diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -15,7 +15,7 @@ "logits/rejected": -2.649298906326294, "logps/chosen": -296.01092529296875, "logps/rejected": -290.09039306640625, - "loss": 0.7406, + "loss": 0.8979, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -29,7 +29,7 @@ "logits/rejected": -2.7385971546173096, "logps/chosen": -339.8030090332031, "logps/rejected": -247.9777374267578, - "loss": 0.829, + "loss": 1.0768, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -39,3328 +39,3328 @@ { "epoch": 0.01, "learning_rate": 4.166666666666666e-08, - "logits/chosen": -2.6828837394714355, - "logits/rejected": -2.717893123626709, - "logps/chosen": -246.95411682128906, - "logps/rejected": -230.5216522216797, - "loss": 0.7515, - "rewards/accuracies": 0.46875, - "rewards/chosen": -1.3528901035897434e-05, - "rewards/margins": -0.00024310783192049712, - "rewards/rejected": 0.00022957894543651491, + "logits/chosen": -2.6829440593719482, + "logits/rejected": -2.71822190284729, + "logps/chosen": -246.91204833984375, + "logps/rejected": -230.60743713378906, + "loss": 0.9685, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0004071760340593755, + "rewards/margins": 0.0010353168472647667, + "rewards/rejected": -0.0006281408714130521, "step": 4 }, { "epoch": 0.01, "learning_rate": 6.25e-08, - "logits/chosen": -2.6765449047088623, - "logits/rejected": -2.7000393867492676, - "logps/chosen": -300.23077392578125, - "logps/rejected": -223.28292846679688, - "loss": 0.8154, - "rewards/accuracies": 0.4375, - "rewards/chosen": -9.068308281712234e-05, - "rewards/margins": -0.00012977616279385984, - "rewards/rejected": 3.9093021769076586e-05, + "logits/chosen": -2.6755082607269287, + "logits/rejected": -2.6991946697235107, + "logps/chosen": -300.23638916015625, + "logps/rejected": -223.28158569335938, + "loss": 1.0262, + "rewards/accuracies": 0.46875, + "rewards/chosen": -0.00014714643475599587, + "rewards/margins": -0.00019944699306506664, + "rewards/rejected": 5.230050737736747e-05, "step": 6 }, { "epoch": 0.02, "learning_rate": 8.333333333333333e-08, - "logits/chosen": -2.599257469177246, - "logits/rejected": -2.6808290481567383, - "logps/chosen": -276.9845886230469, - "logps/rejected": -264.2096252441406, - "loss": 0.7941, - "rewards/accuracies": 0.4375, - "rewards/chosen": 0.0009412524523213506, - "rewards/margins": -0.00016639442765153944, - "rewards/rejected": 0.0011076468508690596, + "logits/chosen": -2.5998125076293945, + "logits/rejected": -2.68131685256958, + "logps/chosen": -276.9141845703125, + "logps/rejected": -264.1656799316406, + "loss": 0.9865, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0016451478004455566, + "rewards/margins": 9.797064558370039e-05, + "rewards/rejected": 0.0015471772057935596, "step": 8 }, { "epoch": 0.02, "learning_rate": 1.0416666666666667e-07, - "logits/chosen": -2.6191229820251465, - "logits/rejected": -2.5384933948516846, - "logps/chosen": -259.14239501953125, - "logps/rejected": -247.5746307373047, - "loss": 0.8451, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.002248743548989296, - "rewards/margins": 0.00199419679120183, - "rewards/rejected": 0.0002545467286836356, + "logits/chosen": -2.6202027797698975, + "logits/rejected": -2.5399973392486572, + "logps/chosen": -258.94354248046875, + "logps/rejected": -247.22210693359375, + "loss": 1.0437, + "rewards/accuracies": 0.546875, + "rewards/chosen": 0.0042375801131129265, + "rewards/margins": 0.0004578869848046452, + "rewards/rejected": 0.00377969304099679, "step": 10 }, { "epoch": 0.03, "learning_rate": 1.25e-07, - "logits/chosen": -2.6755356788635254, - "logits/rejected": -2.6597650051116943, - "logps/chosen": -240.25408935546875, - "logps/rejected": -215.57801818847656, - "loss": 0.7591, - "rewards/accuracies": 0.46875, - "rewards/chosen": 0.003154001198709011, - "rewards/margins": 0.0002609168004710227, - "rewards/rejected": 0.002893084194511175, + "logits/chosen": -2.6750617027282715, + "logits/rejected": -2.6589012145996094, + "logps/chosen": -239.78677368164062, + "logps/rejected": -215.15127563476562, + "loss": 0.898, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.007827259600162506, + "rewards/margins": 0.0006665607215836644, + "rewards/rejected": 0.007160698529332876, "step": 12 }, { "epoch": 0.03, "learning_rate": 1.4583333333333335e-07, - "logits/chosen": -2.7294678688049316, - "logits/rejected": -2.6904141902923584, - "logps/chosen": -315.6171875, - "logps/rejected": -283.3052978515625, - "loss": 0.7765, - "rewards/accuracies": 0.546875, - "rewards/chosen": 0.007693930994719267, - "rewards/margins": 0.0004415067960508168, - "rewards/rejected": 0.007252424024045467, + "logits/chosen": -2.7313432693481445, + "logits/rejected": -2.692768096923828, + "logps/chosen": -314.3973388671875, + "logps/rejected": -282.2184753417969, + "loss": 0.9963, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.019892539829015732, + "rewards/margins": 0.0017717612208798528, + "rewards/rejected": 0.01812077686190605, "step": 14 }, { "epoch": 0.03, "learning_rate": 1.6666666666666665e-07, - "logits/chosen": -2.674433469772339, - "logits/rejected": -2.635763168334961, - "logps/chosen": -280.9161071777344, - "logps/rejected": -286.4499816894531, - "loss": 0.7959, - "rewards/accuracies": 0.390625, - "rewards/chosen": 0.010045092552900314, - "rewards/margins": -0.0013794752303510904, - "rewards/rejected": 0.011424567550420761, + "logits/chosen": -2.677159070968628, + "logits/rejected": -2.638185977935791, + "logps/chosen": -278.99517822265625, + "logps/rejected": -284.5109558105469, + "loss": 0.9918, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.029254097491502762, + "rewards/margins": -0.0015605262015014887, + "rewards/rejected": 0.030814625322818756, "step": 16 }, { "epoch": 0.04, "learning_rate": 1.875e-07, - "logits/chosen": -2.6775548458099365, - "logits/rejected": -2.6488728523254395, - "logps/chosen": -281.817138671875, - "logps/rejected": -226.1727294921875, - "loss": 0.7651, - "rewards/accuracies": 0.703125, - "rewards/chosen": 0.021169912070035934, - "rewards/margins": 0.003915595356374979, - "rewards/rejected": 0.017254315316677094, + "logits/chosen": -2.6816577911376953, + "logits/rejected": -2.6529228687286377, + "logps/chosen": -278.4587097167969, + "logps/rejected": -222.79689025878906, + "loss": 0.9539, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.05475425347685814, + "rewards/margins": 0.003741688560694456, + "rewards/rejected": 0.051012564450502396, "step": 18 }, { "epoch": 0.04, "learning_rate": 2.0833333333333333e-07, - "logits/chosen": -2.6540913581848145, - "logits/rejected": -2.652208089828491, - "logps/chosen": -255.7197265625, - "logps/rejected": -211.753662109375, - "loss": 0.812, + "logits/chosen": -2.6595523357391357, + "logits/rejected": -2.657313585281372, + "logps/chosen": -251.251220703125, + "logps/rejected": -207.63731384277344, + "loss": 0.975, "rewards/accuracies": 0.65625, - "rewards/chosen": 0.031227679923176765, - "rewards/margins": 0.004203509539365768, - "rewards/rejected": 0.027024172246456146, + "rewards/chosen": 0.07591262459754944, + "rewards/margins": 0.007725052069872618, + "rewards/rejected": 0.06818757951259613, "step": 20 }, { "epoch": 0.05, "learning_rate": 2.2916666666666663e-07, - "logits/chosen": -2.7285475730895996, - "logits/rejected": -2.690129280090332, - "logps/chosen": -283.28863525390625, - "logps/rejected": -303.4035339355469, - "loss": 0.8139, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.04402179270982742, - "rewards/margins": 0.0007559025543741882, - "rewards/rejected": 0.04326589033007622, + "logits/chosen": -2.734027624130249, + "logits/rejected": -2.698840618133545, + "logps/chosen": -278.4404602050781, + "logps/rejected": -298.30548095703125, + "loss": 1.0373, + "rewards/accuracies": 0.484375, + "rewards/chosen": 0.09250329434871674, + "rewards/margins": -0.0017432029126212, + "rewards/rejected": 0.09424649924039841, "step": 22 }, { "epoch": 0.05, "learning_rate": 2.5e-07, - "logits/chosen": -2.625800609588623, - "logits/rejected": -2.6034979820251465, - "logps/chosen": -281.58795166015625, - "logps/rejected": -234.80442810058594, - "loss": 0.9627, - "rewards/accuracies": 0.546875, - "rewards/chosen": 0.063897043466568, - "rewards/margins": 0.006137948948889971, - "rewards/rejected": 0.05775909498333931, + "logits/chosen": -2.638693332672119, + "logits/rejected": -2.6179325580596924, + "logps/chosen": -277.06597900390625, + "logps/rejected": -229.88592529296875, + "loss": 1.1041, + "rewards/accuracies": 0.484375, + "rewards/chosen": 0.10911618173122406, + "rewards/margins": 0.002172104548662901, + "rewards/rejected": 0.10694406926631927, "step": 24 }, { "epoch": 0.05, "learning_rate": 2.708333333333333e-07, - "logits/chosen": -2.7099432945251465, - "logits/rejected": -2.715182304382324, - "logps/chosen": -313.54315185546875, - "logps/rejected": -269.708984375, - "loss": 0.7945, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.09343694150447845, - "rewards/margins": 0.011352106928825378, - "rewards/rejected": 0.08208482712507248, + "logits/chosen": -2.7313644886016846, + "logits/rejected": -2.7374155521392822, + "logps/chosen": -309.45098876953125, + "logps/rejected": -265.25311279296875, + "loss": 0.9993, + "rewards/accuracies": 0.609375, + "rewards/chosen": 0.1343587040901184, + "rewards/margins": 0.007715387269854546, + "rewards/rejected": 0.1266433298587799, "step": 26 }, { "epoch": 0.06, "learning_rate": 2.916666666666667e-07, - "logits/chosen": -2.683866024017334, - "logits/rejected": -2.666459083557129, - "logps/chosen": -289.9292297363281, - "logps/rejected": -266.2529602050781, - "loss": 0.963, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.11043840646743774, - "rewards/margins": 0.009698072448372841, - "rewards/rejected": 0.10074033588171005, + "logits/chosen": -2.7133333683013916, + "logits/rejected": -2.69704008102417, + "logps/chosen": -286.62066650390625, + "logps/rejected": -262.74371337890625, + "loss": 1.1579, + "rewards/accuracies": 0.578125, + "rewards/chosen": 0.14352405071258545, + "rewards/margins": 0.0076911491341888905, + "rewards/rejected": 0.13583290576934814, "step": 28 }, { "epoch": 0.06, "learning_rate": 3.1249999999999997e-07, - "logits/chosen": -2.639746904373169, - "logits/rejected": -2.635244369506836, - "logps/chosen": -250.9815673828125, - "logps/rejected": -256.66644287109375, - "loss": 0.782, - "rewards/accuracies": 0.609375, - "rewards/chosen": 0.11398710310459137, - "rewards/margins": 0.006070964969694614, - "rewards/rejected": 0.10791613161563873, + "logits/chosen": -2.675769805908203, + "logits/rejected": -2.6729490756988525, + "logps/chosen": -248.2070770263672, + "logps/rejected": -253.27151489257812, + "loss": 0.9782, + "rewards/accuracies": 0.578125, + "rewards/chosen": 0.14173206686973572, + "rewards/margins": -0.0001335320994257927, + "rewards/rejected": 0.1418655961751938, "step": 30 }, { "epoch": 0.07, "learning_rate": 3.333333333333333e-07, - "logits/chosen": -2.6974880695343018, - "logits/rejected": -2.791822910308838, - "logps/chosen": -311.99462890625, - "logps/rejected": -263.93084716796875, - "loss": 0.7708, - "rewards/accuracies": 0.71875, - "rewards/chosen": 0.13753816485404968, - "rewards/margins": 0.02569321170449257, - "rewards/rejected": 0.11184494942426682, + "logits/chosen": -2.732271909713745, + "logits/rejected": -2.826098918914795, + "logps/chosen": -309.562255859375, + "logps/rejected": -261.3126220703125, + "loss": 0.9497, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.1618621051311493, + "rewards/margins": 0.023834748193621635, + "rewards/rejected": 0.1380273401737213, "step": 32 }, { "epoch": 0.07, "learning_rate": 3.541666666666667e-07, - "logits/chosen": -2.683668613433838, - "logits/rejected": -2.696622610092163, - "logps/chosen": -290.19091796875, - "logps/rejected": -280.1837158203125, - "loss": 0.8665, - "rewards/accuracies": 0.640625, - "rewards/chosen": 0.13525506854057312, - "rewards/margins": 0.01829436421394348, - "rewards/rejected": 0.11696070432662964, + "logits/chosen": -2.7240211963653564, + "logits/rejected": -2.733490467071533, + "logps/chosen": -287.8714294433594, + "logps/rejected": -277.6007080078125, + "loss": 1.1113, + "rewards/accuracies": 0.609375, + "rewards/chosen": 0.15844975411891937, + "rewards/margins": 0.01565898023545742, + "rewards/rejected": 0.1427907645702362, "step": 34 }, { "epoch": 0.08, "learning_rate": 3.75e-07, - "logits/chosen": -2.703218698501587, - "logits/rejected": -2.66160249710083, - "logps/chosen": -331.1356201171875, - "logps/rejected": -277.3096618652344, - "loss": 0.8742, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.14532874524593353, - "rewards/margins": 0.02477584406733513, - "rewards/rejected": 0.1205528974533081, + "logits/chosen": -2.745270252227783, + "logits/rejected": -2.7103464603424072, + "logps/chosen": -328.4593505859375, + "logps/rejected": -273.5399475097656, + "loss": 1.0344, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.17209194600582123, + "rewards/margins": 0.013841740787029266, + "rewards/rejected": 0.15825021266937256, "step": 36 }, { "epoch": 0.08, "learning_rate": 3.958333333333333e-07, - "logits/chosen": -2.605548620223999, - "logits/rejected": -2.507476806640625, - "logps/chosen": -275.4030456542969, - "logps/rejected": -268.85137939453125, - "loss": 0.784, - "rewards/accuracies": 0.671875, - "rewards/chosen": 0.14142535626888275, - "rewards/margins": 0.016934942454099655, - "rewards/rejected": 0.1244904026389122, + "logits/chosen": -2.660860300064087, + "logits/rejected": -2.5762624740600586, + "logps/chosen": -272.0681457519531, + "logps/rejected": -265.1504211425781, + "loss": 0.9973, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17477430403232574, + "rewards/margins": 0.013274325989186764, + "rewards/rejected": 0.1614999771118164, "step": 38 }, { "epoch": 0.08, "learning_rate": 4.1666666666666667e-07, - "logits/chosen": -2.599735736846924, - "logits/rejected": -2.6093332767486572, - "logps/chosen": -259.8871154785156, - "logps/rejected": -238.1406707763672, - "loss": 0.7943, - "rewards/accuracies": 0.703125, - "rewards/chosen": 0.1309652328491211, - "rewards/margins": 0.03087472915649414, - "rewards/rejected": 0.10009051114320755, + "logits/chosen": -2.645998239517212, + "logits/rejected": -2.6596755981445312, + "logps/chosen": -257.1060485839844, + "logps/rejected": -234.7177734375, + "loss": 0.9584, + "rewards/accuracies": 0.640625, + "rewards/chosen": 0.15877613425254822, + "rewards/margins": 0.024456296116113663, + "rewards/rejected": 0.13431984186172485, "step": 40 }, { "epoch": 0.09, "learning_rate": 4.375e-07, - "logits/chosen": -2.678443670272827, - "logits/rejected": -2.6648781299591064, - "logps/chosen": -245.9468994140625, - "logps/rejected": -246.77980041503906, - "loss": 0.7385, - "rewards/accuracies": 0.609375, - "rewards/chosen": 0.12853887677192688, - "rewards/margins": 0.01891843043267727, - "rewards/rejected": 0.10962046682834625, + "logits/chosen": -2.7350802421569824, + "logits/rejected": -2.7258996963500977, + "logps/chosen": -242.8642578125, + "logps/rejected": -243.5854034423828, + "loss": 0.8996, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.1593654453754425, + "rewards/margins": 0.017801081761717796, + "rewards/rejected": 0.14156436920166016, "step": 42 }, { "epoch": 0.09, "learning_rate": 4.5833333333333327e-07, - "logits/chosen": -2.603628158569336, - "logits/rejected": -2.5344913005828857, - "logps/chosen": -331.3908996582031, - "logps/rejected": -259.43768310546875, - "loss": 0.7334, - "rewards/accuracies": 0.828125, - "rewards/chosen": 0.17403613030910492, - "rewards/margins": 0.08045163750648499, - "rewards/rejected": 0.09358450770378113, + "logits/chosen": -2.650637626647949, + "logits/rejected": -2.592097282409668, + "logps/chosen": -327.2317810058594, + "logps/rejected": -254.522705078125, + "loss": 0.8849, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21562764048576355, + "rewards/margins": 0.07289348542690277, + "rewards/rejected": 0.14273415505886078, "step": 44 }, { "epoch": 0.1, "learning_rate": 4.791666666666667e-07, - "logits/chosen": -2.6753909587860107, - "logits/rejected": -2.6965551376342773, - "logps/chosen": -255.45181274414062, - "logps/rejected": -280.82305908203125, - "loss": 0.7035, - "rewards/accuracies": 0.703125, - "rewards/chosen": 0.1322551965713501, - "rewards/margins": 0.049186140298843384, - "rewards/rejected": 0.0830690786242485, + "logits/chosen": -2.7370920181274414, + "logits/rejected": -2.7568929195404053, + "logps/chosen": -252.72396850585938, + "logps/rejected": -275.8587646484375, + "loss": 0.8863, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15953370928764343, + "rewards/margins": 0.02682127058506012, + "rewards/rejected": 0.1327124387025833, "step": 46 }, { "epoch": 0.1, "learning_rate": 5e-07, - "logits/chosen": -2.5472424030303955, - "logits/rejected": -2.5995569229125977, - "logps/chosen": -269.2645568847656, - "logps/rejected": -236.20904541015625, - "loss": 0.7093, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.11034946143627167, - "rewards/margins": 0.03396361321210861, - "rewards/rejected": 0.07638585567474365, + "logits/chosen": -2.6029889583587646, + "logits/rejected": -2.656250476837158, + "logps/chosen": -264.8302307128906, + "logps/rejected": -231.60470581054688, + "loss": 0.8326, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.154692605137825, + "rewards/margins": 0.03226301446557045, + "rewards/rejected": 0.12242957949638367, "step": 48 }, { "epoch": 0.1, "learning_rate": 4.999731868769026e-07, - "logits/chosen": -2.683974266052246, - "logits/rejected": -2.6413497924804688, - "logps/chosen": -319.9888916015625, - "logps/rejected": -328.697265625, - "loss": 0.938, - "rewards/accuracies": 0.734375, - "rewards/chosen": 0.1664372831583023, - "rewards/margins": 0.0846775621175766, - "rewards/rejected": 0.08175972104072571, + "logits/chosen": -2.7349958419799805, + "logits/rejected": -2.7008442878723145, + "logps/chosen": -314.25421142578125, + "logps/rejected": -323.0506591796875, + "loss": 1.1576, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22378462553024292, + "rewards/margins": 0.08555849641561508, + "rewards/rejected": 0.13822612166404724, "step": 50 }, { "epoch": 0.11, "learning_rate": 4.998927532591591e-07, - "logits/chosen": -2.695955514907837, - "logits/rejected": -2.660606861114502, - "logps/chosen": -229.35089111328125, - "logps/rejected": -203.45083618164062, - "loss": 0.7139, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.10423125326633453, - "rewards/margins": 0.0388810932636261, - "rewards/rejected": 0.06535016745328903, + "logits/chosen": -2.748030662536621, + "logits/rejected": -2.717869281768799, + "logps/chosen": -223.426513671875, + "logps/rejected": -197.9170684814453, + "loss": 0.862, + "rewards/accuracies": 0.609375, + "rewards/chosen": 0.16347481310367584, + "rewards/margins": 0.04278700798749924, + "rewards/rejected": 0.12068779766559601, "step": 52 }, { "epoch": 0.11, "learning_rate": 4.997587164001815e-07, - "logits/chosen": -2.6461551189422607, - "logits/rejected": -2.5729639530181885, - "logps/chosen": -267.51885986328125, - "logps/rejected": -215.0835723876953, - "loss": 0.7405, - "rewards/accuracies": 0.71875, - "rewards/chosen": 0.12834982573986053, - "rewards/margins": 0.09162335097789764, - "rewards/rejected": 0.03672647103667259, + "logits/chosen": -2.6950979232788086, + "logits/rejected": -2.6328446865081787, + "logps/chosen": -262.1427001953125, + "logps/rejected": -208.02978515625, + "loss": 0.9206, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.18211160600185394, + "rewards/margins": 0.0748472511768341, + "rewards/rejected": 0.10726435482501984, "step": 54 }, { "epoch": 0.12, "learning_rate": 4.99571105051544e-07, - "logits/chosen": -2.5733587741851807, - "logits/rejected": -2.5592525005340576, - "logps/chosen": -275.8002014160156, - "logps/rejected": -224.9879150390625, - "loss": 0.7809, - "rewards/accuracies": 0.640625, - "rewards/chosen": 0.12791308760643005, - "rewards/margins": 0.06309904903173447, - "rewards/rejected": 0.0648140236735344, + "logits/chosen": -2.6122078895568848, + "logits/rejected": -2.6001956462860107, + "logps/chosen": -269.42315673828125, + "logps/rejected": -216.87860107421875, + "loss": 0.9742, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1916835755109787, + "rewards/margins": 0.04577651992440224, + "rewards/rejected": 0.14590705931186676, "step": 56 }, { "epoch": 0.12, "learning_rate": 4.993299594568162e-07, - "logits/chosen": -2.6415131092071533, - "logits/rejected": -2.591508626937866, - "logps/chosen": -280.14813232421875, - "logps/rejected": -254.98541259765625, - "loss": 1.0116, - "rewards/accuracies": 0.671875, - "rewards/chosen": 0.12386615574359894, - "rewards/margins": 0.06462900340557098, - "rewards/rejected": 0.05923713743686676, + "logits/chosen": -2.6804134845733643, + "logits/rejected": -2.6375977993011475, + "logps/chosen": -272.0626525878906, + "logps/rejected": -243.4908447265625, + "loss": 1.2327, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.20472098886966705, + "rewards/margins": 0.030538024380803108, + "rewards/rejected": 0.1741829514503479, "step": 58 }, { "epoch": 0.13, "learning_rate": 4.990353313429303e-07, - "logits/chosen": -2.635352611541748, - "logits/rejected": -2.581376314163208, - "logps/chosen": -300.7115173339844, - "logps/rejected": -253.92111206054688, - "loss": 0.7298, - "rewards/accuracies": 0.703125, - "rewards/chosen": 0.12783905863761902, - "rewards/margins": 0.11859284341335297, - "rewards/rejected": 0.00924623105674982, + "logits/chosen": -2.658156156539917, + "logits/rejected": -2.6140589714050293, + "logps/chosen": -293.338134765625, + "logps/rejected": -241.25381469726562, + "loss": 0.8916, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.20157276093959808, + "rewards/margins": 0.06565352529287338, + "rewards/rejected": 0.1359192430973053, "step": 60 }, { "epoch": 0.13, "learning_rate": 4.986872839090852e-07, - "logits/chosen": -2.7381272315979004, - "logits/rejected": -2.6615114212036133, - "logps/chosen": -310.81658935546875, - "logps/rejected": -246.3937530517578, - "loss": 0.7147, - "rewards/accuracies": 0.703125, - "rewards/chosen": 0.08360395580530167, - "rewards/margins": 0.0849369466304779, - "rewards/rejected": -0.0013329838402569294, + "logits/chosen": -2.762118339538574, + "logits/rejected": -2.692058563232422, + "logps/chosen": -300.54962158203125, + "logps/rejected": -234.56646728515625, + "loss": 0.8949, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.18627376854419708, + "rewards/margins": 0.06933394819498062, + "rewards/rejected": 0.11693982779979706, "step": 62 }, { "epoch": 0.13, "learning_rate": 4.982858918131906e-07, - "logits/chosen": -2.7732505798339844, - "logits/rejected": -2.7643208503723145, - "logps/chosen": -309.32196044921875, - "logps/rejected": -286.745361328125, - "loss": 0.7717, - "rewards/accuracies": 0.640625, - "rewards/chosen": 0.10828227549791336, - "rewards/margins": 0.12604959309101105, - "rewards/rejected": -0.01776731386780739, + "logits/chosen": -2.7757301330566406, + "logits/rejected": -2.768958806991577, + "logps/chosen": -298.272216796875, + "logps/rejected": -271.7611389160156, + "loss": 1.0205, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21877986192703247, + "rewards/margins": 0.08670471608638763, + "rewards/rejected": 0.13207514584064484, "step": 64 }, { "epoch": 0.14, "learning_rate": 4.978312411558517e-07, - "logits/chosen": -2.7538211345672607, - "logits/rejected": -2.766498565673828, - "logps/chosen": -260.09765625, - "logps/rejected": -237.29440307617188, - "loss": 0.7561, - "rewards/accuracies": 0.734375, - "rewards/chosen": 0.09531789273023605, - "rewards/margins": 0.13862968981266022, - "rewards/rejected": -0.043311797082424164, + "logits/chosen": -2.751561164855957, + "logits/rejected": -2.7719786167144775, + "logps/chosen": -249.63597106933594, + "logps/rejected": -221.65524291992188, + "loss": 0.953, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19993475079536438, + "rewards/margins": 0.08685498684644699, + "rewards/rejected": 0.11307975649833679, "step": 66 }, { "epoch": 0.14, "learning_rate": 4.97323429461901e-07, - "logits/chosen": -2.736222505569458, - "logits/rejected": -2.6993401050567627, - "logps/chosen": -218.64276123046875, - "logps/rejected": -247.08609008789062, - "loss": 0.7268, - "rewards/accuracies": 0.703125, - "rewards/chosen": 0.102709099650383, - "rewards/margins": 0.15037541091442108, - "rewards/rejected": -0.047666311264038086, + "logits/chosen": -2.7238869667053223, + "logits/rejected": -2.692014217376709, + "logps/chosen": -207.89959716796875, + "logps/rejected": -230.82594299316406, + "loss": 0.9133, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.2101408690214157, + "rewards/margins": 0.09520590305328369, + "rewards/rejected": 0.11493496596813202, "step": 68 }, { "epoch": 0.15, "learning_rate": 4.967625656594781e-07, - "logits/chosen": -2.6768088340759277, - "logits/rejected": -2.720437526702881, - "logps/chosen": -255.47183227539062, - "logps/rejected": -234.1626739501953, - "loss": 0.7297, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.06858564168214798, - "rewards/margins": 0.10109753906726837, - "rewards/rejected": -0.032511889934539795, + "logits/chosen": -2.6451263427734375, + "logits/rejected": -2.690622568130493, + "logps/chosen": -243.95298767089844, + "logps/rejected": -219.01947021484375, + "loss": 0.9029, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18377384543418884, + "rewards/margins": 0.06485382467508316, + "rewards/rejected": 0.11892002075910568, "step": 70 }, { "epoch": 0.15, "learning_rate": 4.961487700566646e-07, - "logits/chosen": -2.685685157775879, - "logits/rejected": -2.728847026824951, - "logps/chosen": -277.7132873535156, - "logps/rejected": -273.0968017578125, - "loss": 0.843, - "rewards/accuracies": 0.71875, - "rewards/chosen": 0.08615937829017639, - "rewards/margins": 0.09151909500360489, - "rewards/rejected": -0.0053597185760736465, + "logits/chosen": -2.639003038406372, + "logits/rejected": -2.6975741386413574, + "logps/chosen": -263.14605712890625, + "logps/rejected": -257.71453857421875, + "loss": 1.0418, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.23183217644691467, + "rewards/margins": 0.08336945623159409, + "rewards/rejected": 0.14846271276474, "step": 72 }, { "epoch": 0.15, "learning_rate": 4.954821743156767e-07, - "logits/chosen": -2.749175548553467, - "logits/rejected": -2.7879812717437744, - "logps/chosen": -277.5850524902344, - "logps/rejected": -289.2405090332031, - "loss": 0.7339, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.037747230380773544, - "rewards/margins": 0.07498619705438614, - "rewards/rejected": -0.03723897039890289, + "logits/chosen": -2.713226318359375, + "logits/rejected": -2.762162208557129, + "logps/chosen": -262.9385070800781, + "logps/rejected": -272.867431640625, + "loss": 0.8948, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18421295285224915, + "rewards/margins": 0.057721108198165894, + "rewards/rejected": 0.12649184465408325, "step": 74 }, { "epoch": 0.16, "learning_rate": 4.947629214246236e-07, - "logits/chosen": -2.7705671787261963, - "logits/rejected": -2.763953447341919, - "logps/chosen": -252.2857666015625, - "logps/rejected": -292.7469177246094, - "loss": 0.7657, - "rewards/accuracies": 0.671875, - "rewards/chosen": 0.017970919609069824, - "rewards/margins": 0.09476744383573532, - "rewards/rejected": -0.0767965242266655, + "logits/chosen": -2.7411000728607178, + "logits/rejected": -2.7211222648620605, + "logps/chosen": -237.08746337890625, + "logps/rejected": -274.0372314453125, + "loss": 0.9851, + "rewards/accuracies": 0.609375, + "rewards/chosen": 0.16995400190353394, + "rewards/margins": 0.059653621166944504, + "rewards/rejected": 0.11030039191246033, "step": 76 }, { "epoch": 0.16, "learning_rate": 4.939911656668361e-07, - "logits/chosen": -2.682727336883545, - "logits/rejected": -2.6415486335754395, - "logps/chosen": -290.85369873046875, - "logps/rejected": -280.05572509765625, - "loss": 0.7518, + "logits/chosen": -2.652937650680542, + "logits/rejected": -2.613910675048828, + "logps/chosen": -273.22845458984375, + "logps/rejected": -262.1705627441406, + "loss": 0.9483, "rewards/accuracies": 0.671875, - "rewards/chosen": 0.0839323028922081, - "rewards/margins": 0.10557042807340622, - "rewards/rejected": -0.02163812704384327, + "rewards/chosen": 0.26018452644348145, + "rewards/margins": 0.1029711663722992, + "rewards/rejected": 0.15721337497234344, "step": 78 }, { "epoch": 0.17, "learning_rate": 4.93167072587771e-07, - "logits/chosen": -2.7483415603637695, - "logits/rejected": -2.7103254795074463, - "logps/chosen": -307.75933837890625, - "logps/rejected": -324.3096923828125, - "loss": 0.7522, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.03351260721683502, - "rewards/margins": 0.10701370239257812, - "rewards/rejected": -0.0735010951757431, + "logits/chosen": -2.7028257846832275, + "logits/rejected": -2.6714606285095215, + "logps/chosen": -291.77081298828125, + "logps/rejected": -303.716064453125, + "loss": 0.9756, + "rewards/accuracies": 0.609375, + "rewards/chosen": 0.19339807331562042, + "rewards/margins": 0.060962654650211334, + "rewards/rejected": 0.1324354112148285, "step": 80 }, { "epoch": 0.17, "learning_rate": 4.922908189595017e-07, - "logits/chosen": -2.8289828300476074, - "logits/rejected": -2.8077309131622314, - "logps/chosen": -252.59323120117188, - "logps/rejected": -278.0732727050781, - "loss": 0.7646, - "rewards/accuracies": 0.703125, - "rewards/chosen": 0.07634499669075012, - "rewards/margins": 0.1510338932275772, - "rewards/rejected": -0.0746888816356659, + "logits/chosen": -2.7801144123077393, + "logits/rejected": -2.7747814655303955, + "logps/chosen": -241.05018615722656, + "logps/rejected": -259.6797180175781, + "loss": 1.0054, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1917755901813507, + "rewards/margins": 0.0825289785861969, + "rewards/rejected": 0.10924658179283142, "step": 82 }, { "epoch": 0.18, "learning_rate": 4.913625927427995e-07, - "logits/chosen": -2.673996686935425, - "logits/rejected": -2.7177605628967285, - "logps/chosen": -283.3115234375, - "logps/rejected": -235.6822967529297, - "loss": 0.7361, - "rewards/accuracies": 0.671875, - "rewards/chosen": 0.027728475630283356, - "rewards/margins": 0.19945986568927765, - "rewards/rejected": -0.1717313826084137, + "logits/chosen": -2.629251480102539, + "logits/rejected": -2.685054302215576, + "logps/chosen": -269.2608947753906, + "logps/rejected": -215.30453491210938, + "loss": 0.9148, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.16823476552963257, + "rewards/margins": 0.13618838787078857, + "rewards/rejected": 0.032046347856521606, "step": 84 }, { "epoch": 0.18, "learning_rate": 4.903825930468148e-07, - "logits/chosen": -2.790689468383789, - "logits/rejected": -2.8265206813812256, - "logps/chosen": -271.7046813964844, - "logps/rejected": -289.9587097167969, - "loss": 0.7311, - "rewards/accuracies": 0.734375, - "rewards/chosen": 0.07139424234628677, - "rewards/margins": 0.22358588874340057, - "rewards/rejected": -0.152191624045372, + "logits/chosen": -2.7462263107299805, + "logits/rejected": -2.7930428981781006, + "logps/chosen": -259.4129638671875, + "logps/rejected": -268.3623046875, + "loss": 0.9258, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.19431179761886597, + "rewards/margins": 0.13053934276103973, + "rewards/rejected": 0.06377246230840683, "step": 86 }, { "epoch": 0.18, "learning_rate": 4.893510300863676e-07, - "logits/chosen": -2.6858608722686768, - "logits/rejected": -2.6849381923675537, - "logps/chosen": -234.48670959472656, - "logps/rejected": -243.48057556152344, - "loss": 0.6433, - "rewards/accuracies": 0.765625, - "rewards/chosen": 0.03533104807138443, - "rewards/margins": 0.19699391722679138, - "rewards/rejected": -0.16166287660598755, + "logits/chosen": -2.6440467834472656, + "logits/rejected": -2.6448612213134766, + "logps/chosen": -217.25962829589844, + "logps/rejected": -218.20297241210938, + "loss": 0.8001, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.20760184526443481, + "rewards/margins": 0.11648894846439362, + "rewards/rejected": 0.09111291170120239, "step": 88 }, { "epoch": 0.19, "learning_rate": 4.882681251368548e-07, - "logits/chosen": -2.6847596168518066, - "logits/rejected": -2.627260684967041, - "logps/chosen": -242.67559814453125, - "logps/rejected": -233.02520751953125, - "loss": 0.7216, + "logits/chosen": -2.6310863494873047, + "logits/rejected": -2.591268539428711, + "logps/chosen": -226.30935668945312, + "logps/rejected": -206.21791076660156, + "loss": 0.9086, "rewards/accuracies": 0.78125, - "rewards/chosen": -0.0019026286900043488, - "rewards/margins": 0.2595129609107971, - "rewards/rejected": -0.26141557097435, + "rewards/chosen": 0.1617593765258789, + "rewards/margins": 0.1551017016172409, + "rewards/rejected": 0.006657673045992851, "step": 90 }, { "epoch": 0.19, "learning_rate": 4.871341104867864e-07, - "logits/chosen": -2.861926794052124, - "logits/rejected": -2.8458104133605957, - "logps/chosen": -313.4859619140625, - "logps/rejected": -299.7122497558594, - "loss": 0.6911, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.011806448921561241, - "rewards/margins": 0.1803116649389267, - "rewards/rejected": -0.1921180933713913, + "logits/chosen": -2.8109049797058105, + "logits/rejected": -2.8163559436798096, + "logps/chosen": -292.8932189941406, + "logps/rejected": -274.3307800292969, + "loss": 0.9104, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.1941211074590683, + "rewards/margins": 0.1324244737625122, + "rewards/rejected": 0.061696626245975494, "step": 92 }, { "epoch": 0.2, "learning_rate": 4.859492293879573e-07, - "logits/chosen": -2.814488410949707, - "logits/rejected": -2.763936758041382, - "logps/chosen": -251.5836181640625, - "logps/rejected": -268.69342041015625, - "loss": 0.6986, - "rewards/accuracies": 0.796875, - "rewards/chosen": 0.0034225257113575935, - "rewards/margins": 0.2998045086860657, - "rewards/rejected": -0.29638200998306274, + "logits/chosen": -2.7723848819732666, + "logits/rejected": -2.7233808040618896, + "logps/chosen": -236.38687133789062, + "logps/rejected": -240.41268920898438, + "loss": 0.8922, + "rewards/accuracies": 0.84375, + "rewards/chosen": 0.15539003908634186, + "rewards/margins": 0.16896475851535797, + "rewards/rejected": -0.013574732467532158, "step": 94 }, { "epoch": 0.2, "learning_rate": 4.847137360032699e-07, - "logits/chosen": -2.740324020385742, - "logits/rejected": -2.6555376052856445, - "logps/chosen": -267.32647705078125, - "logps/rejected": -311.2066955566406, - "loss": 0.6782, - "rewards/accuracies": 0.78125, - "rewards/chosen": 0.04199559986591339, - "rewards/margins": 0.22325468063354492, - "rewards/rejected": -0.18125906586647034, + "logits/chosen": -2.708160877227783, + "logits/rejected": -2.632722854614258, + "logps/chosen": -252.2537078857422, + "logps/rejected": -284.7792053222656, + "loss": 0.8878, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1927236020565033, + "rewards/margins": 0.10970807820558548, + "rewards/rejected": 0.08301550149917603, "step": 96 }, { "epoch": 0.21, "learning_rate": 4.834278953522137e-07, - "logits/chosen": -2.788088798522949, - "logits/rejected": -2.769721508026123, - "logps/chosen": -259.531005859375, - "logps/rejected": -282.8291931152344, - "loss": 0.6506, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.01584019511938095, - "rewards/margins": 0.3319249749183655, - "rewards/rejected": -0.347765177488327, + "logits/chosen": -2.7499423027038574, + "logits/rejected": -2.743983745574951, + "logps/chosen": -238.63363647460938, + "logps/rejected": -246.7994842529297, + "loss": 0.8417, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.19313326478004456, + "rewards/margins": 0.18060123920440674, + "rewards/rejected": 0.012532031163573265, "step": 98 }, { "epoch": 0.21, "learning_rate": 4.820919832540181e-07, - "logits/chosen": -2.8391647338867188, - "logits/rejected": -2.7747175693511963, - "logps/chosen": -323.83184814453125, - "logps/rejected": -294.0677795410156, - "loss": 0.7505, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.0377802811563015, - "rewards/margins": 0.21989765763282776, - "rewards/rejected": -0.25767797231674194, + "logits/chosen": -2.7913339138031006, + "logits/rejected": -2.738832473754883, + "logps/chosen": -300.489013671875, + "logps/rejected": -263.0144958496094, + "loss": 0.9361, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.19564741849899292, + "rewards/margins": 0.14279228448867798, + "rewards/rejected": 0.05285511910915375, "step": 100 }, { "epoch": 0.21, "learning_rate": 4.807062862684873e-07, - "logits/chosen": -2.8058762550354004, - "logits/rejected": -2.7266106605529785, - "logps/chosen": -286.4516906738281, - "logps/rejected": -269.9130554199219, - "loss": 0.7046, - "rewards/accuracies": 0.609375, - "rewards/chosen": -0.2667284309864044, - "rewards/margins": 0.10346898436546326, - "rewards/rejected": -0.3701974153518677, + "logits/chosen": -2.743293285369873, + "logits/rejected": -2.6962647438049316, + "logps/chosen": -251.50848388671875, + "logps/rejected": -228.89454650878906, + "loss": 0.8655, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0827035903930664, + "rewards/margins": 0.042715832591056824, + "rewards/rejected": 0.03998776525259018, "step": 102 }, { "epoch": 0.22, "learning_rate": 4.792711016345321e-07, - "logits/chosen": -2.8397815227508545, - "logits/rejected": -2.845283031463623, - "logps/chosen": -359.9697570800781, - "logps/rejected": -326.89508056640625, - "loss": 0.7683, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.029000351205468178, - "rewards/margins": 0.4069358706474304, - "rewards/rejected": -0.43593621253967285, + "logits/chosen": -2.760998010635376, + "logits/rejected": -2.807874917984009, + "logps/chosen": -336.2560119628906, + "logps/rejected": -285.0055236816406, + "loss": 0.9719, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.20813709497451782, + "rewards/margins": 0.22517752647399902, + "rewards/rejected": -0.017040422186255455, "step": 104 }, { "epoch": 0.22, "learning_rate": 4.777867372064105e-07, - "logits/chosen": -2.812641143798828, - "logits/rejected": -2.7523622512817383, - "logps/chosen": -313.5459289550781, - "logps/rejected": -299.8595275878906, - "loss": 0.7817, + "logits/chosen": -2.752171039581299, + "logits/rejected": -2.718291759490967, + "logps/chosen": -285.46630859375, + "logps/rejected": -255.75071716308594, + "loss": 1.0186, "rewards/accuracies": 0.75, - "rewards/chosen": -0.13160283863544464, - "rewards/margins": 0.31865066289901733, - "rewards/rejected": -0.4502534866333008, + "rewards/chosen": 0.1491929143667221, + "rewards/margins": 0.15835797786712646, + "rewards/rejected": -0.009165056981146336, "step": 106 }, { "epoch": 0.23, "learning_rate": 4.7625351138769166e-07, - "logits/chosen": -2.7898976802825928, - "logits/rejected": -2.7862889766693115, - "logps/chosen": -327.0060119628906, - "logps/rejected": -322.96783447265625, - "loss": 0.7708, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.08684093505144119, - "rewards/margins": 0.4070696532726288, - "rewards/rejected": -0.49391061067581177, + "logits/chosen": -2.723923683166504, + "logits/rejected": -2.7391700744628906, + "logps/chosen": -300.93133544921875, + "logps/rejected": -279.8222961425781, + "loss": 0.9867, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17390599846839905, + "rewards/margins": 0.23636136949062347, + "rewards/rejected": -0.06245535984635353, "step": 108 }, { "epoch": 0.23, "learning_rate": 4.7467175306295647e-07, - "logits/chosen": -2.6663591861724854, - "logits/rejected": -2.5961318016052246, - "logps/chosen": -261.4271240234375, - "logps/rejected": -309.5064697265625, - "loss": 0.7365, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.0995909571647644, - "rewards/margins": 0.18599346280097961, - "rewards/rejected": -0.2855844497680664, + "logits/chosen": -2.644249677658081, + "logits/rejected": -2.5753843784332275, + "logps/chosen": -237.38717651367188, + "logps/rejected": -275.41326904296875, + "loss": 0.9589, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14080852270126343, + "rewards/margins": 0.08546096086502075, + "rewards/rejected": 0.055347561836242676, "step": 110 }, { "epoch": 0.23, "learning_rate": 4.7304180152725024e-07, - "logits/chosen": -2.7205381393432617, - "logits/rejected": -2.632664680480957, - "logps/chosen": -324.07098388671875, - "logps/rejected": -303.51422119140625, - "loss": 0.6995, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.09924812614917755, - "rewards/margins": 0.3877854347229004, - "rewards/rejected": -0.48703351616859436, + "logits/chosen": -2.676513433456421, + "logits/rejected": -2.626890182495117, + "logps/chosen": -301.16455078125, + "logps/rejected": -262.0887451171875, + "loss": 0.8776, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.1298159807920456, + "rewards/margins": 0.20259486138820648, + "rewards/rejected": -0.0727788656949997, "step": 112 }, { "epoch": 0.24, "learning_rate": 4.7136400641330245e-07, - "logits/chosen": -2.7004384994506836, - "logits/rejected": -2.6967031955718994, - "logps/chosen": -273.4053039550781, - "logps/rejected": -285.5749206542969, - "loss": 0.6863, - "rewards/accuracies": 0.765625, - "rewards/chosen": -0.04174273461103439, - "rewards/margins": 0.429397314786911, - "rewards/rejected": -0.471140056848526, + "logits/chosen": -2.6685991287231445, + "logits/rejected": -2.69344425201416, + "logps/chosen": -252.33763122558594, + "logps/rejected": -245.7454071044922, + "loss": 0.9084, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.16893407702445984, + "rewards/margins": 0.24177873134613037, + "rewards/rejected": -0.07284467667341232, "step": 114 }, { "epoch": 0.24, "learning_rate": 4.6963872761652834e-07, - "logits/chosen": -2.7354209423065186, - "logits/rejected": -2.6918838024139404, - "logps/chosen": -313.76708984375, - "logps/rejected": -291.7537536621094, - "loss": 0.7426, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.0618835911154747, - "rewards/margins": 0.23076358437538147, - "rewards/rejected": -0.29264718294143677, + "logits/chosen": -2.704777956008911, + "logits/rejected": -2.6885030269622803, + "logps/chosen": -292.1341247558594, + "logps/rejected": -264.8712463378906, + "loss": 0.9878, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1544463038444519, + "rewards/margins": 0.17826814949512482, + "rewards/rejected": -0.023821817710995674, "step": 116 }, { "epoch": 0.25, "learning_rate": 4.6786633521783005e-07, - "logits/chosen": -2.6053059101104736, - "logits/rejected": -2.5342700481414795, - "logps/chosen": -244.05995178222656, - "logps/rejected": -259.55023193359375, - "loss": 0.7067, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.07344765961170197, - "rewards/margins": 0.4161744713783264, - "rewards/rejected": -0.34272682666778564, + "logits/chosen": -2.596072196960449, + "logits/rejected": -2.5699691772460938, + "logps/chosen": -226.89651489257812, + "logps/rejected": -230.52536010742188, + "loss": 0.932, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.24508199095726013, + "rewards/margins": 0.29755982756614685, + "rewards/rejected": -0.05247785896062851, "step": 118 }, { "epoch": 0.25, "learning_rate": 4.6604720940421207e-07, - "logits/chosen": -2.674229145050049, - "logits/rejected": -2.6460635662078857, - "logps/chosen": -293.4000244140625, - "logps/rejected": -237.8984375, - "loss": 0.6553, + "logits/chosen": -2.6633219718933105, + "logits/rejected": -2.6796035766601562, + "logps/chosen": -273.4510803222656, + "logps/rejected": -208.31605529785156, + "loss": 0.838, "rewards/accuracies": 0.65625, - "rewards/chosen": -0.08343453705310822, - "rewards/margins": 0.23878370225429535, - "rewards/rejected": -0.32221823930740356, + "rewards/chosen": 0.11605505645275116, + "rewards/margins": 0.1424492746591568, + "rewards/rejected": -0.026394207030534744, "step": 120 }, { "epoch": 0.26, "learning_rate": 4.6418174038722924e-07, - "logits/chosen": -2.7431387901306152, - "logits/rejected": -2.6589853763580322, - "logps/chosen": -258.30517578125, - "logps/rejected": -337.9000244140625, - "loss": 0.7977, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.044009700417518616, - "rewards/margins": 0.3760453462600708, - "rewards/rejected": -0.42005497217178345, + "logits/chosen": -2.7611799240112305, + "logits/rejected": -2.7173991203308105, + "logps/chosen": -239.86370849609375, + "logps/rejected": -300.7042541503906, + "loss": 1.085, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.14040501415729523, + "rewards/margins": 0.18850228190422058, + "rewards/rejected": -0.04809727519750595, "step": 122 }, { "epoch": 0.26, "learning_rate": 4.6227032831928483e-07, - "logits/chosen": -2.644824981689453, - "logits/rejected": -2.6608684062957764, - "logps/chosen": -300.009033203125, - "logps/rejected": -324.9970397949219, - "loss": 0.6938, + "logits/chosen": -2.674177646636963, + "logits/rejected": -2.7022745609283447, + "logps/chosen": -276.10906982421875, + "logps/rejected": -292.9422912597656, + "loss": 0.9295, "rewards/accuracies": 0.640625, - "rewards/chosen": -0.11674674600362778, - "rewards/margins": 0.20092640817165375, - "rewards/rejected": -0.31767311692237854, + "rewards/chosen": 0.12225355207920074, + "rewards/margins": 0.11937911063432693, + "rewards/rejected": 0.0028744498267769814, "step": 124 }, { "epoch": 0.26, "learning_rate": 4.603133832077953e-07, - "logits/chosen": -2.586251974105835, - "logits/rejected": -2.579716920852661, - "logps/chosen": -283.69183349609375, - "logps/rejected": -308.4037780761719, - "loss": 0.6959, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.10065947473049164, - "rewards/margins": 0.31346243619918823, - "rewards/rejected": -0.4141218662261963, + "logits/chosen": -2.640671968460083, + "logits/rejected": -2.6465303897857666, + "logps/chosen": -259.9521789550781, + "logps/rejected": -271.70806884765625, + "loss": 0.9038, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.13673703372478485, + "rewards/margins": 0.18390166759490967, + "rewards/rejected": -0.047164615243673325, "step": 126 }, { "epoch": 0.27, "learning_rate": 4.5831132482724193e-07, - "logits/chosen": -2.683396577835083, - "logits/rejected": -2.652515172958374, - "logps/chosen": -294.0975646972656, - "logps/rejected": -284.01007080078125, - "loss": 0.6737, - "rewards/accuracies": 0.796875, - "rewards/chosen": -0.09628719091415405, - "rewards/margins": 0.3271844983100891, - "rewards/rejected": -0.4234716594219208, + "logits/chosen": -2.72942852973938, + "logits/rejected": -2.74281644821167, + "logps/chosen": -270.63800048828125, + "logps/rejected": -247.06167602539062, + "loss": 0.8911, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.1383085548877716, + "rewards/margins": 0.19229631125926971, + "rewards/rejected": -0.053987741470336914, "step": 128 }, { "epoch": 0.27, "learning_rate": 4.5626458262912735e-07, - "logits/chosen": -2.662360191345215, - "logits/rejected": -2.656447410583496, - "logps/chosen": -292.4847412109375, - "logps/rejected": -313.1526184082031, - "loss": 0.7135, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.13693922758102417, - "rewards/margins": 0.4902416467666626, - "rewards/rejected": -0.6271809339523315, + "logits/chosen": -2.722701072692871, + "logits/rejected": -2.730611562728882, + "logps/chosen": -267.2499084472656, + "logps/rejected": -263.76068115234375, + "loss": 0.9224, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.1154087707400322, + "rewards/margins": 0.24867050349712372, + "rewards/rejected": -0.13326174020767212, "step": 130 }, { "epoch": 0.28, "learning_rate": 4.541735956498554e-07, - "logits/chosen": -2.7437210083007812, - "logits/rejected": -2.72233247756958, - "logps/chosen": -291.6712646484375, - "logps/rejected": -298.3407287597656, - "loss": 0.6636, + "logits/chosen": -2.757941484451294, + "logits/rejected": -2.76700496673584, + "logps/chosen": -267.6918640136719, + "logps/rejected": -252.36151123046875, + "loss": 0.8856, "rewards/accuracies": 0.828125, - "rewards/chosen": 0.008928142488002777, - "rewards/margins": 0.6095040440559387, - "rewards/rejected": -0.6005759239196777, + "rewards/chosen": 0.24872201681137085, + "rewards/margins": 0.3895059823989868, + "rewards/rejected": -0.14078396558761597, "step": 132 }, { "epoch": 0.28, "learning_rate": 4.520388124165564e-07, - "logits/chosen": -2.68888521194458, - "logits/rejected": -2.673058271408081, - "logps/chosen": -303.44732666015625, - "logps/rejected": -304.8455810546875, - "loss": 0.7579, - "rewards/accuracies": 0.765625, - "rewards/chosen": -0.08947059512138367, - "rewards/margins": 0.37194037437438965, - "rewards/rejected": -0.4614109992980957, + "logits/chosen": -2.7388994693756104, + "logits/rejected": -2.7547032833099365, + "logps/chosen": -280.05029296875, + "logps/rejected": -264.361572265625, + "loss": 0.9747, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.1444995105266571, + "rewards/margins": 0.201070636510849, + "rewards/rejected": -0.05657113343477249, "step": 134 }, { "epoch": 0.28, "learning_rate": 4.498606908508753e-07, - "logits/chosen": -2.6336781978607178, - "logits/rejected": -2.641366958618164, - "logps/chosen": -310.1975402832031, - "logps/rejected": -321.4742736816406, - "loss": 0.7511, + "logits/chosen": -2.71738862991333, + "logits/rejected": -2.7438583374023438, + "logps/chosen": -283.8454284667969, + "logps/rejected": -280.550048828125, + "loss": 0.978, "rewards/accuracies": 0.734375, - "rewards/chosen": -0.1359916627407074, - "rewards/margins": 0.3668138384819031, - "rewards/rejected": -0.5028054714202881, + "rewards/chosen": 0.12752971053123474, + "rewards/margins": 0.22109274566173553, + "rewards/rejected": -0.09356305003166199, "step": 136 }, { "epoch": 0.29, "learning_rate": 4.476396981707453e-07, - "logits/chosen": -2.741149425506592, - "logits/rejected": -2.744832992553711, - "logps/chosen": -283.7612609863281, - "logps/rejected": -319.18011474609375, - "loss": 0.7273, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.14727121591567993, - "rewards/margins": 0.3416295051574707, - "rewards/rejected": -0.48890072107315063, + "logits/chosen": -2.8418924808502197, + "logits/rejected": -2.845505714416504, + "logps/chosen": -257.6434326171875, + "logps/rejected": -278.16436767578125, + "loss": 0.9079, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.11390725523233414, + "rewards/margins": 0.19265054166316986, + "rewards/rejected": -0.07874329388141632, "step": 138 }, { "epoch": 0.29, "learning_rate": 4.453763107901675e-07, - "logits/chosen": -2.769141674041748, - "logits/rejected": -2.6976804733276367, - "logps/chosen": -343.0176086425781, - "logps/rejected": -339.2708740234375, - "loss": 0.684, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.13109339773654938, - "rewards/margins": 0.3063846528530121, - "rewards/rejected": -0.43747806549072266, + "logits/chosen": -2.854187250137329, + "logits/rejected": -2.8112897872924805, + "logps/chosen": -316.2648010253906, + "logps/rejected": -302.19903564453125, + "loss": 0.905, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.13643448054790497, + "rewards/margins": 0.2031940519809723, + "rewards/rejected": -0.06675955653190613, "step": 140 }, { "epoch": 0.3, "learning_rate": 4.4307101421701755e-07, - "logits/chosen": -2.628458023071289, - "logits/rejected": -2.610029458999634, - "logps/chosen": -299.93060302734375, - "logps/rejected": -290.9799499511719, - "loss": 0.6642, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.17024263739585876, - "rewards/margins": 0.28561675548553467, - "rewards/rejected": -0.45585939288139343, + "logits/chosen": -2.7740957736968994, + "logits/rejected": -2.7839348316192627, + "logps/chosen": -277.0834045410156, + "logps/rejected": -256.514404296875, + "loss": 0.8363, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.05822940915822983, + "rewards/margins": 0.16943329572677612, + "rewards/rejected": -0.1112038865685463, "step": 142 }, { "epoch": 0.3, "learning_rate": 4.4072430294890166e-07, - "logits/chosen": -2.5924410820007324, - "logits/rejected": -2.5785272121429443, - "logps/chosen": -262.089599609375, - "logps/rejected": -269.3330383300781, - "loss": 0.6441, - "rewards/accuracies": 0.734375, - "rewards/chosen": 0.034515030682086945, - "rewards/margins": 0.3341864347457886, - "rewards/rejected": -0.2996714115142822, + "logits/chosen": -2.720773458480835, + "logits/rejected": -2.7185139656066895, + "logps/chosen": -244.2642364501953, + "logps/rejected": -243.1986541748047, + "loss": 0.8459, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.2127687633037567, + "rewards/margins": 0.25109627842903137, + "rewards/rejected": -0.038327496498823166, "step": 144 }, { "epoch": 0.31, "learning_rate": 4.3833668036708483e-07, - "logits/chosen": -2.625112771987915, - "logits/rejected": -2.6584105491638184, - "logps/chosen": -306.502685546875, - "logps/rejected": -345.8301086425781, - "loss": 0.6941, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.10175588726997375, - "rewards/margins": 0.30893003940582275, - "rewards/rejected": -0.4106859564781189, + "logits/chosen": -2.8051412105560303, + "logits/rejected": -2.811025381088257, + "logps/chosen": -282.9385986328125, + "logps/rejected": -306.4236145019531, + "loss": 0.8821, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13388508558273315, + "rewards/margins": 0.15050604939460754, + "rewards/rejected": -0.016620971262454987, "step": 146 }, { "epoch": 0.31, "learning_rate": 4.3590865862851263e-07, - "logits/chosen": -2.617330551147461, - "logits/rejected": -2.5643858909606934, - "logps/chosen": -298.26141357421875, - "logps/rejected": -267.2621154785156, - "loss": 0.6176, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.04226139560341835, - "rewards/margins": 0.45013248920440674, - "rewards/rejected": -0.49239394068717957, + "logits/chosen": -2.768535852432251, + "logits/rejected": -2.7534749507904053, + "logps/chosen": -275.9228515625, + "logps/rejected": -232.7934112548828, + "loss": 0.8053, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1811244785785675, + "rewards/margins": 0.32883143424987793, + "rewards/rejected": -0.1477069854736328, "step": 148 }, { "epoch": 0.31, "learning_rate": 4.3344075855595097e-07, - "logits/chosen": -2.4896128177642822, - "logits/rejected": -2.4251513481140137, - "logps/chosen": -294.1082763671875, - "logps/rejected": -305.47332763671875, - "loss": 0.6897, - "rewards/accuracies": 0.828125, - "rewards/chosen": -0.11761671304702759, - "rewards/margins": 0.43049168586730957, - "rewards/rejected": -0.5481083989143372, + "logits/chosen": -2.7373950481414795, + "logits/rejected": -2.725128412246704, + "logps/chosen": -265.9374694824219, + "logps/rejected": -259.4600830078125, + "loss": 0.8794, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16409148275852203, + "rewards/margins": 0.2520677149295807, + "rewards/rejected": -0.08797623217105865, "step": 150 }, { "epoch": 0.32, "learning_rate": 4.309335095262675e-07, - "logits/chosen": -2.3300864696502686, - "logits/rejected": -2.2978248596191406, - "logps/chosen": -284.1370849609375, - "logps/rejected": -352.7837219238281, - "loss": 0.695, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.26125186681747437, - "rewards/margins": 0.5214322209358215, - "rewards/rejected": -0.7826840877532959, + "logits/chosen": -2.6658806800842285, + "logits/rejected": -2.657099723815918, + "logps/chosen": -253.30223083496094, + "logps/rejected": -305.234619140625, + "loss": 0.8846, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.047096915543079376, + "rewards/margins": 0.35428985953330994, + "rewards/rejected": -0.30719298124313354, "step": 152 }, { "epoch": 0.32, "learning_rate": 4.2838744935687716e-07, - "logits/chosen": -2.383730888366699, - "logits/rejected": -2.2686848640441895, - "logps/chosen": -285.3652038574219, - "logps/rejected": -313.19207763671875, - "loss": 0.6355, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.1003919169306755, - "rewards/margins": 0.49685102701187134, - "rewards/rejected": -0.5972429513931274, + "logits/chosen": -2.7290966510772705, + "logits/rejected": -2.736837863922119, + "logps/chosen": -266.3750915527344, + "logps/rejected": -278.6961364746094, + "loss": 0.8613, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.0895094946026802, + "rewards/margins": 0.3417929410934448, + "rewards/rejected": -0.2522834241390228, "step": 154 }, { "epoch": 0.33, "learning_rate": 4.258031241903777e-07, - "logits/chosen": -2.2691562175750732, - "logits/rejected": -2.1100411415100098, - "logps/chosen": -295.82708740234375, - "logps/rejected": -299.9715881347656, - "loss": 0.6957, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.07401493936777115, - "rewards/margins": 0.35916173458099365, - "rewards/rejected": -0.4331766366958618, + "logits/chosen": -2.735307216644287, + "logits/rejected": -2.681828737258911, + "logps/chosen": -277.9707946777344, + "logps/rejected": -269.47174072265625, + "loss": 0.8695, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.104547880589962, + "rewards/margins": 0.23272603750228882, + "rewards/rejected": -0.1281781643629074, "step": 156 }, { "epoch": 0.33, "learning_rate": 4.2318108837739986e-07, - "logits/chosen": -2.101970672607422, - "logits/rejected": -2.052926540374756, - "logps/chosen": -275.3291931152344, - "logps/rejected": -346.45135498046875, - "loss": 0.7169, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.17267772555351257, - "rewards/margins": 0.42162394523620605, - "rewards/rejected": -0.594301700592041, + "logits/chosen": -2.6491639614105225, + "logits/rejected": -2.6681437492370605, + "logps/chosen": -263.2922668457031, + "logps/rejected": -314.55487060546875, + "loss": 0.9629, + "rewards/accuracies": 0.578125, + "rewards/chosen": -0.052308231592178345, + "rewards/margins": 0.22302857041358948, + "rewards/rejected": -0.2753368020057678, "step": 158 }, { "epoch": 0.33, "learning_rate": 4.2052190435769554e-07, - "logits/chosen": -2.161992073059082, - "logits/rejected": -2.0577569007873535, - "logps/chosen": -329.9812316894531, - "logps/rejected": -323.7291259765625, - "loss": 0.6115, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.0013326676562428474, - "rewards/margins": 0.4901263415813446, - "rewards/rejected": -0.4914590120315552, + "logits/chosen": -2.7281954288482666, + "logits/rejected": -2.705580234527588, + "logps/chosen": -313.15869140625, + "logps/rejected": -296.6554870605469, + "loss": 0.8021, + "rewards/accuracies": 0.640625, + "rewards/chosen": 0.1668926179409027, + "rewards/margins": 0.3876148462295532, + "rewards/rejected": -0.2207222282886505, "step": 160 }, { "epoch": 0.34, "learning_rate": 4.1782614253949255e-07, - "logits/chosen": -1.8093454837799072, - "logits/rejected": -1.6997994184494019, - "logps/chosen": -252.4735565185547, - "logps/rejected": -264.23297119140625, - "loss": 0.7009, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.2441805750131607, - "rewards/margins": 0.25481969118118286, - "rewards/rejected": -0.49900031089782715, + "logits/chosen": -2.7059178352355957, + "logits/rejected": -2.6537256240844727, + "logps/chosen": -230.8391571044922, + "logps/rejected": -236.34918212890625, + "loss": 0.8959, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.027836445719003677, + "rewards/margins": 0.19232578575611115, + "rewards/rejected": -0.2201622575521469, "step": 162 }, { "epoch": 0.34, "learning_rate": 4.1509438117713863e-07, - "logits/chosen": -1.5648189783096313, - "logits/rejected": -1.543358325958252, - "logps/chosen": -320.0599060058594, - "logps/rejected": -306.7493896484375, - "loss": 0.6848, - "rewards/accuracies": 0.71875, - "rewards/chosen": 0.039286356419324875, - "rewards/margins": 0.518799901008606, - "rewards/rejected": -0.47951358556747437, + "logits/chosen": -2.6949987411499023, + "logits/rejected": -2.7387402057647705, + "logps/chosen": -300.81646728515625, + "logps/rejected": -271.0618896484375, + "loss": 0.9386, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.2317204475402832, + "rewards/margins": 0.354358971118927, + "rewards/rejected": -0.1226385086774826, "step": 164 }, { "epoch": 0.35, "learning_rate": 4.123272062470633e-07, - "logits/chosen": -1.3095282316207886, - "logits/rejected": -1.038232684135437, - "logps/chosen": -291.6402282714844, - "logps/rejected": -298.0380554199219, - "loss": 0.6891, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.02619929611682892, - "rewards/margins": 0.43957069516181946, - "rewards/rejected": -0.4657699763774872, + "logits/chosen": -2.696976900100708, + "logits/rejected": -2.692206621170044, + "logps/chosen": -272.76568603515625, + "logps/rejected": -259.8251037597656, + "loss": 0.8614, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.16254612803459167, + "rewards/margins": 0.24618662893772125, + "rewards/rejected": -0.08364049345254898, "step": 166 }, { "epoch": 0.35, "learning_rate": 4.0952521132208267e-07, - "logits/chosen": -0.8358496427536011, - "logits/rejected": -0.8311337828636169, - "logps/chosen": -288.705078125, - "logps/rejected": -253.3335723876953, - "loss": 0.6821, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.1505282074213028, - "rewards/margins": 0.26645365357398987, - "rewards/rejected": -0.41698184609413147, + "logits/chosen": -2.735821485519409, + "logits/rejected": -2.703927516937256, + "logps/chosen": -264.80657958984375, + "logps/rejected": -223.46334838867188, + "loss": 0.8696, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.08845643699169159, + "rewards/margins": 0.20673608779907227, + "rewards/rejected": -0.11827965825796127, "step": 168 }, { "epoch": 0.36, "learning_rate": 4.0668899744407567e-07, - "logits/chosen": -0.6292511820793152, - "logits/rejected": 0.11955951899290085, - "logps/chosen": -217.69808959960938, - "logps/rejected": -266.32525634765625, - "loss": 0.7194, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.19091403484344482, - "rewards/margins": 0.39874905347824097, - "rewards/rejected": -0.589663028717041, + "logits/chosen": -2.765240430831909, + "logits/rejected": -2.7325382232666016, + "logps/chosen": -188.49044799804688, + "logps/rejected": -217.9078369140625, + "loss": 0.8982, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.10116222500801086, + "rewards/margins": 0.20665103197097778, + "rewards/rejected": -0.10548880696296692, "step": 170 }, { "epoch": 0.36, "learning_rate": 4.0381917299505686e-07, - "logits/chosen": -0.4040292203426361, - "logits/rejected": -0.05061528459191322, - "logps/chosen": -281.4028015136719, - "logps/rejected": -348.06854248046875, - "loss": 0.6614, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.022851143032312393, - "rewards/margins": 0.5195820331573486, - "rewards/rejected": -0.5424332022666931, + "logits/chosen": -2.7241485118865967, + "logits/rejected": -2.7119617462158203, + "logps/chosen": -256.798583984375, + "logps/rejected": -291.5596008300781, + "loss": 0.8701, + "rewards/accuracies": 0.640625, + "rewards/chosen": 0.22319145500659943, + "rewards/margins": 0.200535386800766, + "rewards/rejected": 0.02265605702996254, "step": 172 }, { "epoch": 0.36, "learning_rate": 4.00916353566676e-07, - "logits/chosen": 0.4304525554180145, - "logits/rejected": 0.37021252512931824, - "logps/chosen": -268.30401611328125, - "logps/rejected": -340.36376953125, - "loss": 0.6454, + "logits/chosen": -2.669443130493164, + "logits/rejected": -2.7155041694641113, + "logps/chosen": -232.86203002929688, + "logps/rejected": -283.644287109375, + "loss": 0.8918, "rewards/accuracies": 0.703125, - "rewards/chosen": -0.2065473347902298, - "rewards/margins": 0.4695810079574585, - "rewards/rejected": -0.6761283874511719, + "rewards/chosen": 0.1478724479675293, + "rewards/margins": 0.256805956363678, + "rewards/rejected": -0.10893349349498749, "step": 174 }, { "epoch": 0.37, "learning_rate": 3.979811618281705e-07, - "logits/chosen": 0.9382424354553223, - "logits/rejected": -0.011063352227210999, - "logps/chosen": -303.7531433105469, - "logps/rejected": -319.4146423339844, - "loss": 0.7602, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.31537631154060364, - "rewards/margins": 0.3811299502849579, - "rewards/rejected": -0.6965062618255615, + "logits/chosen": -2.7021071910858154, + "logits/rejected": -2.764340877532959, + "logps/chosen": -257.8997497558594, + "logps/rejected": -252.3794708251953, + "loss": 1.0393, + "rewards/accuracies": 0.640625, + "rewards/chosen": 0.14315754175186157, + "rewards/margins": 0.16931167244911194, + "rewards/rejected": -0.026154138147830963, "step": 176 }, { "epoch": 0.37, "learning_rate": 3.9501422739279953e-07, - "logits/chosen": 0.4460795521736145, - "logits/rejected": 0.570868968963623, - "logps/chosen": -307.8954772949219, - "logps/rejected": -339.8791198730469, - "loss": 0.6882, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.10756506770849228, - "rewards/margins": 0.5310941934585571, - "rewards/rejected": -0.6386592388153076, + "logits/chosen": -2.7709646224975586, + "logits/rejected": -2.7708492279052734, + "logps/chosen": -278.5252990722656, + "logps/rejected": -275.7145080566406, + "loss": 0.9124, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.1861363798379898, + "rewards/margins": 0.18314936757087708, + "rewards/rejected": 0.0029869992285966873, "step": 178 }, { "epoch": 0.38, "learning_rate": 3.920161866827889e-07, - "logits/chosen": 0.07825072109699249, - "logits/rejected": 0.6348622441291809, - "logps/chosen": -309.1721496582031, - "logps/rejected": -288.8408203125, - "loss": 0.6567, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.1812726855278015, - "rewards/margins": 0.5873675346374512, - "rewards/rejected": -0.7686402797698975, + "logits/chosen": -2.8050036430358887, + "logits/rejected": -2.7901365756988525, + "logps/chosen": -275.84417724609375, + "logps/rejected": -228.7900390625, + "loss": 0.8712, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1520070731639862, + "rewards/margins": 0.3201393783092499, + "rewards/rejected": -0.16813232004642487, "step": 180 }, { "epoch": 0.38, "learning_rate": 3.889876827928156e-07, - "logits/chosen": 0.37462693452835083, - "logits/rejected": 0.3019832670688629, - "logps/chosen": -253.56881713867188, - "logps/rejected": -316.3982238769531, - "loss": 0.5877, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.04077187180519104, - "rewards/margins": 0.5590379238128662, - "rewards/rejected": -0.5182660222053528, + "logits/chosen": -2.6767191886901855, + "logits/rejected": -2.6746373176574707, + "logps/chosen": -233.00840759277344, + "logps/rejected": -267.5483703613281, + "loss": 0.8053, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.2463759481906891, + "rewards/margins": 0.276143342256546, + "rewards/rejected": -0.029767364263534546, "step": 182 }, { "epoch": 0.39, "learning_rate": 3.859293653520604e-07, - "logits/chosen": 0.6516929268836975, - "logits/rejected": 0.2800888419151306, - "logps/chosen": -276.3404541015625, - "logps/rejected": -317.9354248046875, - "loss": 0.7184, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.1999325156211853, - "rewards/margins": 0.5805739164352417, - "rewards/rejected": -0.780506432056427, + "logits/chosen": -2.745319366455078, + "logits/rejected": -2.786708116531372, + "logps/chosen": -244.93020629882812, + "logps/rejected": -252.83175659179688, + "loss": 0.9758, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.11416977643966675, + "rewards/margins": 0.24363908171653748, + "rewards/rejected": -0.12946929037570953, "step": 184 }, { "epoch": 0.39, "learning_rate": 3.828418903848593e-07, - "logits/chosen": -0.025208771228790283, - "logits/rejected": 0.8900982737541199, - "logps/chosen": -265.7961120605469, - "logps/rejected": -337.1889343261719, - "loss": 0.6888, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.3380471467971802, - "rewards/margins": 0.5418481826782227, - "rewards/rejected": -0.8798953294754028, + "logits/chosen": -2.835418224334717, + "logits/rejected": -2.731480836868286, + "logps/chosen": -221.95022583007812, + "logps/rejected": -263.40478515625, + "loss": 0.9023, + "rewards/accuracies": 0.609375, + "rewards/chosen": 0.1004117801785469, + "rewards/margins": 0.24246561527252197, + "rewards/rejected": -0.14205384254455566, "step": 186 }, { "epoch": 0.39, "learning_rate": 3.797259201699833e-07, - "logits/chosen": 0.05765300244092941, - "logits/rejected": 0.576537013053894, - "logps/chosen": -331.6053466796875, - "logps/rejected": -351.186767578125, - "loss": 0.6485, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.3514668941497803, - "rewards/margins": 0.43950024247169495, - "rewards/rejected": -0.7909672260284424, + "logits/chosen": -2.8170595169067383, + "logits/rejected": -2.7756195068359375, + "logps/chosen": -281.1588134765625, + "logps/rejected": -278.281494140625, + "loss": 0.82, + "rewards/accuracies": 0.640625, + "rewards/chosen": 0.1529981791973114, + "rewards/margins": 0.21491265296936035, + "rewards/rejected": -0.061914458870887756, "step": 188 }, { "epoch": 0.4, "learning_rate": 3.765821230985757e-07, - "logits/chosen": 0.3566046953201294, - "logits/rejected": 0.24242231249809265, - "logps/chosen": -333.481689453125, - "logps/rejected": -358.510986328125, - "loss": 0.6358, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.3405124843120575, - "rewards/margins": 0.591222882270813, - "rewards/rejected": -0.9317353963851929, + "logits/chosen": -2.73687744140625, + "logits/rejected": -2.7898876667022705, + "logps/chosen": -280.7138977050781, + "logps/rejected": -280.09661865234375, + "loss": 0.8229, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18716539442539215, + "rewards/margins": 0.3347574472427368, + "rewards/rejected": -0.14759206771850586, "step": 190 }, { "epoch": 0.4, "learning_rate": 3.734111735307796e-07, - "logits/chosen": 0.5186017155647278, - "logits/rejected": 0.6986968517303467, - "logps/chosen": -291.35626220703125, - "logps/rejected": -325.29412841796875, - "loss": 0.6586, - "rewards/accuracies": 0.796875, - "rewards/chosen": -0.3297085464000702, - "rewards/margins": 0.641376256942749, - "rewards/rejected": -0.9710848331451416, + "logits/chosen": -2.600639581680298, + "logits/rejected": -2.5767810344696045, + "logps/chosen": -245.0036163330078, + "logps/rejected": -253.24960327148438, + "loss": 0.838, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.13381804525852203, + "rewards/margins": 0.38445743918418884, + "rewards/rejected": -0.2506394386291504, "step": 192 }, { "epoch": 0.41, "learning_rate": 3.7021375165108377e-07, - "logits/chosen": 0.40191513299942017, - "logits/rejected": 0.24818360805511475, - "logps/chosen": -289.5572509765625, - "logps/rejected": -323.7060241699219, - "loss": 0.6506, + "logits/chosen": -2.7061498165130615, + "logits/rejected": -2.6843795776367188, + "logps/chosen": -246.30758666992188, + "logps/rejected": -262.097900390625, + "loss": 0.8548, "rewards/accuracies": 0.703125, - "rewards/chosen": -0.2648981511592865, - "rewards/margins": 0.49969348311424255, - "rewards/rejected": -0.7645915746688843, + "rewards/chosen": 0.16759857535362244, + "rewards/margins": 0.316108763217926, + "rewards/rejected": -0.1485101878643036, "step": 194 }, { "epoch": 0.41, "learning_rate": 3.6699054332241985e-07, - "logits/chosen": -0.4896765351295471, - "logits/rejected": -0.1318652182817459, - "logps/chosen": -350.1439208984375, - "logps/rejected": -288.9870910644531, - "loss": 0.7068, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.22379332780838013, - "rewards/margins": 0.44880181550979614, - "rewards/rejected": -0.6725951433181763, + "logits/chosen": -2.8462295532226562, + "logits/rejected": -2.83266019821167, + "logps/chosen": -315.21429443359375, + "logps/rejected": -233.60684204101562, + "loss": 0.9461, + "rewards/accuracies": 0.640625, + "rewards/chosen": 0.12550334632396698, + "rewards/margins": 0.24429593980312347, + "rewards/rejected": -0.11879261583089828, "step": 196 }, { "epoch": 0.41, "learning_rate": 3.6374223993904124e-07, - "logits/chosen": -0.05389963835477829, - "logits/rejected": 0.3870914578437805, - "logps/chosen": -330.797119140625, - "logps/rejected": -357.25347900390625, - "loss": 0.6192, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.20667432248592377, - "rewards/margins": 0.6827113032341003, - "rewards/rejected": -0.8893855810165405, + "logits/chosen": -2.7146401405334473, + "logits/rejected": -2.7143666744232178, + "logps/chosen": -297.3031921386719, + "logps/rejected": -299.88665771484375, + "loss": 0.8354, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.12826502323150635, + "rewards/margins": 0.4439820945262909, + "rewards/rejected": -0.31571707129478455, "step": 198 }, { "epoch": 0.42, "learning_rate": 3.604695382782159e-07, - "logits/chosen": -0.1867658495903015, - "logits/rejected": 0.04192525893449783, - "logps/chosen": -306.168212890625, - "logps/rejected": -340.8019104003906, - "loss": 0.594, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.1515503227710724, - "rewards/margins": 0.3953551650047302, - "rewards/rejected": -0.546905517578125, + "logits/chosen": -2.8062236309051514, + "logits/rejected": -2.8035285472869873, + "logps/chosen": -277.0640563964844, + "logps/rejected": -294.4193420410156, + "loss": 0.751, + "rewards/accuracies": 0.640625, + "rewards/chosen": 0.13949106633663177, + "rewards/margins": 0.22257088124752045, + "rewards/rejected": -0.08307983726263046, "step": 200 }, { "epoch": 0.42, "learning_rate": 3.571731403507635e-07, - "logits/chosen": -0.3911605179309845, - "logits/rejected": 0.24688732624053955, - "logps/chosen": -315.493408203125, - "logps/rejected": -362.08209228515625, - "loss": 0.667, + "logits/chosen": -2.8031527996063232, + "logits/rejected": -2.770803213119507, + "logps/chosen": -288.0368957519531, + "logps/rejected": -313.0611877441406, + "loss": 0.8395, "rewards/accuracies": 0.71875, - "rewards/chosen": -0.19705705344676971, - "rewards/margins": 0.5905351638793945, - "rewards/rejected": -0.7875921726226807, + "rewards/chosen": 0.07750790566205978, + "rewards/margins": 0.37489083409309387, + "rewards/rejected": -0.2973828911781311, "step": 202 }, { "epoch": 0.43, "learning_rate": 3.5385375325047163e-07, - "logits/chosen": 0.018862135708332062, - "logits/rejected": 0.042824022471904755, - "logps/chosen": -346.4378662109375, - "logps/rejected": -371.5933837890625, - "loss": 0.7231, - "rewards/accuracies": 0.640625, - "rewards/chosen": -0.16377538442611694, - "rewards/margins": 0.5193297863006592, - "rewards/rejected": -0.6831051707267761, + "logits/chosen": -2.7978999614715576, + "logits/rejected": -2.8142027854919434, + "logps/chosen": -313.62689208984375, + "logps/rejected": -315.30224609375, + "loss": 0.9523, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.16433416306972504, + "rewards/margins": 0.2845282554626465, + "rewards/rejected": -0.12019409239292145, "step": 204 }, { "epoch": 0.43, "learning_rate": 3.505120890024195e-07, - "logits/chosen": -0.5048182010650635, - "logits/rejected": 0.798430323600769, - "logps/chosen": -311.4440002441406, - "logps/rejected": -332.12078857421875, - "loss": 0.5901, - "rewards/accuracies": 0.609375, - "rewards/chosen": -0.1354539841413498, - "rewards/margins": 0.5432121753692627, - "rewards/rejected": -0.6786662340164185, + "logits/chosen": -2.7964963912963867, + "logits/rejected": -2.734177350997925, + "logps/chosen": -277.01202392578125, + "logps/rejected": -279.23516845703125, + "loss": 0.7622, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.2088654786348343, + "rewards/margins": 0.3586757779121399, + "rewards/rejected": -0.1498102843761444, "step": 206 }, { "epoch": 0.44, "learning_rate": 3.4714886441024573e-07, - "logits/chosen": 0.18471813201904297, - "logits/rejected": 0.5488951206207275, - "logps/chosen": -312.63348388671875, - "logps/rejected": -304.498291015625, - "loss": 0.6626, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.16734659671783447, - "rewards/margins": 0.5686672925949097, - "rewards/rejected": -0.7360140085220337, + "logits/chosen": -2.721184253692627, + "logits/rejected": -2.680152416229248, + "logps/chosen": -281.4962158203125, + "logps/rejected": -251.9808349609375, + "loss": 0.8599, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.14402586221694946, + "rewards/margins": 0.35486528277397156, + "rewards/rejected": -0.2108393907546997, "step": 208 }, { "epoch": 0.44, "learning_rate": 3.4376480090239047e-07, - "logits/chosen": 0.2744285464286804, - "logits/rejected": 0.882749080657959, - "logps/chosen": -260.8684997558594, - "logps/rejected": -292.6208190917969, - "loss": 0.7122, + "logits/chosen": -2.7290287017822266, + "logits/rejected": -2.7316441535949707, + "logps/chosen": -217.75003051757812, + "logps/rejected": -235.74436950683594, + "loss": 0.9026, "rewards/accuracies": 0.671875, - "rewards/chosen": -0.3560115098953247, - "rewards/margins": 0.36396852135658264, - "rewards/rejected": -0.719980001449585, + "rewards/chosen": 0.07517298310995102, + "rewards/margins": 0.22638867795467377, + "rewards/rejected": -0.15121567249298096, "step": 210 }, { "epoch": 0.44, "learning_rate": 3.403606243773448e-07, - "logits/chosen": 0.10863735526800156, - "logits/rejected": 0.9924963116645813, - "logps/chosen": -277.6872863769531, - "logps/rejected": -296.84375, - "loss": 0.5924, + "logits/chosen": -2.7563090324401855, + "logits/rejected": -2.689605236053467, + "logps/chosen": -245.2488250732422, + "logps/rejected": -244.4375457763672, + "loss": 0.7967, "rewards/accuracies": 0.703125, - "rewards/chosen": -0.19410894811153412, - "rewards/margins": 0.49725741147994995, - "rewards/rejected": -0.6913663744926453, + "rewards/chosen": 0.13027535378932953, + "rewards/margins": 0.29757967591285706, + "rewards/rejected": -0.16730432212352753, "step": 212 }, { "epoch": 0.45, "learning_rate": 3.3693706504794243e-07, - "logits/chosen": -0.0544244647026062, - "logits/rejected": 0.6754695773124695, - "logps/chosen": -276.8150634765625, - "logps/rejected": -309.6656494140625, - "loss": 0.6933, + "logits/chosen": -2.724280595779419, + "logits/rejected": -2.6927483081817627, + "logps/chosen": -240.44247436523438, + "logps/rejected": -244.09005737304688, + "loss": 0.9075, "rewards/accuracies": 0.703125, - "rewards/chosen": -0.18632037937641144, - "rewards/margins": 0.4902322292327881, - "rewards/rejected": -0.6765525937080383, + "rewards/chosen": 0.1774054914712906, + "rewards/margins": 0.1982022523880005, + "rewards/rejected": -0.020796751603484154, "step": 214 }, { "epoch": 0.45, "learning_rate": 3.334948572847253e-07, - "logits/chosen": 0.12526825070381165, - "logits/rejected": 0.2164376825094223, - "logps/chosen": -288.12420654296875, - "logps/rejected": -301.5098876953125, - "loss": 0.644, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.2477717101573944, - "rewards/margins": 0.510245680809021, - "rewards/rejected": -0.7580174207687378, + "logits/chosen": -2.67922306060791, + "logits/rejected": -2.6972858905792236, + "logps/chosen": -251.50338745117188, + "logps/rejected": -240.28338623046875, + "loss": 0.8234, + "rewards/accuracies": 0.609375, + "rewards/chosen": 0.1184362918138504, + "rewards/margins": 0.26418930292129517, + "rewards/rejected": -0.14575302600860596, "step": 216 }, { "epoch": 0.46, "learning_rate": 3.300347394584172e-07, - "logits/chosen": -0.517288863658905, - "logits/rejected": -0.44046011567115784, - "logps/chosen": -271.6811218261719, - "logps/rejected": -325.8108215332031, - "loss": 0.6557, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.2316058874130249, - "rewards/margins": 0.4710268974304199, - "rewards/rejected": -0.7026327848434448, + "logits/chosen": -2.7296793460845947, + "logits/rejected": -2.7728610038757324, + "logps/chosen": -239.6943359375, + "logps/rejected": -272.48577880859375, + "loss": 0.8447, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.0882621705532074, + "rewards/margins": 0.25764429569244385, + "rewards/rejected": -0.16938212513923645, "step": 218 }, { "epoch": 0.46, "learning_rate": 3.265574537815398e-07, - "logits/chosen": -0.6968811750411987, - "logits/rejected": -0.789453387260437, - "logps/chosen": -290.6319580078125, - "logps/rejected": -339.7631530761719, - "loss": 0.635, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.15982553362846375, - "rewards/margins": 0.5405714511871338, - "rewards/rejected": -0.7003970146179199, + "logits/chosen": -2.6759114265441895, + "logits/rejected": -2.745467185974121, + "logps/chosen": -264.6651916503906, + "logps/rejected": -283.7639465332031, + "loss": 0.8546, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.09984200447797775, + "rewards/margins": 0.24024665355682373, + "rewards/rejected": -0.14040464162826538, "step": 220 }, { "epoch": 0.46, "learning_rate": 3.230637461492043e-07, - "logits/chosen": -0.4320661425590515, - "logits/rejected": -0.4025118052959442, - "logps/chosen": -280.6953430175781, - "logps/rejected": -280.9771728515625, - "loss": 0.6636, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.15578195452690125, - "rewards/margins": 0.5092009902000427, - "rewards/rejected": -0.6649829745292664, + "logits/chosen": -2.51967716217041, + "logits/rejected": -2.5612313747406006, + "logps/chosen": -251.1386260986328, + "logps/rejected": -229.07337951660156, + "loss": 0.8895, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.13978508114814758, + "rewards/margins": 0.2857304513454437, + "rewards/rejected": -0.14594532549381256, "step": 222 }, { "epoch": 0.47, "learning_rate": 3.1955436597911315e-07, - "logits/chosen": -0.9900290369987488, - "logits/rejected": -0.5774145126342773, - "logps/chosen": -257.3419494628906, - "logps/rejected": -318.7354431152344, - "loss": 0.6765, - "rewards/accuracies": 0.703125, - "rewards/chosen": 0.020992910489439964, - "rewards/margins": 0.5579670071601868, - "rewards/rejected": -0.5369741320610046, + "logits/chosen": -2.5901856422424316, + "logits/rejected": -2.5820000171661377, + "logps/chosen": -231.24229431152344, + "logps/rejected": -267.16851806640625, + "loss": 0.8667, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.28198927640914917, + "rewards/margins": 0.3032940626144409, + "rewards/rejected": -0.021304823458194733, "step": 224 }, { "epoch": 0.47, "learning_rate": 3.160300660508064e-07, - "logits/chosen": -1.4757821559906006, - "logits/rejected": -0.815751314163208, - "logps/chosen": -307.86767578125, - "logps/rejected": -318.9004821777344, - "loss": 0.661, - "rewards/accuracies": 0.796875, - "rewards/chosen": -0.030079301446676254, - "rewards/margins": 0.5298373103141785, - "rewards/rejected": -0.5599166750907898, + "logits/chosen": -2.7369868755340576, + "logits/rejected": -2.7302329540252686, + "logps/chosen": -288.5295715332031, + "logps/rejected": -280.9129333496094, + "loss": 0.8476, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.16330157220363617, + "rewards/margins": 0.3433426320552826, + "rewards/rejected": -0.18004107475280762, "step": 226 }, { "epoch": 0.48, "learning_rate": 3.1249160234418644e-07, - "logits/chosen": -0.9963082671165466, - "logits/rejected": -0.20200899243354797, - "logps/chosen": -275.8343200683594, - "logps/rejected": -320.10943603515625, - "loss": 0.6312, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.09299668669700623, - "rewards/margins": 0.47309738397598267, - "rewards/rejected": -0.5660940408706665, + "logits/chosen": -2.6574654579162598, + "logits/rejected": -2.6182937622070312, + "logps/chosen": -254.96800231933594, + "logps/rejected": -263.0310974121094, + "loss": 0.8401, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11566654592752457, + "rewards/margins": 0.11097730696201324, + "rewards/rejected": 0.0046892352402210236, "step": 228 }, { "epoch": 0.48, "learning_rate": 3.0893973387735683e-07, - "logits/chosen": -0.29111677408218384, - "logits/rejected": -0.016862310469150543, - "logps/chosen": -243.76763916015625, - "logps/rejected": -311.628662109375, - "loss": 0.6769, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.1675092577934265, - "rewards/margins": 0.49733197689056396, - "rewards/rejected": -0.6648411750793457, + "logits/chosen": -2.6950528621673584, + "logits/rejected": -2.6576457023620605, + "logps/chosen": -223.60475158691406, + "logps/rejected": -263.3905334472656, + "loss": 0.9043, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.03411956876516342, + "rewards/margins": 0.21657957136631012, + "rewards/rejected": -0.1824600100517273, "step": 230 }, { "epoch": 0.49, "learning_rate": 3.05375222543809e-07, - "logits/chosen": -0.8505387306213379, - "logits/rejected": -0.23501801490783691, - "logps/chosen": -294.7572021484375, - "logps/rejected": -310.2570495605469, - "loss": 0.7179, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.1568153351545334, - "rewards/margins": 0.5180144906044006, - "rewards/rejected": -0.6748298406600952, + "logits/chosen": -2.7462079524993896, + "logits/rejected": -2.752439260482788, + "logps/chosen": -269.77349853515625, + "logps/rejected": -253.81222534179688, + "loss": 0.9722, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.09302152693271637, + "rewards/margins": 0.20340323448181152, + "rewards/rejected": -0.11038171499967575, "step": 232 }, { "epoch": 0.49, "learning_rate": 3.017988329489923e-07, - "logits/chosen": -0.2259301245212555, - "logits/rejected": 0.060025911778211594, - "logps/chosen": -325.3514099121094, - "logps/rejected": -357.01263427734375, - "loss": 0.7001, - "rewards/accuracies": 0.796875, - "rewards/chosen": -0.1160479336977005, - "rewards/margins": 0.5354103446006775, - "rewards/rejected": -0.6514582633972168, + "logits/chosen": -2.7211241722106934, + "logits/rejected": -2.724308729171753, + "logps/chosen": -298.8631896972656, + "logps/rejected": -296.299072265625, + "loss": 0.8966, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.1488339602947235, + "rewards/margins": 0.19315680861473083, + "rewards/rejected": -0.044322848320007324, "step": 234 }, { "epoch": 0.49, "learning_rate": 2.9821133224630223e-07, - "logits/chosen": -0.24249279499053955, - "logits/rejected": 0.2804643511772156, - "logps/chosen": -306.503662109375, - "logps/rejected": -328.2669372558594, - "loss": 0.6416, + "logits/chosen": -2.687819719314575, + "logits/rejected": -2.67553973197937, + "logps/chosen": -269.1780090332031, + "logps/rejected": -264.85516357421875, + "loss": 0.7953, "rewards/accuracies": 0.703125, - "rewards/chosen": -0.3325122892856598, - "rewards/margins": 0.5438024401664734, - "rewards/rejected": -0.8763147592544556, + "rewards/chosen": 0.04074423760175705, + "rewards/margins": 0.282941073179245, + "rewards/rejected": -0.24219684302806854, "step": 236 }, { "epoch": 0.5, "learning_rate": 2.946134899725226e-07, - "logits/chosen": -0.21587029099464417, - "logits/rejected": 0.22806911170482635, - "logps/chosen": -272.1225891113281, - "logps/rejected": -317.8914489746094, - "loss": 0.7015, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.2932060956954956, - "rewards/margins": 0.544987142086029, - "rewards/rejected": -0.8381932377815247, + "logits/chosen": -2.7366926670074463, + "logits/rejected": -2.6992833614349365, + "logps/chosen": -236.1741180419922, + "logps/rejected": -250.46524047851562, + "loss": 0.9427, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.06627872586250305, + "rewards/margins": 0.2302100658416748, + "rewards/rejected": -0.16393133997917175, "step": 238 }, { "epoch": 0.5, "learning_rate": 2.910060778827554e-07, - "logits/chosen": 0.44503313302993774, - "logits/rejected": 0.7824755311012268, - "logps/chosen": -242.06944274902344, - "logps/rejected": -281.43572998046875, - "loss": 0.6554, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.23596498370170593, - "rewards/margins": 0.4384329915046692, - "rewards/rejected": -0.6743979454040527, + "logits/chosen": -2.56319260597229, + "logits/rejected": -2.5229761600494385, + "logps/chosen": -205.67662048339844, + "logps/rejected": -219.51187133789062, + "loss": 0.8552, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.12796306610107422, + "rewards/margins": 0.1831222027540207, + "rewards/rejected": -0.05515911057591438, "step": 240 }, { "epoch": 0.51, "learning_rate": 2.873898697848762e-07, - "logits/chosen": 0.17147797346115112, - "logits/rejected": 0.5953517556190491, - "logps/chosen": -300.029296875, - "logps/rejected": -331.022216796875, - "loss": 0.6662, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.23151984810829163, - "rewards/margins": 0.441323846578598, - "rewards/rejected": -0.6728437542915344, + "logits/chosen": -2.6426453590393066, + "logits/rejected": -2.6488285064697266, + "logps/chosen": -266.0320739746094, + "logps/rejected": -267.57720947265625, + "loss": 0.8925, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.10845239460468292, + "rewards/margins": 0.14684635400772095, + "rewards/rejected": -0.03839395195245743, "step": 242 }, { "epoch": 0.51, "learning_rate": 2.837656413735479e-07, - "logits/chosen": 0.24648509919643402, - "logits/rejected": 0.7520670294761658, - "logps/chosen": -287.30377197265625, - "logps/rejected": -344.3001708984375, - "loss": 0.6139, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.20726029574871063, - "rewards/margins": 0.7070189118385315, - "rewards/rejected": -0.9142792820930481, + "logits/chosen": -2.6461117267608643, + "logits/rejected": -2.66263747215271, + "logps/chosen": -248.2384033203125, + "logps/rejected": -269.6097106933594, + "loss": 0.8246, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.183393195271492, + "rewards/margins": 0.35076749324798584, + "rewards/rejected": -0.16737432777881622, "step": 244 }, { "epoch": 0.51, "learning_rate": 2.801341700638307e-07, - "logits/chosen": 0.4327443242073059, - "logits/rejected": 0.28958457708358765, - "logps/chosen": -314.9183349609375, - "logps/rejected": -296.54376220703125, - "loss": 0.6514, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.24840864539146423, - "rewards/margins": 0.4947468638420105, - "rewards/rejected": -0.7431554794311523, + "logits/chosen": -2.7126615047454834, + "logits/rejected": -2.7577171325683594, + "logps/chosen": -276.72100830078125, + "logps/rejected": -236.76815795898438, + "loss": 0.8063, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.13356462121009827, + "rewards/margins": 0.2789640724658966, + "rewards/rejected": -0.14539945125579834, "step": 246 }, { "epoch": 0.52, "learning_rate": 2.7649623482442274e-07, - "logits/chosen": 0.5047562718391418, - "logits/rejected": 1.313357949256897, - "logps/chosen": -284.20257568359375, - "logps/rejected": -354.37872314453125, - "loss": 0.6634, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.3625330328941345, - "rewards/margins": 0.5466379523277283, - "rewards/rejected": -0.9091709852218628, + "logits/chosen": -2.632189989089966, + "logits/rejected": -2.605233669281006, + "logps/chosen": -243.29464721679688, + "logps/rejected": -292.3890686035156, + "loss": 0.8651, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.04654625803232193, + "rewards/margins": 0.3358207941055298, + "rewards/rejected": -0.28927454352378845, "step": 248 }, { "epoch": 0.52, "learning_rate": 2.7285261601056697e-07, - "logits/chosen": 0.19208696484565735, - "logits/rejected": 0.3003866970539093, - "logps/chosen": -306.0182189941406, - "logps/rejected": -343.263671875, - "loss": 0.6481, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.1929151564836502, - "rewards/margins": 0.7904263138771057, - "rewards/rejected": -0.9833414554595947, + "logits/chosen": -2.7081713676452637, + "logits/rejected": -2.6980645656585693, + "logps/chosen": -279.4450988769531, + "logps/rejected": -274.97735595703125, + "loss": 0.8543, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.07281598448753357, + "rewards/margins": 0.37329408526420593, + "rewards/rejected": -0.30047810077667236, "step": 250 }, { "epoch": 0.53, "learning_rate": 2.692040951966617e-07, - "logits/chosen": 0.5424776077270508, - "logits/rejected": 0.7004806995391846, - "logps/chosen": -312.521728515625, - "logps/rejected": -328.9450378417969, - "loss": 0.5763, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.0869397521018982, - "rewards/margins": 0.742694616317749, - "rewards/rejected": -0.8296343088150024, + "logits/chosen": -2.6583333015441895, + "logits/rejected": -2.739175796508789, + "logps/chosen": -289.65753173828125, + "logps/rejected": -276.938720703125, + "loss": 0.7868, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.14170213043689728, + "rewards/margins": 0.45127326250076294, + "rewards/rejected": -0.30957114696502686, "step": 252 }, { "epoch": 0.53, "learning_rate": 2.655514550086086e-07, - "logits/chosen": 0.3900845944881439, - "logits/rejected": 1.1437915563583374, - "logps/chosen": -270.412353515625, - "logps/rejected": -285.83740234375, - "loss": 0.6635, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.039971936494112015, - "rewards/margins": 0.6791943907737732, - "rewards/rejected": -0.6392224431037903, + "logits/chosen": -2.648866891860962, + "logits/rejected": -2.6364998817443848, + "logps/chosen": -258.0337829589844, + "logps/rejected": -243.5597686767578, + "loss": 0.8754, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.1637575477361679, + "rewards/margins": 0.380203515291214, + "rewards/rejected": -0.21644596755504608, "step": 254 }, { "epoch": 0.54, "learning_rate": 2.618954789559356e-07, - "logits/chosen": -0.6006118059158325, - "logits/rejected": 0.0894058495759964, - "logps/chosen": -329.71051025390625, - "logps/rejected": -360.0182800292969, - "loss": 0.7222, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.2027689814567566, - "rewards/margins": 0.3933987617492676, - "rewards/rejected": -0.5961677432060242, + "logits/chosen": -2.735349655151367, + "logits/rejected": -2.7232978343963623, + "logps/chosen": -306.3507995605469, + "logps/rejected": -313.97503662109375, + "loss": 0.9665, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.030827980488538742, + "rewards/margins": 0.16656307876110077, + "rewards/rejected": -0.13573509454727173, "step": 256 }, { "epoch": 0.54, "learning_rate": 2.582369512637302e-07, - "logits/chosen": 0.022289007902145386, - "logits/rejected": 0.7651575207710266, - "logps/chosen": -239.86917114257812, - "logps/rejected": -276.7130126953125, - "loss": 0.6653, - "rewards/accuracies": 0.828125, - "rewards/chosen": 0.07738906890153885, - "rewards/margins": 0.4737430810928345, - "rewards/rejected": -0.3963540196418762, + "logits/chosen": -2.6541173458099365, + "logits/rejected": -2.620288610458374, + "logps/chosen": -222.8006591796875, + "logps/rejected": -244.00851440429688, + "loss": 0.8402, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.24807412922382355, + "rewards/margins": 0.3173832297325134, + "rewards/rejected": -0.06930903345346451, "step": 258 }, { "epoch": 0.54, "learning_rate": 2.5457665670441937e-07, - "logits/chosen": 0.08083158731460571, - "logits/rejected": 0.5649360418319702, - "logps/chosen": -276.38232421875, - "logps/rejected": -330.9521789550781, - "loss": 0.7062, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.10466089844703674, - "rewards/margins": 0.6767792701721191, - "rewards/rejected": -0.7814401984214783, + "logits/chosen": -2.726813793182373, + "logits/rejected": -2.6844475269317627, + "logps/chosen": -253.82244873046875, + "logps/rejected": -278.234375, + "loss": 0.8644, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.12093760073184967, + "rewards/margins": 0.37519997358322144, + "rewards/rejected": -0.25426238775253296, "step": 260 }, { "epoch": 0.55, "learning_rate": 2.509153804294318e-07, - "logits/chosen": 0.3234708309173584, - "logits/rejected": 0.7615450024604797, - "logps/chosen": -269.09417724609375, - "logps/rejected": -293.904052734375, - "loss": 0.6256, - "rewards/accuracies": 0.609375, - "rewards/chosen": -0.07213997095823288, - "rewards/margins": 0.43272051215171814, - "rewards/rejected": -0.504860520362854, + "logits/chosen": -2.7090988159179688, + "logits/rejected": -2.6777806282043457, + "logps/chosen": -245.5247802734375, + "logps/rejected": -253.37815856933594, + "loss": 0.8277, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1635538786649704, + "rewards/margins": 0.2631555199623108, + "rewards/rejected": -0.09960167109966278, "step": 262 }, { "epoch": 0.55, "learning_rate": 2.4725390780077905e-07, - "logits/chosen": 0.46226564049720764, - "logits/rejected": 0.5075880885124207, - "logps/chosen": -314.9215087890625, - "logps/rejected": -332.1287536621094, - "loss": 0.7012, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.053367726504802704, - "rewards/margins": 0.6606994867324829, - "rewards/rejected": -0.7140672206878662, + "logits/chosen": -2.6489369869232178, + "logits/rejected": -2.6580944061279297, + "logps/chosen": -288.50787353515625, + "logps/rejected": -283.6526184082031, + "loss": 0.9073, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.2107684165239334, + "rewards/margins": 0.44007429480552673, + "rewards/rejected": -0.22930584847927094, "step": 264 }, { "epoch": 0.56, "learning_rate": 2.435930242225919e-07, - "logits/chosen": 0.2664536237716675, - "logits/rejected": 1.190063714981079, - "logps/chosen": -302.0753479003906, - "logps/rejected": -365.6280822753906, - "loss": 0.6695, + "logits/chosen": -2.7068402767181396, + "logits/rejected": -2.696272373199463, + "logps/chosen": -278.40228271484375, + "logps/rejected": -313.0991516113281, + "loss": 0.8511, "rewards/accuracies": 0.765625, - "rewards/chosen": -0.07531341910362244, - "rewards/margins": 0.6120296120643616, - "rewards/rejected": -0.6873430609703064, + "rewards/chosen": 0.16141736507415771, + "rewards/margins": 0.3234710991382599, + "rewards/rejected": -0.16205376386642456, "step": 266 }, { "epoch": 0.56, "learning_rate": 2.399335149726463e-07, - "logits/chosen": 0.32580170035362244, - "logits/rejected": 0.5576289892196655, - "logps/chosen": -325.13726806640625, - "logps/rejected": -381.2005615234375, - "loss": 0.6745, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.10618472844362259, - "rewards/margins": 0.5702216625213623, - "rewards/rejected": -0.6764063835144043, + "logits/chosen": -2.625251054763794, + "logits/rejected": -2.63790225982666, + "logps/chosen": -304.51580810546875, + "logps/rejected": -328.6022644042969, + "loss": 0.8956, + "rewards/accuracies": 0.640625, + "rewards/chosen": 0.10003012418746948, + "rewards/margins": 0.25045347213745117, + "rewards/rejected": -0.1504233181476593, "step": 268 }, { "epoch": 0.57, "learning_rate": 2.3627616503391812e-07, - "logits/chosen": 0.29107674956321716, - "logits/rejected": 0.5633612871170044, - "logps/chosen": -277.61590576171875, - "logps/rejected": -348.748291015625, - "loss": 0.6212, + "logits/chosen": -2.7186062335968018, + "logits/rejected": -2.7630484104156494, + "logps/chosen": -248.68087768554688, + "logps/rejected": -298.2981872558594, + "loss": 0.8321, "rewards/accuracies": 0.765625, - "rewards/chosen": -0.12373210489749908, - "rewards/margins": 0.578803300857544, - "rewards/rejected": -0.7025353908538818, + "rewards/chosen": 0.1656179130077362, + "rewards/margins": 0.36365240812301636, + "rewards/rejected": -0.19803446531295776, "step": 270 }, { "epoch": 0.57, "learning_rate": 2.3262175892620062e-07, - "logits/chosen": 0.6590377688407898, - "logits/rejected": 0.6782903075218201, - "logps/chosen": -301.8876647949219, - "logps/rejected": -348.36968994140625, - "loss": 0.732, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.06287754327058792, - "rewards/margins": 0.7469431757926941, - "rewards/rejected": -0.8098206520080566, + "logits/chosen": -2.71309494972229, + "logits/rejected": -2.756438732147217, + "logps/chosen": -279.9858093261719, + "logps/rejected": -294.4936828613281, + "loss": 0.9392, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1561407744884491, + "rewards/margins": 0.42720139026641846, + "rewards/rejected": -0.27106061577796936, "step": 272 }, { "epoch": 0.57, "learning_rate": 2.2897108053782e-07, - "logits/chosen": 0.5858335494995117, - "logits/rejected": 1.1848359107971191, - "logps/chosen": -278.7359619140625, - "logps/rejected": -322.556884765625, - "loss": 0.6291, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.1839427649974823, - "rewards/margins": 0.5925223231315613, - "rewards/rejected": -0.7764650583267212, + "logits/chosen": -2.7783031463623047, + "logits/rejected": -2.743617057800293, + "logps/chosen": -255.32431030273438, + "logps/rejected": -272.09881591796875, + "loss": 0.855, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.05017377436161041, + "rewards/margins": 0.3220583498477936, + "rewards/rejected": -0.2718845307826996, "step": 274 }, { "epoch": 0.58, "learning_rate": 2.2532491295748865e-07, - "logits/chosen": 0.5746503472328186, - "logits/rejected": 1.5114972591400146, - "logps/chosen": -256.6669616699219, - "logps/rejected": -291.76373291015625, - "loss": 0.677, + "logits/chosen": -2.752013683319092, + "logits/rejected": -2.6955549716949463, + "logps/chosen": -232.36911010742188, + "logps/rejected": -245.0435333251953, + "loss": 0.824, "rewards/accuracies": 0.71875, - "rewards/chosen": -0.17329725623130798, - "rewards/margins": 0.6033150553703308, - "rewards/rejected": -0.7766122817993164, + "rewards/chosen": 0.06968133896589279, + "rewards/margins": 0.37909212708473206, + "rewards/rejected": -0.30941081047058105, "step": 276 }, { "epoch": 0.58, "learning_rate": 2.2168403830632769e-07, - "logits/chosen": 1.2941956520080566, - "logits/rejected": 1.145210862159729, - "logps/chosen": -253.27915954589844, - "logps/rejected": -313.1325378417969, - "loss": 0.63, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.18560369312763214, - "rewards/margins": 0.5705838203430176, - "rewards/rejected": -0.7561875581741333, + "logits/chosen": -2.6277527809143066, + "logits/rejected": -2.630091905593872, + "logps/chosen": -228.22360229492188, + "logps/rejected": -266.8860778808594, + "loss": 0.8247, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.06495171785354614, + "rewards/margins": 0.3586746156215668, + "rewards/rejected": -0.29372289776802063, "step": 278 }, { "epoch": 0.59, "learning_rate": 2.1804923757009882e-07, - "logits/chosen": 0.9151768684387207, - "logits/rejected": 1.2065914869308472, - "logps/chosen": -335.712890625, - "logps/rejected": -334.68817138671875, - "loss": 0.5678, - "rewards/accuracies": 0.875, - "rewards/chosen": -0.24476921558380127, - "rewards/margins": 0.7057139873504639, - "rewards/rejected": -0.9504832029342651, + "logits/chosen": -2.7561256885528564, + "logits/rejected": -2.795393466949463, + "logps/chosen": -303.15875244140625, + "logps/rejected": -270.75982666015625, + "loss": 0.7497, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.08077248930931091, + "rewards/margins": 0.3919723629951477, + "rewards/rejected": -0.3111998736858368, "step": 280 }, { "epoch": 0.59, "learning_rate": 2.1442129043167873e-07, - "logits/chosen": 0.8291666507720947, - "logits/rejected": 1.7541477680206299, - "logps/chosen": -330.841064453125, - "logps/rejected": -347.6577453613281, - "loss": 0.6978, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.29731371998786926, - "rewards/margins": 0.6203598380088806, - "rewards/rejected": -0.917673647403717, + "logits/chosen": -2.7110955715179443, + "logits/rejected": -2.6533727645874023, + "logps/chosen": -293.0301818847656, + "logps/rejected": -282.8433837890625, + "loss": 0.9302, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.08079522103071213, + "rewards/margins": 0.350325345993042, + "rewards/rejected": -0.26953014731407166, "step": 282 }, { "epoch": 0.59, "learning_rate": 2.1080097510381294e-07, - "logits/chosen": 0.8476366400718689, - "logits/rejected": 1.6469848155975342, - "logps/chosen": -290.6492004394531, - "logps/rejected": -338.7218017578125, - "loss": 0.7014, - "rewards/accuracies": 0.765625, - "rewards/chosen": -0.2652449905872345, - "rewards/margins": 0.51957768201828, - "rewards/rejected": -0.7848227024078369, + "logits/chosen": -2.803464889526367, + "logits/rejected": -2.781702756881714, + "logps/chosen": -255.2788848876953, + "logps/rejected": -279.19287109375, + "loss": 0.9151, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.08845841884613037, + "rewards/margins": 0.27799126505851746, + "rewards/rejected": -0.1895328313112259, "step": 284 }, { "epoch": 0.6, "learning_rate": 2.0718906816218595e-07, - "logits/chosen": 1.3053678274154663, - "logits/rejected": 1.4397318363189697, - "logps/chosen": -312.151123046875, - "logps/rejected": -353.7760009765625, - "loss": 0.5969, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.3702758550643921, - "rewards/margins": 0.655584454536438, - "rewards/rejected": -1.02586030960083, + "logits/chosen": -2.7392845153808594, + "logits/rejected": -2.7706708908081055, + "logps/chosen": -268.2358703613281, + "logps/rejected": -271.9430847167969, + "loss": 0.7782, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.06887663900852203, + "rewards/margins": 0.2764078974723816, + "rewards/rejected": -0.20753124356269836, "step": 286 }, { "epoch": 0.6, "learning_rate": 2.035863443788411e-07, - "logits/chosen": 1.1411147117614746, - "logits/rejected": 1.3497332334518433, - "logps/chosen": -334.48223876953125, - "logps/rejected": -358.02911376953125, - "loss": 0.6894, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.24735981225967407, - "rewards/margins": 0.7227092981338501, - "rewards/rejected": -0.9700690507888794, + "logits/chosen": -2.6818222999572754, + "logits/rejected": -2.71232533454895, + "logps/chosen": -294.4471130371094, + "logps/rejected": -282.6849670410156, + "loss": 0.8824, + "rewards/accuracies": 0.640625, + "rewards/chosen": 0.1529916673898697, + "rewards/margins": 0.3696192800998688, + "rewards/rejected": -0.2166275829076767, "step": 288 }, { "epoch": 0.61, "learning_rate": 1.9999357655598891e-07, - "logits/chosen": 1.477621078491211, - "logits/rejected": 2.1223461627960205, - "logps/chosen": -277.4604187011719, - "logps/rejected": -316.42803955078125, - "loss": 0.6614, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.5241543650627136, - "rewards/margins": 0.5040414333343506, - "rewards/rejected": -1.0281956195831299, + "logits/chosen": -2.6832895278930664, + "logits/rejected": -2.72611141204834, + "logps/chosen": -232.6370391845703, + "logps/rejected": -240.6990966796875, + "loss": 0.8592, + "rewards/accuracies": 0.703125, + "rewards/chosen": -0.07592064142227173, + "rewards/margins": 0.19498607516288757, + "rewards/rejected": -0.2709067165851593, "step": 290 }, { "epoch": 0.61, "learning_rate": 1.9641153536023642e-07, - "logits/chosen": 0.708802342414856, - "logits/rejected": 1.7905950546264648, - "logps/chosen": -260.52166748046875, - "logps/rejected": -309.6279602050781, - "loss": 0.6517, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.22097410261631012, - "rewards/margins": 0.6726060509681702, - "rewards/rejected": -0.8935801982879639, + "logits/chosen": -2.757486581802368, + "logits/rejected": -2.7176427841186523, + "logps/chosen": -230.83448791503906, + "logps/rejected": -239.2701416015625, + "loss": 0.8443, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.0758976861834526, + "rewards/margins": 0.2658996880054474, + "rewards/rejected": -0.19000205397605896, "step": 292 }, { "epoch": 0.62, "learning_rate": 1.928409891572757e-07, - "logits/chosen": 0.6345776915550232, - "logits/rejected": 1.2965819835662842, - "logps/chosen": -296.1861572265625, - "logps/rejected": -325.6953430175781, - "loss": 0.6571, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.15726947784423828, - "rewards/margins": 0.33030712604522705, - "rewards/rejected": -0.48757660388946533, + "logits/chosen": -2.7726337909698486, + "logits/rejected": -2.7439818382263184, + "logps/chosen": -268.2599182128906, + "logps/rejected": -282.44757080078125, + "loss": 0.8611, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12199299037456512, + "rewards/margins": 0.1770920306444168, + "rewards/rejected": -0.055099040269851685, "step": 294 }, { "epoch": 0.62, "learning_rate": 1.8928270384706582e-07, - "logits/chosen": 0.06032524257898331, - "logits/rejected": 0.7254040241241455, - "logps/chosen": -320.2947998046875, - "logps/rejected": -417.827392578125, - "loss": 0.823, - "rewards/accuracies": 0.625, - "rewards/chosen": -0.05728043615818024, - "rewards/margins": 0.4396788477897644, - "rewards/rejected": -0.49695926904678345, + "logits/chosen": -2.6347806453704834, + "logits/rejected": -2.6196320056915283, + "logps/chosen": -285.2722473144531, + "logps/rejected": -349.5801696777344, + "loss": 1.0746, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.29294514656066895, + "rewards/margins": 0.10743236541748047, + "rewards/rejected": 0.1855127513408661, "step": 296 }, { "epoch": 0.62, "learning_rate": 1.8573744269954297e-07, - "logits/chosen": -0.6211626529693604, - "logits/rejected": 0.5601530075073242, - "logps/chosen": -387.068359375, - "logps/rejected": -362.6905517578125, - "loss": 0.7065, - "rewards/accuracies": 0.703125, - "rewards/chosen": 0.007479578256607056, - "rewards/margins": 0.5313453078269958, - "rewards/rejected": -0.5238656997680664, + "logits/chosen": -2.745089530944824, + "logits/rejected": -2.7034072875976562, + "logps/chosen": -363.0157470703125, + "logps/rejected": -315.6820068359375, + "loss": 0.9456, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24800607562065125, + "rewards/margins": 0.30178576707839966, + "rewards/rejected": -0.05377965793013573, "step": 298 }, { "epoch": 0.63, "learning_rate": 1.8220596619089573e-07, - "logits/chosen": 0.012296931818127632, - "logits/rejected": 0.47643163800239563, - "logps/chosen": -278.79449462890625, - "logps/rejected": -358.36175537109375, - "loss": 0.6023, - "rewards/accuracies": 0.828125, - "rewards/chosen": -0.02508646249771118, - "rewards/margins": 0.529964029788971, - "rewards/rejected": -0.5550505518913269, + "logits/chosen": -2.70751953125, + "logits/rejected": -2.7504544258117676, + "logps/chosen": -259.25885009765625, + "logps/rejected": -320.9954528808594, + "loss": 0.8518, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.1702701300382614, + "rewards/margins": 0.35165777802467346, + "rewards/rejected": -0.18138763308525085, "step": 300 }, { "epoch": 0.63, "learning_rate": 1.7868903184043885e-07, - "logits/chosen": 0.22885966300964355, - "logits/rejected": 0.6849693655967712, - "logps/chosen": -245.32431030273438, - "logps/rejected": -260.42095947265625, - "loss": 0.6808, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.2312414050102234, - "rewards/margins": 0.5006843209266663, - "rewards/rejected": -0.7319257855415344, + "logits/chosen": -2.802051305770874, + "logits/rejected": -2.777665138244629, + "logps/chosen": -218.00796508789062, + "logps/rejected": -211.69900512695312, + "loss": 0.882, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.04192201793193817, + "rewards/margins": 0.2866283059120178, + "rewards/rejected": -0.24470627307891846, "step": 302 }, { "epoch": 0.64, "learning_rate": 1.7518739404812155e-07, - "logits/chosen": 0.10696160048246384, - "logits/rejected": 0.900860071182251, - "logps/chosen": -304.24365234375, - "logps/rejected": -300.4913330078125, - "loss": 0.6239, - "rewards/accuracies": 0.765625, - "rewards/chosen": 0.06586351990699768, - "rewards/margins": 0.6996278762817383, - "rewards/rejected": -0.6337643265724182, + "logits/chosen": -2.714080333709717, + "logits/rejected": -2.7069191932678223, + "logps/chosen": -284.216796875, + "logps/rejected": -254.74819946289062, + "loss": 0.8666, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2661323547363281, + "rewards/margins": 0.44246548414230347, + "rewards/rejected": -0.17633305490016937, "step": 304 }, { "epoch": 0.64, "learning_rate": 1.717018039327053e-07, - "logits/chosen": 0.37065258622169495, - "logits/rejected": 1.0153319835662842, - "logps/chosen": -274.7767639160156, - "logps/rejected": -315.3498840332031, - "loss": 0.6113, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.13434213399887085, - "rewards/margins": 0.5673910975456238, - "rewards/rejected": -0.7017332315444946, + "logits/chosen": -2.7300875186920166, + "logits/rejected": -2.7512590885162354, + "logps/chosen": -250.939208984375, + "logps/rejected": -264.42950439453125, + "loss": 0.8208, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.10403341054916382, + "rewards/margins": 0.29656291007995605, + "rewards/rejected": -0.19252946972846985, "step": 306 }, { "epoch": 0.64, "learning_rate": 1.6823300917064458e-07, - "logits/chosen": 0.0030569732189178467, - "logits/rejected": 1.1464507579803467, - "logps/chosen": -292.8100280761719, - "logps/rejected": -329.35333251953125, - "loss": 0.5978, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.3217441141605377, - "rewards/margins": 0.6382726430892944, - "rewards/rejected": -0.9600168466567993, + "logits/chosen": -2.806004524230957, + "logits/rejected": -2.7536754608154297, + "logps/chosen": -254.1464385986328, + "logps/rejected": -263.18927001953125, + "loss": 0.7796, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06489163637161255, + "rewards/margins": 0.36326807737350464, + "rewards/rejected": -0.2983764708042145, "step": 308 }, { "epoch": 0.65, "learning_rate": 1.647817538357072e-07, - "logits/chosen": 0.3857354521751404, - "logits/rejected": 1.0852301120758057, - "logps/chosen": -329.72882080078125, - "logps/rejected": -318.942138671875, - "loss": 0.7355, - "rewards/accuracies": 0.765625, - "rewards/chosen": -0.1711738556623459, - "rewards/margins": 0.6827853322029114, - "rewards/rejected": -0.8539592027664185, + "logits/chosen": -2.758282423019409, + "logits/rejected": -2.767810821533203, + "logps/chosen": -294.16033935546875, + "logps/rejected": -253.86602783203125, + "loss": 0.9443, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1845110058784485, + "rewards/margins": 0.3877091705799103, + "rewards/rejected": -0.20319817960262299, "step": 310 }, { "epoch": 0.65, "learning_rate": 1.6134877823936607e-07, - "logits/chosen": 0.8364083766937256, - "logits/rejected": 1.8597906827926636, - "logps/chosen": -264.943359375, - "logps/rejected": -348.2406921386719, - "loss": 0.5214, + "logits/chosen": -2.6954281330108643, + "logits/rejected": -2.6631717681884766, + "logps/chosen": -239.2750701904297, + "logps/rejected": -275.7354431152344, + "loss": 0.7568, "rewards/accuracies": 0.84375, - "rewards/chosen": -0.09745172411203384, - "rewards/margins": 1.012548565864563, - "rewards/rejected": -1.1100002527236938, + "rewards/chosen": 0.15923139452934265, + "rewards/margins": 0.5441789627075195, + "rewards/rejected": -0.3849475383758545, "step": 312 }, { "epoch": 0.66, "learning_rate": 1.5793481877199943e-07, - "logits/chosen": 0.5367469191551208, - "logits/rejected": 1.0159227848052979, - "logps/chosen": -320.8637390136719, - "logps/rejected": -364.44586181640625, - "loss": 0.6954, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.26059049367904663, - "rewards/margins": 0.5368437170982361, - "rewards/rejected": -0.7974342107772827, + "logits/chosen": -2.765436887741089, + "logits/rejected": -2.7954540252685547, + "logps/chosen": -280.85205078125, + "logps/rejected": -299.5735168457031, + "loss": 0.9009, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.13952624797821045, + "rewards/margins": 0.2882372736930847, + "rewards/rejected": -0.14871099591255188, "step": 314 }, { "epoch": 0.66, "learning_rate": 1.5454060774493065e-07, - "logits/chosen": 1.5845757722854614, - "logits/rejected": 0.7877135872840881, - "logps/chosen": -269.8088684082031, - "logps/rejected": -316.855712890625, - "loss": 0.6368, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.14322423934936523, - "rewards/margins": 0.6599938869476318, - "rewards/rejected": -0.8032180070877075, + "logits/chosen": -2.6711347103118896, + "logits/rejected": -2.751641035079956, + "logps/chosen": -233.09951782226562, + "logps/rejected": -261.0928039550781, + "loss": 0.853, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22386965155601501, + "rewards/margins": 0.46945852041244507, + "rewards/rejected": -0.24558885395526886, "step": 316 }, { "epoch": 0.67, "learning_rate": 1.5116687323334464e-07, - "logits/chosen": 0.47069552540779114, - "logits/rejected": 1.2589174509048462, - "logps/chosen": -289.40130615234375, - "logps/rejected": -347.61376953125, - "loss": 0.6596, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.15590085089206696, - "rewards/margins": 0.6152348518371582, - "rewards/rejected": -0.771135687828064, + "logits/chosen": -2.7912518978118896, + "logits/rejected": -2.7529494762420654, + "logps/chosen": -257.3804016113281, + "logps/rejected": -287.87310791015625, + "loss": 0.8653, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.16430845856666565, + "rewards/margins": 0.33803790807724, + "rewards/rejected": -0.17372946441173553, "step": 318 }, { "epoch": 0.67, "learning_rate": 1.478143389201113e-07, - "logits/chosen": 0.5592608451843262, - "logits/rejected": 1.0087881088256836, - "logps/chosen": -281.7051696777344, - "logps/rejected": -335.6897277832031, - "loss": 0.5898, - "rewards/accuracies": 0.859375, - "rewards/chosen": 0.045632533729076385, - "rewards/margins": 0.8765643835067749, - "rewards/rejected": -0.8309319019317627, + "logits/chosen": -2.5865142345428467, + "logits/rejected": -2.6156907081604004, + "logps/chosen": -257.1618957519531, + "logps/rejected": -277.2457580566406, + "loss": 0.7507, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.291064977645874, + "rewards/margins": 0.5375569462776184, + "rewards/rejected": -0.24649198353290558, "step": 320 }, { "epoch": 0.67, "learning_rate": 1.4448372394055246e-07, - "logits/chosen": -0.018945127725601196, - "logits/rejected": 0.802770733833313, - "logps/chosen": -328.23980712890625, - "logps/rejected": -333.0008239746094, - "loss": 0.6108, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.1781616359949112, - "rewards/margins": 0.6047887206077576, - "rewards/rejected": -0.7829503417015076, + "logits/chosen": -2.7859787940979004, + "logits/rejected": -2.7700722217559814, + "logps/chosen": -295.9079895019531, + "logps/rejected": -275.94921875, + "loss": 0.8344, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.14515657722949982, + "rewards/margins": 0.3575909733772278, + "rewards/rejected": -0.21243438124656677, "step": 322 }, { "epoch": 0.68, "learning_rate": 1.4117574272818386e-07, - "logits/chosen": -0.5540781021118164, - "logits/rejected": 0.035438332706689835, - "logps/chosen": -317.64404296875, - "logps/rejected": -343.2810974121094, - "loss": 0.6491, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.0927032008767128, - "rewards/margins": 0.4850231111049652, - "rewards/rejected": -0.577726423740387, + "logits/chosen": -2.839742422103882, + "logits/rejected": -2.7990095615386963, + "logps/chosen": -292.06414794921875, + "logps/rejected": -301.084228515625, + "loss": 0.8299, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.16309592127799988, + "rewards/margins": 0.31885361671447754, + "rewards/rejected": -0.15575766563415527, "step": 324 }, { "epoch": 0.68, "learning_rate": 1.3789110486146468e-07, - "logits/chosen": 0.01489407941699028, - "logits/rejected": 0.62424635887146, - "logps/chosen": -293.1672668457031, - "logps/rejected": -329.47479248046875, - "loss": 0.6931, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.16148582100868225, - "rewards/margins": 0.5662094950675964, - "rewards/rejected": -0.7276952862739563, + "logits/chosen": -2.8153977394104004, + "logits/rejected": -2.809354305267334, + "logps/chosen": -268.2123718261719, + "logps/rejected": -278.0255126953125, + "loss": 0.9075, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.08806290477514267, + "rewards/margins": 0.301265150308609, + "rewards/rejected": -0.21320223808288574, "step": 326 }, { "epoch": 0.69, "learning_rate": 1.3463051491159093e-07, - "logits/chosen": 0.06474697589874268, - "logits/rejected": -0.06373190879821777, - "logps/chosen": -314.8403015136719, - "logps/rejected": -331.58941650390625, - "loss": 0.6393, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.08766806125640869, - "rewards/margins": 0.6701959371566772, - "rewards/rejected": -0.7578639984130859, + "logits/chosen": -2.726783037185669, + "logits/rejected": -2.814533233642578, + "logps/chosen": -292.0963439941406, + "logps/rejected": -286.7141418457031, + "loss": 0.8571, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.1397712677717209, + "rewards/margins": 0.44888272881507874, + "rewards/rejected": -0.30911147594451904, "step": 328 }, { "epoch": 0.69, "learning_rate": 1.3139467229135998e-07, - "logits/chosen": -0.7029250860214233, - "logits/rejected": -0.01946442574262619, - "logps/chosen": -334.6876220703125, - "logps/rejected": -347.177734375, - "loss": 0.63, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.16892662644386292, - "rewards/margins": 0.5833321809768677, - "rewards/rejected": -0.752258837223053, + "logits/chosen": -2.8347132205963135, + "logits/rejected": -2.8469901084899902, + "logps/chosen": -309.94439697265625, + "logps/rejected": -298.09735107421875, + "loss": 0.8232, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.07850594073534012, + "rewards/margins": 0.3399609625339508, + "rewards/rejected": -0.2614549696445465, "step": 330 }, { "epoch": 0.69, "learning_rate": 1.281842711051438e-07, - "logits/chosen": -0.23442219197750092, - "logits/rejected": 0.7187111973762512, - "logps/chosen": -295.4104309082031, - "logps/rejected": -344.3884582519531, - "loss": 0.5594, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.11131522059440613, - "rewards/margins": 0.7158193588256836, - "rewards/rejected": -0.8271346092224121, + "logits/chosen": -2.7002525329589844, + "logits/rejected": -2.6935536861419678, + "logps/chosen": -268.6197204589844, + "logps/rejected": -288.15814208984375, + "loss": 0.7435, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.1565917432308197, + "rewards/margins": 0.4214229881763458, + "rewards/rejected": -0.26483121514320374, "step": 332 }, { "epoch": 0.7, "learning_rate": 1.2500000000000005e-07, - "logits/chosen": -0.23394931852817535, - "logits/rejected": 0.5850412845611572, - "logps/chosen": -316.489990234375, - "logps/rejected": -380.6622009277344, - "loss": 0.695, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.21047508716583252, - "rewards/margins": 0.6145602464675903, - "rewards/rejected": -0.8250353336334229, + "logits/chosen": -2.711907148361206, + "logits/rejected": -2.747971773147583, + "logps/chosen": -293.81781005859375, + "logps/rejected": -326.8099060058594, + "loss": 0.9277, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.01624665968120098, + "rewards/margins": 0.3027590811252594, + "rewards/rejected": -0.2865124046802521, "step": 334 }, { "epoch": 0.7, "learning_rate": 1.2184254201795363e-07, - "logits/chosen": 0.016806162893772125, - "logits/rejected": 0.5278490781784058, - "logps/chosen": -292.6138916015625, - "logps/rejected": -314.90130615234375, - "loss": 0.6083, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.21455985307693481, - "rewards/margins": 0.4614453911781311, - "rewards/rejected": -0.6760051846504211, + "logits/chosen": -2.6891090869903564, + "logits/rejected": -2.706904411315918, + "logps/chosen": -261.0979919433594, + "logps/rejected": -264.4306945800781, + "loss": 0.781, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.10059913992881775, + "rewards/margins": 0.27189815044403076, + "rewards/rejected": -0.171299010515213, "step": 336 }, { "epoch": 0.71, "learning_rate": 1.1871257444948096e-07, - "logits/chosen": 0.26967114210128784, - "logits/rejected": 0.47079068422317505, - "logps/chosen": -297.1998291015625, - "logps/rejected": -293.2596435546875, - "loss": 0.744, - "rewards/accuracies": 0.59375, - "rewards/chosen": -0.2197016477584839, - "rewards/margins": 0.3860716223716736, - "rewards/rejected": -0.6057732105255127, + "logits/chosen": -2.7001423835754395, + "logits/rejected": -2.7412095069885254, + "logps/chosen": -267.71435546875, + "logps/rejected": -240.58407592773438, + "loss": 0.9423, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.07515294849872589, + "rewards/margins": 0.15417048335075378, + "rewards/rejected": -0.0790175348520279, "step": 338 }, { "epoch": 0.71, "learning_rate": 1.1561076868822755e-07, - "logits/chosen": 0.01600978896021843, - "logits/rejected": 0.6568925380706787, - "logps/chosen": -254.8614044189453, - "logps/rejected": -302.18023681640625, - "loss": 0.6576, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.19405773282051086, - "rewards/margins": 0.5532448887825012, - "rewards/rejected": -0.7473027110099792, + "logits/chosen": -2.7272467613220215, + "logits/rejected": -2.740358352661133, + "logps/chosen": -227.6375732421875, + "logps/rejected": -245.42372131347656, + "loss": 0.8961, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07818037271499634, + "rewards/margins": 0.25791800022125244, + "rewards/rejected": -0.1797376275062561, "step": 340 }, { "epoch": 0.72, "learning_rate": 1.125377900869913e-07, - "logits/chosen": -0.40245479345321655, - "logits/rejected": 0.1633588671684265, - "logps/chosen": -301.2410888671875, - "logps/rejected": -386.735107421875, - "loss": 0.6539, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.09816699475049973, - "rewards/margins": 0.6523302793502808, - "rewards/rejected": -0.7504973411560059, + "logits/chosen": -2.735792636871338, + "logits/rejected": -2.749459981918335, + "logps/chosen": -278.9033203125, + "logps/rejected": -330.1114196777344, + "loss": 0.9268, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12521089613437653, + "rewards/margins": 0.3094714283943176, + "rewards/rejected": -0.1842605471611023, "step": 342 }, { "epoch": 0.72, "learning_rate": 1.09494297815e-07, - "logits/chosen": -0.19187738001346588, - "logits/rejected": 1.2669118642807007, - "logps/chosen": -223.18153381347656, - "logps/rejected": -265.8275146484375, - "loss": 0.6018, - "rewards/accuracies": 0.765625, - "rewards/chosen": -0.05469350516796112, - "rewards/margins": 0.8093068599700928, - "rewards/rejected": -0.8640003800392151, + "logits/chosen": -2.6738338470458984, + "logits/rejected": -2.6385083198547363, + "logps/chosen": -203.19024658203125, + "logps/rejected": -210.32115173339844, + "loss": 0.7715, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.14521953463554382, + "rewards/margins": 0.45415645837783813, + "rewards/rejected": -0.3089369535446167, "step": 344 }, { "epoch": 0.72, "learning_rate": 1.0648094471651722e-07, - "logits/chosen": 0.06107745319604874, - "logits/rejected": 1.0187910795211792, - "logps/chosen": -298.7318420410156, - "logps/rejected": -298.97607421875, - "loss": 0.615, - "rewards/accuracies": 0.765625, - "rewards/chosen": -0.29442766308784485, - "rewards/margins": 0.5528799295425415, - "rewards/rejected": -0.8473076820373535, + "logits/chosen": -2.6865973472595215, + "logits/rejected": -2.639554977416992, + "logps/chosen": -264.8190612792969, + "logps/rejected": -241.10523986816406, + "loss": 0.8059, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.04470006376504898, + "rewards/margins": 0.3132995069026947, + "rewards/rejected": -0.2685994505882263, "step": 346 }, { "epoch": 0.73, "learning_rate": 1.0349837717080347e-07, - "logits/chosen": 0.2968660891056061, - "logits/rejected": 1.533956527709961, - "logps/chosen": -308.34906005859375, - "logps/rejected": -352.53582763671875, - "loss": 0.7653, + "logits/chosen": -2.6760525703430176, + "logits/rejected": -2.6360294818878174, + "logps/chosen": -277.3511657714844, + "logps/rejected": -289.6516418457031, + "loss": 0.9878, "rewards/accuracies": 0.734375, - "rewards/chosen": -0.2456754446029663, - "rewards/margins": 0.691182553768158, - "rewards/rejected": -0.9368579983711243, + "rewards/chosen": 0.06430340558290482, + "rewards/margins": 0.3723197877407074, + "rewards/rejected": -0.3080163300037384, "step": 348 }, { "epoch": 0.73, "learning_rate": 1.0054723495346482e-07, - "logits/chosen": 0.6172692775726318, - "logits/rejected": 1.4260393381118774, - "logps/chosen": -255.27175903320312, - "logps/rejected": -352.9705505371094, - "loss": 0.5945, - "rewards/accuracies": 0.796875, - "rewards/chosen": -0.2834804654121399, - "rewards/margins": 0.7214651107788086, - "rewards/rejected": -1.0049455165863037, + "logits/chosen": -2.7421512603759766, + "logits/rejected": -2.7008419036865234, + "logps/chosen": -223.1668701171875, + "logps/rejected": -292.0777587890625, + "loss": 0.8066, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.037568360567092896, + "rewards/margins": 0.43358632922172546, + "rewards/rejected": -0.39601796865463257, "step": 350 }, { "epoch": 0.74, "learning_rate": 9.76281510992176e-08, - "logits/chosen": 0.1523429900407791, - "logits/rejected": 1.0697251558303833, - "logps/chosen": -341.0992736816406, - "logps/rejected": -314.7637023925781, - "loss": 0.5933, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.2510679364204407, - "rewards/margins": 0.7420140504837036, - "rewards/rejected": -0.9930820465087891, + "logits/chosen": -2.7077085971832275, + "logits/rejected": -2.693835496902466, + "logps/chosen": -309.0074157714844, + "logps/rejected": -239.2669219970703, + "loss": 0.7919, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.06985090672969818, + "rewards/margins": 0.30796530842781067, + "rewards/rejected": -0.2381144016981125, "step": 352 }, { "epoch": 0.74, "learning_rate": 9.474175176609956e-08, - "logits/chosen": -0.027563797309994698, - "logits/rejected": 0.730286717414856, - "logps/chosen": -306.8202209472656, - "logps/rejected": -328.6500244140625, - "loss": 0.6264, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.21240682899951935, - "rewards/margins": 0.526252269744873, - "rewards/rejected": -0.7386590838432312, + "logits/chosen": -2.7741637229919434, + "logits/rejected": -2.757633924484253, + "logps/chosen": -273.8724060058594, + "logps/rejected": -275.011474609375, + "loss": 0.8173, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11707128584384918, + "rewards/margins": 0.3193451762199402, + "rewards/rejected": -0.202273890376091, "step": 354 }, { "epoch": 0.75, "learning_rate": 9.18886561011557e-08, - "logits/chosen": 0.010053783655166626, - "logits/rejected": 0.9134060144424438, - "logps/chosen": -343.5555419921875, - "logps/rejected": -344.8569030761719, - "loss": 0.5614, + "logits/chosen": -2.749138116836548, + "logits/rejected": -2.7531909942626953, + "logps/chosen": -313.9000549316406, + "logps/rejected": -275.1875305175781, + "loss": 0.7741, "rewards/accuracies": 0.78125, - "rewards/chosen": -0.1430027186870575, - "rewards/margins": 0.8304474949836731, - "rewards/rejected": -0.9734501838684082, + "rewards/chosen": 0.15355215966701508, + "rewards/margins": 0.4303089678287506, + "rewards/rejected": -0.27675676345825195, "step": 356 }, { "epoch": 0.75, "learning_rate": 8.906947610762825e-08, - "logits/chosen": 0.48395290970802307, - "logits/rejected": 0.8562638759613037, - "logps/chosen": -303.1999816894531, - "logps/rejected": -394.3343505859375, - "loss": 0.6914, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.22828936576843262, - "rewards/margins": 0.8299635648727417, - "rewards/rejected": -1.0582529306411743, + "logits/chosen": -2.780381917953491, + "logits/rejected": -2.7906088829040527, + "logps/chosen": -271.66485595703125, + "logps/rejected": -318.3462219238281, + "loss": 0.9219, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08706194907426834, + "rewards/margins": 0.38543349504470825, + "rewards/rejected": -0.2983715236186981, "step": 358 }, { "epoch": 0.75, "learning_rate": 8.628481651367875e-08, - "logits/chosen": 0.9650541543960571, - "logits/rejected": 1.530713438987732, - "logps/chosen": -317.75604248046875, - "logps/rejected": -357.28875732421875, - "loss": 0.569, + "logits/chosen": -2.730201482772827, + "logits/rejected": -2.7436330318450928, + "logps/chosen": -274.79412841796875, + "logps/rejected": -288.38702392578125, + "loss": 0.7784, "rewards/accuracies": 0.703125, - "rewards/chosen": -0.3810655474662781, - "rewards/margins": 0.5778117775917053, - "rewards/rejected": -0.9588773250579834, + "rewards/chosen": 0.048553235828876495, + "rewards/margins": 0.31841325759887695, + "rewards/rejected": -0.26986002922058105, "step": 360 }, { "epoch": 0.76, "learning_rate": 8.353527464267104e-08, - "logits/chosen": 0.648531436920166, - "logits/rejected": 0.5499491095542908, - "logps/chosen": -314.2027282714844, - "logps/rejected": -314.53021240234375, - "loss": 0.6003, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.2263791263103485, - "rewards/margins": 0.5237205624580383, - "rewards/rejected": -0.7500997185707092, + "logits/chosen": -2.7122316360473633, + "logits/rejected": -2.728078603744507, + "logps/chosen": -277.984130859375, + "logps/rejected": -246.37118530273438, + "loss": 0.8088, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.1358068883419037, + "rewards/margins": 0.20431655645370483, + "rewards/rejected": -0.06850966066122055, "step": 362 }, { "epoch": 0.76, "learning_rate": 8.082144028504231e-08, - "logits/chosen": 0.57253497838974, - "logits/rejected": 1.7951874732971191, - "logps/chosen": -329.2058410644531, - "logps/rejected": -354.29132080078125, - "loss": 0.6177, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.18844135105609894, - "rewards/margins": 0.7768537402153015, - "rewards/rejected": -0.9652950763702393, + "logits/chosen": -2.7570886611938477, + "logits/rejected": -2.7342653274536133, + "logps/chosen": -293.0337829589844, + "logps/rejected": -281.64093017578125, + "loss": 0.8193, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.17327940464019775, + "rewards/margins": 0.4120709300041199, + "rewards/rejected": -0.2387915551662445, "step": 364 }, { "epoch": 0.77, "learning_rate": 7.814389557179016e-08, - "logits/chosen": 0.21237725019454956, - "logits/rejected": 1.0711424350738525, - "logps/chosen": -355.3842468261719, - "logps/rejected": -369.0294494628906, - "loss": 0.6145, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.2611796259880066, - "rewards/margins": 0.7826185822486877, - "rewards/rejected": -1.0437982082366943, + "logits/chosen": -2.8120195865631104, + "logits/rejected": -2.7900400161743164, + "logps/chosen": -321.8678283691406, + "logps/rejected": -288.43896484375, + "loss": 0.8336, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.0739845484495163, + "rewards/margins": 0.311877965927124, + "rewards/rejected": -0.23789341747760773, "step": 366 }, { "epoch": 0.77, "learning_rate": 7.550321484960251e-08, - "logits/chosen": 0.7930352687835693, - "logits/rejected": 1.6684021949768066, - "logps/chosen": -339.7545471191406, - "logps/rejected": -346.7519226074219, - "loss": 0.6387, - "rewards/accuracies": 0.796875, - "rewards/chosen": -0.27698060870170593, - "rewards/margins": 0.8290520906448364, - "rewards/rejected": -1.1060327291488647, + "logits/chosen": -2.6915907859802246, + "logits/rejected": -2.7190332412719727, + "logps/chosen": -301.5814208984375, + "logps/rejected": -273.3837890625, + "loss": 0.8438, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.10475097596645355, + "rewards/margins": 0.4771023392677307, + "rewards/rejected": -0.37235134840011597, "step": 368 }, { "epoch": 0.77, "learning_rate": 7.289996455765748e-08, - "logits/chosen": 0.9145855903625488, - "logits/rejected": 1.413263201713562, - "logps/chosen": -331.789794921875, - "logps/rejected": -378.784912109375, - "loss": 0.7024, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.29554906487464905, - "rewards/margins": 0.6969149708747864, - "rewards/rejected": -0.9924640655517578, + "logits/chosen": -2.700855016708374, + "logits/rejected": -2.761075019836426, + "logps/chosen": -289.8067321777344, + "logps/rejected": -302.69891357421875, + "loss": 0.9268, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.1242816224694252, + "rewards/margins": 0.35588526725769043, + "rewards/rejected": -0.23160363733768463, "step": 370 }, { "epoch": 0.78, "learning_rate": 7.033470310611945e-08, - "logits/chosen": 0.3435116112232208, - "logits/rejected": 0.8178513050079346, - "logps/chosen": -300.52178955078125, - "logps/rejected": -318.1594543457031, - "loss": 0.5883, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.3312419652938843, - "rewards/margins": 0.560550332069397, - "rewards/rejected": -0.8917922973632812, + "logits/chosen": -2.6806559562683105, + "logits/rejected": -2.7227230072021484, + "logps/chosen": -257.09869384765625, + "logps/rejected": -249.40573120117188, + "loss": 0.7621, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.10298891365528107, + "rewards/margins": 0.3072441816329956, + "rewards/rejected": -0.20425525307655334, "step": 372 }, { "epoch": 0.78, "learning_rate": 6.780798075635675e-08, - "logits/chosen": 0.1530955582857132, - "logits/rejected": 1.2445425987243652, - "logps/chosen": -338.9671936035156, - "logps/rejected": -394.33905029296875, - "loss": 0.6703, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.10936611145734787, - "rewards/margins": 0.7992250919342041, - "rewards/rejected": -0.9085911512374878, + "logits/chosen": -2.765962839126587, + "logits/rejected": -2.7616312503814697, + "logps/chosen": -306.30718994140625, + "logps/rejected": -328.2559814453125, + "loss": 0.8588, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.2172340750694275, + "rewards/margins": 0.4649943709373474, + "rewards/rejected": -0.24776031076908112, "step": 374 }, { "epoch": 0.79, "learning_rate": 6.532033950290885e-08, - "logits/chosen": 0.3828258514404297, - "logits/rejected": -0.013105906546115875, - "logps/chosen": -247.27374267578125, - "logps/rejected": -330.0480041503906, - "loss": 0.7178, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.16955241560935974, - "rewards/margins": 0.5796515941619873, - "rewards/rejected": -0.7492039799690247, + "logits/chosen": -2.6854770183563232, + "logits/rejected": -2.7639622688293457, + "logps/chosen": -223.91505432128906, + "logps/rejected": -275.1647644042969, + "loss": 0.9594, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06403464078903198, + "rewards/margins": 0.2644060552120209, + "rewards/rejected": -0.2003713995218277, "step": 376 }, { "epoch": 0.79, "learning_rate": 6.28723129572247e-08, - "logits/chosen": -0.008059903979301453, - "logits/rejected": 0.4361591339111328, - "logps/chosen": -293.2403259277344, - "logps/rejected": -323.6603698730469, - "loss": 0.6125, - "rewards/accuracies": 0.65625, - "rewards/chosen": -0.1709737777709961, - "rewards/margins": 0.5192134380340576, - "rewards/rejected": -0.6901871562004089, + "logits/chosen": -2.7621538639068604, + "logits/rejected": -2.7511823177337646, + "logps/chosen": -263.3729553222656, + "logps/rejected": -269.253173828125, + "loss": 0.8127, + "rewards/accuracies": 0.609375, + "rewards/chosen": 0.12769976258277893, + "rewards/margins": 0.2738149166107178, + "rewards/rejected": -0.14611512422561646, "step": 378 }, { "epoch": 0.8, "learning_rate": 6.046442623320145e-08, - "logits/chosen": 0.3876532316207886, - "logits/rejected": 0.8840723037719727, - "logps/chosen": -317.1882019042969, - "logps/rejected": -368.5491943359375, - "loss": 0.5847, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.15435317158699036, - "rewards/margins": 0.7234021425247192, - "rewards/rejected": -0.8777552843093872, + "logits/chosen": -2.7075910568237305, + "logits/rejected": -2.719918727874756, + "logps/chosen": -281.3136291503906, + "logps/rejected": -309.2221984863281, + "loss": 0.801, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.20439280569553375, + "rewards/margins": 0.48887795209884644, + "rewards/rejected": -0.2844851315021515, "step": 380 }, { "epoch": 0.8, "learning_rate": 5.809719583454414e-08, - "logits/chosen": 0.1544257402420044, - "logits/rejected": 0.5524401664733887, - "logps/chosen": -320.5746154785156, - "logps/rejected": -355.8383483886719, - "loss": 0.6153, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.30984005331993103, - "rewards/margins": 0.5343888998031616, - "rewards/rejected": -0.844228982925415, + "logits/chosen": -2.728285312652588, + "logits/rejected": -2.7402377128601074, + "logps/chosen": -285.9422912597656, + "logps/rejected": -295.9526062011719, + "loss": 0.8346, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.03648320585489273, + "rewards/margins": 0.28185516595840454, + "rewards/rejected": -0.2453719675540924, "step": 382 }, { "epoch": 0.8, "learning_rate": 5.57711295439732e-08, - "logits/chosen": -0.23453757166862488, - "logits/rejected": 0.108016237616539, - "logps/chosen": -317.7830810546875, - "logps/rejected": -392.8485107421875, - "loss": 0.7964, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.26463451981544495, - "rewards/margins": 0.3382280468940735, - "rewards/rejected": -0.6028625965118408, + "logits/chosen": -2.829235792160034, + "logits/rejected": -2.816831588745117, + "logps/chosen": -285.594482421875, + "logps/rejected": -342.944091796875, + "loss": 1.0512, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.05725131556391716, + "rewards/margins": 0.1610693335533142, + "rewards/rejected": -0.10381801426410675, "step": 384 }, { "epoch": 0.81, "learning_rate": 5.3486726314303175e-08, - "logits/chosen": -0.37305283546447754, - "logits/rejected": 0.704928994178772, - "logps/chosen": -298.06561279296875, - "logps/rejected": -325.4785461425781, - "loss": 0.6078, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.00762226153165102, - "rewards/margins": 0.6488328576087952, - "rewards/rejected": -0.6564551591873169, + "logits/chosen": -2.760784864425659, + "logits/rejected": -2.7432379722595215, + "logps/chosen": -274.1730041503906, + "logps/rejected": -278.40673828125, + "loss": 0.8335, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.23130366206169128, + "rewards/margins": 0.41704025864601135, + "rewards/rejected": -0.18573659658432007, "step": 386 }, { "epoch": 0.81, "learning_rate": 5.1244476161413806e-08, - "logits/chosen": 0.41815757751464844, - "logits/rejected": 1.4270755052566528, - "logps/chosen": -233.82427978515625, - "logps/rejected": -301.65826416015625, - "loss": 0.6295, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.19570133090019226, - "rewards/margins": 0.6876392960548401, - "rewards/rejected": -0.8833405375480652, + "logits/chosen": -2.650850772857666, + "logits/rejected": -2.661970853805542, + "logps/chosen": -207.67056274414062, + "logps/rejected": -246.8564453125, + "loss": 0.8388, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.06583596765995026, + "rewards/margins": 0.4011579751968384, + "rewards/rejected": -0.3353220224380493, "step": 388 }, { "epoch": 0.82, "learning_rate": 4.904486005914027e-08, - "logits/chosen": -0.3861064612865448, - "logits/rejected": 0.04176158830523491, - "logps/chosen": -294.285400390625, - "logps/rejected": -333.8177185058594, - "loss": 0.6621, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.11257711052894592, - "rewards/margins": 0.5992448329925537, - "rewards/rejected": -0.7118219137191772, + "logits/chosen": -2.7880287170410156, + "logits/rejected": -2.8512425422668457, + "logps/chosen": -269.3569030761719, + "logps/rejected": -282.790771484375, + "loss": 0.8904, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.13670799136161804, + "rewards/margins": 0.3382605314254761, + "rewards/rejected": -0.20155256986618042, "step": 390 }, { "epoch": 0.82, "learning_rate": 4.688834983610082e-08, - "logits/chosen": 0.12746843695640564, - "logits/rejected": -0.0012746751308441162, - "logps/chosen": -273.5867919921875, - "logps/rejected": -360.38873291015625, - "loss": 0.6068, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.17792265117168427, - "rewards/margins": 0.6180278658866882, - "rewards/rejected": -0.7959505915641785, + "logits/chosen": -2.73187518119812, + "logits/rejected": -2.7440197467803955, + "logps/chosen": -248.7688751220703, + "logps/rejected": -302.49566650390625, + "loss": 0.8427, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.07025647163391113, + "rewards/margins": 0.2872760593891144, + "rewards/rejected": -0.21701961755752563, "step": 392 }, { "epoch": 0.82, "learning_rate": 4.477540807448832e-08, - "logits/chosen": -0.5536242127418518, - "logits/rejected": 0.9800190925598145, - "logps/chosen": -342.70782470703125, - "logps/rejected": -373.4602355957031, - "loss": 0.6496, + "logits/chosen": -2.7831361293792725, + "logits/rejected": -2.736912965774536, + "logps/chosen": -306.8554992675781, + "logps/rejected": -311.9383239746094, + "loss": 0.8699, "rewards/accuracies": 0.734375, - "rewards/chosen": -0.16559717059135437, - "rewards/margins": 0.689116358757019, - "rewards/rejected": -0.8547135591506958, + "rewards/chosen": 0.19292616844177246, + "rewards/margins": 0.43242016434669495, + "rewards/rejected": -0.2394939661026001, "step": 394 }, { "epoch": 0.83, "learning_rate": 4.270648801084295e-08, - "logits/chosen": 0.006685473024845123, - "logits/rejected": 0.7440766096115112, - "logps/chosen": -366.8838195800781, - "logps/rejected": -356.4771728515625, - "loss": 0.628, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.31455904245376587, - "rewards/margins": 0.559075653553009, - "rewards/rejected": -0.8736346364021301, + "logits/chosen": -2.7754673957824707, + "logits/rejected": -2.7401957511901855, + "logps/chosen": -333.12847900390625, + "logps/rejected": -291.1416320800781, + "loss": 0.8529, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.022994879633188248, + "rewards/margins": 0.2432737946510315, + "rewards/rejected": -0.22027893364429474, "step": 396 }, { "epoch": 0.83, "learning_rate": 4.0682033438831584e-08, - "logits/chosen": -0.7373438477516174, - "logits/rejected": 0.6611166000366211, - "logps/chosen": -362.8245849609375, - "logps/rejected": -376.4985046386719, - "loss": 0.603, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.004348123446106911, - "rewards/margins": 0.7439255714416504, - "rewards/rejected": -0.7482736706733704, + "logits/chosen": -2.713646650314331, + "logits/rejected": -2.6474950313568115, + "logps/chosen": -336.6706237792969, + "logps/rejected": -316.4405212402344, + "loss": 0.8742, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.25719165802001953, + "rewards/margins": 0.4048852324485779, + "rewards/rejected": -0.14769358932971954, "step": 398 }, { "epoch": 0.84, "learning_rate": 3.8702478614051345e-08, - "logits/chosen": -0.5010265707969666, - "logits/rejected": 0.7126686573028564, - "logps/chosen": -333.6928405761719, - "logps/rejected": -362.8348388671875, - "loss": 0.5581, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.09581595659255981, - "rewards/margins": 0.7684364914894104, - "rewards/rejected": -0.8642523884773254, + "logits/chosen": -2.645653486251831, + "logits/rejected": -2.635227680206299, + "logps/chosen": -305.938232421875, + "logps/rejected": -297.98956298828125, + "loss": 0.7668, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.1817300170660019, + "rewards/margins": 0.3975300192832947, + "rewards/rejected": -0.2157999873161316, "step": 400 }, { "epoch": 0.84, "learning_rate": 3.676824816087978e-08, - "logits/chosen": 0.4850161671638489, - "logits/rejected": 1.3615456819534302, - "logps/chosen": -282.8792419433594, - "logps/rejected": -272.2681579589844, - "loss": 0.628, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.2777165174484253, - "rewards/margins": 0.5231444835662842, - "rewards/rejected": -0.8008609414100647, + "logits/chosen": -2.790530204772949, + "logits/rejected": -2.7565648555755615, + "logps/chosen": -252.03407287597656, + "logps/rejected": -218.005859375, + "loss": 0.7934, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.030734974890947342, + "rewards/margins": 0.2889730930328369, + "rewards/rejected": -0.25823813676834106, "step": 402 }, { "epoch": 0.85, "learning_rate": 3.487975698139084e-08, - "logits/chosen": 0.2012457251548767, - "logits/rejected": 0.999991774559021, - "logps/chosen": -308.87738037109375, - "logps/rejected": -351.0428466796875, - "loss": 0.6757, - "rewards/accuracies": 0.6875, - "rewards/chosen": -0.27788233757019043, - "rewards/margins": 0.4390586316585541, - "rewards/rejected": -0.7169409394264221, + "logits/chosen": -2.7400898933410645, + "logits/rejected": -2.7303409576416016, + "logps/chosen": -269.85601806640625, + "logps/rejected": -294.58831787109375, + "loss": 0.92, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.11233149468898773, + "rewards/margins": 0.2647273540496826, + "rewards/rejected": -0.15239585936069489, "step": 404 }, { "epoch": 0.85, "learning_rate": 3.303741016635614e-08, - "logits/chosen": 0.05412872135639191, - "logits/rejected": 0.9229797124862671, - "logps/chosen": -312.1148681640625, - "logps/rejected": -305.1954650878906, - "loss": 0.7495, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.04841495305299759, - "rewards/margins": 0.6764097809791565, - "rewards/rejected": -0.7248247861862183, + "logits/chosen": -2.691680908203125, + "logits/rejected": -2.6881818771362305, + "logps/chosen": -286.71990966796875, + "logps/rejected": -245.76287841796875, + "loss": 0.9557, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.20553439855575562, + "rewards/margins": 0.33603304624557495, + "rewards/rejected": -0.13049863278865814, "step": 406 }, { "epoch": 0.85, "learning_rate": 3.12416029083514e-08, - "logits/chosen": 0.9064224362373352, - "logits/rejected": 0.9631800651550293, - "logps/chosen": -251.423583984375, - "logps/rejected": -331.3157958984375, - "loss": 0.6625, + "logits/chosen": -2.7589359283447266, + "logits/rejected": -2.8019297122955322, + "logps/chosen": -216.32762145996094, + "logps/rejected": -272.4498596191406, + "loss": 0.8453, "rewards/accuracies": 0.765625, - "rewards/chosen": -0.3316621482372284, - "rewards/margins": 0.5902231335639954, - "rewards/rejected": -0.9218851923942566, + "rewards/chosen": 0.01929749734699726, + "rewards/margins": 0.3525233268737793, + "rewards/rejected": -0.333225816488266, "step": 408 }, { "epoch": 0.86, "learning_rate": 2.9492720416985e-08, - "logits/chosen": 0.768115222454071, - "logits/rejected": 1.219833493232727, - "logps/chosen": -290.3539733886719, - "logps/rejected": -339.0583801269531, - "loss": 0.6143, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.2172953188419342, - "rewards/margins": 0.7515923380851746, - "rewards/rejected": -0.9688878059387207, + "logits/chosen": -2.7147293090820312, + "logits/rejected": -2.766231060028076, + "logps/chosen": -259.3979797363281, + "logps/rejected": -272.8369140625, + "loss": 0.8153, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.09226459264755249, + "rewards/margins": 0.3989376425743103, + "rewards/rejected": -0.3066730499267578, "step": 410 }, { "epoch": 0.86, "learning_rate": 2.7791137836269158e-08, - "logits/chosen": -0.08727936446666718, - "logits/rejected": 1.5329773426055908, - "logps/chosen": -255.40463256835938, - "logps/rejected": -344.9212951660156, - "loss": 0.6984, + "logits/chosen": -2.704505443572998, + "logits/rejected": -2.676281690597534, + "logps/chosen": -220.72763061523438, + "logps/rejected": -279.1423645019531, + "loss": 0.9302, "rewards/accuracies": 0.71875, - "rewards/chosen": -0.2787282168865204, - "rewards/margins": 0.8177152276039124, - "rewards/rejected": -1.0964434146881104, + "rewards/chosen": 0.06804195791482925, + "rewards/margins": 0.5066962838172913, + "rewards/rejected": -0.4386543333530426, "step": 412 }, { "epoch": 0.87, "learning_rate": 2.613722016414943e-08, - "logits/chosen": 0.10900969803333282, - "logits/rejected": 1.1554454565048218, - "logps/chosen": -302.16650390625, - "logps/rejected": -325.09332275390625, - "loss": 0.6676, - "rewards/accuracies": 0.765625, - "rewards/chosen": -0.15435893833637238, - "rewards/margins": 0.6768224835395813, - "rewards/rejected": -0.8311813473701477, + "logits/chosen": -2.7758872509002686, + "logits/rejected": -2.744346857070923, + "logps/chosen": -274.24212646484375, + "logps/rejected": -269.83892822265625, + "loss": 0.8688, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.12488481402397156, + "rewards/margins": 0.403522253036499, + "rewards/rejected": -0.27863743901252747, "step": 414 }, { "epoch": 0.87, "learning_rate": 2.4531322174210973e-08, - "logits/chosen": 0.2667168378829956, - "logits/rejected": 0.06283846497535706, - "logps/chosen": -270.20196533203125, - "logps/rejected": -340.9342041015625, - "loss": 0.6368, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.2986586093902588, - "rewards/margins": 0.41698285937309265, - "rewards/rejected": -0.7156413793563843, + "logits/chosen": -2.7819604873657227, + "logits/rejected": -2.868227481842041, + "logps/chosen": -233.44618225097656, + "logps/rejected": -277.4327087402344, + "loss": 0.8291, + "rewards/accuracies": 0.640625, + "rewards/chosen": 0.06889940798282623, + "rewards/margins": 0.14952558279037476, + "rewards/rejected": -0.08062617480754852, "step": 416 }, { "epoch": 0.87, "learning_rate": 2.297378833957761e-08, - "logits/chosen": 0.11617850512266159, - "logits/rejected": 1.1254775524139404, - "logps/chosen": -359.0841064453125, - "logps/rejected": -370.67474365234375, - "loss": 0.7271, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.12873072922229767, - "rewards/margins": 0.7082916498184204, - "rewards/rejected": -0.8370224237442017, + "logits/chosen": -2.681041955947876, + "logits/rejected": -2.7060351371765137, + "logps/chosen": -327.63848876953125, + "logps/rejected": -309.4879455566406, + "loss": 0.9851, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.18572543561458588, + "rewards/margins": 0.4108802378177643, + "rewards/rejected": -0.2251548022031784, "step": 418 }, { "epoch": 0.88, "learning_rate": 2.1464952759020856e-08, - "logits/chosen": 0.8191357851028442, - "logits/rejected": 0.4740172028541565, - "logps/chosen": -314.5917053222656, - "logps/rejected": -338.27239990234375, - "loss": 0.6364, - "rewards/accuracies": 0.765625, - "rewards/chosen": -0.10390536487102509, - "rewards/margins": 0.45885345339775085, - "rewards/rejected": -0.5627588033676147, + "logits/chosen": -2.6037652492523193, + "logits/rejected": -2.6691200733184814, + "logps/chosen": -283.7677001953125, + "logps/rejected": -282.0677490234375, + "loss": 0.8572, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.20433439314365387, + "rewards/margins": 0.20504654943943024, + "rewards/rejected": -0.0007121749222278595, "step": 420 }, { "epoch": 0.88, "learning_rate": 2.0005139085293942e-08, - "logits/chosen": -0.050498366355895996, - "logits/rejected": 1.2883732318878174, - "logps/chosen": -322.22113037109375, - "logps/rejected": -332.8543701171875, - "loss": 0.6629, - "rewards/accuracies": 0.765625, - "rewards/chosen": -0.1322225034236908, - "rewards/margins": 0.6378248333930969, - "rewards/rejected": -0.7700474262237549, + "logits/chosen": -2.8397231101989746, + "logits/rejected": -2.8188626766204834, + "logps/chosen": -294.79412841796875, + "logps/rejected": -280.51043701171875, + "loss": 0.8636, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.14204713702201843, + "rewards/margins": 0.38865533471107483, + "rewards/rejected": -0.2466081976890564, "step": 422 }, { "epoch": 0.89, "learning_rate": 1.8594660455706763e-08, - "logits/chosen": 0.42545390129089355, - "logits/rejected": 0.9098062515258789, - "logps/chosen": -289.212646484375, - "logps/rejected": -291.0245666503906, - "loss": 0.6363, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.17995241284370422, - "rewards/margins": 0.5931330323219299, - "rewards/rejected": -0.7730855345726013, + "logits/chosen": -2.7326340675354004, + "logits/rejected": -2.806819438934326, + "logps/chosen": -261.22613525390625, + "logps/rejected": -233.63914489746094, + "loss": 0.7965, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.09991291910409927, + "rewards/margins": 0.29914426803588867, + "rewards/rejected": -0.1992313265800476, "step": 424 }, { "epoch": 0.89, "learning_rate": 1.7233819424956247e-08, - "logits/chosen": 0.331084281206131, - "logits/rejected": 0.931097686290741, - "logps/chosen": -289.7149353027344, - "logps/rejected": -335.7164306640625, - "loss": 0.5931, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.14608880877494812, - "rewards/margins": 0.6044370532035828, - "rewards/rejected": -0.7505258321762085, + "logits/chosen": -2.786564826965332, + "logits/rejected": -2.7827227115631104, + "logps/chosen": -264.6268005371094, + "logps/rejected": -277.969482421875, + "loss": 0.8283, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.1047925055027008, + "rewards/margins": 0.27784889936447144, + "rewards/rejected": -0.17305639386177063, "step": 426 }, { "epoch": 0.9, "learning_rate": 1.5922907900227017e-08, - "logits/chosen": -0.002596009522676468, - "logits/rejected": 1.004823088645935, - "logps/chosen": -328.34918212890625, - "logps/rejected": -338.5020751953125, - "loss": 0.6564, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.28392377495765686, - "rewards/margins": 0.8925819993019104, - "rewards/rejected": -1.1765056848526, + "logits/chosen": -2.7327969074249268, + "logits/rejected": -2.766997814178467, + "logps/chosen": -292.2518615722656, + "logps/rejected": -258.28839111328125, + "loss": 0.8505, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.07704988121986389, + "rewards/margins": 0.45141905546188354, + "rewards/rejected": -0.37436920404434204, "step": 428 }, { "epoch": 0.9, "learning_rate": 1.4662207078575684e-08, - "logits/chosen": 0.22775022685527802, - "logits/rejected": 0.9491074681282043, - "logps/chosen": -307.7417907714844, - "logps/rejected": -377.443115234375, - "loss": 0.6774, + "logits/chosen": -2.723634719848633, + "logits/rejected": -2.73431658744812, + "logps/chosen": -277.9277038574219, + "logps/rejected": -315.11688232421875, + "loss": 0.9181, "rewards/accuracies": 0.828125, - "rewards/chosen": -0.16252580285072327, - "rewards/margins": 0.724989116191864, - "rewards/rejected": -0.8875149488449097, + "rewards/chosen": 0.13561531901359558, + "rewards/margins": 0.39986780285835266, + "rewards/rejected": -0.2642524838447571, "step": 430 }, { "epoch": 0.9, "learning_rate": 1.345198738661285e-08, - "logits/chosen": 0.4222959280014038, - "logits/rejected": 1.0754032135009766, - "logps/chosen": -275.5315856933594, - "logps/rejected": -338.4194030761719, - "loss": 0.607, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.19549164175987244, - "rewards/margins": 0.6255277991294861, - "rewards/rejected": -0.8210194110870361, + "logits/chosen": -2.7748377323150635, + "logits/rejected": -2.8037893772125244, + "logps/chosen": -248.05618286132812, + "logps/rejected": -276.68695068359375, + "loss": 0.8116, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.07926252484321594, + "rewards/margins": 0.282957524061203, + "rewards/rejected": -0.20369501411914825, "step": 432 }, { "epoch": 0.91, "learning_rate": 1.2292508422495157e-08, - "logits/chosen": 0.35431140661239624, - "logits/rejected": 0.6342911124229431, - "logps/chosen": -245.69967651367188, - "logps/rejected": -307.9548034667969, - "loss": 0.5931, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.08900564908981323, - "rewards/margins": 0.7763758897781372, - "rewards/rejected": -0.8653814792633057, + "logits/chosen": -2.7421977519989014, + "logits/rejected": -2.783344030380249, + "logps/chosen": -220.06707763671875, + "logps/rejected": -245.82522583007812, + "loss": 0.7779, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.16732043027877808, + "rewards/margins": 0.4114060401916504, + "rewards/rejected": -0.2440856546163559, "step": 434 }, { "epoch": 0.91, "learning_rate": 1.118401890024001e-08, - "logits/chosen": -0.6884260773658752, - "logits/rejected": 0.5942091941833496, - "logps/chosen": -309.41900634765625, - "logps/rejected": -392.107421875, - "loss": 0.8082, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.12636291980743408, - "rewards/margins": 0.7045049667358398, - "rewards/rejected": -0.8308678865432739, + "logits/chosen": -2.818490743637085, + "logits/rejected": -2.7636194229125977, + "logps/chosen": -281.3802795410156, + "logps/rejected": -330.90875244140625, + "loss": 1.0341, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.1540243923664093, + "rewards/margins": 0.372905969619751, + "rewards/rejected": -0.21888157725334167, "step": 436 }, { "epoch": 0.92, "learning_rate": 1.0126756596375685e-08, - "logits/chosen": 0.28693687915802, - "logits/rejected": 0.19982758164405823, - "logps/chosen": -284.8650207519531, - "logps/rejected": -322.2666015625, - "loss": 0.7014, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.29686933755874634, - "rewards/margins": 0.5921163558959961, - "rewards/rejected": -0.8889857530593872, + "logits/chosen": -2.7073822021484375, + "logits/rejected": -2.723830461502075, + "logps/chosen": -244.02633666992188, + "logps/rejected": -255.3850860595703, + "loss": 0.9397, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.11151732504367828, + "rewards/margins": 0.33168792724609375, + "rewards/rejected": -0.22017060220241547, "step": 438 }, { "epoch": 0.92, "learning_rate": 9.12094829893642e-09, - "logits/chosen": -0.22270016372203827, - "logits/rejected": 1.3697696924209595, - "logps/chosen": -282.0764465332031, - "logps/rejected": -308.25970458984375, - "loss": 0.7014, - "rewards/accuracies": 0.671875, - "rewards/chosen": -0.24073350429534912, - "rewards/margins": 0.5527843236923218, - "rewards/rejected": -0.7935178279876709, + "logits/chosen": -2.78114652633667, + "logits/rejected": -2.7229230403900146, + "logps/chosen": -249.22332763671875, + "logps/rejected": -253.51535034179688, + "loss": 0.908, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.08779719471931458, + "rewards/margins": 0.33387118577957153, + "rewards/rejected": -0.24607399106025696, "step": 440 }, { "epoch": 0.93, "learning_rate": 8.166809758815895e-09, - "logits/chosen": -0.42256081104278564, - "logits/rejected": 0.24574501812458038, - "logps/chosen": -287.09100341796875, - "logps/rejected": -344.0694580078125, - "loss": 0.6457, + "logits/chosen": -2.7489495277404785, + "logits/rejected": -2.744652271270752, + "logps/chosen": -252.46728515625, + "logps/rejected": -290.2929382324219, + "loss": 0.849, "rewards/accuracies": 0.6875, - "rewards/chosen": -0.2197842299938202, - "rewards/margins": 0.44125646352767944, - "rewards/rejected": -0.661040723323822, + "rewards/chosen": 0.1264531910419464, + "rewards/margins": 0.24972890317440033, + "rewards/rejected": -0.12327572703361511, "step": 442 }, { "epoch": 0.93, "learning_rate": 7.2645456434869965e-09, - "logits/chosen": -0.25060874223709106, - "logits/rejected": 0.9384374618530273, - "logps/chosen": -278.2327880859375, - "logps/rejected": -328.0905456542969, - "loss": 0.6304, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.1905202716588974, - "rewards/margins": 0.6406797766685486, - "rewards/rejected": -0.8312000036239624, + "logits/chosen": -2.70603084564209, + "logits/rejected": -2.676625967025757, + "logps/chosen": -248.60690307617188, + "logps/rejected": -260.7947998046875, + "loss": 0.8489, + "rewards/accuracies": 0.671875, + "rewards/chosen": 0.10573847591876984, + "rewards/margins": 0.2639809846878052, + "rewards/rejected": -0.15824246406555176, "step": 444 }, { "epoch": 0.93, "learning_rate": 6.414349493100129e-09, - "logits/chosen": 0.5399929881095886, - "logits/rejected": 1.2095367908477783, - "logps/chosen": -292.74786376953125, - "logps/rejected": -348.2053527832031, - "loss": 0.6079, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.0946485847234726, - "rewards/margins": 0.6198921799659729, - "rewards/rejected": -0.7145407795906067, + "logits/chosen": -2.6382882595062256, + "logits/rejected": -2.611945152282715, + "logps/chosen": -267.7255859375, + "logps/rejected": -293.4171142578125, + "loss": 0.7966, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.15557408332824707, + "rewards/margins": 0.32223251461982727, + "rewards/rejected": -0.1666584312915802, "step": 446 }, { "epoch": 0.94, "learning_rate": 5.616403678967624e-09, - "logits/chosen": -0.07524357736110687, - "logits/rejected": 1.0452533960342407, - "logps/chosen": -356.8501281738281, - "logps/rejected": -411.29931640625, - "loss": 0.707, - "rewards/accuracies": 0.609375, - "rewards/chosen": -0.12035229802131653, - "rewards/margins": 0.5983991622924805, - "rewards/rejected": -0.7187514901161194, + "logits/chosen": -2.796721935272217, + "logits/rejected": -2.7096996307373047, + "logps/chosen": -328.8945617675781, + "logps/rejected": -354.02105712890625, + "loss": 0.9644, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.1592036485671997, + "rewards/margins": 0.30517300963401794, + "rewards/rejected": -0.14596937596797943, "step": 448 }, { "epoch": 0.94, "learning_rate": 4.8708793644441086e-09, - "logits/chosen": -0.13995477557182312, - "logits/rejected": 0.6635825634002686, - "logps/chosen": -278.5004577636719, - "logps/rejected": -310.28857421875, - "loss": 0.6305, - "rewards/accuracies": 0.71875, - "rewards/chosen": -0.18752513825893402, - "rewards/margins": 0.5632314085960388, - "rewards/rejected": -0.750756561756134, + "logits/chosen": -2.6981637477874756, + "logits/rejected": -2.680389165878296, + "logps/chosen": -244.60003662109375, + "logps/rejected": -252.7685546875, + "loss": 0.7989, + "rewards/accuracies": 0.734375, + "rewards/chosen": 0.15147946774959564, + "rewards/margins": 0.32703569531440735, + "rewards/rejected": -0.1755562573671341, "step": 450 }, { "epoch": 0.95, "learning_rate": 4.1779364682113794e-09, - "logits/chosen": 0.5096856951713562, - "logits/rejected": 1.3543939590454102, - "logps/chosen": -241.704833984375, - "logps/rejected": -314.39495849609375, - "loss": 0.6309, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.21836420893669128, - "rewards/margins": 0.6935679912567139, - "rewards/rejected": -0.911932110786438, + "logits/chosen": -2.7574119567871094, + "logits/rejected": -2.7250611782073975, + "logps/chosen": -214.03793334960938, + "logps/rejected": -251.9590606689453, + "loss": 0.8664, + "rewards/accuracies": 0.71875, + "rewards/chosen": 0.05830486863851547, + "rewards/margins": 0.3458782136440277, + "rewards/rejected": -0.28757330775260925, "step": 452 }, { "epoch": 0.95, "learning_rate": 3.5377236299748147e-09, - "logits/chosen": 0.5073993802070618, - "logits/rejected": 0.9775093197822571, - "logps/chosen": -270.1071472167969, - "logps/rejected": -317.7763977050781, - "loss": 0.6822, - "rewards/accuracies": 0.796875, - "rewards/chosen": -0.27697494626045227, - "rewards/margins": 0.5462560653686523, - "rewards/rejected": -0.8232309818267822, + "logits/chosen": -2.722461700439453, + "logits/rejected": -2.728785514831543, + "logps/chosen": -238.22683715820312, + "logps/rejected": -260.7715148925781, + "loss": 0.8829, + "rewards/accuracies": 0.765625, + "rewards/chosen": 0.04182838648557663, + "rewards/margins": 0.29501089453697205, + "rewards/rejected": -0.2531825304031372, "step": 454 }, { "epoch": 0.95, "learning_rate": 2.9503781785795713e-09, - "logits/chosen": -0.3081734776496887, - "logits/rejected": 0.7983647584915161, - "logps/chosen": -324.9632568359375, - "logps/rejected": -354.402099609375, - "loss": 0.5723, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.0758206844329834, - "rewards/margins": 0.8021882772445679, - "rewards/rejected": -0.878009021282196, + "logits/chosen": -2.8074533939361572, + "logits/rejected": -2.7784385681152344, + "logps/chosen": -296.22344970703125, + "logps/rejected": -288.3941345214844, + "loss": 0.8096, + "rewards/accuracies": 0.828125, + "rewards/chosen": 0.2115776538848877, + "rewards/margins": 0.4295070469379425, + "rewards/rejected": -0.21792933344841003, "step": 456 }, { "epoch": 0.96, "learning_rate": 2.416026102552732e-09, - "logits/chosen": 0.3920813202857971, - "logits/rejected": 1.4421265125274658, - "logps/chosen": -302.9536437988281, - "logps/rejected": -352.2631530761719, - "loss": 0.6248, + "logits/chosen": -2.6836681365966797, + "logits/rejected": -2.679197311401367, + "logps/chosen": -267.4215393066406, + "logps/rejected": -276.3209228515625, + "loss": 0.7877, "rewards/accuracies": 0.765625, - "rewards/chosen": -0.22983548045158386, - "rewards/margins": 0.8996194005012512, - "rewards/rejected": -1.1294549703598022, + "rewards/chosen": 0.12548568844795227, + "rewards/margins": 0.4955182373523712, + "rewards/rejected": -0.37003248929977417, "step": 458 }, { "epoch": 0.96, "learning_rate": 1.9347820230782295e-09, - "logits/chosen": 0.027735590934753418, - "logits/rejected": 0.36365261673927307, - "logps/chosen": -318.6025390625, - "logps/rejected": -359.25799560546875, - "loss": 0.6865, - "rewards/accuracies": 0.703125, - "rewards/chosen": -0.2018052041530609, - "rewards/margins": 0.5466940402984619, - "rewards/rejected": -0.7484991550445557, + "logits/chosen": -2.79423451423645, + "logits/rejected": -2.776221990585327, + "logps/chosen": -285.2467956542969, + "logps/rejected": -300.2720947265625, + "loss": 0.9234, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13175205886363983, + "rewards/margins": 0.2903917729854584, + "rewards/rejected": -0.15863972902297974, "step": 460 }, { "epoch": 0.97, "learning_rate": 1.5067491694100153e-09, - "logits/chosen": -0.1223829835653305, - "logits/rejected": 0.2881450355052948, - "logps/chosen": -289.8230285644531, - "logps/rejected": -360.09906005859375, - "loss": 0.6747, - "rewards/accuracies": 0.84375, - "rewards/chosen": -0.1440747082233429, - "rewards/margins": 0.8571825623512268, - "rewards/rejected": -1.0012574195861816, + "logits/chosen": -2.794790029525757, + "logits/rejected": -2.8092966079711914, + "logps/chosen": -263.51153564453125, + "logps/rejected": -287.7908935546875, + "loss": 0.9312, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.11904040724039078, + "rewards/margins": 0.3972160816192627, + "rewards/rejected": -0.2781756818294525, "step": 462 }, { "epoch": 0.97, "learning_rate": 1.1320193567288527e-09, - "logits/chosen": 0.2805168032646179, - "logits/rejected": 0.7300337553024292, - "logps/chosen": -256.3475341796875, - "logps/rejected": -297.0839538574219, - "loss": 0.564, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.2623736560344696, - "rewards/margins": 0.49820947647094727, - "rewards/rejected": -0.7605831027030945, + "logits/chosen": -2.689584493637085, + "logits/rejected": -2.6893630027770996, + "logps/chosen": -221.8512420654297, + "logps/rejected": -238.0108642578125, + "loss": 0.7463, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.08258925378322601, + "rewards/margins": 0.25244125723838806, + "rewards/rejected": -0.16985198855400085, "step": 464 }, { "epoch": 0.98, "learning_rate": 8.106729664475176e-10, - "logits/chosen": -0.15276291966438293, - "logits/rejected": 0.6538082957267761, - "logps/chosen": -352.9091796875, - "logps/rejected": -393.73797607421875, - "loss": 0.6624, - "rewards/accuracies": 0.8125, - "rewards/chosen": -0.07146578282117844, - "rewards/margins": 0.7627599239349365, - "rewards/rejected": -0.8342257738113403, + "logits/chosen": -2.7731821537017822, + "logits/rejected": -2.767963171005249, + "logps/chosen": -327.0430603027344, + "logps/rejected": -333.2215576171875, + "loss": 0.8902, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.18719592690467834, + "rewards/margins": 0.41625767946243286, + "rewards/rejected": -0.22906175255775452, "step": 466 }, { "epoch": 0.98, "learning_rate": 5.427789289685347e-10, - "logits/chosen": 0.2313707172870636, - "logits/rejected": 1.0570056438446045, - "logps/chosen": -367.5287170410156, - "logps/rejected": -360.8330078125, - "loss": 0.6087, - "rewards/accuracies": 0.75, - "rewards/chosen": -0.035427868366241455, - "rewards/margins": 0.8285888433456421, - "rewards/rejected": -0.8640167713165283, + "logits/chosen": -2.6973464488983154, + "logits/rejected": -2.7609875202178955, + "logps/chosen": -330.8072509765625, + "logps/rejected": -303.5555419921875, + "loss": 0.8152, + "rewards/accuracies": 0.796875, + "rewards/chosen": 0.3317866325378418, + "rewards/margins": 0.6230289936065674, + "rewards/rejected": -0.2912423312664032, "step": 468 }, { "epoch": 0.98, "learning_rate": 3.2839470889836627e-10, - "logits/chosen": -0.49842333793640137, - "logits/rejected": 0.5836061239242554, - "logps/chosen": -309.240234375, - "logps/rejected": -367.9534912109375, - "loss": 0.5799, - "rewards/accuracies": 0.796875, - "rewards/chosen": -0.18272840976715088, - "rewards/margins": 0.7705845236778259, - "rewards/rejected": -0.9533129930496216, + "logits/chosen": -2.8350577354431152, + "logits/rejected": -2.8280062675476074, + "logps/chosen": -278.7225341796875, + "logps/rejected": -298.595947265625, + "loss": 0.7965, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12244854122400284, + "rewards/margins": 0.38218602538108826, + "rewards/rejected": -0.259737491607666, "step": 470 }, { "epoch": 0.99, "learning_rate": 1.6756629272085544e-10, - "logits/chosen": -0.1911655068397522, - "logits/rejected": 0.9592544436454773, - "logps/chosen": -297.9132385253906, - "logps/rejected": -368.43060302734375, - "loss": 0.6029, - "rewards/accuracies": 0.765625, - "rewards/chosen": -0.12253910303115845, - "rewards/margins": 0.7276279926300049, - "rewards/rejected": -0.8501670956611633, + "logits/chosen": -2.7701029777526855, + "logits/rejected": -2.774332284927368, + "logps/chosen": -268.000244140625, + "logps/rejected": -307.9557189941406, + "loss": 0.8169, + "rewards/accuracies": 0.78125, + "rewards/chosen": 0.17659083008766174, + "rewards/margins": 0.4220091700553894, + "rewards/rejected": -0.24541833996772766, "step": 472 }, { "epoch": 0.99, "learning_rate": 6.032817893297793e-11, - "logits/chosen": -0.25915825366973877, - "logits/rejected": 1.385116696357727, - "logps/chosen": -289.7322998046875, - "logps/rejected": -346.7291564941406, - "loss": 0.685, - "rewards/accuracies": 0.78125, - "rewards/chosen": -0.21489854156970978, - "rewards/margins": 0.8257070779800415, - "rewards/rejected": -1.0406055450439453, + "logits/chosen": -2.7753193378448486, + "logits/rejected": -2.7097368240356445, + "logps/chosen": -253.8528594970703, + "logps/rejected": -277.52337646484375, + "loss": 0.9081, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.14389578998088837, + "rewards/margins": 0.4924442172050476, + "rewards/rejected": -0.34854838252067566, "step": 474 }, { "epoch": 1.0, "learning_rate": 6.7033706447061635e-12, - "logits/chosen": 0.4867172837257385, - "logits/rejected": 1.1571818590164185, - "logps/chosen": -337.714599609375, - "logps/rejected": -358.3683166503906, - "loss": 0.6851, - "rewards/accuracies": 0.734375, - "rewards/chosen": -0.18006440997123718, - "rewards/margins": 0.681747317314148, - "rewards/rejected": -0.8618118166923523, + "logits/chosen": -2.740011692047119, + "logits/rejected": -2.744830369949341, + "logps/chosen": -303.83245849609375, + "logps/rejected": -293.7603759765625, + "loss": 0.9008, + "rewards/accuracies": 0.703125, + "rewards/chosen": 0.1587567925453186, + "rewards/margins": 0.3744887411594391, + "rewards/rejected": -0.21573196351528168, "step": 476 }, { "epoch": 1.0, "step": 477, "total_flos": 0.0, - "train_loss": 0.6847315603082285, - "train_runtime": 17806.8146, - "train_samples_per_second": 3.433, + "train_loss": 0.8900128602981567, + "train_runtime": 17762.6435, + "train_samples_per_second": 3.442, "train_steps_per_second": 0.027 } ],