diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -655,3537 +655,3537 @@ }, { "epoch": 0.30979827089337175, - "grad_norm": 2.2838289737701416, + "grad_norm": 2.2838587760925293, "learning_rate": 4.95446040808959e-08, - "logits/chosen": -1.4970475435256958, - "logits/rejected": -1.4972736835479736, - "logps/chosen": -46.018821716308594, - "logps/rejected": -46.19584655761719, - "loss": 0.6929, + "logits/chosen": -1.4970625638961792, + "logits/rejected": -1.4972014427185059, + "logps/chosen": -46.028480529785156, + "logps/rejected": -46.18268585205078, + "loss": 0.693, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.0007925600511953235, - "rewards/margins": 0.0005588697385974228, - "rewards/rejected": 0.00023369032714981586, + "rewards/chosen": 0.0006959561142139137, + "rewards/margins": 0.0003306520520709455, + "rewards/rejected": 0.0003653041203506291, "step": 430 }, { "epoch": 0.3170028818443804, - "grad_norm": 2.4645979404449463, + "grad_norm": 2.447035312652588, "learning_rate": 4.948292668010676e-08, - "logits/chosen": -1.4980700016021729, - "logits/rejected": -1.4899036884307861, - "logps/chosen": -45.96923828125, - "logps/rejected": -49.418914794921875, - "loss": 0.6929, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": 0.0007791355019435287, - "rewards/margins": 0.00043947863741777837, - "rewards/rejected": 0.0003396569227334112, + "logits/chosen": -1.4979121685028076, + "logits/rejected": -1.4898037910461426, + "logps/chosen": -45.97195053100586, + "logps/rejected": -49.404659271240234, + "loss": 0.693, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.0007520461804233491, + "rewards/margins": 0.0002698205644264817, + "rewards/rejected": 0.0004822257033083588, "step": 440 }, { "epoch": 0.3242074927953891, - "grad_norm": 3.1768293380737305, + "grad_norm": 3.175678014755249, "learning_rate": 4.941737694820975e-08, - "logits/chosen": -1.5109517574310303, - "logits/rejected": -1.489824891090393, - "logps/chosen": -53.17173385620117, - "logps/rejected": -51.28825759887695, - "loss": 0.6929, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": 0.0005129527417011559, - "rewards/margins": 0.00046978183672763407, - "rewards/rejected": 4.3170934077352285e-05, + "logits/chosen": -1.5109035968780518, + "logits/rejected": -1.4897724390029907, + "logps/chosen": -53.14085006713867, + "logps/rejected": -51.29607391357422, + "loss": 0.6927, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.0008217478170990944, + "rewards/margins": 0.0008567500626668334, + "rewards/rejected": -3.50023437931668e-05, "step": 450 }, { "epoch": 0.3314121037463977, - "grad_norm": 3.7954866886138916, + "grad_norm": 3.8300483226776123, "learning_rate": 4.93479652528488e-08, - "logits/chosen": -1.5158827304840088, - "logits/rejected": -1.4983150959014893, - "logps/chosen": -53.763702392578125, - "logps/rejected": -55.667442321777344, - "loss": 0.6928, + "logits/chosen": -1.5159543752670288, + "logits/rejected": -1.4984338283538818, + "logps/chosen": -53.76072311401367, + "logps/rejected": -55.672523498535156, + "loss": 0.6927, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.0007644235738553107, - "rewards/margins": 0.0007272266084328294, - "rewards/rejected": 3.719698725035414e-05, + "rewards/chosen": 0.000794197607319802, + "rewards/margins": 0.0008078647661022842, + "rewards/rejected": -1.3667243365489412e-05, "step": 460 }, { "epoch": 0.33861671469740634, - "grad_norm": 3.7215628623962402, + "grad_norm": 3.7393746376037598, "learning_rate": 4.9274702572493555e-08, - "logits/chosen": -1.5778518915176392, - "logits/rejected": -1.5484634637832642, - "logps/chosen": -52.62053680419922, - "logps/rejected": -53.18864822387695, + "logits/chosen": -1.5780770778656006, + "logits/rejected": -1.54873788356781, + "logps/chosen": -52.60649490356445, + "logps/rejected": -53.17595672607422, "loss": 0.6929, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.0006313440389931202, - "rewards/margins": 0.00042634853161871433, - "rewards/rejected": 0.00020499550737440586, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.0007717165863141418, + "rewards/margins": 0.00043986932723782957, + "rewards/rejected": 0.00033184728818014264, "step": 470 }, { "epoch": 0.345821325648415, - "grad_norm": 3.663480758666992, + "grad_norm": 3.6585752964019775, "learning_rate": 4.9197600494702955e-08, - "logits/chosen": -1.6467891931533813, - "logits/rejected": -1.6328201293945312, - "logps/chosen": -42.48698425292969, - "logps/rejected": -45.419456481933594, + "logits/chosen": -1.6469532251358032, + "logits/rejected": -1.6328893899917603, + "logps/chosen": -42.46988296508789, + "logps/rejected": -45.399417877197266, "loss": 0.6929, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.0006600015331059694, - "rewards/margins": 0.0005476917722262442, - "rewards/rejected": 0.00011230977543164045, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0008309917757287621, + "rewards/margins": 0.0005182669847272336, + "rewards/rejected": 0.0003127247327938676, "step": 480 }, { "epoch": 0.3530259365994236, - "grad_norm": 4.272515773773193, + "grad_norm": 4.246161460876465, "learning_rate": 4.9116671214292526e-08, - "logits/chosen": -1.5244684219360352, - "logits/rejected": -1.512758493423462, - "logps/chosen": -46.40390396118164, - "logps/rejected": -49.49402618408203, - "loss": 0.693, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.0008591124787926674, - "rewards/margins": 0.0003861512814182788, - "rewards/rejected": 0.00047296128468587995, + "logits/chosen": -1.5239448547363281, + "logits/rejected": -1.5122658014297485, + "logps/chosen": -46.399169921875, + "logps/rejected": -49.506656646728516, + "loss": 0.6929, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.0009064410114660859, + "rewards/margins": 0.0005597518174909055, + "rewards/rejected": 0.00034668916487134993, "step": 490 }, { "epoch": 0.36023054755043227, - "grad_norm": 2.831533670425415, + "grad_norm": 2.821146249771118, "learning_rate": 4.903192753140557e-08, - "logits/chosen": -1.5279796123504639, - "logits/rejected": -1.5046515464782715, - "logps/chosen": -42.62705612182617, - "logps/rejected": -45.91027069091797, + "logits/chosen": -1.5277873277664185, + "logits/rejected": -1.504441499710083, + "logps/chosen": -42.61452865600586, + "logps/rejected": -45.90108108520508, "loss": 0.6927, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.000864860019646585, - "rewards/margins": 0.0008514861692674458, - "rewards/rejected": 1.3373884939937852e-05, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0009901354787871242, + "rewards/margins": 0.0008848762372508645, + "rewards/rejected": 0.00010525914694881067, "step": 500 }, { "epoch": 0.36743515850144093, - "grad_norm": 2.8894739151000977, + "grad_norm": 2.9027976989746094, "learning_rate": 4.894338284948866e-08, - "logits/chosen": -1.6386358737945557, - "logits/rejected": -1.6209615468978882, - "logps/chosen": -45.98933792114258, - "logps/rejected": -47.95410919189453, + "logits/chosen": -1.6388967037200928, + "logits/rejected": -1.6212489604949951, + "logps/chosen": -45.9903564453125, + "logps/rejected": -47.963653564453125, "loss": 0.6929, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.0008489834144711494, - "rewards/margins": 0.000439401192124933, - "rewards/rejected": 0.00040958222234621644, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0008388383430428803, + "rewards/margins": 0.0005247647641226649, + "rewards/rejected": 0.00031407366623170674, "step": 510 }, { "epoch": 0.3746397694524496, - "grad_norm": 3.5497148036956787, + "grad_norm": 3.551910400390625, "learning_rate": 4.8851051173171656e-08, - "logits/chosen": -1.5345745086669922, - "logits/rejected": -1.5294183492660522, - "logps/chosen": -53.89246368408203, - "logps/rejected": -56.03412628173828, + "logits/chosen": -1.5345404148101807, + "logits/rejected": -1.5293302536010742, + "logps/chosen": -53.89219284057617, + "logps/rejected": -56.03322219848633, "loss": 0.6929, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": 0.000699670345056802, - "rewards/margins": 0.00042429380118846893, - "rewards/rejected": 0.00027537651476450264, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.000702341552823782, + "rewards/margins": 0.00041793621494434774, + "rewards/rejected": 0.00028440530877560377, "step": 520 }, { "epoch": 0.3818443804034582, - "grad_norm": 3.226824998855591, + "grad_norm": 3.20746111869812, "learning_rate": 4.8754947106052696e-08, - "logits/chosen": -1.4582703113555908, - "logits/rejected": -1.4374693632125854, - "logps/chosen": -46.44105911254883, - "logps/rejected": -47.530235290527344, - "loss": 0.6931, - "rewards/accuracies": 0.46875, - "rewards/chosen": 0.0007159336237236857, - "rewards/margins": 3.299415766377933e-05, - "rewards/rejected": 0.0006829394842498004, + "logits/chosen": -1.4583015441894531, + "logits/rejected": -1.4375460147857666, + "logps/chosen": -46.40522384643555, + "logps/rejected": -47.5439567565918, + "loss": 0.6929, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.0010742919985204935, + "rewards/margins": 0.0005286088562570512, + "rewards/rejected": 0.0005456830258481205, "step": 530 }, { "epoch": 0.38904899135446686, - "grad_norm": 3.1804444789886475, + "grad_norm": 3.1715567111968994, "learning_rate": 4.865508584838841e-08, - "logits/chosen": -1.4848048686981201, - "logits/rejected": -1.4638136625289917, - "logps/chosen": -45.8503303527832, - "logps/rejected": -47.795982360839844, + "logits/chosen": -1.4847066402435303, + "logits/rejected": -1.463714361190796, + "logps/chosen": -45.841705322265625, + "logps/rejected": -47.79716491699219, "loss": 0.6927, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.001187361660413444, - "rewards/margins": 0.0008513416978530586, - "rewards/rejected": 0.00033602004987187684, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.0012736220378428698, + "rewards/margins": 0.00094941770657897, + "rewards/rejected": 0.0003242043312638998, "step": 540 }, { "epoch": 0.3962536023054755, - "grad_norm": 3.0861072540283203, + "grad_norm": 3.076672315597534, "learning_rate": 4.855148319468979e-08, - "logits/chosen": -1.4294873476028442, - "logits/rejected": -1.421155571937561, - "logps/chosen": -48.331214904785156, - "logps/rejected": -48.36426544189453, + "logits/chosen": -1.4295966625213623, + "logits/rejected": -1.4212697744369507, + "logps/chosen": -48.33202362060547, + "logps/rejected": -48.36079406738281, "loss": 0.6927, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.001134269405156374, - "rewards/margins": 0.000912741175852716, - "rewards/rejected": 0.00022152825840748847, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0011261259205639362, + "rewards/margins": 0.0008698878809809685, + "rewards/rejected": 0.00025623809779062867, "step": 550 }, { "epoch": 0.4034582132564842, - "grad_norm": 3.239006519317627, + "grad_norm": 3.240483522415161, "learning_rate": 4.8444155531224065e-08, - "logits/chosen": -1.5731785297393799, - "logits/rejected": -1.5627200603485107, - "logps/chosen": -48.466590881347656, - "logps/rejected": -49.91735076904297, + "logits/chosen": -1.573085069656372, + "logits/rejected": -1.5627367496490479, + "logps/chosen": -48.46537399291992, + "logps/rejected": -49.903297424316406, "loss": 0.6929, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.000884662673342973, - "rewards/margins": 0.0005383921670727432, - "rewards/rejected": 0.00034627056447789073, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0008968374459072948, + "rewards/margins": 0.00041013729060068727, + "rewards/rejected": 0.0004867001553066075, "step": 560 }, { "epoch": 0.4106628242074928, - "grad_norm": 2.9216575622558594, + "grad_norm": 2.9320995807647705, "learning_rate": 4.833311983342292e-08, - "logits/chosen": -1.5521368980407715, - "logits/rejected": -1.522883653640747, - "logps/chosen": -52.464149475097656, - "logps/rejected": -54.49372482299805, + "logits/chosen": -1.5522406101226807, + "logits/rejected": -1.5230294466018677, + "logps/chosen": -52.45341873168945, + "logps/rejected": -54.48564529418945, "loss": 0.6927, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.001176183926872909, - "rewards/margins": 0.0009485279442742467, - "rewards/rejected": 0.00022765605535823852, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.0012835357338190079, + "rewards/margins": 0.0009750700555741787, + "rewards/rejected": 0.00030846576555632055, "step": 570 }, { "epoch": 0.41786743515850144, - "grad_norm": 3.1774847507476807, + "grad_norm": 3.174853563308716, "learning_rate": 4.821839366319768e-08, - "logits/chosen": -1.5763927698135376, - "logits/rejected": -1.5627062320709229, - "logps/chosen": -42.62623977661133, - "logps/rejected": -44.08416748046875, + "logits/chosen": -1.576537847518921, + "logits/rejected": -1.5629103183746338, + "logps/chosen": -42.617637634277344, + "logps/rejected": -44.07463073730469, "loss": 0.6929, - "rewards/accuracies": 0.53125, - "rewards/chosen": 0.00070603983476758, - "rewards/margins": 0.0005500231636688113, - "rewards/rejected": 0.00015601668565068394, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.0007920503849163651, + "rewards/margins": 0.0005406917771324515, + "rewards/rejected": 0.0002513585495762527, "step": 580 }, { "epoch": 0.4250720461095101, - "grad_norm": 2.739443302154541, + "grad_norm": 2.7447383403778076, "learning_rate": 4.8099995166161536e-08, - "logits/chosen": -1.5268305540084839, - "logits/rejected": -1.5254180431365967, - "logps/chosen": -47.49382019042969, - "logps/rejected": -53.12456512451172, - "loss": 0.6927, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.0012865675380453467, - "rewards/margins": 0.0009040467557497323, - "rewards/rejected": 0.00038252072408795357, + "logits/chosen": -1.526511788368225, + "logits/rejected": -1.5251224040985107, + "logps/chosen": -47.496360778808594, + "logps/rejected": -53.13859939575195, + "loss": 0.6926, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.0012611480196937919, + "rewards/margins": 0.0010188872693106532, + "rewards/rejected": 0.00024226067762356251, "step": 590 }, { "epoch": 0.4322766570605187, - "grad_norm": 2.7929279804229736, + "grad_norm": 2.7920360565185547, "learning_rate": 4.797794306875963e-08, - "logits/chosen": -1.6296379566192627, - "logits/rejected": -1.619217872619629, - "logps/chosen": -45.383460998535156, - "logps/rejected": -49.08354949951172, + "logits/chosen": -1.6294994354248047, + "logits/rejected": -1.6191202402114868, + "logps/chosen": -45.38005065917969, + "logps/rejected": -49.074134826660156, "loss": 0.6926, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.0011386250844225287, - "rewards/margins": 0.0011278244201093912, - "rewards/rejected": 1.0800594282045495e-05, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.0011727013625204563, + "rewards/margins": 0.0010677037062123418, + "rewards/rejected": 0.00010499786731088534, "step": 600 }, { "epoch": 0.43948126801152737, - "grad_norm": 3.9306447505950928, + "grad_norm": 3.9166364669799805, "learning_rate": 4.785225667530716e-08, - "logits/chosen": -1.5340135097503662, - "logits/rejected": -1.516903281211853, - "logps/chosen": -50.405311584472656, - "logps/rejected": -51.097049713134766, - "loss": 0.6926, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.0011954871006309986, - "rewards/margins": 0.00101277616340667, - "rewards/rejected": 0.00018271090812049806, + "logits/chosen": -1.5335214138031006, + "logits/rejected": -1.5165165662765503, + "logps/chosen": -50.41100311279297, + "logps/rejected": -51.082786560058594, + "loss": 0.6927, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0011385614052414894, + "rewards/margins": 0.0008131733047775924, + "rewards/rejected": 0.000325388100463897, "step": 610 }, { "epoch": 0.44668587896253603, - "grad_norm": 2.6901683807373047, + "grad_norm": 2.6957144737243652, "learning_rate": 4.772295586493613e-08, - "logits/chosen": -1.5675427913665771, - "logits/rejected": -1.5532591342926025, - "logps/chosen": -42.37627029418945, - "logps/rejected": -44.42975616455078, + "logits/chosen": -1.5677138566970825, + "logits/rejected": -1.5532875061035156, + "logps/chosen": -42.37538146972656, + "logps/rejected": -44.42705154418945, "loss": 0.6929, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": 0.0010986726265400648, - "rewards/margins": 0.0004203086718916893, - "rewards/rejected": 0.0006783640128560364, + "rewards/accuracies": 0.53125, + "rewards/chosen": 0.0011075434740632772, + "rewards/margins": 0.0004021703207399696, + "rewards/rejected": 0.0007053731824271381, "step": 620 }, { "epoch": 0.4538904899135447, - "grad_norm": 2.359860897064209, + "grad_norm": 2.359483003616333, "learning_rate": 4.759006108845116e-08, - "logits/chosen": -1.5706075429916382, - "logits/rejected": -1.5634089708328247, - "logps/chosen": -43.01512145996094, - "logps/rejected": -46.5160026550293, - "loss": 0.6927, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.001413361169397831, - "rewards/margins": 0.0008302571368403733, - "rewards/rejected": 0.0005831040907651186, + "logits/chosen": -1.5706963539123535, + "logits/rejected": -1.5633833408355713, + "logps/chosen": -43.02092742919922, + "logps/rejected": -46.545799255371094, + "loss": 0.6926, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.0013553181197494268, + "rewards/margins": 0.0010701713617891073, + "rewards/rejected": 0.0002851466997526586, "step": 630 }, { "epoch": 0.4610951008645533, - "grad_norm": 2.925004482269287, + "grad_norm": 2.9229588508605957, "learning_rate": 4.7453593365094926e-08, - "logits/chosen": -1.4282965660095215, - "logits/rejected": -1.423585295677185, - "logps/chosen": -45.34966278076172, - "logps/rejected": -48.02306365966797, + "logits/chosen": -1.4285109043121338, + "logits/rejected": -1.4237844944000244, + "logps/chosen": -45.349342346191406, + "logps/rejected": -48.016780853271484, "loss": 0.6925, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.0014253833796828985, - "rewards/margins": 0.0012716830242425203, - "rewards/rejected": 0.00015370050095953047, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.00142852368298918, + "rewards/margins": 0.0012120843166485429, + "rewards/rejected": 0.00021643943910021335, "step": 640 }, { "epoch": 0.46829971181556196, - "grad_norm": 3.485377788543701, + "grad_norm": 3.4733026027679443, "learning_rate": 4.731357427922361e-08, - "logits/chosen": -1.6126524209976196, - "logits/rejected": -1.583707571029663, - "logps/chosen": -45.416481018066406, - "logps/rejected": -45.403465270996094, - "loss": 0.6926, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.0017271407414227724, - "rewards/margins": 0.0011107297614216805, - "rewards/rejected": 0.000616410921793431, + "logits/chosen": -1.6126518249511719, + "logits/rejected": -1.5838146209716797, + "logps/chosen": -45.41937255859375, + "logps/rejected": -45.38434600830078, + "loss": 0.6927, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.0016982409870252013, + "rewards/margins": 0.0008906273869797587, + "rewards/rejected": 0.0008076136000454426, "step": 650 }, { "epoch": 0.4755043227665706, - "grad_norm": 3.4235525131225586, + "grad_norm": 3.42234468460083, "learning_rate": 4.71700259768931e-08, - "logits/chosen": -1.54206120967865, - "logits/rejected": -1.539907693862915, - "logps/chosen": -47.119869232177734, - "logps/rejected": -50.016571044921875, - "loss": 0.6925, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.0014607865596190095, - "rewards/margins": 0.001290248241275549, - "rewards/rejected": 0.00017053820192813873, + "logits/chosen": -1.5418545007705688, + "logits/rejected": -1.539717435836792, + "logps/chosen": -47.135189056396484, + "logps/rejected": -50.01400375366211, + "loss": 0.6926, + "rewards/accuracies": 0.518750011920929, + "rewards/chosen": 0.0013075959868729115, + "rewards/margins": 0.0011113332584500313, + "rewards/rejected": 0.00019626265566330403, "step": 660 }, { "epoch": 0.4827089337175792, - "grad_norm": 3.424257516860962, + "grad_norm": 3.422145366668701, "learning_rate": 4.7022971162356176e-08, - "logits/chosen": -1.5014058351516724, - "logits/rejected": -1.4740307331085205, - "logps/chosen": -49.815711975097656, - "logps/rejected": -51.14215087890625, - "loss": 0.6929, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": 0.0011415343033149838, - "rewards/margins": 0.0005702447961084545, - "rewards/rejected": 0.0005712894489988685, + "logits/chosen": -1.5012540817260742, + "logits/rejected": -1.4739376306533813, + "logps/chosen": -49.80141067504883, + "logps/rejected": -51.135658264160156, + "loss": 0.6928, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0012845945311710238, + "rewards/margins": 0.0006483677425421774, + "rewards/rejected": 0.0006362267886288464, "step": 670 }, { "epoch": 0.4899135446685879, - "grad_norm": 4.627213478088379, + "grad_norm": 4.703243255615234, "learning_rate": 4.6872433094471577e-08, - "logits/chosen": -1.4441626071929932, - "logits/rejected": -1.4307535886764526, - "logps/chosen": -49.690391540527344, - "logps/rejected": -49.590946197509766, - "loss": 0.6925, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.0019428128143772483, - "rewards/margins": 0.0013735650572925806, - "rewards/rejected": 0.0005692478152923286, + "logits/chosen": -1.4440491199493408, + "logits/rejected": -1.4305951595306396, + "logps/chosen": -49.694740295410156, + "logps/rejected": -49.61138153076172, + "loss": 0.6924, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0018993699923157692, + "rewards/margins": 0.0015344747807830572, + "rewards/rejected": 0.0003648948040790856, "step": 680 }, { "epoch": 0.49711815561959655, - "grad_norm": 3.3332901000976562, + "grad_norm": 3.327789545059204, "learning_rate": 4.671843558302522e-08, - "logits/chosen": -1.5309851169586182, - "logits/rejected": -1.5203081369400024, - "logps/chosen": -47.79151153564453, - "logps/rejected": -50.61272430419922, + "logits/chosen": -1.5309746265411377, + "logits/rejected": -1.5202220678329468, + "logps/chosen": -47.79454040527344, + "logps/rejected": -50.61468505859375, "loss": 0.6926, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.0015973392874002457, - "rewards/margins": 0.001105111907236278, - "rewards/rejected": 0.0004922273219563067, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0015670496504753828, + "rewards/margins": 0.0010944500099867582, + "rewards/rejected": 0.00047259964048862457, "step": 690 }, { "epoch": 0.5043227665706052, - "grad_norm": 2.7896227836608887, + "grad_norm": 2.800119400024414, "learning_rate": 4.656100298496439e-08, - "logits/chosen": -1.5440318584442139, - "logits/rejected": -1.5306475162506104, - "logps/chosen": -46.43749237060547, - "logps/rejected": -49.2497444152832, - "loss": 0.6926, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.001613436033949256, - "rewards/margins": 0.0011261611944064498, - "rewards/rejected": 0.0004872747231274843, + "logits/chosen": -1.5437812805175781, + "logits/rejected": -1.5303466320037842, + "logps/chosen": -46.43258285522461, + "logps/rejected": -49.254676818847656, + "loss": 0.6925, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.0016625222051516175, + "rewards/margins": 0.0012245414545759559, + "rewards/rejected": 0.0004379806632641703, "step": 700 }, { "epoch": 0.5115273775216138, - "grad_norm": 2.8206114768981934, + "grad_norm": 2.8147964477539062, "learning_rate": 4.640016020054527e-08, - "logits/chosen": -1.5245901346206665, - "logits/rejected": -1.50834059715271, - "logps/chosen": -41.334625244140625, - "logps/rejected": -43.386146545410156, - "loss": 0.6927, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.0014407543931156397, - "rewards/margins": 0.0009830340277403593, - "rewards/rejected": 0.0004577203653752804, + "logits/chosen": -1.5244632959365845, + "logits/rejected": -1.508224606513977, + "logps/chosen": -41.31523895263672, + "logps/rejected": -43.39879608154297, + "loss": 0.6925, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.0016346697229892015, + "rewards/margins": 0.0013034967705607414, + "rewards/rejected": 0.00033117301063612103, "step": 710 }, { "epoch": 0.5187319884726225, - "grad_norm": 3.338675022125244, + "grad_norm": 3.334763288497925, "learning_rate": 4.6235932669394676e-08, - "logits/chosen": -1.4957685470581055, - "logits/rejected": -1.4806830883026123, - "logps/chosen": -49.26667785644531, - "logps/rejected": -52.455284118652344, + "logits/chosen": -1.49542236328125, + "logits/rejected": -1.480480432510376, + "logps/chosen": -49.24640655517578, + "logps/rejected": -52.440582275390625, "loss": 0.6925, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.0016621215036138892, - "rewards/margins": 0.0012910037767142057, - "rewards/rejected": 0.00037111755227670074, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.0018648200202733278, + "rewards/margins": 0.001346664153970778, + "rewards/rejected": 0.000518155749887228, "step": 720 }, { "epoch": 0.5259365994236311, - "grad_norm": 2.878810405731201, + "grad_norm": 2.8706724643707275, "learning_rate": 4.6068346366486325e-08, - "logits/chosen": -1.5098023414611816, - "logits/rejected": -1.4903719425201416, - "logps/chosen": -48.89864730834961, - "logps/rejected": -51.05956268310547, + "logits/chosen": -1.5098488330841064, + "logits/rejected": -1.4903903007507324, + "logps/chosen": -48.90100860595703, + "logps/rejected": -51.054744720458984, "loss": 0.6926, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": 0.0018040034919977188, - "rewards/margins": 0.0011706488439813256, - "rewards/rejected": 0.0006333546480163932, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.001780451275408268, + "rewards/margins": 0.0010989690199494362, + "rewards/rejected": 0.0006814823136664927, "step": 730 }, { "epoch": 0.5331412103746398, - "grad_norm": 2.9037601947784424, + "grad_norm": 2.898585557937622, "learning_rate": 4.589742779803259e-08, - "logits/chosen": -1.5034666061401367, - "logits/rejected": -1.4996031522750854, - "logps/chosen": -46.70368576049805, - "logps/rejected": -46.2753791809082, - "loss": 0.6928, - "rewards/accuracies": 0.5062500238418579, - "rewards/chosen": 0.0013754296815022826, - "rewards/margins": 0.0007380875176750124, - "rewards/rejected": 0.000637342338450253, + "logits/chosen": -1.503379464149475, + "logits/rejected": -1.4997670650482178, + "logps/chosen": -46.70905303955078, + "logps/rejected": -46.263973236083984, + "loss": 0.6929, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0013217134401202202, + "rewards/margins": 0.0005703223869204521, + "rewards/rejected": 0.0007513910531997681, "step": 740 }, { "epoch": 0.5403458213256485, - "grad_norm": 2.8892266750335693, + "grad_norm": 2.892930269241333, "learning_rate": 4.5723203997292146e-08, - "logits/chosen": -1.5262401103973389, - "logits/rejected": -1.5095927715301514, - "logps/chosen": -48.874473571777344, - "logps/rejected": -50.08967971801758, + "logits/chosen": -1.5265824794769287, + "logits/rejected": -1.509937047958374, + "logps/chosen": -48.86448669433594, + "logps/rejected": -50.092742919921875, "loss": 0.6923, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.001853222376666963, - "rewards/margins": 0.001641176058910787, - "rewards/rejected": 0.00021204639051575214, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.001953024766407907, + "rewards/margins": 0.0017716301372274756, + "rewards/rejected": 0.00018139451276510954, "step": 750 }, { "epoch": 0.547550432276657, - "grad_norm": 3.330249309539795, + "grad_norm": 3.3281779289245605, "learning_rate": 4.554570252029421e-08, - "logits/chosen": -1.4528664350509644, - "logits/rejected": -1.451348900794983, - "logps/chosen": -45.50419235229492, - "logps/rejected": -48.61125946044922, + "logits/chosen": -1.452885627746582, + "logits/rejected": -1.4512499570846558, + "logps/chosen": -45.503047943115234, + "logps/rejected": -48.603092193603516, "loss": 0.6926, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.0016944361850619316, - "rewards/margins": 0.0011575535172596574, - "rewards/rejected": 0.0005368827260099351, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.0017059330129995942, + "rewards/margins": 0.0010874137515202165, + "rewards/rejected": 0.0006185191450640559, "step": 760 }, { "epoch": 0.5547550432276657, - "grad_norm": 3.2376348972320557, + "grad_norm": 3.245059013366699, "learning_rate": 4.536495144148021e-08, - "logits/chosen": -1.4735352993011475, - "logits/rejected": -1.467444658279419, - "logps/chosen": -43.1876335144043, - "logps/rejected": -45.92963409423828, - "loss": 0.692, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": 0.002239152090623975, - "rewards/margins": 0.002211696235463023, - "rewards/rejected": 2.7455598683445714e-05, + "logits/chosen": -1.4730937480926514, + "logits/rejected": -1.4670097827911377, + "logps/chosen": -43.183677673339844, + "logps/rejected": -45.912662506103516, + "loss": 0.6921, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.002278746571391821, + "rewards/margins": 0.0020815269090235233, + "rewards/rejected": 0.00019722021534107625, "step": 770 }, { "epoch": 0.5619596541786743, - "grad_norm": 2.817673444747925, + "grad_norm": 2.81795072555542, "learning_rate": 4.518097934926339e-08, - "logits/chosen": -1.498214602470398, - "logits/rejected": -1.4887255430221558, - "logps/chosen": -48.37374496459961, - "logps/rejected": -50.18889617919922, - "loss": 0.6924, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.00200117495842278, - "rewards/margins": 0.0015044829342514277, - "rewards/rejected": 0.0004966917913407087, + "logits/chosen": -1.4980876445770264, + "logits/rejected": -1.4887163639068604, + "logps/chosen": -48.368553161621094, + "logps/rejected": -50.15790557861328, + "loss": 0.6925, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.0020531173795461655, + "rewards/margins": 0.0012465236941352487, + "rewards/rejected": 0.0008065939764492214, "step": 780 }, { "epoch": 0.569164265129683, - "grad_norm": 2.894794464111328, + "grad_norm": 2.900557279586792, "learning_rate": 4.499381534150714e-08, - "logits/chosen": -1.5464510917663574, - "logits/rejected": -1.5364391803741455, - "logps/chosen": -49.842140197753906, - "logps/rejected": -53.442283630371094, - "loss": 0.6922, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.0023777992464601994, - "rewards/margins": 0.0019426383078098297, - "rewards/rejected": 0.0004351611132733524, + "logits/chosen": -1.546007752418518, + "logits/rejected": -1.5360281467437744, + "logps/chosen": -49.854331970214844, + "logps/rejected": -53.4351806640625, + "loss": 0.6923, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0022559103090316057, + "rewards/margins": 0.0017496871296316385, + "rewards/rejected": 0.000506223295815289, "step": 790 }, { "epoch": 0.5763688760806917, - "grad_norm": 3.3290724754333496, + "grad_norm": 3.331489086151123, "learning_rate": 4.48034890209227e-08, - "logits/chosen": -1.5245379209518433, - "logits/rejected": -1.5132036209106445, - "logps/chosen": -46.85020446777344, - "logps/rejected": -47.90043640136719, - "loss": 0.6923, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": 0.0021142957266420126, - "rewards/margins": 0.0016745930770412087, - "rewards/rejected": 0.0004397027660161257, + "logits/chosen": -1.5247681140899658, + "logits/rejected": -1.513277292251587, + "logps/chosen": -46.856727600097656, + "logps/rejected": -47.8914794921875, + "loss": 0.6924, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0020490610040724277, + "rewards/margins": 0.001519771059975028, + "rewards/rejected": 0.0005292895366437733, "step": 800 }, { "epoch": 0.5835734870317003, - "grad_norm": 2.6531589031219482, + "grad_norm": 2.6944401264190674, "learning_rate": 4.4610030490387154e-08, - "logits/chosen": -1.5167392492294312, - "logits/rejected": -1.5201002359390259, - "logps/chosen": -44.721466064453125, - "logps/rejected": -47.87586975097656, + "logits/chosen": -1.5168665647506714, + "logits/rejected": -1.520288348197937, + "logps/chosen": -44.71699142456055, + "logps/rejected": -47.871925354003906, "loss": 0.6925, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": 0.0016299128765240312, - "rewards/margins": 0.0012777007650583982, - "rewards/rejected": 0.0003522119950503111, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0016746418550610542, + "rewards/margins": 0.0012830138439312577, + "rewards/rejected": 0.0003916279529221356, "step": 810 }, { "epoch": 0.590778097982709, - "grad_norm": 2.716808319091797, + "grad_norm": 2.7250638008117676, "learning_rate": 4.4413470348182124e-08, - "logits/chosen": -1.5516841411590576, - "logits/rejected": -1.5297892093658447, - "logps/chosen": -46.904747009277344, - "logps/rejected": -47.53886795043945, + "logits/chosen": -1.551769495010376, + "logits/rejected": -1.5299403667449951, + "logps/chosen": -46.902976989746094, + "logps/rejected": -47.55158615112305, "loss": 0.6923, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.0022296695969998837, - "rewards/margins": 0.0016077695181593299, - "rewards/rejected": 0.0006219002534635365, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.002247415017336607, + "rewards/margins": 0.0017527195159345865, + "rewards/rejected": 0.0004946957342326641, "step": 820 }, { "epoch": 0.5979827089337176, - "grad_norm": 2.626673936843872, + "grad_norm": 2.6315906047821045, "learning_rate": 4.421383968315427e-08, - "logits/chosen": -1.5095025300979614, - "logits/rejected": -1.502556562423706, - "logps/chosen": -41.78215408325195, - "logps/rejected": -44.652320861816406, + "logits/chosen": -1.5096698999404907, + "logits/rejected": -1.502686619758606, + "logps/chosen": -41.75879669189453, + "logps/rejected": -44.64236068725586, "loss": 0.6922, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.0024126016069203615, - "rewards/margins": 0.0018264114623889327, - "rewards/rejected": 0.0005861902609467506, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.0026461954694241285, + "rewards/margins": 0.001960416790097952, + "rewards/rejected": 0.0006857783882878721, "step": 830 }, { "epoch": 0.6051873198847262, - "grad_norm": 2.260462760925293, + "grad_norm": 2.2645647525787354, "learning_rate": 4.4011170069798126e-08, - "logits/chosen": -1.5185630321502686, - "logits/rejected": -1.5112112760543823, - "logps/chosen": -47.178226470947266, - "logps/rejected": -47.4826545715332, - "loss": 0.6922, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": 0.0025229831226170063, - "rewards/margins": 0.0018927056808024645, - "rewards/rejected": 0.0006302774418145418, + "logits/chosen": -1.518868327140808, + "logits/rejected": -1.5114505290985107, + "logps/chosen": -47.18276596069336, + "logps/rejected": -47.49988555908203, + "loss": 0.6921, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.002477550646290183, + "rewards/margins": 0.0020195599645376205, + "rewards/rejected": 0.0004579908272717148, "step": 840 }, { "epoch": 0.6123919308357348, - "grad_norm": 3.8307180404663086, + "grad_norm": 3.829859495162964, "learning_rate": 4.380549356326208e-08, - "logits/chosen": -1.5277329683303833, - "logits/rejected": -1.5167248249053955, - "logps/chosen": -47.15378189086914, - "logps/rejected": -50.47490692138672, - "loss": 0.6919, + "logits/chosen": -1.5276497602462769, + "logits/rejected": -1.516660451889038, + "logps/chosen": -47.155147552490234, + "logps/rejected": -50.468345642089844, + "loss": 0.692, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.002481653820723295, - "rewards/margins": 0.0024097152054309845, - "rewards/rejected": 7.193867349997163e-05, + "rewards/chosen": 0.00246800621971488, + "rewards/margins": 0.0023304035421460867, + "rewards/rejected": 0.00013760270667262375, "step": 850 }, { "epoch": 0.6195965417867435, - "grad_norm": 2.7984201908111572, + "grad_norm": 2.80873441696167, "learning_rate": 4.359684269427848e-08, - "logits/chosen": -1.533808946609497, - "logits/rejected": -1.5186866521835327, - "logps/chosen": -56.82160186767578, - "logps/rejected": -56.765541076660156, + "logits/chosen": -1.5340362787246704, + "logits/rejected": -1.5188828706741333, + "logps/chosen": -56.821044921875, + "logps/rejected": -56.762481689453125, "loss": 0.6921, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.0025614206679165363, - "rewards/margins": 0.00205902848392725, - "rewards/rejected": 0.0005023921839892864, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.00256694620475173, + "rewards/margins": 0.002033988945186138, + "rewards/rejected": 0.0005329570267349482, "step": 860 }, { "epoch": 0.6268011527377522, - "grad_norm": 3.104840040206909, + "grad_norm": 3.1013128757476807, "learning_rate": 4.3385250464018355e-08, - "logits/chosen": -1.5737254619598389, - "logits/rejected": -1.5628758668899536, - "logps/chosen": -47.565834045410156, - "logps/rejected": -50.6081657409668, + "logits/chosen": -1.5736215114593506, + "logits/rejected": -1.5625803470611572, + "logps/chosen": -47.568138122558594, + "logps/rejected": -50.600791931152344, "loss": 0.6921, "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": 0.0022534371819347143, - "rewards/margins": 0.0021057892590761185, - "rewards/rejected": 0.00014764793741051108, + "rewards/chosen": 0.0022304158192127943, + "rewards/margins": 0.0020090844482183456, + "rewards/rejected": 0.00022133109450805932, "step": 870 }, { "epoch": 0.6340057636887608, - "grad_norm": 3.066314220428467, + "grad_norm": 3.047192096710205, "learning_rate": 4.3170750338871806e-08, - "logits/chosen": -1.5945173501968384, - "logits/rejected": -1.5865932703018188, - "logps/chosen": -46.46044921875, - "logps/rejected": -47.598365783691406, - "loss": 0.6924, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.0018970193341374397, - "rewards/margins": 0.0015174715081229806, - "rewards/rejected": 0.00037954788422212005, + "logits/chosen": -1.594675898551941, + "logits/rejected": -1.5868117809295654, + "logps/chosen": -46.44190979003906, + "logps/rejected": -47.60022735595703, + "loss": 0.6923, + "rewards/accuracies": 0.543749988079071, + "rewards/chosen": 0.0020824489183723927, + "rewards/margins": 0.0017215529223904014, + "rewards/rejected": 0.00036089582135900855, "step": 880 }, { "epoch": 0.6412103746397695, - "grad_norm": 2.9591355323791504, + "grad_norm": 2.9602956771850586, "learning_rate": 4.295337624515485e-08, - "logits/chosen": -1.5930372476577759, - "logits/rejected": -1.5826869010925293, - "logps/chosen": -44.152645111083984, - "logps/rejected": -45.86223602294922, - "loss": 0.6919, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.0027129610534757376, - "rewards/margins": 0.002449572551995516, - "rewards/rejected": 0.00026338855968788266, + "logits/chosen": -1.5927708148956299, + "logits/rejected": -1.582419514656067, + "logps/chosen": -44.159637451171875, + "logps/rejected": -45.84920120239258, + "loss": 0.692, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.002643002662807703, + "rewards/margins": 0.0022493128199130297, + "rewards/rejected": 0.00039368978468701243, "step": 890 }, { "epoch": 0.6484149855907781, - "grad_norm": 3.188676118850708, + "grad_norm": 3.1930832862854004, "learning_rate": 4.273316256374342e-08, - "logits/chosen": -1.44186270236969, - "logits/rejected": -1.4450018405914307, - "logps/chosen": -52.880889892578125, - "logps/rejected": -56.0131721496582, + "logits/chosen": -1.4421499967575073, + "logits/rejected": -1.4453353881835938, + "logps/chosen": -52.881622314453125, + "logps/rejected": -56.02192306518555, "loss": 0.6924, - "rewards/accuracies": 0.53125, - "rewards/chosen": 0.0019604223780333996, - "rewards/margins": 0.001505883177742362, - "rewards/rejected": 0.00045453928760252893, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.0019530628342181444, + "rewards/margins": 0.001586082042194903, + "rewards/rejected": 0.000366980821127072, "step": 900 }, { "epoch": 0.6556195965417867, - "grad_norm": 3.1296677589416504, + "grad_norm": 3.1409542560577393, "learning_rate": 4.2510144124635605e-08, - "logits/chosen": -1.4937114715576172, - "logits/rejected": -1.4861652851104736, - "logps/chosen": -50.3962287902832, - "logps/rejected": -51.939613342285156, - "loss": 0.6924, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.002028070855885744, - "rewards/margins": 0.001589615480042994, - "rewards/rejected": 0.0004384555504657328, + "logits/chosen": -1.4935252666473389, + "logits/rejected": -1.4859027862548828, + "logps/chosen": -50.40468215942383, + "logps/rejected": -51.92905807495117, + "loss": 0.6925, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.001943575800396502, + "rewards/margins": 0.001399615197442472, + "rewards/rejected": 0.000543960661161691, "step": 910 }, { "epoch": 0.6628242074927954, - "grad_norm": 2.5078420639038086, + "grad_norm": 2.506747245788574, "learning_rate": 4.22843562014427e-08, - "logits/chosen": -1.5258592367172241, - "logits/rejected": -1.5139418840408325, - "logps/chosen": -42.02109909057617, - "logps/rejected": -43.982444763183594, + "logits/chosen": -1.5257174968719482, + "logits/rejected": -1.513899803161621, + "logps/chosen": -42.02476501464844, + "logps/rejected": -43.99268341064453, "loss": 0.6918, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.002846998395398259, - "rewards/margins": 0.002694095950573683, - "rewards/rejected": 0.00015290200826711953, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.0028103392105549574, + "rewards/margins": 0.002759791212156415, + "rewards/rejected": 5.0548336730571464e-05, "step": 920 }, { "epoch": 0.670028818443804, - "grad_norm": 3.376978635787964, + "grad_norm": 3.2415852546691895, "learning_rate": 4.205583450581023e-08, - "logits/chosen": -1.5917527675628662, - "logits/rejected": -1.5782153606414795, - "logps/chosen": -46.341373443603516, - "logps/rejected": -48.814178466796875, + "logits/chosen": -1.5915582180023193, + "logits/rejected": -1.5780293941497803, + "logps/chosen": -46.33761215209961, + "logps/rejected": -48.80952453613281, "loss": 0.692, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": 0.002764966571703553, - "rewards/margins": 0.0022214148193597794, - "rewards/rejected": 0.0005435518105514348, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.002802562899887562, + "rewards/margins": 0.002212467370554805, + "rewards/rejected": 0.0005900953547097743, "step": 930 }, { "epoch": 0.6772334293948127, - "grad_norm": 2.3079779148101807, + "grad_norm": 2.30546498298645, "learning_rate": 4.1824615181769577e-08, - "logits/chosen": -1.47239351272583, - "logits/rejected": -1.467908501625061, - "logps/chosen": -54.866912841796875, - "logps/rejected": -55.613975524902344, - "loss": 0.6922, - "rewards/accuracies": 0.4937500059604645, - "rewards/chosen": 0.0026164217852056026, - "rewards/margins": 0.0020035277120769024, - "rewards/rejected": 0.000612894247751683, + "logits/chosen": -1.472581148147583, + "logits/rejected": -1.468040943145752, + "logps/chosen": -54.85009002685547, + "logps/rejected": -55.63652420043945, + "loss": 0.692, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.002784591168165207, + "rewards/margins": 0.0023971297778189182, + "rewards/rejected": 0.0003874614485539496, "step": 940 }, { "epoch": 0.6844380403458213, - "grad_norm": 2.9210896492004395, + "grad_norm": 2.9255244731903076, "learning_rate": 4.1590734800021354e-08, - "logits/chosen": -1.4288716316223145, - "logits/rejected": -1.4427287578582764, - "logps/chosen": -46.20374298095703, - "logps/rejected": -50.84699630737305, + "logits/chosen": -1.428789734840393, + "logits/rejected": -1.4426469802856445, + "logps/chosen": -46.2214469909668, + "logps/rejected": -50.858089447021484, "loss": 0.6924, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": 0.0032922211103141308, - "rewards/margins": 0.0015979878371581435, - "rewards/rejected": 0.0016942331567406654, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0031152009032666683, + "rewards/margins": 0.0015318902442231774, + "rewards/rejected": 0.001583310542628169, "step": 950 }, { "epoch": 0.69164265129683, - "grad_norm": 2.980557680130005, + "grad_norm": 2.974820137023926, "learning_rate": 4.1354230352151143e-08, - "logits/chosen": -1.564298391342163, - "logits/rejected": -1.5560590028762817, - "logps/chosen": -48.91004943847656, - "logps/rejected": -51.33696746826172, + "logits/chosen": -1.5642731189727783, + "logits/rejected": -1.5561494827270508, + "logps/chosen": -48.91220474243164, + "logps/rejected": -51.33967971801758, "loss": 0.6922, "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": 0.002130536362528801, - "rewards/margins": 0.0019005474168807268, - "rewards/rejected": 0.000229989382205531, + "rewards/chosen": 0.002109041903167963, + "rewards/margins": 0.0019061642233282328, + "rewards/rejected": 0.00020287782535888255, "step": 960 }, { "epoch": 0.6988472622478387, - "grad_norm": 3.662576198577881, + "grad_norm": 3.665234327316284, "learning_rate": 4.111513924477878e-08, - "logits/chosen": -1.5793085098266602, - "logits/rejected": -1.570892095565796, - "logps/chosen": -44.40299987792969, - "logps/rejected": -47.72153854370117, + "logits/chosen": -1.5790517330169678, + "logits/rejected": -1.5706714391708374, + "logps/chosen": -44.39840316772461, + "logps/rejected": -47.70134735107422, "loss": 0.6917, - "rewards/accuracies": 0.6625000238418579, - "rewards/chosen": 0.003438382176682353, - "rewards/margins": 0.002996251452714205, - "rewards/rejected": 0.0004421306657604873, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.003484348300844431, + "rewards/margins": 0.002840267028659582, + "rewards/rejected": 0.000644081097561866, "step": 970 }, { "epoch": 0.7060518731988472, - "grad_norm": 2.7718052864074707, + "grad_norm": 2.770003080368042, "learning_rate": 4.087349929364192e-08, - "logits/chosen": -1.4303061962127686, - "logits/rejected": -1.4330605268478394, - "logps/chosen": -50.70553207397461, - "logps/rejected": -55.8089599609375, + "logits/chosen": -1.4302839040756226, + "logits/rejected": -1.4329025745391846, + "logps/chosen": -50.67039489746094, + "logps/rejected": -55.78020477294922, "loss": 0.6924, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": 0.0023039651568979025, - "rewards/margins": 0.00151240814011544, - "rewards/rejected": 0.0007915569585748017, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.0026553962379693985, + "rewards/margins": 0.0015762934926897287, + "rewards/rejected": 0.001079102512449026, "step": 980 }, { "epoch": 0.7132564841498559, - "grad_norm": 3.8964684009552, + "grad_norm": 3.872164726257324, "learning_rate": 4.062934871761497e-08, - "logits/chosen": -1.538272500038147, - "logits/rejected": -1.5321686267852783, - "logps/chosen": -50.46923065185547, - "logps/rejected": -51.98360061645508, + "logits/chosen": -1.5383307933807373, + "logits/rejected": -1.5321803092956543, + "logps/chosen": -50.46217727661133, + "logps/rejected": -51.976783752441406, "loss": 0.6922, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": 0.002601128537207842, - "rewards/margins": 0.0018721583765000105, - "rewards/rejected": 0.0007289702189154923, + "rewards/accuracies": 0.48750001192092896, + "rewards/chosen": 0.0026717365253716707, + "rewards/margins": 0.001874624053016305, + "rewards/rejected": 0.0007971125887706876, "step": 990 }, { "epoch": 0.7204610951008645, - "grad_norm": 2.786881923675537, + "grad_norm": 2.7889599800109863, "learning_rate": 4.038272613266419e-08, - "logits/chosen": -1.5223407745361328, - "logits/rejected": -1.510818362236023, - "logps/chosen": -47.009864807128906, - "logps/rejected": -49.04174041748047, - "loss": 0.692, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.0034912824630737305, - "rewards/margins": 0.002240509260445833, - "rewards/rejected": 0.00125077273696661, + "logits/chosen": -1.5222680568695068, + "logits/rejected": -1.5106613636016846, + "logps/chosen": -46.987491607666016, + "logps/rejected": -49.00891876220703, + "loss": 0.6921, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0037150229327380657, + "rewards/margins": 0.002136030001565814, + "rewards/rejected": 0.0015789925819262862, "step": 1000 }, { "epoch": 0.7276657060518732, - "grad_norm": 3.942155599594116, + "grad_norm": 3.937760353088379, "learning_rate": 4.0133670545740014e-08, - "logits/chosen": -1.5300023555755615, - "logits/rejected": -1.5186035633087158, - "logps/chosen": -43.023597717285156, - "logps/rejected": -44.827213287353516, - "loss": 0.692, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": 0.002812603721395135, - "rewards/margins": 0.002308101858943701, - "rewards/rejected": 0.0005045018042437732, + "logits/chosen": -1.529769778251648, + "logits/rejected": -1.5183862447738647, + "logps/chosen": -42.99585723876953, + "logps/rejected": -44.8293571472168, + "loss": 0.6919, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0030899574048817158, + "rewards/margins": 0.0026069427840411663, + "rewards/rejected": 0.0004830144753213972, "step": 1010 }, { "epoch": 0.7348703170028819, - "grad_norm": 2.838587760925293, + "grad_norm": 2.8411173820495605, "learning_rate": 3.988222134860755e-08, - "logits/chosen": -1.5480142831802368, - "logits/rejected": -1.5353991985321045, - "logps/chosen": -46.54960250854492, - "logps/rejected": -48.426612854003906, - "loss": 0.6923, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.002769311424344778, - "rewards/margins": 0.001765635795891285, - "rewards/rejected": 0.0010036755120381713, + "logits/chosen": -1.547809362411499, + "logits/rejected": -1.5350381135940552, + "logps/chosen": -46.547340393066406, + "logps/rejected": -48.436302185058594, + "loss": 0.6922, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.002791911829262972, + "rewards/margins": 0.0018851269269362092, + "rewards/rejected": 0.0009067848441191018, "step": 1020 }, { "epoch": 0.7420749279538905, - "grad_norm": 2.6795196533203125, + "grad_norm": 2.6739847660064697, "learning_rate": 3.962841831161617e-08, - "logits/chosen": -1.479621171951294, - "logits/rejected": -1.4719586372375488, - "logps/chosen": -43.465667724609375, - "logps/rejected": -46.79264831542969, + "logits/chosen": -1.479472279548645, + "logits/rejected": -1.471895694732666, + "logps/chosen": -43.468360900878906, + "logps/rejected": -46.796627044677734, "loss": 0.6916, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.003748814109712839, - "rewards/margins": 0.003088985104113817, - "rewards/rejected": 0.0006598292966373265, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0037218802608549595, + "rewards/margins": 0.0031018408481031656, + "rewards/rejected": 0.0006200397037900984, "step": 1030 }, { "epoch": 0.7492795389048992, - "grad_norm": 2.4216113090515137, + "grad_norm": 2.4207677841186523, "learning_rate": 3.937230157740931e-08, - "logits/chosen": -1.5232858657836914, - "logits/rejected": -1.50690495967865, - "logps/chosen": -46.25413131713867, - "logps/rejected": -48.194854736328125, - "loss": 0.6921, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": 0.0025470464024692774, - "rewards/margins": 0.0020666818600147963, - "rewards/rejected": 0.00048036445514298975, + "logits/chosen": -1.5236645936965942, + "logits/rejected": -1.5073776245117188, + "logps/chosen": -46.229156494140625, + "logps/rejected": -48.211395263671875, + "loss": 0.6919, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0027968082576990128, + "rewards/margins": 0.002481859177350998, + "rewards/rejected": 0.00031494878930971026, "step": 1040 }, { "epoch": 0.7564841498559077, - "grad_norm": 2.367373466491699, + "grad_norm": 2.3701324462890625, "learning_rate": 3.9113911654575246e-08, - "logits/chosen": -1.432803988456726, - "logits/rejected": -1.419306993484497, - "logps/chosen": -41.23280334472656, - "logps/rejected": -44.33710861206055, + "logits/chosen": -1.4330904483795166, + "logits/rejected": -1.4194681644439697, + "logps/chosen": -41.221595764160156, + "logps/rejected": -44.33475112915039, "loss": 0.6915, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.003690010402351618, - "rewards/margins": 0.0032567516900599003, - "rewards/rejected": 0.0004332589451223612, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.003802055027335882, + "rewards/margins": 0.0033451938070356846, + "rewards/rejected": 0.00045686113298870623, "step": 1050 }, { "epoch": 0.7636887608069164, - "grad_norm": 3.087181806564331, + "grad_norm": 3.088486909866333, "learning_rate": 3.885328941124014e-08, - "logits/chosen": -1.5099000930786133, - "logits/rejected": -1.4841463565826416, - "logps/chosen": -47.18888473510742, - "logps/rejected": -48.68304443359375, - "loss": 0.6916, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.0034715167712420225, - "rewards/margins": 0.0031684015411883593, - "rewards/rejected": 0.0003031149972230196, + "logits/chosen": -1.5096882581710815, + "logits/rejected": -1.4839286804199219, + "logps/chosen": -47.197532653808594, + "logps/rejected": -48.702110290527344, + "loss": 0.6915, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.0033850923646241426, + "rewards/margins": 0.0032726675271987915, + "rewards/rejected": 0.00011242493928875774, "step": 1060 }, { "epoch": 0.770893371757925, - "grad_norm": 3.8396615982055664, + "grad_norm": 3.8530242443084717, "learning_rate": 3.8590476068604106e-08, - "logits/chosen": -1.4987900257110596, - "logits/rejected": -1.4933557510375977, - "logps/chosen": -53.66324996948242, - "logps/rejected": -57.7510986328125, - "loss": 0.6914, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.004215725697577, - "rewards/margins": 0.0035787061788141727, - "rewards/rejected": 0.0006370203336700797, + "logits/chosen": -1.498840093612671, + "logits/rejected": -1.4932810068130493, + "logps/chosen": -53.6661491394043, + "logps/rejected": -57.72388458251953, + "loss": 0.6915, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.0041867331601679325, + "rewards/margins": 0.003277536015957594, + "rewards/rejected": 0.0009091972606256604, "step": 1070 }, { "epoch": 0.7780979827089337, - "grad_norm": 3.060645818710327, + "grad_norm": 3.0547661781311035, "learning_rate": 3.832551319442151e-08, - "logits/chosen": -1.5067791938781738, - "logits/rejected": -1.498543620109558, - "logps/chosen": -48.05634307861328, - "logps/rejected": -51.09278869628906, - "loss": 0.6913, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.004450359381735325, - "rewards/margins": 0.003629028797149658, - "rewards/rejected": 0.0008213302935473621, + "logits/chosen": -1.507131576538086, + "logits/rejected": -1.4989385604858398, + "logps/chosen": -48.07558822631836, + "logps/rejected": -51.0708122253418, + "loss": 0.6915, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.004257823806256056, + "rewards/margins": 0.0032167278695851564, + "rewards/rejected": 0.0010410962859168649, "step": 1080 }, { "epoch": 0.7853025936599424, - "grad_norm": 3.566991090774536, + "grad_norm": 3.5525083541870117, "learning_rate": 3.8058442696426404e-08, - "logits/chosen": -1.5294276475906372, - "logits/rejected": -1.5239439010620117, - "logps/chosen": -52.47978973388672, - "logps/rejected": -56.07960891723633, - "loss": 0.6914, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.0028939805924892426, - "rewards/margins": 0.0036067981272935867, - "rewards/rejected": -0.000712817651219666, + "logits/chosen": -1.5296133756637573, + "logits/rejected": -1.5241445302963257, + "logps/chosen": -52.46879196166992, + "logps/rejected": -56.08189010620117, + "loss": 0.6913, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.003003983059898019, + "rewards/margins": 0.0037396536208689213, + "rewards/rejected": -0.0007356697460636497, "step": 1090 }, { "epoch": 0.792507204610951, - "grad_norm": 3.160165786743164, + "grad_norm": 3.173449993133545, "learning_rate": 3.7789306815704216e-08, - "logits/chosen": -1.5380399227142334, - "logits/rejected": -1.5153748989105225, - "logps/chosen": -47.30868148803711, - "logps/rejected": -48.814453125, - "loss": 0.6917, + "logits/chosen": -1.5380277633666992, + "logits/rejected": -1.5155049562454224, + "logps/chosen": -47.31706237792969, + "logps/rejected": -48.80278778076172, + "loss": 0.6918, "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.003151174634695053, - "rewards/margins": 0.0028507737442851067, - "rewards/rejected": 0.0003004012396559119, + "rewards/chosen": 0.003067363053560257, + "rewards/margins": 0.0026503056287765503, + "rewards/rejected": 0.000417057191953063, "step": 1100 }, { "epoch": 0.7997118155619597, - "grad_norm": 2.2915351390838623, + "grad_norm": 2.299166679382324, "learning_rate": 3.7518148120010705e-08, - "logits/chosen": -1.5468862056732178, - "logits/rejected": -1.5344375371932983, - "logps/chosen": -46.47789764404297, - "logps/rejected": -48.65725326538086, - "loss": 0.6918, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.0027878470718860626, - "rewards/margins": 0.002646528882905841, - "rewards/rejected": 0.00014131757779978216, + "logits/chosen": -1.546809434890747, + "logits/rejected": -1.5342543125152588, + "logps/chosen": -46.491127014160156, + "logps/rejected": -48.6580924987793, + "loss": 0.6919, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.002655484713613987, + "rewards/margins": 0.0025225512217730284, + "rewards/rejected": 0.00013293321535456926, "step": 1110 }, { "epoch": 0.8069164265129684, - "grad_norm": 2.6376588344573975, + "grad_norm": 2.6358752250671387, "learning_rate": 3.7245009497039244e-08, - "logits/chosen": -1.4868565797805786, - "logits/rejected": -1.4767712354660034, - "logps/chosen": -45.49077606201172, - "logps/rejected": -46.50215530395508, + "logits/chosen": -1.4867825508117676, + "logits/rejected": -1.4767963886260986, + "logps/chosen": -45.50969696044922, + "logps/rejected": -46.510292053222656, "loss": 0.6914, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.004422894213348627, - "rewards/margins": 0.003566063242033124, - "rewards/rejected": 0.0008568307384848595, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.004233731422573328, + "rewards/margins": 0.0034582801163196564, + "rewards/rejected": 0.0007754509570077062, "step": 1120 }, { "epoch": 0.8141210374639769, - "grad_norm": 2.846630573272705, + "grad_norm": 2.844066619873047, "learning_rate": 3.696993414763753e-08, - "logits/chosen": -1.5217278003692627, - "logits/rejected": -1.5155227184295654, - "logps/chosen": -43.4204216003418, - "logps/rejected": -45.222496032714844, + "logits/chosen": -1.5220118761062622, + "logits/rejected": -1.5158849954605103, + "logps/chosen": -43.408538818359375, + "logps/rejected": -45.21761703491211, "loss": 0.6921, "rewards/accuracies": 0.518750011920929, - "rewards/chosen": 0.0024362634867429733, - "rewards/margins": 0.0021375538781285286, - "rewards/rejected": 0.000298709433991462, + "rewards/chosen": 0.0025550839491188526, + "rewards/margins": 0.0022075737360864878, + "rewards/rejected": 0.0003475099219940603, "step": 1130 }, { "epoch": 0.8213256484149856, - "grad_norm": 2.912266492843628, + "grad_norm": 2.9131383895874023, "learning_rate": 3.66929655789747e-08, - "logits/chosen": -1.5701639652252197, - "logits/rejected": -1.5613019466400146, - "logps/chosen": -46.80807113647461, - "logps/rejected": -49.12160110473633, - "loss": 0.6911, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.00439231563359499, - "rewards/margins": 0.004190822131931782, - "rewards/rejected": 0.00020149415649939328, + "logits/chosen": -1.5703356266021729, + "logits/rejected": -1.5614650249481201, + "logps/chosen": -46.78563690185547, + "logps/rejected": -49.125823974609375, + "loss": 0.6909, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.004616620950400829, + "rewards/margins": 0.004457393195480108, + "rewards/rejected": 0.0001592273183632642, "step": 1140 }, { "epoch": 0.8285302593659942, - "grad_norm": 2.6701982021331787, + "grad_norm": 2.677063226699829, "learning_rate": 3.64141475976601e-08, - "logits/chosen": -1.565177321434021, - "logits/rejected": -1.553778886795044, - "logps/chosen": -48.714622497558594, - "logps/rejected": -50.90757751464844, - "loss": 0.6916, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.0032081424724310637, - "rewards/margins": 0.003137503983452916, - "rewards/rejected": 7.063868542900309e-05, + "logits/chosen": -1.5651849508285522, + "logits/rejected": -1.5538979768753052, + "logps/chosen": -48.73290252685547, + "logps/rejected": -50.91090774536133, + "loss": 0.6917, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.0030253741424530745, + "rewards/margins": 0.0029880281072109938, + "rewards/rejected": 3.734655911102891e-05, "step": 1150 }, { "epoch": 0.8357348703170029, - "grad_norm": 3.0760040283203125, + "grad_norm": 3.094961166381836, "learning_rate": 3.61335243028146e-08, - "logits/chosen": -1.5410432815551758, - "logits/rejected": -1.5293331146240234, - "logps/chosen": -50.61613845825195, - "logps/rejected": -51.36149978637695, - "loss": 0.6917, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.0031296256929636, - "rewards/margins": 0.0029243044555187225, - "rewards/rejected": 0.000205320815439336, + "logits/chosen": -1.5411107540130615, + "logits/rejected": -1.5292822122573853, + "logps/chosen": -50.64885711669922, + "logps/rejected": -51.355140686035156, + "loss": 0.6919, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.002802414819598198, + "rewards/margins": 0.0025334556121379137, + "rewards/rejected": 0.0002689594402909279, "step": 1160 }, { "epoch": 0.8429394812680115, - "grad_norm": 3.5490238666534424, + "grad_norm": 3.541774034500122, "learning_rate": 3.585114007909562e-08, - "logits/chosen": -1.4593126773834229, - "logits/rejected": -1.4339640140533447, - "logps/chosen": -46.77470779418945, - "logps/rejected": -46.867835998535156, - "loss": 0.6916, + "logits/chosen": -1.459811806678772, + "logits/rejected": -1.4343186616897583, + "logps/chosen": -46.77803039550781, + "logps/rejected": -46.88262176513672, + "loss": 0.6915, "rewards/accuracies": 0.5625, - "rewards/chosen": 0.003284228267148137, - "rewards/margins": 0.0031310878694057465, - "rewards/rejected": 0.00015314055781345814, + "rewards/chosen": 0.003251022193580866, + "rewards/margins": 0.003245703410357237, + "rewards/rejected": 5.318806415743893e-06, "step": 1170 }, { "epoch": 0.8501440922190202, - "grad_norm": 3.8909618854522705, + "grad_norm": 3.8750853538513184, "learning_rate": 3.556703958967716e-08, - "logits/chosen": -1.4243078231811523, - "logits/rejected": -1.4103549718856812, - "logps/chosen": -48.663448333740234, - "logps/rejected": -49.696876525878906, - "loss": 0.6916, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.003148261923342943, - "rewards/margins": 0.0031150388531386852, - "rewards/rejected": 3.3223019272554666e-05, + "logits/chosen": -1.4243541955947876, + "logits/rejected": -1.4104390144348145, + "logps/chosen": -48.67729949951172, + "logps/rejected": -49.68004608154297, + "loss": 0.6918, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0030097700655460358, + "rewards/margins": 0.0028082337230443954, + "rewards/rejected": 0.00020153654622845352, "step": 1180 }, { "epoch": 0.8573487031700289, - "grad_norm": 2.9232029914855957, + "grad_norm": 2.9351320266723633, "learning_rate": 3.528126776918559e-08, - "logits/chosen": -1.589402437210083, - "logits/rejected": -1.5662322044372559, - "logps/chosen": -49.0584716796875, - "logps/rejected": -50.383140563964844, - "loss": 0.6915, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.0031496137380599976, - "rewards/margins": 0.003298001829534769, - "rewards/rejected": -0.00014838797505944967, + "logits/chosen": -1.5893309116363525, + "logits/rejected": -1.5662416219711304, + "logps/chosen": -49.06155014038086, + "logps/rejected": -50.372955322265625, + "loss": 0.6916, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0031187927816063166, + "rewards/margins": 0.0031653507612645626, + "rewards/rejected": -4.655800148611888e-05, "step": 1190 }, { "epoch": 0.8645533141210374, - "grad_norm": 3.276780843734741, + "grad_norm": 3.2886970043182373, "learning_rate": 3.499386981659262e-08, - "logits/chosen": -1.465123176574707, - "logits/rejected": -1.4452338218688965, - "logps/chosen": -51.65974807739258, - "logps/rejected": -53.6181755065918, + "logits/chosen": -1.4647929668426514, + "logits/rejected": -1.4450578689575195, + "logps/chosen": -51.65665817260742, + "logps/rejected": -53.59879684448242, "loss": 0.6915, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.0028546880930662155, - "rewards/margins": 0.0033967546187341213, - "rewards/rejected": -0.0005420667002908885, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.0028855870477855206, + "rewards/margins": 0.003233908908441663, + "rewards/rejected": -0.0003483216860331595, "step": 1200 }, { "epoch": 0.8717579250720461, - "grad_norm": 2.6365585327148438, + "grad_norm": 2.6401572227478027, "learning_rate": 3.47048911880664e-08, - "logits/chosen": -1.446416974067688, - "logits/rejected": -1.4437949657440186, - "logps/chosen": -43.36547088623047, - "logps/rejected": -46.496070861816406, - "loss": 0.6916, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.003637858433648944, - "rewards/margins": 0.003204674692824483, - "rewards/rejected": 0.00043318397365510464, + "logits/chosen": -1.4464818239212036, + "logits/rejected": -1.4437991380691528, + "logps/chosen": -43.36724853515625, + "logps/rejected": -46.509620666503906, + "loss": 0.6915, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.003620029194280505, + "rewards/margins": 0.003322354983538389, + "rewards/rejected": 0.00029767383239232004, "step": 1210 }, { "epoch": 0.8789625360230547, - "grad_norm": 3.4537570476531982, + "grad_norm": 3.4509706497192383, "learning_rate": 3.4414377589782e-08, - "logits/chosen": -1.4948800802230835, - "logits/rejected": -1.4891208410263062, - "logps/chosen": -46.580223083496094, - "logps/rejected": -50.13518524169922, - "loss": 0.6911, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.0040110033005476, - "rewards/margins": 0.004163045436143875, - "rewards/rejected": -0.00015204254304990172, + "logits/chosen": -1.495086908340454, + "logits/rejected": -1.489436388015747, + "logps/chosen": -46.590518951416016, + "logps/rejected": -50.121578216552734, + "loss": 0.6912, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.003908050246536732, + "rewards/margins": 0.003924064338207245, + "rewards/rejected": -1.6013882486731745e-05, "step": 1220 }, { "epoch": 0.8861671469740634, - "grad_norm": 2.9817545413970947, + "grad_norm": 2.975177526473999, "learning_rate": 3.412237497069226e-08, - "logits/chosen": -1.446361780166626, - "logits/rejected": -1.4230271577835083, - "logps/chosen": -48.76630401611328, - "logps/rejected": -50.79680252075195, - "loss": 0.6914, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": 0.003571272362023592, - "rewards/margins": 0.0035423249937593937, - "rewards/rejected": 2.8947553801117465e-05, + "logits/chosen": -1.4464085102081299, + "logits/rejected": -1.4230797290802002, + "logps/chosen": -48.742740631103516, + "logps/rejected": -50.80247116088867, + "loss": 0.6912, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.003806891618296504, + "rewards/margins": 0.0038346692454069853, + "rewards/rejected": -2.777800000330899e-05, "step": 1230 }, { "epoch": 0.8933717579250721, - "grad_norm": 3.617110013961792, + "grad_norm": 3.605241298675537, "learning_rate": 3.382892951526036e-08, - "logits/chosen": -1.50287926197052, - "logits/rejected": -1.4904611110687256, - "logps/chosen": -42.8762092590332, - "logps/rejected": -45.95240783691406, + "logits/chosen": -1.5027892589569092, + "logits/rejected": -1.4902957677841187, + "logps/chosen": -42.880409240722656, + "logps/rejected": -45.94416046142578, "loss": 0.691, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.0038196600507944822, - "rewards/margins": 0.0043813083320856094, - "rewards/rejected": -0.0005616483394987881, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.003777655540034175, + "rewards/margins": 0.004256793763488531, + "rewards/rejected": -0.0004791382234543562, "step": 1240 }, { "epoch": 0.9005763688760807, - "grad_norm": 2.5366759300231934, + "grad_norm": 2.5285606384277344, "learning_rate": 3.353408763615502e-08, - "logits/chosen": -1.5510237216949463, - "logits/rejected": -1.5464321374893188, - "logps/chosen": -50.646915435791016, - "logps/rejected": -54.0255012512207, - "loss": 0.6916, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.0030564251355826855, - "rewards/margins": 0.003073544707149267, - "rewards/rejected": -1.711988079478033e-05, + "logits/chosen": -1.550816535949707, + "logits/rejected": -1.5462706089019775, + "logps/chosen": -50.65374755859375, + "logps/rejected": -54.01652145385742, + "loss": 0.6917, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.002988115418702364, + "rewards/margins": 0.0029154701624065638, + "rewards/rejected": 7.264531450346112e-05, "step": 1250 }, { "epoch": 0.9077809798270894, - "grad_norm": 3.127284049987793, + "grad_norm": 3.12884259223938, "learning_rate": 3.323789596690971e-08, - "logits/chosen": -1.5054200887680054, - "logits/rejected": -1.5216195583343506, - "logps/chosen": -46.50788116455078, - "logps/rejected": -53.43675994873047, - "loss": 0.6918, - "rewards/accuracies": 0.518750011920929, - "rewards/chosen": 0.0026815244928002357, - "rewards/margins": 0.0028147255070507526, - "rewards/rejected": -0.0001332012179773301, + "logits/chosen": -1.5052629709243774, + "logits/rejected": -1.5213134288787842, + "logps/chosen": -46.520626068115234, + "logps/rejected": -53.43013381958008, + "loss": 0.6919, + "rewards/accuracies": 0.5062500238418579, + "rewards/chosen": 0.0025540876667946577, + "rewards/margins": 0.002621028106659651, + "rewards/rejected": -6.69404907966964e-05, "step": 1260 }, { "epoch": 0.9149855907780979, - "grad_norm": 3.190725564956665, + "grad_norm": 3.214010000228882, "learning_rate": 3.294040135454681e-08, - "logits/chosen": -1.4547207355499268, - "logits/rejected": -1.4413490295410156, - "logps/chosen": -45.393150329589844, - "logps/rejected": -47.96538162231445, + "logits/chosen": -1.4547159671783447, + "logits/rejected": -1.441368818283081, + "logps/chosen": -45.38474655151367, + "logps/rejected": -47.95299530029297, "loss": 0.6908, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": 0.004647851921617985, - "rewards/margins": 0.004829541314393282, - "rewards/rejected": -0.00018168911628890783, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.004731893539428711, + "rewards/margins": 0.004789763130247593, + "rewards/rejected": -5.786996189272031e-05, "step": 1270 }, { "epoch": 0.9221902017291066, - "grad_norm": 3.4384658336639404, + "grad_norm": 3.44657039642334, "learning_rate": 3.264165085216817e-08, - "logits/chosen": -1.6091865301132202, - "logits/rejected": -1.5993707180023193, - "logps/chosen": -46.1981086730957, - "logps/rejected": -49.691402435302734, - "loss": 0.6906, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": 0.004781835246831179, - "rewards/margins": 0.005211901850998402, - "rewards/rejected": -0.00043006654595956206, + "logits/chosen": -1.6092770099639893, + "logits/rejected": -1.599332332611084, + "logps/chosen": -46.198097229003906, + "logps/rejected": -49.7007942199707, + "loss": 0.6905, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.004781954921782017, + "rewards/margins": 0.005305943079292774, + "rewards/rejected": -0.0005239887977950275, "step": 1280 }, { "epoch": 0.9293948126801153, - "grad_norm": 2.493252992630005, + "grad_norm": 2.4775938987731934, "learning_rate": 3.2341691711512854e-08, - "logits/chosen": -1.5666477680206299, - "logits/rejected": -1.5637898445129395, - "logps/chosen": -45.591976165771484, - "logps/rejected": -49.11730194091797, - "loss": 0.6911, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.0044815982691943645, - "rewards/margins": 0.004124378319829702, - "rewards/rejected": 0.000357220065779984, + "logits/chosen": -1.5666391849517822, + "logits/rejected": -1.5636540651321411, + "logps/chosen": -45.602577209472656, + "logps/rejected": -49.0974006652832, + "loss": 0.6913, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.004375613294541836, + "rewards/margins": 0.0038193650543689728, + "rewards/rejected": 0.0005562485312111676, "step": 1290 }, { "epoch": 0.9365994236311239, - "grad_norm": 3.0198280811309814, + "grad_norm": 3.021411180496216, "learning_rate": 3.204057137548371e-08, - "logits/chosen": -1.6098960638046265, - "logits/rejected": -1.6085681915283203, - "logps/chosen": -47.30309295654297, - "logps/rejected": -48.908607482910156, - "loss": 0.6922, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.0020401510410010815, - "rewards/margins": 0.001850422821007669, - "rewards/rejected": 0.00018972805992234498, + "logits/chosen": -1.6098123788833618, + "logits/rejected": -1.608615517616272, + "logps/chosen": -47.298789978027344, + "logps/rejected": -48.89512634277344, + "loss": 0.6923, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.0020831464789807796, + "rewards/margins": 0.0017586341127753258, + "rewards/rejected": 0.0003245121333748102, "step": 1300 }, { "epoch": 0.9438040345821326, - "grad_norm": 2.6708743572235107, + "grad_norm": 2.660128593444824, "learning_rate": 3.173833747064351e-08, - "logits/chosen": -1.5706069469451904, - "logits/rejected": -1.5725312232971191, - "logps/chosen": -41.88125991821289, - "logps/rejected": -45.090187072753906, + "logits/chosen": -1.5708482265472412, + "logits/rejected": -1.572715401649475, + "logps/chosen": -41.87781524658203, + "logps/rejected": -45.07353591918945, "loss": 0.6913, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.0036051557399332523, - "rewards/margins": 0.003785437438637018, - "rewards/rejected": -0.00018028108752332628, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.0036396575160324574, + "rewards/margins": 0.003653382882475853, + "rewards/rejected": -1.372521546727512e-05, "step": 1310 }, { "epoch": 0.9510086455331412, - "grad_norm": 3.0949208736419678, + "grad_norm": 3.0910470485687256, "learning_rate": 3.143503779968213e-08, - "logits/chosen": -1.5067790746688843, - "logits/rejected": -1.4917875528335571, - "logps/chosen": -46.52467727661133, - "logps/rejected": -49.682655334472656, + "logits/chosen": -1.5068211555480957, + "logits/rejected": -1.491811990737915, + "logps/chosen": -46.52091979980469, + "logps/rejected": -49.677032470703125, "loss": 0.6909, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.004719844087958336, - "rewards/margins": 0.004467535298317671, - "rewards/rejected": 0.0002523087023291737, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.004757388960570097, + "rewards/margins": 0.004448880907148123, + "rewards/rejected": 0.00030850752955302596, "step": 1320 }, { "epoch": 0.9582132564841499, - "grad_norm": 3.0085012912750244, + "grad_norm": 3.005345106124878, "learning_rate": 3.113072033385589e-08, - "logits/chosen": -1.549525499343872, - "logits/rejected": -1.5283381938934326, - "logps/chosen": -51.49726486206055, - "logps/rejected": -52.88813018798828, + "logits/chosen": -1.5497840642929077, + "logits/rejected": -1.5285706520080566, + "logps/chosen": -51.490211486816406, + "logps/rejected": -52.88207244873047, "loss": 0.6909, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.003991552162915468, - "rewards/margins": 0.004448592662811279, - "rewards/rejected": -0.00045704032527282834, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.004062139429152012, + "rewards/margins": 0.004458623938262463, + "rewards/rejected": -0.0003964842180721462, "step": 1330 }, { "epoch": 0.9654178674351584, - "grad_norm": 3.5239176750183105, + "grad_norm": 3.5104808807373047, "learning_rate": 3.082543320540015e-08, - "logits/chosen": -1.534172773361206, - "logits/rejected": -1.531167984008789, - "logps/chosen": -45.726051330566406, - "logps/rejected": -49.008514404296875, + "logits/chosen": -1.5339133739471436, + "logits/rejected": -1.5309906005859375, + "logps/chosen": -45.72144317626953, + "logps/rejected": -49.00925064086914, "loss": 0.6913, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.0037339492700994015, - "rewards/margins": 0.0037639313377439976, - "rewards/rejected": -2.9981707484694198e-05, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.003779984312132001, + "rewards/margins": 0.003817369695752859, + "rewards/rejected": -3.738547820830718e-05, "step": 1340 }, { "epoch": 0.9726224783861671, - "grad_norm": 2.747431755065918, + "grad_norm": 2.7417361736297607, "learning_rate": 3.051922469991655e-08, - "logits/chosen": -1.4042211771011353, - "logits/rejected": -1.399253249168396, - "logps/chosen": -52.229103088378906, - "logps/rejected": -53.1360969543457, - "loss": 0.6912, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.004444047808647156, - "rewards/margins": 0.00401811208575964, - "rewards/rejected": 0.0004259358684066683, + "logits/chosen": -1.4038527011871338, + "logits/rejected": -1.3989447355270386, + "logps/chosen": -52.2333869934082, + "logps/rejected": -53.14558792114258, + "loss": 0.6911, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.004401214420795441, + "rewards/margins": 0.00407016184180975, + "rewards/rejected": 0.0003310522879473865, "step": 1350 }, { "epoch": 0.9798270893371758, - "grad_norm": 3.373246192932129, + "grad_norm": 3.360459327697754, "learning_rate": 3.0212143248735886e-08, - "logits/chosen": -1.5010387897491455, - "logits/rejected": -1.4960435628890991, - "logps/chosen": -43.80424118041992, - "logps/rejected": -45.68942642211914, + "logits/chosen": -1.5011059045791626, + "logits/rejected": -1.4959580898284912, + "logps/chosen": -43.794273376464844, + "logps/rejected": -45.68433380126953, "loss": 0.6905, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.005180574022233486, - "rewards/margins": 0.0053636059165000916, - "rewards/rejected": -0.00018303189426660538, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.0052802106365561485, + "rewards/margins": 0.005412275902926922, + "rewards/rejected": -0.00013206522271502763, "step": 1360 }, { "epoch": 0.9870317002881844, - "grad_norm": 2.73590350151062, + "grad_norm": 2.7504827976226807, "learning_rate": 2.9904237421258046e-08, - "logits/chosen": -1.504531979560852, - "logits/rejected": -1.5007575750350952, - "logps/chosen": -43.289154052734375, - "logps/rejected": -47.094947814941406, - "loss": 0.6911, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.003871799912303686, - "rewards/margins": 0.004103526007384062, - "rewards/rejected": -0.00023172618239186704, + "logits/chosen": -1.5047662258148193, + "logits/rejected": -1.5008184909820557, + "logps/chosen": -43.2752799987793, + "logps/rejected": -47.1126823425293, + "loss": 0.691, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.004010520875453949, + "rewards/margins": 0.004419704433530569, + "rewards/rejected": -0.000409183616284281, "step": 1370 }, { "epoch": 0.9942363112391931, - "grad_norm": 3.3988845348358154, + "grad_norm": 3.4040732383728027, "learning_rate": 2.9595555917269997e-08, - "logits/chosen": -1.4505603313446045, - "logits/rejected": -1.4407179355621338, - "logps/chosen": -46.89836502075195, - "logps/rejected": -49.03162384033203, - "loss": 0.6919, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.0038405396044254303, - "rewards/margins": 0.002565765753388405, - "rewards/rejected": 0.0012747733853757381, + "logits/chosen": -1.450474500656128, + "logits/rejected": -1.4406346082687378, + "logps/chosen": -46.895957946777344, + "logps/rejected": -49.04015350341797, + "loss": 0.6918, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.0038645744789391756, + "rewards/margins": 0.0026750778779387474, + "rewards/rejected": 0.0011894966010004282, "step": 1380 }, { "epoch": 1.0014409221902016, - "grad_norm": 3.138370990753174, + "grad_norm": 3.0235538482666016, "learning_rate": 2.928614755924327e-08, - "logits/chosen": -1.56797194480896, - "logits/rejected": -1.5627766847610474, - "logps/chosen": -43.79928970336914, - "logps/rejected": -46.12070846557617, + "logits/chosen": -1.56806218624115, + "logits/rejected": -1.5628299713134766, + "logps/chosen": -43.78453826904297, + "logps/rejected": -46.11414337158203, "loss": 0.6915, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.00478459894657135, - "rewards/margins": 0.003308654297143221, - "rewards/rejected": 0.0014759448822587729, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.004932035692036152, + "rewards/margins": 0.0033904393203556538, + "rewards/rejected": 0.00154159648809582, "step": 1390 }, { "epoch": 1.0086455331412103, - "grad_norm": 2.4705088138580322, + "grad_norm": 2.480149745941162, "learning_rate": 2.8976061284611908e-08, - "logits/chosen": -1.4321318864822388, - "logits/rejected": -1.4220823049545288, - "logps/chosen": -47.27979278564453, - "logps/rejected": -49.95292282104492, + "logits/chosen": -1.432166576385498, + "logits/rejected": -1.4222097396850586, + "logps/chosen": -47.26256561279297, + "logps/rejected": -49.93214797973633, "loss": 0.6897, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.00496502872556448, - "rewards/margins": 0.006896062288433313, - "rewards/rejected": -0.0019310333300381899, + "rewards/accuracies": 0.6812499761581421, + "rewards/chosen": 0.005137256812304258, + "rewards/margins": 0.0068604955449700356, + "rewards/rejected": -0.0017232384998351336, "step": 1400 }, { "epoch": 1.015850144092219, - "grad_norm": 2.7129950523376465, + "grad_norm": 2.717216968536377, "learning_rate": 2.8665346138032327e-08, - "logits/chosen": -1.4868696928024292, - "logits/rejected": -1.4936091899871826, - "logps/chosen": -43.82860565185547, - "logps/rejected": -47.73162078857422, + "logits/chosen": -1.4869023561477661, + "logits/rejected": -1.4936742782592773, + "logps/chosen": -43.82477569580078, + "logps/rejected": -47.72769546508789, "loss": 0.6903, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.004813327919691801, - "rewards/margins": 0.005714214872568846, - "rewards/rejected": -0.0009008872439153492, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.004851721692830324, + "rewards/margins": 0.005713304504752159, + "rewards/rejected": -0.000861583452206105, "step": 1410 }, { "epoch": 1.0230547550432276, - "grad_norm": 3.119001865386963, + "grad_norm": 3.103109121322632, "learning_rate": 2.8354051263626227e-08, - "logits/chosen": -1.5246412754058838, - "logits/rejected": -1.5142338275909424, - "logps/chosen": -49.44842529296875, - "logps/rejected": -52.086151123046875, - "loss": 0.6902, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.004633410833775997, - "rewards/margins": 0.0059117949567735195, - "rewards/rejected": -0.001278384355828166, + "logits/chosen": -1.524235486984253, + "logits/rejected": -1.5138403177261353, + "logps/chosen": -49.43414306640625, + "logps/rejected": -52.10200881958008, + "loss": 0.6901, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.004776235204190016, + "rewards/margins": 0.006213142536580563, + "rewards/rejected": -0.0014369066338986158, "step": 1420 }, { "epoch": 1.0302593659942363, - "grad_norm": 3.002915859222412, + "grad_norm": 2.993903875350952, "learning_rate": 2.8042225897207648e-08, - "logits/chosen": -1.574763536453247, - "logits/rejected": -1.5653074979782104, - "logps/chosen": -38.265506744384766, - "logps/rejected": -39.41289520263672, - "loss": 0.6909, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.0049251071177423, - "rewards/margins": 0.00451985327526927, - "rewards/rejected": 0.00040525413351133466, + "logits/chosen": -1.57471764087677, + "logits/rejected": -1.5653092861175537, + "logps/chosen": -38.26995086669922, + "logps/rejected": -39.39617156982422, + "loss": 0.691, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.004880656488239765, + "rewards/margins": 0.0043081482872366905, + "rewards/rejected": 0.0005725078517571092, "step": 1430 }, { "epoch": 1.037463976945245, - "grad_norm": 3.6086034774780273, + "grad_norm": 3.5944411754608154, "learning_rate": 2.7729919358495728e-08, - "logits/chosen": -1.5048730373382568, - "logits/rejected": -1.4918932914733887, - "logps/chosen": -56.471527099609375, - "logps/rejected": -56.62952423095703, - "loss": 0.6909, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.0038827345706522465, - "rewards/margins": 0.00448782229796052, - "rewards/rejected": -0.0006050873198546469, + "logits/chosen": -1.5047539472579956, + "logits/rejected": -1.4917666912078857, + "logps/chosen": -56.480064392089844, + "logps/rejected": -56.622596740722656, + "loss": 0.691, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.003797374665737152, + "rewards/margins": 0.004333194810897112, + "rewards/rejected": -0.0005358201451599598, "step": 1440 }, { "epoch": 1.0446685878962536, - "grad_norm": 2.5716030597686768, + "grad_norm": 2.5727920532226562, "learning_rate": 2.741718104331393e-08, - "logits/chosen": -1.609317421913147, - "logits/rejected": -1.6218010187149048, - "logps/chosen": -42.27556228637695, - "logps/rejected": -48.39263153076172, - "loss": 0.69, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.004354503937065601, - "rewards/margins": 0.006454105488955975, - "rewards/rejected": -0.002099601784721017, + "logits/chosen": -1.6095669269561768, + "logits/rejected": -1.6219494342803955, + "logps/chosen": -42.31049728393555, + "logps/rejected": -48.38723373413086, + "loss": 0.6902, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.004005103372037411, + "rewards/margins": 0.006050781812518835, + "rewards/rejected": -0.0020456784404814243, "step": 1450 }, { "epoch": 1.0518731988472623, - "grad_norm": 3.471620798110962, + "grad_norm": 3.475553035736084, "learning_rate": 2.710406041577751e-08, - "logits/chosen": -1.4766095876693726, - "logits/rejected": -1.4579746723175049, - "logps/chosen": -46.40749740600586, - "logps/rejected": -50.96078872680664, - "loss": 0.6893, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.005362418945878744, - "rewards/margins": 0.007757273968309164, - "rewards/rejected": -0.00239485502243042, + "logits/chosen": -1.4763418436050415, + "logits/rejected": -1.4576597213745117, + "logps/chosen": -46.408714294433594, + "logps/rejected": -50.95368957519531, + "loss": 0.6894, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.005350281484425068, + "rewards/margins": 0.007674089167267084, + "rewards/rejected": -0.0023238074500113726, "step": 1460 }, { "epoch": 1.059077809798271, - "grad_norm": 2.8946073055267334, + "grad_norm": 2.9119138717651367, "learning_rate": 2.679060700046994e-08, - "logits/chosen": -1.5664584636688232, - "logits/rejected": -1.5454753637313843, - "logps/chosen": -42.57251739501953, - "logps/rejected": -45.81456756591797, - "loss": 0.6901, + "logits/chosen": -1.5665452480316162, + "logits/rejected": -1.5455653667449951, + "logps/chosen": -42.573341369628906, + "logps/rejected": -45.83182907104492, + "loss": 0.69, "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.00448919553309679, - "rewards/margins": 0.006237962283194065, - "rewards/rejected": -0.0017487674485892057, + "rewards/chosen": 0.004481030162423849, + "rewards/margins": 0.00640241801738739, + "rewards/rejected": -0.0019213876221328974, "step": 1470 }, { "epoch": 1.0662824207492796, - "grad_norm": 2.410533905029297, + "grad_norm": 2.404226064682007, "learning_rate": 2.647687037460996e-08, - "logits/chosen": -1.5056918859481812, - "logits/rejected": -1.486359715461731, - "logps/chosen": -47.32781219482422, - "logps/rejected": -50.14433670043945, - "loss": 0.6906, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.00441827904433012, - "rewards/margins": 0.005131185986101627, - "rewards/rejected": -0.0007129069417715073, + "logits/chosen": -1.5061237812042236, + "logits/rejected": -1.4868037700653076, + "logps/chosen": -47.349422454833984, + "logps/rejected": -50.14873504638672, + "loss": 0.6907, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.004202242009341717, + "rewards/margins": 0.004959183745086193, + "rewards/rejected": -0.0007569411536678672, "step": 1480 }, { "epoch": 1.0734870317002883, - "grad_norm": 3.537882089614868, + "grad_norm": 3.527313232421875, "learning_rate": 2.616290016021016e-08, - "logits/chosen": -1.476985216140747, - "logits/rejected": -1.4656414985656738, - "logps/chosen": -54.070762634277344, - "logps/rejected": -55.63530731201172, + "logits/chosen": -1.4768452644348145, + "logits/rejected": -1.4656083583831787, + "logps/chosen": -54.07750701904297, + "logps/rejected": -55.63515090942383, "loss": 0.6906, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": 0.004396974574774504, - "rewards/margins": 0.0052247559651732445, - "rewards/rejected": -0.0008277808083221316, + "rewards/accuracies": 0.5687500238418579, + "rewards/chosen": 0.004329579882323742, + "rewards/margins": 0.005155793856829405, + "rewards/rejected": -0.0008262138580903411, "step": 1490 }, { "epoch": 1.080691642651297, - "grad_norm": 2.9404425621032715, + "grad_norm": 2.9487035274505615, "learning_rate": 2.584874601622854e-08, - "logits/chosen": -1.546706199645996, - "logits/rejected": -1.5226829051971436, - "logps/chosen": -44.90399932861328, - "logps/rejected": -47.32870101928711, - "loss": 0.6906, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.004803264979273081, - "rewards/margins": 0.0052206553518772125, - "rewards/rejected": -0.0004173902270849794, + "logits/chosen": -1.546918511390686, + "logits/rejected": -1.5229146480560303, + "logps/chosen": -44.89658737182617, + "logps/rejected": -47.34569549560547, + "loss": 0.6904, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.004877328872680664, + "rewards/margins": 0.005464649759232998, + "rewards/rejected": -0.0005873211775906384, "step": 1500 }, { "epoch": 1.0878962536023056, - "grad_norm": 3.3834643363952637, + "grad_norm": 3.3687028884887695, "learning_rate": 2.5534457630714267e-08, - "logits/chosen": -1.5554708242416382, - "logits/rejected": -1.551318883895874, - "logps/chosen": -42.92151641845703, - "logps/rejected": -45.517974853515625, - "loss": 0.69, + "logits/chosen": -1.5555428266525269, + "logits/rejected": -1.5512815713882446, + "logps/chosen": -42.9159049987793, + "logps/rejected": -45.5041389465332, + "loss": 0.6901, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.004870655946433544, - "rewards/margins": 0.006291358731687069, - "rewards/rejected": -0.0014207030180841684, + "rewards/chosen": 0.0049267979338765144, + "rewards/margins": 0.006209177430719137, + "rewards/rejected": -0.0012823803117498755, "step": 1510 }, { "epoch": 1.0951008645533142, - "grad_norm": 3.087625741958618, + "grad_norm": 3.1003010272979736, "learning_rate": 2.5220084712948764e-08, - "logits/chosen": -1.5192267894744873, - "logits/rejected": -1.511206865310669, - "logps/chosen": -53.39455032348633, - "logps/rejected": -52.092124938964844, - "loss": 0.6922, - "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": 0.003288760082796216, - "rewards/margins": 0.0019624028354883194, - "rewards/rejected": 0.0013263572473078966, + "logits/chosen": -1.5193350315093994, + "logits/rejected": -1.5114047527313232, + "logps/chosen": -53.41497039794922, + "logps/rejected": -52.077911376953125, + "loss": 0.6924, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.003084565047174692, + "rewards/margins": 0.001615988090634346, + "rewards/rejected": 0.0014685768401250243, "step": 1520 }, { "epoch": 1.1023054755043227, - "grad_norm": 3.0726020336151123, + "grad_norm": 3.07667875289917, "learning_rate": 2.490567698558343e-08, - "logits/chosen": -1.5649652481079102, - "logits/rejected": -1.551971673965454, - "logps/chosen": -47.31943130493164, - "logps/rejected": -51.5905647277832, - "loss": 0.6904, + "logits/chosen": -1.565250039100647, + "logits/rejected": -1.552339792251587, + "logps/chosen": -47.303611755371094, + "logps/rejected": -51.61354446411133, + "loss": 0.6902, "rewards/accuracies": 0.625, - "rewards/chosen": 0.004729762207716703, - "rewards/margins": 0.005521063692867756, - "rewards/rejected": -0.000791301135905087, + "rewards/chosen": 0.004887987859547138, + "rewards/margins": 0.00590903777629137, + "rewards/rejected": -0.0010210506152361631, "step": 1530 }, { "epoch": 1.1095100864553313, - "grad_norm": 3.0820343494415283, + "grad_norm": 3.0826640129089355, "learning_rate": 2.4591284176775326e-08, - "logits/chosen": -1.5759637355804443, - "logits/rejected": -1.5641340017318726, - "logps/chosen": -45.97845458984375, - "logps/rejected": -49.1536865234375, + "logits/chosen": -1.5758742094039917, + "logits/rejected": -1.5641416311264038, + "logps/chosen": -45.96913146972656, + "logps/rejected": -49.14014434814453, "loss": 0.6907, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.004768095910549164, - "rewards/margins": 0.0049921320751309395, - "rewards/rejected": -0.0002240361354779452, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.004861322231590748, + "rewards/margins": 0.004949974361807108, + "rewards/rejected": -8.865230483934283e-05, "step": 1540 }, { "epoch": 1.11671469740634, - "grad_norm": 2.4528286457061768, + "grad_norm": 2.455200433731079, "learning_rate": 2.4276956012321926e-08, - "logits/chosen": -1.5444703102111816, - "logits/rejected": -1.5342377424240112, - "logps/chosen": -46.77397537231445, - "logps/rejected": -48.802772521972656, - "loss": 0.6901, - "rewards/accuracies": 0.668749988079071, - "rewards/chosen": 0.0045394268818199635, - "rewards/margins": 0.006133201532065868, - "rewards/rejected": -0.0015937744174152613, + "logits/chosen": -1.5445371866226196, + "logits/rejected": -1.534186840057373, + "logps/chosen": -46.76959991455078, + "logps/rejected": -48.75522994995117, + "loss": 0.6903, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.004583186469972134, + "rewards/margins": 0.0057015675120055676, + "rewards/rejected": -0.0011183806927874684, "step": 1550 }, { "epoch": 1.1239193083573487, - "grad_norm": 3.9852285385131836, + "grad_norm": 4.026498317718506, "learning_rate": 2.3962742207796268e-08, - "logits/chosen": -1.5929399728775024, - "logits/rejected": -1.5741552114486694, - "logps/chosen": -47.95702362060547, - "logps/rejected": -51.39387130737305, - "loss": 0.6899, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.005778255872428417, - "rewards/margins": 0.006543435156345367, - "rewards/rejected": -0.0007651798659935594, + "logits/chosen": -1.5931525230407715, + "logits/rejected": -1.5743935108184814, + "logps/chosen": -47.93376541137695, + "logps/rejected": -51.3961067199707, + "loss": 0.6898, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.006010809447616339, + "rewards/margins": 0.006798262242227793, + "rewards/rejected": -0.0007874539005570114, "step": 1560 }, { "epoch": 1.1311239193083573, - "grad_norm": 2.3503243923187256, + "grad_norm": 2.341965913772583, "learning_rate": 2.364869246068368e-08, - "logits/chosen": -1.5574367046356201, - "logits/rejected": -1.5443198680877686, - "logps/chosen": -48.76219940185547, - "logps/rejected": -51.48505401611328, + "logits/chosen": -1.5574229955673218, + "logits/rejected": -1.5441659688949585, + "logps/chosen": -48.76102828979492, + "logps/rejected": -51.49226760864258, "loss": 0.6907, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.004482048097997904, - "rewards/margins": 0.004908232484012842, - "rewards/rejected": -0.0004261844733264297, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.004493670538067818, + "rewards/margins": 0.004992063157260418, + "rewards/rejected": -0.0004983923281542957, "step": 1570 }, { "epoch": 1.138328530259366, - "grad_norm": 3.942777395248413, + "grad_norm": 3.937645435333252, "learning_rate": 2.3334856442521435e-08, - "logits/chosen": -1.5212857723236084, - "logits/rejected": -1.5089150667190552, - "logps/chosen": -49.635498046875, - "logps/rejected": -52.68400955200195, - "loss": 0.6893, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.004903840832412243, - "rewards/margins": 0.007788983639329672, - "rewards/rejected": -0.0028851437382400036, + "logits/chosen": -1.5212982892990112, + "logits/rejected": -1.5090210437774658, + "logps/chosen": -49.635223388671875, + "logps/rejected": -52.65364456176758, + "loss": 0.6894, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.004906609188765287, + "rewards/margins": 0.007488124072551727, + "rewards/rejected": -0.002581514185294509, "step": 1580 }, { "epoch": 1.1455331412103746, - "grad_norm": 3.9626173973083496, + "grad_norm": 3.9511215686798096, "learning_rate": 2.3021283791042474e-08, - "logits/chosen": -1.5031888484954834, - "logits/rejected": -1.490180492401123, - "logps/chosen": -45.95370864868164, - "logps/rejected": -50.583778381347656, - "loss": 0.6899, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.005041131284087896, - "rewards/margins": 0.006472212728112936, - "rewards/rejected": -0.0014310817932710052, + "logits/chosen": -1.5035020112991333, + "logits/rejected": -1.4904110431671143, + "logps/chosen": -45.936710357666016, + "logps/rejected": -50.556278228759766, + "loss": 0.69, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.005211076699197292, + "rewards/margins": 0.00636711623519659, + "rewards/rejected": -0.0011560400016605854, "step": 1590 }, { "epoch": 1.1527377521613833, - "grad_norm": 3.0199642181396484, + "grad_norm": 3.0166871547698975, "learning_rate": 2.2708024102324454e-08, - "logits/chosen": -1.5501774549484253, - "logits/rejected": -1.5308644771575928, - "logps/chosen": -49.10248565673828, - "logps/rejected": -51.190670013427734, + "logits/chosen": -1.5508172512054443, + "logits/rejected": -1.5315322875976562, + "logps/chosen": -49.09033966064453, + "logps/rejected": -51.193824768066406, "loss": 0.6904, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.00435130437836051, - "rewards/margins": 0.0055001284927129745, - "rewards/rejected": -0.0011488242307677865, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.004472781904041767, + "rewards/margins": 0.005653180181980133, + "rewards/rejected": -0.001180398277938366, "step": 1600 }, { "epoch": 1.159942363112392, - "grad_norm": 3.394359827041626, + "grad_norm": 3.386165142059326, "learning_rate": 2.23951269229454e-08, - "logits/chosen": -1.4638254642486572, - "logits/rejected": -1.460058569908142, - "logps/chosen": -46.797508239746094, - "logps/rejected": -49.03670120239258, - "loss": 0.6906, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.004273005295544863, - "rewards/margins": 0.005153841804713011, - "rewards/rejected": -0.0008808368002064526, + "logits/chosen": -1.4639875888824463, + "logits/rejected": -1.4601199626922607, + "logps/chosen": -46.766456604003906, + "logps/rejected": -49.0338134765625, + "loss": 0.6905, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.004583518020808697, + "rewards/margins": 0.00543550169095397, + "rewards/rejected": -0.0008519830880686641, "step": 1610 }, { "epoch": 1.1671469740634006, - "grad_norm": 3.544788360595703, + "grad_norm": 3.5503532886505127, "learning_rate": 2.2082641742147238e-08, - "logits/chosen": -1.5878477096557617, - "logits/rejected": -1.5844405889511108, - "logps/chosen": -49.61848068237305, - "logps/rejected": -53.7255744934082, - "loss": 0.6904, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.005005150567740202, - "rewards/margins": 0.005550033412873745, - "rewards/rejected": -0.0005448829615488648, + "logits/chosen": -1.5878784656524658, + "logits/rejected": -1.584558129310608, + "logps/chosen": -49.621742248535156, + "logps/rejected": -53.74200439453125, + "loss": 0.6903, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.004972456954419613, + "rewards/margins": 0.005681641399860382, + "rewards/rejected": -0.0007091845036484301, "step": 1620 }, { "epoch": 1.1743515850144093, - "grad_norm": 4.231856822967529, + "grad_norm": 4.222747802734375, "learning_rate": 2.177061798400832e-08, - "logits/chosen": -1.4017666578292847, - "logits/rejected": -1.3872358798980713, - "logps/chosen": -51.326171875, - "logps/rejected": -50.59434127807617, + "logits/chosen": -1.4017961025238037, + "logits/rejected": -1.387158751487732, + "logps/chosen": -51.322509765625, + "logps/rejected": -50.58330535888672, "loss": 0.6906, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.004057625774294138, - "rewards/margins": 0.005242443643510342, - "rewards/rejected": -0.0011848185677081347, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.004094275180250406, + "rewards/margins": 0.005168738774955273, + "rewards/rejected": -0.0010744637111201882, "step": 1630 }, { "epoch": 1.181556195965418, - "grad_norm": 2.3840255737304688, + "grad_norm": 2.3746259212493896, "learning_rate": 2.145910499962628e-08, - "logits/chosen": -1.504612684249878, - "logits/rejected": -1.494834065437317, - "logps/chosen": -45.80976867675781, - "logps/rejected": -47.15070343017578, - "loss": 0.6901, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.005578846205025911, - "rewards/margins": 0.006252855062484741, - "rewards/rejected": -0.0006740099051967263, + "logits/chosen": -1.5043513774871826, + "logits/rejected": -1.4944355487823486, + "logps/chosen": -45.81401824951172, + "logps/rejected": -47.165225982666016, + "loss": 0.69, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.0055363597348332405, + "rewards/margins": 0.00635557109490037, + "rewards/rejected": -0.0008192112436518073, "step": 1640 }, { "epoch": 1.1887608069164266, - "grad_norm": 3.0488662719726562, + "grad_norm": 3.0583455562591553, "learning_rate": 2.1148152059312437e-08, - "logits/chosen": -1.5291849374771118, - "logits/rejected": -1.518646478652954, - "logps/chosen": -40.93765640258789, - "logps/rejected": -41.940635681152344, - "loss": 0.6913, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": 0.0034720220137387514, - "rewards/margins": 0.003840423421934247, - "rewards/rejected": -0.0003684009425342083, + "logits/chosen": -1.5294198989868164, + "logits/rejected": -1.5188114643096924, + "logps/chosen": -40.923824310302734, + "logps/rejected": -41.93095779418945, + "loss": 0.6912, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.003610325511544943, + "rewards/margins": 0.003881955984979868, + "rewards/rejected": -0.00027163056074641645, "step": 1650 }, { "epoch": 1.195965417867435, - "grad_norm": 2.6120803356170654, + "grad_norm": 2.615760564804077, "learning_rate": 2.0837808344799028e-08, - "logits/chosen": -1.398280382156372, - "logits/rejected": -1.3995027542114258, - "logps/chosen": -46.10521697998047, - "logps/rejected": -48.297630310058594, - "loss": 0.6902, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.003890200052410364, - "rewards/margins": 0.006055611185729504, - "rewards/rejected": -0.002165411366149783, + "logits/chosen": -1.3983901739120483, + "logits/rejected": -1.3996646404266357, + "logps/chosen": -46.13004684448242, + "logps/rejected": -48.2944450378418, + "loss": 0.6903, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.0036418340168893337, + "rewards/margins": 0.0057754674926400185, + "rewards/rejected": -0.002133633242920041, "step": 1660 }, { "epoch": 1.2031700288184437, - "grad_norm": 3.599088430404663, + "grad_norm": 3.5800857543945312, "learning_rate": 2.052812294146033e-08, - "logits/chosen": -1.5415016412734985, - "logits/rejected": -1.5342748165130615, - "logps/chosen": -45.235252380371094, - "logps/rejected": -49.03083038330078, + "logits/chosen": -1.541508436203003, + "logits/rejected": -1.534239649772644, + "logps/chosen": -45.248985290527344, + "logps/rejected": -49.04714584350586, "loss": 0.6901, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.005832507275044918, - "rewards/margins": 0.00619142223149538, - "rewards/rejected": -0.0003589147236198187, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.005695226602256298, + "rewards/margins": 0.006217294372618198, + "rewards/rejected": -0.0005220678867772222, "step": 1670 }, { "epoch": 1.2103746397694524, - "grad_norm": 3.644432306289673, + "grad_norm": 3.644780397415161, "learning_rate": 2.0219144830549163e-08, - "logits/chosen": -1.438042402267456, - "logits/rejected": -1.4225962162017822, - "logps/chosen": -45.40910339355469, - "logps/rejected": -49.33518981933594, - "loss": 0.6899, + "logits/chosen": -1.437660813331604, + "logits/rejected": -1.422384262084961, + "logps/chosen": -45.428367614746094, + "logps/rejected": -49.343223571777344, + "loss": 0.69, "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.004086838103830814, - "rewards/margins": 0.0065740495920181274, - "rewards/rejected": -0.0024872112553566694, + "rewards/chosen": 0.0038942252285778522, + "rewards/margins": 0.006461729761213064, + "rewards/rejected": -0.0025675049982964993, "step": 1680 }, { "epoch": 1.217579250720461, - "grad_norm": 2.6360511779785156, + "grad_norm": 2.636898994445801, "learning_rate": 1.9910922881449716e-08, - "logits/chosen": -1.5140334367752075, - "logits/rejected": -1.5046603679656982, - "logps/chosen": -50.067481994628906, - "logps/rejected": -52.06745529174805, - "loss": 0.6891, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.005797156598418951, - "rewards/margins": 0.008195227012038231, - "rewards/rejected": -0.002398070180788636, + "logits/chosen": -1.514047622680664, + "logits/rejected": -1.5047409534454346, + "logps/chosen": -50.08318328857422, + "logps/rejected": -52.0395622253418, + "loss": 0.6893, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.00564007181674242, + "rewards/margins": 0.007759124971926212, + "rewards/rejected": -0.002119052689522505, "step": 1690 }, { "epoch": 1.2247838616714697, - "grad_norm": 2.971135377883911, + "grad_norm": 2.9671974182128906, "learning_rate": 1.9603505843948214e-08, - "logits/chosen": -1.4751417636871338, - "logits/rejected": -1.4692978858947754, - "logps/chosen": -50.14417266845703, - "logps/rejected": -53.16925811767578, - "loss": 0.6888, + "logits/chosen": -1.4749956130981445, + "logits/rejected": -1.469154715538025, + "logps/chosen": -50.12143325805664, + "logps/rejected": -53.1866340637207, + "loss": 0.6886, "rewards/accuracies": 0.6875, - "rewards/chosen": 0.007443590555340052, - "rewards/margins": 0.008724456652998924, - "rewards/rejected": -0.001280865166336298, + "rewards/chosen": 0.007670970167964697, + "rewards/margins": 0.009125592187047005, + "rewards/rejected": -0.0014546227175742388, "step": 1700 }, { "epoch": 1.2319884726224783, - "grad_norm": 3.588036060333252, + "grad_norm": 3.5755786895751953, "learning_rate": 1.929694234052239e-08, - "logits/chosen": -1.573609471321106, - "logits/rejected": -1.5557610988616943, - "logps/chosen": -41.997642517089844, - "logps/rejected": -46.378440856933594, - "loss": 0.6898, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.00520675303414464, - "rewards/margins": 0.006725759245455265, - "rewards/rejected": -0.0015190057456493378, + "logits/chosen": -1.573656439781189, + "logits/rejected": -1.555607795715332, + "logps/chosen": -41.991127014160156, + "logps/rejected": -46.354522705078125, + "loss": 0.6899, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.005271919537335634, + "rewards/margins": 0.0065517732873559, + "rewards/rejected": -0.0012798536336049438, "step": 1710 }, { "epoch": 1.239193083573487, - "grad_norm": 2.2766144275665283, + "grad_norm": 2.281280517578125, "learning_rate": 1.8991280858651157e-08, - "logits/chosen": -1.4746538400650024, - "logits/rejected": -1.4580004215240479, - "logps/chosen": -43.533973693847656, - "logps/rejected": -45.88447189331055, - "loss": 0.6899, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": 0.005346877966076136, - "rewards/margins": 0.0066922446712851524, - "rewards/rejected": -0.001345366588793695, + "logits/chosen": -1.4749398231506348, + "logits/rejected": -1.4583933353424072, + "logps/chosen": -43.545719146728516, + "logps/rejected": -45.899620056152344, + "loss": 0.6898, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.005229421891272068, + "rewards/margins": 0.00672622537240386, + "rewards/rejected": -0.0014968044124543667, "step": 1720 }, { "epoch": 1.2463976945244957, - "grad_norm": 3.984792947769165, + "grad_norm": 4.007260322570801, "learning_rate": 1.868656974314557e-08, - "logits/chosen": -1.5194143056869507, - "logits/rejected": -1.514233946800232, - "logps/chosen": -45.3614616394043, - "logps/rejected": -48.93018341064453, - "loss": 0.6897, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.003951947204768658, - "rewards/margins": 0.0070055266842246056, - "rewards/rejected": -0.003053579479455948, + "logits/chosen": -1.519423484802246, + "logits/rejected": -1.5143511295318604, + "logps/chosen": -45.357017517089844, + "logps/rejected": -48.9483757019043, + "loss": 0.6896, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.003996414598077536, + "rewards/margins": 0.007231863681226969, + "rewards/rejected": -0.0032354488503187895, "step": 1730 }, { "epoch": 1.2536023054755043, - "grad_norm": 2.624852180480957, + "grad_norm": 2.6294591426849365, "learning_rate": 1.8382857188502422e-08, - "logits/chosen": -1.4986703395843506, - "logits/rejected": -1.4926235675811768, - "logps/chosen": -48.85844802856445, - "logps/rejected": -51.434288024902344, + "logits/chosen": -1.4989508390426636, + "logits/rejected": -1.4928483963012695, + "logps/chosen": -48.86175537109375, + "logps/rejected": -51.449058532714844, "loss": 0.6899, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.004724487196654081, - "rewards/margins": 0.006543878465890884, - "rewards/rejected": -0.0018193913856521249, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.004691471811383963, + "rewards/margins": 0.006658635102212429, + "rewards/rejected": -0.0019671639893203974, "step": 1740 }, { "epoch": 1.260806916426513, - "grad_norm": 3.2458348274230957, + "grad_norm": 3.2440133094787598, "learning_rate": 1.8080191231281594e-08, - "logits/chosen": -1.4420843124389648, - "logits/rejected": -1.4108572006225586, - "logps/chosen": -50.80805587768555, - "logps/rejected": -49.44476318359375, - "loss": 0.6885, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.006172743625938892, - "rewards/margins": 0.009407153353095055, - "rewards/rejected": -0.003234410658478737, + "logits/chosen": -1.4419151544570923, + "logits/rejected": -1.410692572593689, + "logps/chosen": -50.820106506347656, + "logps/rejected": -49.444854736328125, + "loss": 0.6886, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.006052200682461262, + "rewards/margins": 0.009287551045417786, + "rewards/rejected": -0.0032353501301258802, "step": 1750 }, { "epoch": 1.2680115273775217, - "grad_norm": 2.894810199737549, + "grad_norm": 2.9025306701660156, "learning_rate": 1.7778619742508345e-08, - "logits/chosen": -1.5622589588165283, - "logits/rejected": -1.561194896697998, - "logps/chosen": -51.19914627075195, - "logps/rejected": -53.47156524658203, - "loss": 0.6894, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.004230267833918333, - "rewards/margins": 0.00759546086192131, - "rewards/rejected": -0.0033651920966804028, + "logits/chosen": -1.5624260902404785, + "logits/rejected": -1.5614995956420898, + "logps/chosen": -51.2219352722168, + "logps/rejected": -53.461517333984375, + "loss": 0.6896, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.00400232058018446, + "rewards/margins": 0.007267022039741278, + "rewards/rejected": -0.003264702158048749, "step": 1760 }, { "epoch": 1.2752161383285303, - "grad_norm": 2.6120336055755615, + "grad_norm": 2.6166088581085205, "learning_rate": 1.7478190420101796e-08, - "logits/chosen": -1.5609530210494995, - "logits/rejected": -1.5471351146697998, - "logps/chosen": -44.31365203857422, - "logps/rejected": -47.786956787109375, + "logits/chosen": -1.561137080192566, + "logits/rejected": -1.5472664833068848, + "logps/chosen": -44.327239990234375, + "logps/rejected": -47.79932403564453, "loss": 0.6904, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.004004472866654396, - "rewards/margins": 0.005543733946979046, - "rewards/rejected": -0.0015392610803246498, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.003868586616590619, + "rewards/margins": 0.005531606264412403, + "rewards/rejected": -0.0016630191821604967, "step": 1770 }, { "epoch": 1.282420749279539, - "grad_norm": 3.9775009155273438, + "grad_norm": 3.9649178981781006, "learning_rate": 1.717895078133088e-08, - "logits/chosen": -1.535461187362671, - "logits/rejected": -1.5190433263778687, - "logps/chosen": -53.3277702331543, - "logps/rejected": -53.350120544433594, + "logits/chosen": -1.535312533378601, + "logits/rejected": -1.5189718008041382, + "logps/chosen": -53.335899353027344, + "logps/rejected": -53.346397399902344, "loss": 0.6911, "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.004517709836363792, - "rewards/margins": 0.004209564998745918, - "rewards/rejected": 0.00030814393539913, + "rewards/chosen": 0.004436426796019077, + "rewards/margins": 0.004091030452400446, + "rewards/rejected": 0.0003453959652688354, "step": 1780 }, { "epoch": 1.2896253602305476, - "grad_norm": 2.3989152908325195, + "grad_norm": 2.4041812419891357, "learning_rate": 1.688094815529873e-08, - "logits/chosen": -1.495939016342163, - "logits/rejected": -1.4992210865020752, - "logps/chosen": -46.378273010253906, - "logps/rejected": -48.57247543334961, + "logits/chosen": -1.4961040019989014, + "logits/rejected": -1.4994142055511475, + "logps/chosen": -46.37284851074219, + "logps/rejected": -48.573429107666016, "loss": 0.6917, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.0043221996165812016, - "rewards/margins": 0.002974593546241522, - "rewards/rejected": 0.0013476064195856452, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.004376430995762348, + "rewards/margins": 0.0030383667908608913, + "rewards/rejected": 0.0013380638556554914, "step": 1790 }, { "epoch": 1.2968299711815563, - "grad_norm": 3.332216739654541, + "grad_norm": 3.323512077331543, "learning_rate": 1.658422967545693e-08, - "logits/chosen": -1.5825788974761963, - "logits/rejected": -1.5739555358886719, - "logps/chosen": -45.478031158447266, - "logps/rejected": -51.53306198120117, + "logits/chosen": -1.5825564861297607, + "logits/rejected": -1.573864221572876, + "logps/chosen": -45.47216033935547, + "logps/rejected": -51.52876663208008, "loss": 0.6898, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.006268317811191082, - "rewards/margins": 0.006844964809715748, - "rewards/rejected": -0.0005766463582403958, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.006327009294182062, + "rewards/margins": 0.006860643625259399, + "rewards/rejected": -0.0005336340982466936, "step": 1800 }, { "epoch": 1.304034582132565, - "grad_norm": 2.483397960662842, + "grad_norm": 2.4792699813842773, "learning_rate": 1.6288842272150614e-08, - "logits/chosen": -1.466790795326233, - "logits/rejected": -1.4666392803192139, - "logps/chosen": -46.46950149536133, - "logps/rejected": -49.743324279785156, + "logits/chosen": -1.4666478633880615, + "logits/rejected": -1.4664814472198486, + "logps/chosen": -46.45840835571289, + "logps/rejected": -49.744529724121094, "loss": 0.6904, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": 0.004139263648539782, - "rewards/margins": 0.005469338037073612, - "rewards/rejected": -0.0013300742721185088, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0042501529678702354, + "rewards/margins": 0.005592338740825653, + "rewards/rejected": -0.001342184841632843, "step": 1810 }, { "epoch": 1.3112391930835736, - "grad_norm": 4.057764530181885, + "grad_norm": 4.067249774932861, "learning_rate": 1.5994832665195853e-08, - "logits/chosen": -1.5589174032211304, - "logits/rejected": -1.5488518476486206, - "logps/chosen": -40.69142532348633, - "logps/rejected": -45.08420944213867, - "loss": 0.689, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.0055547840893268585, - "rewards/margins": 0.008430869318544865, - "rewards/rejected": -0.002876085927709937, + "logits/chosen": -1.5588191747665405, + "logits/rejected": -1.548767328262329, + "logps/chosen": -40.6784782409668, + "logps/rejected": -45.09257507324219, + "loss": 0.6889, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.005684255622327328, + "rewards/margins": 0.008644016459584236, + "rewards/rejected": -0.002959761070087552, "step": 1820 }, { "epoch": 1.318443804034582, - "grad_norm": 2.2705423831939697, + "grad_norm": 2.269655466079712, "learning_rate": 1.5702247356490134e-08, - "logits/chosen": -1.4942680597305298, - "logits/rejected": -1.4937645196914673, - "logps/chosen": -44.22240447998047, - "logps/rejected": -46.42625427246094, + "logits/chosen": -1.4939970970153809, + "logits/rejected": -1.4934114217758179, + "logps/chosen": -44.22581100463867, + "logps/rejected": -46.432308197021484, "loss": 0.69, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.003690245794132352, - "rewards/margins": 0.006414071656763554, - "rewards/rejected": -0.0027238260954618454, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.003656134009361267, + "rewards/margins": 0.006440469529479742, + "rewards/rejected": -0.0027843350544571877, "step": 1830 }, { "epoch": 1.3256484149855907, - "grad_norm": 2.201984405517578, + "grad_norm": 2.2065982818603516, "learning_rate": 1.541113262265748e-08, - "logits/chosen": -1.5093882083892822, - "logits/rejected": -1.4849798679351807, - "logps/chosen": -48.22920227050781, - "logps/rejected": -49.40846633911133, - "loss": 0.69, - "rewards/accuracies": 0.5562499761581421, - "rewards/chosen": 0.004583511967211962, - "rewards/margins": 0.006392877548933029, - "rewards/rejected": -0.001809365814551711, + "logits/chosen": -1.5094202756881714, + "logits/rejected": -1.4850237369537354, + "logps/chosen": -48.20293426513672, + "logps/rejected": -49.39777755737305, + "loss": 0.6899, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.004846207797527313, + "rewards/margins": 0.006548731122165918, + "rewards/rejected": -0.0017025221604853868, "step": 1840 }, { "epoch": 1.3328530259365994, - "grad_norm": 3.0725996494293213, + "grad_norm": 3.0674479007720947, "learning_rate": 1.5121534507729073e-08, - "logits/chosen": -1.55897057056427, - "logits/rejected": -1.5454485416412354, - "logps/chosen": -43.795204162597656, - "logps/rejected": -46.801849365234375, - "loss": 0.6896, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.003769079688936472, - "rewards/margins": 0.007118572946637869, - "rewards/rejected": -0.0033494927920401096, + "logits/chosen": -1.5589674711227417, + "logits/rejected": -1.5453927516937256, + "logps/chosen": -43.794151306152344, + "logps/rejected": -46.793357849121094, + "loss": 0.6897, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.0037796609103679657, + "rewards/margins": 0.0070442743599414825, + "rewards/rejected": -0.003264613449573517, "step": 1850 }, { "epoch": 1.340057636887608, - "grad_norm": 3.1964995861053467, + "grad_norm": 3.1985769271850586, "learning_rate": 1.4833498815860756e-08, - "logits/chosen": -1.5131951570510864, - "logits/rejected": -1.503108024597168, - "logps/chosen": -48.543392181396484, - "logps/rejected": -53.26386260986328, - "loss": 0.6892, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": 0.004869317635893822, - "rewards/margins": 0.007982860319316387, - "rewards/rejected": -0.003113542450591922, + "logits/chosen": -1.5135055780410767, + "logits/rejected": -1.503368616104126, + "logps/chosen": -48.541114807128906, + "logps/rejected": -53.24763870239258, + "loss": 0.6893, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.004892065189778805, + "rewards/margins": 0.007843365892767906, + "rewards/rejected": -0.0029513011686503887, "step": 1860 }, { "epoch": 1.3472622478386167, - "grad_norm": 3.0271787643432617, + "grad_norm": 3.0174834728240967, "learning_rate": 1.4547071104088443e-08, - "logits/chosen": -1.482999324798584, - "logits/rejected": -1.4558321237564087, - "logps/chosen": -42.19821548461914, - "logps/rejected": -46.049476623535156, + "logits/chosen": -1.482691764831543, + "logits/rejected": -1.4554970264434814, + "logps/chosen": -42.20122528076172, + "logps/rejected": -46.06108856201172, "loss": 0.6895, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.005044402554631233, - "rewards/margins": 0.0073828669264912605, - "rewards/rejected": -0.00233846390619874, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.005014275666326284, + "rewards/margins": 0.007468868978321552, + "rewards/rejected": -0.0024545923806726933, "step": 1870 }, { "epoch": 1.3544668587896254, - "grad_norm": 3.3923115730285645, + "grad_norm": 3.380967617034912, "learning_rate": 1.4262296675122592e-08, - "logits/chosen": -1.5146949291229248, - "logits/rejected": -1.497996211051941, - "logps/chosen": -49.10752487182617, - "logps/rejected": -50.08454513549805, - "loss": 0.6902, + "logits/chosen": -1.5148619413375854, + "logits/rejected": -1.498252511024475, + "logps/chosen": -49.090728759765625, + "logps/rejected": -50.08750534057617, + "loss": 0.6901, "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.003856577444821596, - "rewards/margins": 0.006001911126077175, - "rewards/rejected": -0.0021453332155942917, + "rewards/chosen": 0.004024534951895475, + "rewards/margins": 0.0061994679272174835, + "rewards/rejected": -0.0021749339066445827, "step": 1880 }, { "epoch": 1.361671469740634, - "grad_norm": 3.2024686336517334, + "grad_norm": 3.214054584503174, "learning_rate": 1.3979220570182902e-08, - "logits/chosen": -1.449581503868103, - "logits/rejected": -1.4461772441864014, - "logps/chosen": -45.997764587402344, - "logps/rejected": -50.069419860839844, - "loss": 0.6898, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.004631740506738424, - "rewards/margins": 0.006712678819894791, - "rewards/rejected": -0.002080937847495079, + "logits/chosen": -1.4493000507354736, + "logits/rejected": -1.4460159540176392, + "logps/chosen": -46.005516052246094, + "logps/rejected": -50.10773468017578, + "loss": 0.6897, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.004554239567369223, + "rewards/margins": 0.0070183975622057915, + "rewards/rejected": -0.002464158460497856, "step": 1890 }, { "epoch": 1.3688760806916427, - "grad_norm": 2.0080878734588623, + "grad_norm": 2.0138704776763916, "learning_rate": 1.369788756187445e-08, - "logits/chosen": -1.531224012374878, - "logits/rejected": -1.5335500240325928, - "logps/chosen": -45.338050842285156, - "logps/rejected": -48.8311653137207, - "loss": 0.6901, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.0053773848339915276, - "rewards/margins": 0.006197996437549591, - "rewards/rejected": -0.000820611952804029, + "logits/chosen": -1.5314304828643799, + "logits/rejected": -1.5336986780166626, + "logps/chosen": -45.31755447387695, + "logps/rejected": -48.831336975097656, + "loss": 0.69, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.005582300014793873, + "rewards/margins": 0.0064046382904052734, + "rewards/rejected": -0.000822339323349297, "step": 1900 }, { "epoch": 1.3760806916426513, - "grad_norm": 2.963144540786743, + "grad_norm": 2.9555187225341797, "learning_rate": 1.3418342147106212e-08, - "logits/chosen": -1.5691850185394287, - "logits/rejected": -1.568887710571289, - "logps/chosen": -45.99410629272461, - "logps/rejected": -50.148170471191406, - "loss": 0.689, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.005367639474570751, - "rewards/margins": 0.00848575122654438, - "rewards/rejected": -0.0031181129161268473, + "logits/chosen": -1.5689207315444946, + "logits/rejected": -1.5687485933303833, + "logps/chosen": -45.9853515625, + "logps/rejected": -50.116615295410156, + "loss": 0.6891, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.005455161444842815, + "rewards/margins": 0.008257756009697914, + "rewards/rejected": -0.0028025947976857424, "step": 1910 }, { "epoch": 1.38328530259366, - "grad_norm": 2.71224308013916, + "grad_norm": 2.710223913192749, "learning_rate": 1.3140628540053218e-08, - "logits/chosen": -1.5857388973236084, - "logits/rejected": -1.5784494876861572, - "logps/chosen": -38.50310516357422, - "logps/rejected": -43.5479850769043, + "logits/chosen": -1.5859081745147705, + "logits/rejected": -1.5784118175506592, + "logps/chosen": -38.49979782104492, + "logps/rejected": -43.55414962768555, "loss": 0.6899, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.0047732992097735405, - "rewards/margins": 0.0065245418809354305, - "rewards/rejected": -0.0017512429039925337, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.004806336015462875, + "rewards/margins": 0.0066192434169352055, + "rewards/rejected": -0.0018129062373191118, "step": 1920 }, { "epoch": 1.3904899135446687, - "grad_norm": 4.161129474639893, + "grad_norm": 4.167338848114014, "learning_rate": 1.286479066516345e-08, - "logits/chosen": -1.450089931488037, - "logits/rejected": -1.4414006471633911, - "logps/chosen": -52.12537384033203, - "logps/rejected": -53.66926956176758, + "logits/chosen": -1.4498648643493652, + "logits/rejected": -1.4411438703536987, + "logps/chosen": -52.12342071533203, + "logps/rejected": -53.66259002685547, "loss": 0.6906, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": 0.004887779243290424, - "rewards/margins": 0.0051414938643574715, - "rewards/rejected": -0.0002537147665861994, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.004907308612018824, + "rewards/margins": 0.005094234831631184, + "rewards/rejected": -0.00018692630692385137, "step": 1930 }, { "epoch": 1.397694524495677, - "grad_norm": 2.4571914672851562, + "grad_norm": 2.4612481594085693, "learning_rate": 1.2590872150210574e-08, - "logits/chosen": -1.5860296487808228, - "logits/rejected": -1.5841039419174194, - "logps/chosen": -43.57406234741211, - "logps/rejected": -45.4689826965332, - "loss": 0.6902, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.004368237219750881, - "rewards/margins": 0.006046769674867392, - "rewards/rejected": -0.0016785322222858667, + "logits/chosen": -1.5858898162841797, + "logits/rejected": -1.584014654159546, + "logps/chosen": -43.59282302856445, + "logps/rejected": -45.45196533203125, + "loss": 0.6903, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.004180621821433306, + "rewards/margins": 0.005688945297151804, + "rewards/rejected": -0.0015083241742104292, "step": 1940 }, { "epoch": 1.4048991354466858, - "grad_norm": 2.796229362487793, + "grad_norm": 2.8043339252471924, "learning_rate": 1.2318916319393555e-08, - "logits/chosen": -1.5375984907150269, - "logits/rejected": -1.5296894311904907, - "logps/chosen": -43.640052795410156, - "logps/rejected": -46.94563293457031, + "logits/chosen": -1.5378026962280273, + "logits/rejected": -1.529823660850525, + "logps/chosen": -43.639686584472656, + "logps/rejected": -46.94390869140625, "loss": 0.6892, "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": 0.004670538939535618, - "rewards/margins": 0.00806342251598835, - "rewards/rejected": -0.0033928831107914448, + "rewards/chosen": 0.004674214404076338, + "rewards/margins": 0.008049890398979187, + "rewards/rejected": -0.003375676227733493, "step": 1950 }, { "epoch": 1.4121037463976944, - "grad_norm": 3.647590160369873, + "grad_norm": 3.6563119888305664, "learning_rate": 1.2048966186484282e-08, - "logits/chosen": -1.4963290691375732, - "logits/rejected": -1.4878935813903809, - "logps/chosen": -45.24595260620117, - "logps/rejected": -49.62815475463867, - "loss": 0.6886, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.005302284844219685, - "rewards/margins": 0.009141536429524422, - "rewards/rejected": -0.0038392518181353807, + "logits/chosen": -1.4962561130523682, + "logits/rejected": -1.487799882888794, + "logps/chosen": -45.25653076171875, + "logps/rejected": -49.612525939941406, + "loss": 0.6888, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005196507088840008, + "rewards/margins": 0.008879442699253559, + "rewards/rejected": -0.0036829360760748386, "step": 1960 }, { "epoch": 1.419308357348703, - "grad_norm": 3.734166383743286, + "grad_norm": 3.7385265827178955, "learning_rate": 1.1781064448024333e-08, - "logits/chosen": -1.5457031726837158, - "logits/rejected": -1.5351985692977905, - "logps/chosen": -41.0999755859375, - "logps/rejected": -45.25709533691406, + "logits/chosen": -1.54585599899292, + "logits/rejected": -1.5353103876113892, + "logps/chosen": -41.12052917480469, + "logps/rejected": -45.282588958740234, "loss": 0.6884, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.00485195079818368, - "rewards/margins": 0.009582359343767166, - "rewards/rejected": -0.004730407148599625, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004646482411772013, + "rewards/margins": 0.009631778113543987, + "rewards/rejected": -0.004985296167433262, "step": 1970 }, { "epoch": 1.4265129682997117, - "grad_norm": 3.188124179840088, + "grad_norm": 3.1890530586242676, "learning_rate": 1.1515253476571923e-08, - "logits/chosen": -1.51368248462677, - "logits/rejected": -1.5137616395950317, - "logps/chosen": -45.28264617919922, - "logps/rejected": -49.37968826293945, + "logits/chosen": -1.5136247873306274, + "logits/rejected": -1.513709545135498, + "logps/chosen": -45.29556655883789, + "logps/rejected": -49.37841033935547, "loss": 0.6902, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.0027642191853374243, - "rewards/margins": 0.006152432877570391, - "rewards/rejected": -0.0033882136922329664, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0026350452098995447, + "rewards/margins": 0.0060105351731181145, + "rewards/rejected": -0.003375489264726639, "step": 1980 }, { "epoch": 1.4337175792507204, - "grad_norm": 3.298959732055664, + "grad_norm": 3.2632498741149902, "learning_rate": 1.1251575314000034e-08, - "logits/chosen": -1.5250017642974854, - "logits/rejected": -1.5145986080169678, - "logps/chosen": -43.598915100097656, - "logps/rejected": -45.834327697753906, + "logits/chosen": -1.524692416191101, + "logits/rejected": -1.5142878293991089, + "logps/chosen": -43.59644317626953, + "logps/rejected": -45.814125061035156, "loss": 0.69, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.0049310047179460526, - "rewards/margins": 0.0064821490086615086, - "rewards/rejected": -0.0015511433593928814, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.004955722484737635, + "rewards/margins": 0.006304902024567127, + "rewards/rejected": -0.0013491790741682053, "step": 1990 }, { "epoch": 1.440922190201729, - "grad_norm": 2.1519386768341064, + "grad_norm": 2.1573173999786377, "learning_rate": 1.0990071664846861e-08, - "logits/chosen": -1.56293785572052, - "logits/rejected": -1.5495407581329346, - "logps/chosen": -41.893795013427734, - "logps/rejected": -47.79245376586914, - "loss": 0.6887, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.005727686919271946, - "rewards/margins": 0.0090394988656044, - "rewards/rejected": -0.0033118128776550293, + "logits/chosen": -1.5630563497543335, + "logits/rejected": -1.5495526790618896, + "logps/chosen": -41.898902893066406, + "logps/rejected": -47.80742645263672, + "loss": 0.6886, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.005676587112247944, + "rewards/margins": 0.009138097986578941, + "rewards/rejected": -0.0034615113399922848, "step": 2000 }, { "epoch": 1.4481268011527377, - "grad_norm": 3.128014326095581, + "grad_norm": 3.135481357574463, "learning_rate": 1.0730783889719711e-08, - "logits/chosen": -1.477405309677124, - "logits/rejected": -1.4622482061386108, - "logps/chosen": -43.723175048828125, - "logps/rejected": -47.11913299560547, - "loss": 0.6895, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.003931211773306131, - "rewards/margins": 0.007391474209725857, - "rewards/rejected": -0.0034602631349116564, + "logits/chosen": -1.477289080619812, + "logits/rejected": -1.4621999263763428, + "logps/chosen": -43.74903106689453, + "logps/rejected": -47.121009826660156, + "loss": 0.6896, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.0036726822145283222, + "rewards/margins": 0.007151623722165823, + "rewards/rejected": -0.003478941274806857, "step": 2010 }, { "epoch": 1.4553314121037464, - "grad_norm": 4.031602382659912, + "grad_norm": 4.046953201293945, "learning_rate": 1.0473752998753114e-08, - "logits/chosen": -1.565809965133667, - "logits/rejected": -1.5617783069610596, - "logps/chosen": -46.38797378540039, - "logps/rejected": -48.52936553955078, - "loss": 0.6897, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.004219281952828169, - "rewards/margins": 0.007104185409843922, - "rewards/rejected": -0.0028849041555076838, + "logits/chosen": -1.565807580947876, + "logits/rejected": -1.561715841293335, + "logps/chosen": -46.399658203125, + "logps/rejected": -48.55376052856445, + "loss": 0.6896, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.00410244707018137, + "rewards/margins": 0.007231240160763264, + "rewards/rejected": -0.003128792392089963, "step": 2020 }, { "epoch": 1.462536023054755, - "grad_norm": 3.348253011703491, + "grad_norm": 3.342499017715454, "learning_rate": 1.0219019645122575e-08, - "logits/chosen": -1.5837256908416748, - "logits/rejected": -1.5753594636917114, - "logps/chosen": -46.2445068359375, - "logps/rejected": -50.02815246582031, + "logits/chosen": -1.5836737155914307, + "logits/rejected": -1.5750820636749268, + "logps/chosen": -46.25938415527344, + "logps/rejected": -50.03252410888672, "loss": 0.691, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.004541327711194754, - "rewards/margins": 0.004451502580195665, - "rewards/rejected": 8.9825494796969e-05, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.004392546601593494, + "rewards/margins": 0.004346476402133703, + "rewards/rejected": 4.6069595555309206e-05, "step": 2030 }, { "epoch": 1.4697406340057637, - "grad_norm": 3.961947441101074, + "grad_norm": 3.9536612033843994, "learning_rate": 9.966624118614611e-09, - "logits/chosen": -1.5389362573623657, - "logits/rejected": -1.5367815494537354, - "logps/chosen": -49.683441162109375, - "logps/rejected": -54.005958557128906, - "loss": 0.6893, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.005118837580084801, - "rewards/margins": 0.007952864281833172, - "rewards/rejected": -0.0028340264689177275, + "logits/chosen": -1.5390533208847046, + "logits/rejected": -1.5368832349777222, + "logps/chosen": -49.673789978027344, + "logps/rejected": -53.99889373779297, + "loss": 0.6892, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.005215344484895468, + "rewards/margins": 0.007978727109730244, + "rewards/rejected": -0.0027633821591734886, "step": 2040 }, { "epoch": 1.4769452449567724, - "grad_norm": 3.100484848022461, + "grad_norm": 3.108199119567871, "learning_rate": 9.71660633925438e-09, - "logits/chosen": -1.529512643814087, - "logits/rejected": -1.5044424533843994, - "logps/chosen": -50.613685607910156, - "logps/rejected": -52.775421142578125, + "logits/chosen": -1.529578447341919, + "logits/rejected": -1.5046679973602295, + "logps/chosen": -50.6198844909668, + "logps/rejected": -52.77753829956055, "loss": 0.6892, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": 0.0055261170491576195, - "rewards/margins": 0.007974695414304733, - "rewards/rejected": -0.0024485790636390448, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.005464083980768919, + "rewards/margins": 0.007933822460472584, + "rewards/rejected": -0.0024697366170585155, "step": 2050 }, { "epoch": 1.484149855907781, - "grad_norm": 2.6049001216888428, + "grad_norm": 2.6042842864990234, "learning_rate": 9.469005850991705e-09, - "logits/chosen": -1.5558052062988281, - "logits/rejected": -1.5315583944320679, - "logps/chosen": -44.12029266357422, - "logps/rejected": -44.07444763183594, + "logits/chosen": -1.5559239387512207, + "logits/rejected": -1.531647801399231, + "logps/chosen": -44.11097717285156, + "logps/rejected": -44.06501007080078, "loss": 0.6899, "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.0032895379699766636, - "rewards/margins": 0.0066891806200146675, - "rewards/rejected": -0.0033996417187154293, + "rewards/chosen": 0.0033826634753495455, + "rewards/margins": 0.0066879005171358585, + "rewards/rejected": -0.0033052365761250257, "step": 2060 }, { "epoch": 1.4913544668587897, - "grad_norm": 4.039497375488281, + "grad_norm": 4.049923896789551, "learning_rate": 9.223861815446682e-09, - "logits/chosen": -1.565125823020935, - "logits/rejected": -1.537915587425232, - "logps/chosen": -51.43004608154297, - "logps/rejected": -52.8193359375, + "logits/chosen": -1.5651435852050781, + "logits/rejected": -1.5379374027252197, + "logps/chosen": -51.427284240722656, + "logps/rejected": -52.82805633544922, "loss": 0.69, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.0033548108767718077, - "rewards/margins": 0.0063339159823954105, - "rewards/rejected": -0.002979104872792959, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.0033824308775365353, + "rewards/margins": 0.006448785308748484, + "rewards/rejected": -0.003066354664042592, "step": 2070 }, { "epoch": 1.4985590778097984, - "grad_norm": 3.0772712230682373, + "grad_norm": 3.0854642391204834, "learning_rate": 8.981213005715627e-09, - "logits/chosen": -1.4433072805404663, - "logits/rejected": -1.440057396888733, - "logps/chosen": -49.11186218261719, - "logps/rejected": -53.07099533081055, - "loss": 0.689, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": 0.005112348590046167, - "rewards/margins": 0.008395684882998466, - "rewards/rejected": -0.003283336292952299, + "logits/chosen": -1.4433515071868896, + "logits/rejected": -1.4401018619537354, + "logps/chosen": -49.125553131103516, + "logps/rejected": -53.04938507080078, + "loss": 0.6892, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.004975461401045322, + "rewards/margins": 0.008042706176638603, + "rewards/rejected": -0.003067243145778775, "step": 2080 }, { "epoch": 1.505763688760807, - "grad_norm": 2.5140655040740967, + "grad_norm": 2.5064876079559326, "learning_rate": 8.741097800238617e-09, - "logits/chosen": -1.5396144390106201, - "logits/rejected": -1.5253915786743164, - "logps/chosen": -42.4496955871582, - "logps/rejected": -45.593421936035156, - "loss": 0.6886, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.004491289611905813, - "rewards/margins": 0.009266827255487442, - "rewards/rejected": -0.004775536712259054, + "logits/chosen": -1.5399099588394165, + "logits/rejected": -1.5254058837890625, + "logps/chosen": -42.423248291015625, + "logps/rejected": -45.59009552001953, + "loss": 0.6885, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.004755737259984016, + "rewards/margins": 0.009497955441474915, + "rewards/rejected": -0.0047422172501683235, "step": 2090 }, { "epoch": 1.5129682997118157, - "grad_norm": 3.587307929992676, + "grad_norm": 3.562002420425415, "learning_rate": 8.503554176729341e-09, - "logits/chosen": -1.4787709712982178, - "logits/rejected": -1.4897699356079102, - "logps/chosen": -41.53155517578125, - "logps/rejected": -44.84314727783203, - "loss": 0.6895, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.005696765147149563, - "rewards/margins": 0.007413689978420734, - "rewards/rejected": -0.001716924598440528, + "logits/chosen": -1.4786937236785889, + "logits/rejected": -1.4896752834320068, + "logps/chosen": -41.526512145996094, + "logps/rejected": -44.81072235107422, + "loss": 0.6896, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.005747204180806875, + "rewards/margins": 0.007139878813177347, + "rewards/rejected": -0.0013926750980317593, "step": 2100 }, { "epoch": 1.5201729106628243, - "grad_norm": 3.186201333999634, + "grad_norm": 3.1862659454345703, "learning_rate": 8.268619706168376e-09, - "logits/chosen": -1.4888498783111572, - "logits/rejected": -1.472393274307251, - "logps/chosen": -44.67079544067383, - "logps/rejected": -47.441776275634766, - "loss": 0.6887, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.0060030934400856495, - "rewards/margins": 0.009135951288044453, - "rewards/rejected": -0.003132858779281378, + "logits/chosen": -1.4893500804901123, + "logits/rejected": -1.472717523574829, + "logps/chosen": -44.6899299621582, + "logps/rejected": -47.43818283081055, + "loss": 0.6888, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.005811712704598904, + "rewards/margins": 0.008908652700483799, + "rewards/rejected": -0.003096939530223608, "step": 2110 }, { "epoch": 1.527377521613833, - "grad_norm": 3.1603381633758545, + "grad_norm": 3.151075601577759, "learning_rate": 8.036331546860777e-09, - "logits/chosen": -1.3437327146530151, - "logits/rejected": -1.327552318572998, - "logps/chosen": -51.8477897644043, - "logps/rejected": -53.872764587402344, + "logits/chosen": -1.3434933423995972, + "logits/rejected": -1.3273756504058838, + "logps/chosen": -51.86799240112305, + "logps/rejected": -53.89594650268555, "loss": 0.6902, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.005064574535936117, - "rewards/margins": 0.005968200508505106, - "rewards/rejected": -0.0009036259725689888, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.004862475208938122, + "rewards/margins": 0.005997861735522747, + "rewards/rejected": -0.0011353869922459126, "step": 2120 }, { "epoch": 1.5345821325648417, - "grad_norm": 3.1406702995300293, + "grad_norm": 3.147087574005127, "learning_rate": 7.806726438559003e-09, - "logits/chosen": -1.4764467477798462, - "logits/rejected": -1.4766347408294678, - "logps/chosen": -50.17755889892578, - "logps/rejected": -52.37862014770508, - "loss": 0.6902, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.004243126604706049, - "rewards/margins": 0.0059885298833251, - "rewards/rejected": -0.0017454035114496946, + "logits/chosen": -1.4763046503067017, + "logits/rejected": -1.476601243019104, + "logps/chosen": -50.17803955078125, + "logps/rejected": -52.410606384277344, + "loss": 0.69, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.004238266963511705, + "rewards/margins": 0.006303568370640278, + "rewards/rejected": -0.0020653014071285725, "step": 2130 }, { "epoch": 1.54178674351585, - "grad_norm": 4.606470108032227, + "grad_norm": 4.610652446746826, "learning_rate": 7.579840696651938e-09, - "logits/chosen": -1.4668611288070679, - "logits/rejected": -1.4538590908050537, - "logps/chosen": -51.94190216064453, - "logps/rejected": -54.42673873901367, + "logits/chosen": -1.4667813777923584, + "logits/rejected": -1.4536288976669312, + "logps/chosen": -51.929771423339844, + "logps/rejected": -54.413536071777344, "loss": 0.6891, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.00513820443302393, - "rewards/margins": 0.008161871694028378, - "rewards/rejected": -0.003023667959496379, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.005259462166577578, + "rewards/margins": 0.008151163347065449, + "rewards/rejected": -0.0028917009476572275, "step": 2140 }, { "epoch": 1.5489913544668588, - "grad_norm": 3.249380111694336, + "grad_norm": 3.2517471313476562, "learning_rate": 7.355710206421098e-09, - "logits/chosen": -1.371519923210144, - "logits/rejected": -1.3645904064178467, - "logps/chosen": -51.36472702026367, - "logps/rejected": -54.906837463378906, + "logits/chosen": -1.3716877698898315, + "logits/rejected": -1.3648245334625244, + "logps/chosen": -51.35541915893555, + "logps/rejected": -54.90302658081055, "loss": 0.6896, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.004159058444201946, - "rewards/margins": 0.007202534936368465, - "rewards/rejected": -0.00304347719065845, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.004252185113728046, + "rewards/margins": 0.0072575947269797325, + "rewards/rejected": -0.003005408914759755, "step": 2150 }, { "epoch": 1.5561959654178674, - "grad_norm": 2.9742538928985596, + "grad_norm": 2.9784481525421143, "learning_rate": 7.134370417364849e-09, - "logits/chosen": -1.5147724151611328, - "logits/rejected": -1.5050067901611328, - "logps/chosen": -52.23655319213867, - "logps/rejected": -52.967933654785156, - "loss": 0.6892, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.004608969669789076, - "rewards/margins": 0.007931066676974297, - "rewards/rejected": -0.0033220970071852207, + "logits/chosen": -1.5145814418792725, + "logits/rejected": -1.5049586296081543, + "logps/chosen": -52.248756408691406, + "logps/rejected": -52.96476364135742, + "loss": 0.6893, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.00448699900880456, + "rewards/margins": 0.007777456194162369, + "rewards/rejected": -0.0032904576510190964, "step": 2160 }, { "epoch": 1.563400576368876, - "grad_norm": 3.727450132369995, + "grad_norm": 3.7061970233917236, "learning_rate": 6.915856337591572e-09, - "logits/chosen": -1.482490062713623, - "logits/rejected": -1.4760053157806396, - "logps/chosen": -43.63593292236328, - "logps/rejected": -47.05614471435547, + "logits/chosen": -1.48256516456604, + "logits/rejected": -1.476099967956543, + "logps/chosen": -43.65003204345703, + "logps/rejected": -47.065677642822266, "loss": 0.69, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.006135334726423025, - "rewards/margins": 0.006472317036241293, - "rewards/rejected": -0.0003369826590642333, + "rewards/chosen": 0.005994272883981466, + "rewards/margins": 0.006426584906876087, + "rewards/rejected": -0.0004323108587414026, "step": 2170 }, { "epoch": 1.5706051873198847, - "grad_norm": 3.832752227783203, + "grad_norm": 3.8327274322509766, "learning_rate": 6.700202528282603e-09, - "logits/chosen": -1.4922187328338623, - "logits/rejected": -1.485725998878479, - "logps/chosen": -52.761619567871094, - "logps/rejected": -56.29583740234375, + "logits/chosen": -1.4924890995025635, + "logits/rejected": -1.4859017133712769, + "logps/chosen": -52.759056091308594, + "logps/rejected": -56.2874641418457, "loss": 0.6905, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.002379755023866892, - "rewards/margins": 0.005406736861914396, - "rewards/rejected": -0.003026981372386217, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.00240537291392684, + "rewards/margins": 0.005348607897758484, + "rewards/rejected": -0.002943234983831644, "step": 2180 }, { "epoch": 1.5778097982708934, - "grad_norm": 3.990762710571289, + "grad_norm": 3.9853765964508057, "learning_rate": 6.487443098225892e-09, - "logits/chosen": -1.5615607500076294, - "logits/rejected": -1.5581114292144775, - "logps/chosen": -47.892555236816406, - "logps/rejected": -53.33452606201172, - "loss": 0.6899, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.005321321543306112, - "rewards/margins": 0.00668514147400856, - "rewards/rejected": -0.0013638200471177697, + "logits/chosen": -1.5614805221557617, + "logits/rejected": -1.5581691265106201, + "logps/chosen": -47.90576934814453, + "logps/rejected": -53.327911376953125, + "loss": 0.69, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.005189166404306889, + "rewards/margins": 0.006486810743808746, + "rewards/rejected": -0.001297644106671214, "step": 2190 }, { "epoch": 1.585014409221902, - "grad_norm": 3.2427289485931396, + "grad_norm": 3.2553822994232178, "learning_rate": 6.277611698421179e-09, - "logits/chosen": -1.5482763051986694, - "logits/rejected": -1.5331636667251587, - "logps/chosen": -48.45262145996094, - "logps/rejected": -52.01759719848633, + "logits/chosen": -1.5480514764785767, + "logits/rejected": -1.5331087112426758, + "logps/chosen": -48.43512725830078, + "logps/rejected": -51.991416931152344, "loss": 0.6881, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.007097711320966482, - "rewards/margins": 0.010285461321473122, - "rewards/rejected": -0.0031877500005066395, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.00727265328168869, + "rewards/margins": 0.010198639705777168, + "rewards/rejected": -0.002925986424088478, "step": 2200 }, { "epoch": 1.5922190201729105, - "grad_norm": 3.0866363048553467, + "grad_norm": 3.103288412094116, "learning_rate": 6.070741516757608e-09, - "logits/chosen": -1.5173919200897217, - "logits/rejected": -1.5092884302139282, - "logps/chosen": -48.39284896850586, - "logps/rejected": -49.55265808105469, - "loss": 0.6898, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.0061082784086465836, - "rewards/margins": 0.0069038597866892815, - "rewards/rejected": -0.0007955812034197152, + "logits/chosen": -1.5171834230422974, + "logits/rejected": -1.5092236995697021, + "logps/chosen": -48.38337326049805, + "logps/rejected": -49.54729461669922, + "loss": 0.6897, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006203086115419865, + "rewards/margins": 0.0069450633600354195, + "rewards/rejected": -0.0007419771864078939, "step": 2210 }, { "epoch": 1.5994236311239192, - "grad_norm": 3.538295269012451, + "grad_norm": 3.5335028171539307, "learning_rate": 5.866865272764607e-09, - "logits/chosen": -1.4962482452392578, - "logits/rejected": -1.4885507822036743, - "logps/chosen": -52.85791778564453, - "logps/rejected": -57.89457321166992, - "loss": 0.6889, + "logits/chosen": -1.4960315227508545, + "logits/rejected": -1.4884825944900513, + "logps/chosen": -52.839698791503906, + "logps/rejected": -57.9176025390625, + "loss": 0.6886, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.006442956626415253, - "rewards/margins": 0.008721071295440197, - "rewards/rejected": -0.0022781144361943007, + "rewards/chosen": 0.006625134497880936, + "rewards/margins": 0.00913353729993105, + "rewards/rejected": -0.002508401870727539, "step": 2220 }, { "epoch": 1.6066282420749278, - "grad_norm": 3.118870258331299, + "grad_norm": 3.121880054473877, "learning_rate": 5.666015212436795e-09, - "logits/chosen": -1.517188310623169, - "logits/rejected": -1.5100383758544922, - "logps/chosen": -48.561012268066406, - "logps/rejected": -51.84938430786133, + "logits/chosen": -1.5170773267745972, + "logits/rejected": -1.5097806453704834, + "logps/chosen": -48.54191207885742, + "logps/rejected": -51.84587478637695, "loss": 0.6903, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": 0.0028922080527991056, - "rewards/margins": 0.00573217635974288, - "rewards/rejected": -0.0028399676084518433, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0030832444317638874, + "rewards/margins": 0.005888159386813641, + "rewards/rejected": -0.002804915653541684, "step": 2230 }, { "epoch": 1.6138328530259365, - "grad_norm": 2.9040238857269287, + "grad_norm": 2.897308111190796, "learning_rate": 5.46822310313379e-09, - "logits/chosen": -1.517345666885376, - "logits/rejected": -1.5175951719284058, - "logps/chosen": -40.73456954956055, - "logps/rejected": -45.976905822753906, - "loss": 0.6881, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.005813897587358952, - "rewards/margins": 0.010332506150007248, - "rewards/rejected": -0.004518609028309584, + "logits/chosen": -1.5174100399017334, + "logits/rejected": -1.5176904201507568, + "logps/chosen": -40.74285125732422, + "logps/rejected": -45.95573806762695, + "loss": 0.6882, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.005731114652007818, + "rewards/margins": 0.010038026608526707, + "rewards/rejected": -0.004306911490857601, "step": 2240 }, { "epoch": 1.6210374639769451, - "grad_norm": 3.120763063430786, + "grad_norm": 3.124610662460327, "learning_rate": 5.273520228555767e-09, - "logits/chosen": -1.57765793800354, - "logits/rejected": -1.5605663061141968, - "logps/chosen": -49.17769241333008, - "logps/rejected": -52.6965217590332, - "loss": 0.6905, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.004476577043533325, - "rewards/margins": 0.0055053262040019035, - "rewards/rejected": -0.0010287497425451875, + "logits/chosen": -1.577868103981018, + "logits/rejected": -1.560633897781372, + "logps/chosen": -49.160133361816406, + "logps/rejected": -52.690643310546875, + "loss": 0.6904, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.004652169533073902, + "rewards/margins": 0.005622128956019878, + "rewards/rejected": -0.0009699600050225854, "step": 2250 }, { "epoch": 1.6282420749279538, - "grad_norm": 2.858696937561035, + "grad_norm": 2.8722281455993652, "learning_rate": 5.081937383795484e-09, - "logits/chosen": -1.4714362621307373, - "logits/rejected": -1.451599359512329, - "logps/chosen": -44.16965103149414, - "logps/rejected": -47.10548782348633, - "loss": 0.6896, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.005831424612551928, - "rewards/margins": 0.0071947164833545685, - "rewards/rejected": -0.0013632916379719973, + "logits/chosen": -1.4711134433746338, + "logits/rejected": -1.4512357711791992, + "logps/chosen": -44.158172607421875, + "logps/rejected": -47.07408905029297, + "loss": 0.6897, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.005946185905486345, + "rewards/margins": 0.006995551288127899, + "rewards/rejected": -0.0010493656154721975, "step": 2260 }, { "epoch": 1.6354466858789625, - "grad_norm": 3.424889087677002, + "grad_norm": 3.4327311515808105, "learning_rate": 4.893504870467588e-09, - "logits/chosen": -1.5349056720733643, - "logits/rejected": -1.5201621055603027, - "logps/chosen": -45.561912536621094, - "logps/rejected": -47.927860260009766, - "loss": 0.6897, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.004046425689011812, - "rewards/margins": 0.006989126093685627, - "rewards/rejected": -0.002942700171843171, + "logits/chosen": -1.5347981452941895, + "logits/rejected": -1.5201083421707153, + "logps/chosen": -45.572601318359375, + "logps/rejected": -47.918853759765625, + "loss": 0.6898, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.003939527552574873, + "rewards/margins": 0.006792210042476654, + "rewards/rejected": -0.0028526827227324247, "step": 2270 }, { "epoch": 1.6426512968299711, - "grad_norm": 3.203747510910034, + "grad_norm": 3.184828519821167, "learning_rate": 4.708252491915951e-09, - "logits/chosen": -1.4700814485549927, - "logits/rejected": -1.4582784175872803, - "logps/chosen": -51.875648498535156, - "logps/rejected": -54.74699020385742, - "loss": 0.6909, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.005088013596832752, - "rewards/margins": 0.00451476639136672, - "rewards/rejected": 0.0005732477875426412, + "logits/chosen": -1.4699965715408325, + "logits/rejected": -1.4581400156021118, + "logps/chosen": -51.87919998168945, + "logps/rejected": -54.724205017089844, + "loss": 0.6911, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.005052558146417141, + "rewards/margins": 0.004251400474458933, + "rewards/rejected": 0.0008011573809199035, "step": 2280 }, { "epoch": 1.6498559077809798, - "grad_norm": 3.683656930923462, + "grad_norm": 3.679774284362793, "learning_rate": 4.526209548499877e-09, - "logits/chosen": -1.4772391319274902, - "logits/rejected": -1.4647594690322876, - "logps/chosen": -49.298465728759766, - "logps/rejected": -48.70001983642578, - "loss": 0.6889, + "logits/chosen": -1.4771262407302856, + "logits/rejected": -1.4646440744400024, + "logps/chosen": -49.27619552612305, + "logps/rejected": -48.700233459472656, + "loss": 0.6888, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.005325310863554478, - "rewards/margins": 0.00872163288295269, - "rewards/rejected": -0.0033963224850594997, + "rewards/chosen": 0.005548023618757725, + "rewards/margins": 0.008946443907916546, + "rewards/rejected": -0.003398419823497534, "step": 2290 }, { "epoch": 1.6570605187319885, - "grad_norm": 2.8685970306396484, + "grad_norm": 2.867825746536255, "learning_rate": 4.347404832959775e-09, - "logits/chosen": -1.5159680843353271, - "logits/rejected": -1.4964356422424316, - "logps/chosen": -43.20624542236328, - "logps/rejected": -44.70439147949219, + "logits/chosen": -1.5160801410675049, + "logits/rejected": -1.4965603351593018, + "logps/chosen": -43.20734405517578, + "logps/rejected": -44.69267654418945, "loss": 0.6888, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.005432508885860443, - "rewards/margins": 0.00892320554703474, - "rewards/rejected": -0.003490696894004941, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.005421455018222332, + "rewards/margins": 0.008794995956122875, + "rewards/rejected": -0.0033735414035618305, "step": 2300 }, { "epoch": 1.6642651296829971, - "grad_norm": 3.4407453536987305, + "grad_norm": 3.4203250408172607, "learning_rate": 4.171866625863229e-09, - "logits/chosen": -1.4605551958084106, - "logits/rejected": -1.4485323429107666, - "logps/chosen": -54.96281814575195, - "logps/rejected": -56.01666259765625, - "loss": 0.6907, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.004884735215455294, - "rewards/margins": 0.0049371724016964436, - "rewards/rejected": -5.2436884288908914e-05, + "logits/chosen": -1.460514783859253, + "logits/rejected": -1.4484317302703857, + "logps/chosen": -54.94541549682617, + "logps/rejected": -56.01485061645508, + "loss": 0.6906, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.005058721173554659, + "rewards/margins": 0.005093052517622709, + "rewards/rejected": -3.4331507777096704e-05, "step": 2310 }, { "epoch": 1.6714697406340058, - "grad_norm": 2.6767680644989014, + "grad_norm": 2.676339864730835, "learning_rate": 3.9996226911319546e-09, - "logits/chosen": -1.5319734811782837, - "logits/rejected": -1.5255602598190308, - "logps/chosen": -42.955421447753906, - "logps/rejected": -45.6214599609375, + "logits/chosen": -1.531790018081665, + "logits/rejected": -1.5253288745880127, + "logps/chosen": -42.960140228271484, + "logps/rejected": -45.631595611572266, "loss": 0.6894, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.004288064781576395, - "rewards/margins": 0.007574597839266062, - "rewards/rejected": -0.0032865318935364485, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.004240875598043203, + "rewards/margins": 0.007628757506608963, + "rewards/rejected": -0.0033878819085657597, "step": 2320 }, { "epoch": 1.6786743515850144, - "grad_norm": 2.9439940452575684, + "grad_norm": 2.9492862224578857, "learning_rate": 3.830700271650567e-09, - "logits/chosen": -1.5762637853622437, - "logits/rejected": -1.567180871963501, - "logps/chosen": -42.681846618652344, - "logps/rejected": -46.93994903564453, - "loss": 0.6887, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.004170938860625029, - "rewards/margins": 0.009164386428892612, - "rewards/rejected": -0.0049934471026062965, + "logits/chosen": -1.5764167308807373, + "logits/rejected": -1.5673660039901733, + "logps/chosen": -42.68858337402344, + "logps/rejected": -46.930320739746094, + "loss": 0.6888, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.004103544168174267, + "rewards/margins": 0.009000726044178009, + "rewards/rejected": -0.004897181876003742, "step": 2330 }, { "epoch": 1.685878962536023, - "grad_norm": 3.5754525661468506, + "grad_norm": 3.574375629425049, "learning_rate": 3.665126084957723e-09, - "logits/chosen": -1.4605201482772827, - "logits/rejected": -1.4505817890167236, - "logps/chosen": -41.482200622558594, - "logps/rejected": -44.76730728149414, - "loss": 0.6883, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.006525760050863028, - "rewards/margins": 0.00990013312548399, - "rewards/rejected": -0.0033743735402822495, + "logits/chosen": -1.460824728012085, + "logits/rejected": -1.4508405923843384, + "logps/chosen": -41.46354293823242, + "logps/rejected": -44.764930725097656, + "loss": 0.6882, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.006712308619171381, + "rewards/margins": 0.0100629311054945, + "rewards/rejected": -0.003350622486323118, "step": 2340 }, { "epoch": 1.6930835734870318, - "grad_norm": 3.5225203037261963, + "grad_norm": 3.5213210582733154, "learning_rate": 3.502926319020327e-09, - "logits/chosen": -1.4367626905441284, - "logits/rejected": -1.4138025045394897, - "logps/chosen": -53.16743087768555, - "logps/rejected": -54.37062454223633, + "logits/chosen": -1.4368828535079956, + "logits/rejected": -1.4139349460601807, + "logps/chosen": -53.15620803833008, + "logps/rejected": -54.36595916748047, "loss": 0.6894, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.004174394998699427, - "rewards/margins": 0.007619527634233236, - "rewards/rejected": -0.0034451335668563843, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.004286657087504864, + "rewards/margins": 0.007685109041631222, + "rewards/rejected": -0.0033984524197876453, "step": 2350 }, { "epoch": 1.7002881844380404, - "grad_norm": 3.2806501388549805, + "grad_norm": 3.2753190994262695, "learning_rate": 3.3441266280915427e-09, - "logits/chosen": -1.4779942035675049, - "logits/rejected": -1.458083987236023, - "logps/chosen": -42.71442413330078, - "logps/rejected": -47.51856994628906, - "loss": 0.6878, + "logits/chosen": -1.4781744480133057, + "logits/rejected": -1.4581515789031982, + "logps/chosen": -42.72399139404297, + "logps/rejected": -47.5009880065918, + "loss": 0.688, "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.0056925928220152855, - "rewards/margins": 0.010809649713337421, - "rewards/rejected": -0.005117055959999561, + "rewards/chosen": 0.005596922244876623, + "rewards/margins": 0.010538162663578987, + "rewards/rejected": -0.004941239953041077, "step": 2360 }, { "epoch": 1.707492795389049, - "grad_norm": 3.2662885189056396, + "grad_norm": 3.2596471309661865, "learning_rate": 3.1887521286532023e-09, - "logits/chosen": -1.5734524726867676, - "logits/rejected": -1.555147409439087, - "logps/chosen": -51.01714324951172, - "logps/rejected": -50.69014358520508, + "logits/chosen": -1.5734946727752686, + "logits/rejected": -1.5553158521652222, + "logps/chosen": -51.01832962036133, + "logps/rejected": -50.7015266418457, "loss": 0.6902, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.004882704466581345, - "rewards/margins": 0.00589717784896493, - "rewards/rejected": -0.0010144734987989068, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.004870889242738485, + "rewards/margins": 0.005999116692692041, + "rewards/rejected": -0.0011282269842922688, "step": 2370 }, { "epoch": 1.7146974063400577, - "grad_norm": 3.4008705615997314, + "grad_norm": 3.2845699787139893, "learning_rate": 3.0368273954432698e-09, - "logits/chosen": -1.5671641826629639, - "logits/rejected": -1.5657932758331299, - "logps/chosen": -49.652183532714844, - "logps/rejected": -52.856849670410156, - "loss": 0.689, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.005631798878312111, - "rewards/margins": 0.00844245683401823, - "rewards/rejected": -0.0028106574900448322, + "logits/chosen": -1.5669500827789307, + "logits/rejected": -1.5655710697174072, + "logps/chosen": -49.607826232910156, + "logps/rejected": -52.859046936035156, + "loss": 0.6888, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.006075365003198385, + "rewards/margins": 0.008908048272132874, + "rewards/rejected": -0.0028326823376119137, "step": 2380 }, { "epoch": 1.7219020172910664, - "grad_norm": 2.6368513107299805, + "grad_norm": 2.6468191146850586, "learning_rate": 2.888376457568964e-09, - "logits/chosen": -1.628116250038147, - "logits/rejected": -1.6141830682754517, - "logps/chosen": -46.15139389038086, - "logps/rejected": -48.91469955444336, - "loss": 0.6905, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.00384289538487792, - "rewards/margins": 0.005390810314565897, - "rewards/rejected": -0.0015479145804420114, + "logits/chosen": -1.6281824111938477, + "logits/rejected": -1.614386796951294, + "logps/chosen": -46.13481903076172, + "logps/rejected": -48.9344367980957, + "loss": 0.6903, + "rewards/accuracies": 0.65625, + "rewards/chosen": 0.004008620046079159, + "rewards/margins": 0.005753874778747559, + "rewards/rejected": -0.0017452544998377562, "step": 2390 }, { "epoch": 1.729106628242075, - "grad_norm": 2.6690101623535156, + "grad_norm": 2.675931930541992, "learning_rate": 2.7434227947062324e-09, - "logits/chosen": -1.5447126626968384, - "logits/rejected": -1.5395575761795044, - "logps/chosen": -46.42256546020508, - "logps/rejected": -50.7569465637207, + "logits/chosen": -1.5447221994400024, + "logits/rejected": -1.5395848751068115, + "logps/chosen": -46.4262809753418, + "logps/rejected": -50.77505874633789, "loss": 0.6884, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.00619752099737525, - "rewards/margins": 0.009579489007592201, - "rewards/rejected": -0.003381968941539526, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.006160321645438671, + "rewards/margins": 0.009723430499434471, + "rewards/rejected": -0.0035631093196570873, "step": 2400 }, { "epoch": 1.7363112391930837, - "grad_norm": 3.5768134593963623, + "grad_norm": 3.5658557415008545, "learning_rate": 2.6019893333860954e-09, - "logits/chosen": -1.5103172063827515, - "logits/rejected": -1.5113203525543213, - "logps/chosen": -48.074466705322266, - "logps/rejected": -51.149784088134766, + "logits/chosen": -1.5105868577957153, + "logits/rejected": -1.511648178100586, + "logps/chosen": -48.063568115234375, + "logps/rejected": -51.142642974853516, "loss": 0.6899, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.003455445636063814, - "rewards/margins": 0.006496204528957605, - "rewards/rejected": -0.003040758427232504, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.003564387559890747, + "rewards/margins": 0.0065337494015693665, + "rewards/rejected": -0.002969362074509263, "step": 2410 }, { "epoch": 1.7435158501440924, - "grad_norm": 2.7053143978118896, + "grad_norm": 2.702556610107422, "learning_rate": 2.4640984433684758e-09, - "logits/chosen": -1.4912617206573486, - "logits/rejected": -1.4768751859664917, - "logps/chosen": -43.323394775390625, - "logps/rejected": -45.09511947631836, + "logits/chosen": -1.4914367198944092, + "logits/rejected": -1.4769344329833984, + "logps/chosen": -43.33830642700195, + "logps/rejected": -45.11254119873047, "loss": 0.6891, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.0036243596114218235, - "rewards/margins": 0.008290117606520653, - "rewards/rejected": -0.004665757529437542, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.003475245786830783, + "rewards/margins": 0.008315211161971092, + "rewards/rejected": -0.004839965142309666, "step": 2420 }, { "epoch": 1.7507204610951008, - "grad_norm": 3.432485342025757, + "grad_norm": 3.4406282901763916, "learning_rate": 2.3297719341040856e-09, - "logits/chosen": -1.4857823848724365, - "logits/rejected": -1.4767965078353882, - "logps/chosen": -45.32086944580078, - "logps/rejected": -50.63701629638672, + "logits/chosen": -1.485995888710022, + "logits/rejected": -1.477012038230896, + "logps/chosen": -45.341087341308594, + "logps/rejected": -50.648109436035156, "loss": 0.6894, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.004546423442661762, - "rewards/margins": 0.007703344337642193, - "rewards/rejected": -0.0031569204293191433, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.004344302229583263, + "rewards/margins": 0.007612115703523159, + "rewards/rejected": -0.003267813939601183, "step": 2430 }, { "epoch": 1.7579250720461095, - "grad_norm": 3.0087802410125732, + "grad_norm": 3.0044658184051514, "learning_rate": 2.199031051284972e-09, - "logits/chosen": -1.561683177947998, - "logits/rejected": -1.549645185470581, - "logps/chosen": -45.22322082519531, - "logps/rejected": -47.55016326904297, + "logits/chosen": -1.561757206916809, + "logits/rejected": -1.549767255783081, + "logps/chosen": -45.233734130859375, + "logps/rejected": -47.549617767333984, "loss": 0.6898, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.004354492295533419, - "rewards/margins": 0.006802760995924473, - "rewards/rejected": -0.002448268933221698, + "rewards/accuracies": 0.6312500238418579, + "rewards/chosen": 0.004249324090778828, + "rewards/margins": 0.006692181341350079, + "rewards/rejected": -0.0024428577162325382, "step": 2440 }, { "epoch": 1.7651296829971181, - "grad_norm": 2.272958517074585, + "grad_norm": 2.2655446529388428, "learning_rate": 2.0718964734841667e-09, - "logits/chosen": -1.5301949977874756, - "logits/rejected": -1.5162990093231201, - "logps/chosen": -49.53049850463867, - "logps/rejected": -50.499881744384766, + "logits/chosen": -1.5306880474090576, + "logits/rejected": -1.516817331314087, + "logps/chosen": -49.53384017944336, + "logps/rejected": -50.49539566040039, "loss": 0.6898, - "rewards/accuracies": 0.5687500238418579, - "rewards/chosen": 0.004863867070525885, - "rewards/margins": 0.006819572299718857, - "rewards/rejected": -0.0019557056948542595, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.004830458201467991, + "rewards/margins": 0.006741275545209646, + "rewards/rejected": -0.001910816179588437, "step": 2450 }, { "epoch": 1.7723342939481268, - "grad_norm": 2.498459577560425, + "grad_norm": 2.505718231201172, "learning_rate": 1.948388308885102e-09, - "logits/chosen": -1.5911833047866821, - "logits/rejected": -1.5699782371520996, - "logps/chosen": -43.71312713623047, - "logps/rejected": -45.14199447631836, - "loss": 0.6881, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.006634755991399288, - "rewards/margins": 0.010198970325291157, - "rewards/rejected": -0.0035642138682305813, + "logits/chosen": -1.5914843082427979, + "logits/rejected": -1.5702216625213623, + "logps/chosen": -43.73151397705078, + "logps/rejected": -45.120140075683594, + "loss": 0.6883, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.006450912915170193, + "rewards/margins": 0.009796546772122383, + "rewards/rejected": -0.0033456324599683285, "step": 2460 }, { "epoch": 1.7795389048991355, - "grad_norm": 3.781324863433838, + "grad_norm": 3.7846803665161133, "learning_rate": 1.8285260921011846e-09, - "logits/chosen": -1.5634214878082275, - "logits/rejected": -1.5540778636932373, - "logps/chosen": -52.9296875, - "logps/rejected": -54.504615783691406, + "logits/chosen": -1.5637753009796143, + "logits/rejected": -1.5544298887252808, + "logps/chosen": -52.91522216796875, + "logps/rejected": -54.493995666503906, "loss": 0.6898, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.0047055198810994625, - "rewards/margins": 0.0067476751282811165, - "rewards/rejected": -0.0020421550143510103, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.004850171972066164, + "rewards/margins": 0.00678609311580658, + "rewards/rejected": -0.001935920910909772, "step": 2470 }, { "epoch": 1.7867435158501441, - "grad_norm": 2.4721181392669678, + "grad_norm": 2.4653208255767822, "learning_rate": 1.712328781086131e-09, - "logits/chosen": -1.4916369915008545, - "logits/rejected": -1.470900058746338, - "logps/chosen": -48.653865814208984, - "logps/rejected": -50.2443733215332, - "loss": 0.6894, + "logits/chosen": -1.4916441440582275, + "logits/rejected": -1.470869779586792, + "logps/chosen": -48.64797592163086, + "logps/rejected": -50.219051361083984, + "loss": 0.6895, "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.004430476576089859, - "rewards/margins": 0.007640810217708349, - "rewards/rejected": -0.003210334572941065, + "rewards/chosen": 0.004489334765821695, + "rewards/margins": 0.007446461822837591, + "rewards/rejected": -0.002957127522677183, "step": 2480 }, { "epoch": 1.7939481268011528, - "grad_norm": 2.553084373474121, + "grad_norm": 2.550013303756714, "learning_rate": 1.59981475413547e-09, - "logits/chosen": -1.4686026573181152, - "logits/rejected": -1.4523656368255615, - "logps/chosen": -43.7984504699707, - "logps/rejected": -46.64140701293945, + "logits/chosen": -1.4687654972076416, + "logits/rejected": -1.4527361392974854, + "logps/chosen": -43.778564453125, + "logps/rejected": -46.61829376220703, "loss": 0.6879, - "rewards/accuracies": 0.643750011920929, - "rewards/chosen": 0.0074505493976175785, - "rewards/margins": 0.010650457814335823, - "rewards/rejected": -0.00319990748539567, + "rewards/accuracies": 0.668749988079071, + "rewards/chosen": 0.0076493555679917336, + "rewards/margins": 0.010618172585964203, + "rewards/rejected": -0.0029688174836337566, "step": 2490 }, { "epoch": 1.8011527377521612, - "grad_norm": 2.9727623462677, + "grad_norm": 2.973017692565918, "learning_rate": 1.491001806979772e-09, - "logits/chosen": -1.4443368911743164, - "logits/rejected": -1.443589687347412, - "logps/chosen": -44.56245803833008, - "logps/rejected": -50.14686965942383, - "loss": 0.6877, - "rewards/accuracies": 0.706250011920929, - "rewards/chosen": 0.0059210616163909435, - "rewards/margins": 0.010932967066764832, - "rewards/rejected": -0.005011905916035175, + "logits/chosen": -1.4441421031951904, + "logits/rejected": -1.4434623718261719, + "logps/chosen": -44.5540657043457, + "logps/rejected": -50.13270950317383, + "loss": 0.6878, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.006004899740219116, + "rewards/margins": 0.010875212959945202, + "rewards/rejected": -0.004870312754064798, "step": 2500 }, { "epoch": 1.8083573487031699, - "grad_norm": 3.477095365524292, + "grad_norm": 3.489056348800659, "learning_rate": 1.3859071499699698e-09, - "logits/chosen": -1.5023186206817627, - "logits/rejected": -1.4870012998580933, - "logps/chosen": -48.71327590942383, - "logps/rejected": -50.696590423583984, - "loss": 0.6894, - "rewards/accuracies": 0.543749988079071, - "rewards/chosen": 0.004189325962215662, - "rewards/margins": 0.0075593143701553345, - "rewards/rejected": -0.003369989339262247, + "logits/chosen": -1.502044677734375, + "logits/rejected": -1.4867156744003296, + "logps/chosen": -48.71193313598633, + "logps/rejected": -50.691951751708984, + "loss": 0.6895, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.004202738404273987, + "rewards/margins": 0.007526367902755737, + "rewards/rejected": -0.0033236306626349688, "step": 2510 }, { "epoch": 1.8155619596541785, - "grad_norm": 2.4839766025543213, + "grad_norm": 2.489485740661621, "learning_rate": 1.2845474053553156e-09, - "logits/chosen": -1.4774976968765259, - "logits/rejected": -1.467456579208374, - "logps/chosen": -48.630348205566406, - "logps/rejected": -51.53857421875, + "logits/chosen": -1.4774166345596313, + "logits/rejected": -1.4674955606460571, + "logps/chosen": -48.631839752197266, + "logps/rejected": -51.5322151184082, "loss": 0.6885, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.004527250304818153, - "rewards/margins": 0.009543434716761112, - "rewards/rejected": -0.005016184411942959, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.0045123351737856865, + "rewards/margins": 0.009464847855269909, + "rewards/rejected": -0.004952512681484222, "step": 2520 }, { "epoch": 1.8227665706051872, - "grad_norm": 2.7578563690185547, + "grad_norm": 2.7608137130737305, "learning_rate": 1.1869386046543222e-09, - "logits/chosen": -1.4523786306381226, - "logits/rejected": -1.4426023960113525, - "logps/chosen": -44.634124755859375, - "logps/rejected": -47.72616958618164, - "loss": 0.687, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.007956079207360744, - "rewards/margins": 0.012530913576483727, - "rewards/rejected": -0.004574834369122982, + "logits/chosen": -1.452405571937561, + "logits/rejected": -1.4426206350326538, + "logps/chosen": -44.629940032958984, + "logps/rejected": -47.741859436035156, + "loss": 0.6869, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.007997962646186352, + "rewards/margins": 0.01272977888584137, + "rewards/rejected": -0.0047318171709775925, "step": 2530 }, { "epoch": 1.8299711815561959, - "grad_norm": 3.3305163383483887, + "grad_norm": 3.311514377593994, "learning_rate": 1.0930961861191302e-09, - "logits/chosen": -1.477717638015747, - "logits/rejected": -1.4776172637939453, - "logps/chosen": -39.495826721191406, - "logps/rejected": -43.933837890625, - "loss": 0.6888, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.004767412785440683, - "rewards/margins": 0.008766481652855873, - "rewards/rejected": -0.0039990684017539024, + "logits/chosen": -1.4774367809295654, + "logits/rejected": -1.4772757291793823, + "logps/chosen": -39.503108978271484, + "logps/rejected": -43.93723678588867, + "loss": 0.6889, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.004694581031799316, + "rewards/margins": 0.00872765015810728, + "rewards/rejected": -0.004033069126307964, "step": 2540 }, { "epoch": 1.8371757925072045, - "grad_norm": 2.9037258625030518, + "grad_norm": 2.9069316387176514, "learning_rate": 1.003034992293733e-09, - "logits/chosen": -1.5053445100784302, - "logits/rejected": -1.4848747253417969, - "logps/chosen": -40.590179443359375, - "logps/rejected": -45.24515914916992, - "loss": 0.6894, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.005578812211751938, - "rewards/margins": 0.00761597603559494, - "rewards/rejected": -0.0020371642895042896, + "logits/chosen": -1.5049939155578613, + "logits/rejected": -1.4844647645950317, + "logps/chosen": -40.570735931396484, + "logps/rejected": -45.23885726928711, + "loss": 0.6893, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.0057731932029128075, + "rewards/margins": 0.007747343275696039, + "rewards/rejected": -0.0019741503056138754, "step": 2550 }, { "epoch": 1.8443804034582132, - "grad_norm": 2.814815044403076, + "grad_norm": 2.824398994445801, "learning_rate": 9.16769267666434e-10, - "logits/chosen": -1.478011131286621, - "logits/rejected": -1.4578216075897217, - "logps/chosen": -43.7270622253418, - "logps/rejected": -45.00766372680664, + "logits/chosen": -1.4777934551239014, + "logits/rejected": -1.457552194595337, + "logps/chosen": -43.71975326538086, + "logps/rejected": -45.001991271972656, "loss": 0.6896, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.006640469189733267, - "rewards/margins": 0.0071569690480828285, - "rewards/rejected": -0.0005164999747648835, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.006713517941534519, + "rewards/margins": 0.007173291407525539, + "rewards/rejected": -0.00045977457193657756, "step": 2560 }, { "epoch": 1.8515850144092219, - "grad_norm": 3.5298328399658203, + "grad_norm": 3.5243492126464844, "learning_rate": 8.343126564168412e-10, - "logits/chosen": -1.5062114000320435, - "logits/rejected": -1.4929311275482178, - "logps/chosen": -48.96141815185547, - "logps/rejected": -51.6135139465332, - "loss": 0.6892, + "logits/chosen": -1.5060440301895142, + "logits/rejected": -1.49294912815094, + "logps/chosen": -48.975364685058594, + "logps/rejected": -51.62299728393555, + "loss": 0.6893, "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.005637262016534805, - "rewards/margins": 0.007979677058756351, - "rewards/rejected": -0.002342414576560259, + "rewards/chosen": 0.005497785285115242, + "rewards/margins": 0.007935033179819584, + "rewards/rejected": -0.0024372474290430546, "step": 2570 }, { "epoch": 1.8587896253602305, - "grad_norm": 2.955904483795166, + "grad_norm": 2.9600613117218018, "learning_rate": 7.55678200257856e-10, - "logits/chosen": -1.4822179079055786, - "logits/rejected": -1.4612315893173218, - "logps/chosen": -47.652366638183594, - "logps/rejected": -48.669776916503906, - "loss": 0.6893, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.00491288211196661, - "rewards/margins": 0.007909296080470085, - "rewards/rejected": -0.002996413502842188, + "logits/chosen": -1.4818129539489746, + "logits/rejected": -1.4608432054519653, + "logps/chosen": -47.65571975708008, + "logps/rejected": -48.67729187011719, + "loss": 0.6892, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.004879404790699482, + "rewards/margins": 0.007951008155941963, + "rewards/rejected": -0.0030716031324118376, "step": 2580 }, { "epoch": 1.8659942363112392, - "grad_norm": 3.5581696033477783, + "grad_norm": 3.5499563217163086, "learning_rate": 6.808783363729364e-10, - "logits/chosen": -1.4488227367401123, - "logits/rejected": -1.4353034496307373, - "logps/chosen": -42.95090866088867, - "logps/rejected": -45.97223663330078, + "logits/chosen": -1.4484130144119263, + "logits/rejected": -1.4349002838134766, + "logps/chosen": -42.954776763916016, + "logps/rejected": -45.976585388183594, "loss": 0.6888, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.005257806740701199, - "rewards/margins": 0.008940257132053375, - "rewards/rejected": -0.003682450158521533, + "rewards/accuracies": 0.5562499761581421, + "rewards/chosen": 0.005219182465225458, + "rewards/margins": 0.0089451028034091, + "rewards/rejected": -0.0037259210366755724, "step": 2590 }, { "epoch": 1.8731988472622478, - "grad_norm": 3.371162176132202, + "grad_norm": 3.3686113357543945, "learning_rate": 6.099248954489794e-10, - "logits/chosen": -1.4984956979751587, - "logits/rejected": -1.4886690378189087, - "logps/chosen": -44.46028137207031, - "logps/rejected": -46.45746994018555, + "logits/chosen": -1.4986984729766846, + "logits/rejected": -1.488825798034668, + "logps/chosen": -44.43115997314453, + "logps/rejected": -46.438255310058594, "loss": 0.6901, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.0052803074941039085, - "rewards/margins": 0.006142514757812023, - "rewards/rejected": -0.0008622069726698101, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.005571546033024788, + "rewards/margins": 0.006241600029170513, + "rewards/rejected": -0.0006700534140691161, "step": 2600 }, { "epoch": 1.8804034582132565, - "grad_norm": 2.8961639404296875, + "grad_norm": 2.901690721511841, "learning_rate": 5.428290998051116e-10, - "logits/chosen": -1.496824026107788, - "logits/rejected": -1.4815888404846191, - "logps/chosen": -43.019920349121094, - "logps/rejected": -45.08091354370117, - "loss": 0.6893, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.004790132399648428, - "rewards/margins": 0.007879992015659809, - "rewards/rejected": -0.003089859616011381, + "logits/chosen": -1.49661386013031, + "logits/rejected": -1.4813730716705322, + "logps/chosen": -43.02548599243164, + "logps/rejected": -45.05482482910156, + "loss": 0.6895, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.00473454874008894, + "rewards/margins": 0.007563503924757242, + "rewards/rejected": -0.002828955417498946, "step": 2610 }, { "epoch": 1.8876080691642652, - "grad_norm": 2.850574493408203, + "grad_norm": 2.8460912704467773, "learning_rate": 4.796015616177401e-10, - "logits/chosen": -1.4512312412261963, - "logits/rejected": -1.4404473304748535, - "logps/chosen": -44.99188232421875, - "logps/rejected": -47.4771842956543, - "loss": 0.6884, - "rewards/accuracies": 0.59375, - "rewards/chosen": 0.005510573275387287, - "rewards/margins": 0.009593435563147068, - "rewards/rejected": -0.004082861822098494, + "logits/chosen": -1.4509621858596802, + "logits/rejected": -1.4402557611465454, + "logps/chosen": -44.99142837524414, + "logps/rejected": -47.46977996826172, + "loss": 0.6885, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.005515123251825571, + "rewards/margins": 0.00952393189072609, + "rewards/rejected": -0.004008809570223093, "step": 2620 }, { "epoch": 1.8948126801152738, - "grad_norm": 2.4907619953155518, + "grad_norm": 2.4846298694610596, "learning_rate": 4.2025228124205335e-10, - "logits/chosen": -1.533969759941101, - "logits/rejected": -1.529395580291748, - "logps/chosen": -53.5245361328125, - "logps/rejected": -54.80263137817383, + "logits/chosen": -1.5341758728027344, + "logits/rejected": -1.5296345949172974, + "logps/chosen": -53.49615478515625, + "logps/rejected": -54.78078079223633, "loss": 0.6904, - "rewards/accuracies": 0.606249988079071, - "rewards/chosen": 0.003422043053433299, - "rewards/margins": 0.005617387592792511, - "rewards/rejected": -0.002195344539359212, + "rewards/accuracies": 0.581250011920929, + "rewards/chosen": 0.0037058573216199875, + "rewards/margins": 0.005682673770934343, + "rewards/rejected": -0.001976816216483712, "step": 2630 }, { "epoch": 1.9020172910662825, - "grad_norm": 2.9207513332366943, + "grad_norm": 2.923110008239746, "learning_rate": 3.64790645630339e-10, - "logits/chosen": -1.5180240869522095, - "logits/rejected": -1.4972988367080688, - "logps/chosen": -48.23350143432617, - "logps/rejected": -49.28852844238281, - "loss": 0.6896, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.004124370403587818, - "rewards/margins": 0.007322290446609259, - "rewards/rejected": -0.0031979219056665897, + "logits/chosen": -1.5179532766342163, + "logits/rejected": -1.4971638917922974, + "logps/chosen": -48.21674346923828, + "logps/rejected": -49.32129669189453, + "loss": 0.6893, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.004291985183954239, + "rewards/margins": 0.00781762134283781, + "rewards/rejected": -0.0035256363917142153, "step": 2640 }, { "epoch": 1.9092219020172911, - "grad_norm": 5.275201320648193, + "grad_norm": 5.278362274169922, "learning_rate": 3.1322542684729945e-10, - "logits/chosen": -1.4674465656280518, - "logits/rejected": -1.4508670568466187, - "logps/chosen": -55.194297790527344, - "logps/rejected": -59.635154724121094, + "logits/chosen": -1.4670631885528564, + "logits/rejected": -1.450452208518982, + "logps/chosen": -55.23088455200195, + "logps/rejected": -59.66814041137695, "loss": 0.6891, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.004097995813935995, - "rewards/margins": 0.008317622356116772, - "rewards/rejected": -0.004219626076519489, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.003732159035280347, + "rewards/margins": 0.008281553164124489, + "rewards/rejected": -0.0045493957586586475, "step": 2650 }, { "epoch": 1.9164265129682998, - "grad_norm": 2.4065980911254883, + "grad_norm": 2.41750431060791, "learning_rate": 2.6556478068261447e-10, - "logits/chosen": -1.5389297008514404, - "logits/rejected": -1.5416805744171143, - "logps/chosen": -47.71001434326172, - "logps/rejected": -50.589969635009766, - "loss": 0.6894, - "rewards/accuracies": 0.6312500238418579, - "rewards/chosen": 0.0047785029746592045, - "rewards/margins": 0.0075127137824893, - "rewards/rejected": -0.0027342115063220263, + "logits/chosen": -1.538832426071167, + "logits/rejected": -1.5415027141571045, + "logps/chosen": -47.72152328491211, + "logps/rejected": -50.597503662109375, + "loss": 0.6895, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.004663361236453056, + "rewards/margins": 0.007472933270037174, + "rewards/rejected": -0.002809572499245405, "step": 2660 }, { "epoch": 1.9236311239193085, - "grad_norm": 2.941249132156372, + "grad_norm": 2.938985586166382, "learning_rate": 2.2181624536098952e-10, - "logits/chosen": -1.5548467636108398, - "logits/rejected": -1.544862151145935, - "logps/chosen": -45.30964279174805, - "logps/rejected": -49.62782287597656, + "logits/chosen": -1.5549607276916504, + "logits/rejected": -1.545019507408142, + "logps/chosen": -45.30247497558594, + "logps/rejected": -49.602821350097656, "loss": 0.6886, "rewards/accuracies": 0.668749988079071, - "rewards/chosen": 0.005453969351947308, - "rewards/margins": 0.009312251582741737, - "rewards/rejected": -0.003858281997963786, + "rewards/chosen": 0.005525644402951002, + "rewards/margins": 0.009133870713412762, + "rewards/rejected": -0.0036082263104617596, "step": 2670 }, { "epoch": 1.9308357348703171, - "grad_norm": 2.6823368072509766, + "grad_norm": 2.6823463439941406, "learning_rate": 1.819867403498737e-10, - "logits/chosen": -1.530578851699829, - "logits/rejected": -1.5335153341293335, - "logps/chosen": -48.06808090209961, - "logps/rejected": -52.33460235595703, - "loss": 0.6912, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.004878942854702473, - "rewards/margins": 0.0039011756889522076, - "rewards/rejected": 0.0009777669329196215, + "logits/chosen": -1.5306833982467651, + "logits/rejected": -1.533712387084961, + "logps/chosen": -48.063419342041016, + "logps/rejected": -52.36247634887695, + "loss": 0.6911, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.004925573710352182, + "rewards/margins": 0.004226507619023323, + "rewards/rejected": 0.000699066324159503, "step": 2680 }, { "epoch": 1.9380403458213258, - "grad_norm": 2.720761775970459, + "grad_norm": 2.731215715408325, "learning_rate": 1.4608256526505157e-10, - "logits/chosen": -1.4372056722640991, - "logits/rejected": -1.4277093410491943, - "logps/chosen": -53.2573356628418, - "logps/rejected": -54.631309509277344, - "loss": 0.69, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.004901220556348562, - "rewards/margins": 0.006378164980560541, - "rewards/rejected": -0.001476944424211979, + "logits/chosen": -1.437555193901062, + "logits/rejected": -1.4281179904937744, + "logps/chosen": -53.27381134033203, + "logps/rejected": -54.61333465576172, + "loss": 0.6902, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.0047364565543830395, + "rewards/margins": 0.006033606827259064, + "rewards/rejected": -0.0012971509713679552, "step": 2690 }, { "epoch": 1.9452449567723344, - "grad_norm": 3.5336971282958984, + "grad_norm": 3.514517068862915, "learning_rate": 1.1410939887425141e-10, - "logits/chosen": -1.5654321908950806, - "logits/rejected": -1.543419599533081, - "logps/chosen": -45.95741653442383, - "logps/rejected": -47.627723693847656, + "logits/chosen": -1.5650951862335205, + "logits/rejected": -1.54314386844635, + "logps/chosen": -45.952232360839844, + "logps/rejected": -47.62507629394531, "loss": 0.6892, - "rewards/accuracies": 0.581250011920929, - "rewards/chosen": 0.004066399298608303, - "rewards/margins": 0.007989974692463875, - "rewards/rejected": -0.003923576325178146, + "rewards/accuracies": 0.59375, + "rewards/chosen": 0.004118201322853565, + "rewards/margins": 0.008015300147235394, + "rewards/rejected": -0.0038970999885350466, "step": 2700 }, { "epoch": 1.952449567723343, - "grad_norm": 3.1100754737854004, + "grad_norm": 3.1143033504486084, "learning_rate": 8.607229819898865e-11, - "logits/chosen": -1.5358569622039795, - "logits/rejected": -1.5192859172821045, - "logps/chosen": -47.22208023071289, - "logps/rejected": -49.05561065673828, - "loss": 0.6903, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.0036678563337773085, - "rewards/margins": 0.0058050090447068214, - "rewards/rejected": -0.002137152012437582, + "logits/chosen": -1.536029577255249, + "logits/rejected": -1.5196514129638672, + "logps/chosen": -47.20793914794922, + "logps/rejected": -49.070213317871094, + "loss": 0.6902, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0038091919850558043, + "rewards/margins": 0.00609241146594286, + "rewards/rejected": -0.0022832187823951244, "step": 2710 }, { "epoch": 1.9596541786743515, - "grad_norm": 3.285006046295166, + "grad_norm": 3.2864060401916504, "learning_rate": 6.19756977147029e-11, - "logits/chosen": -1.521823763847351, - "logits/rejected": -1.5114644765853882, - "logps/chosen": -41.895545959472656, - "logps/rejected": -46.61229705810547, - "loss": 0.6892, - "rewards/accuracies": 0.6187499761581421, - "rewards/chosen": 0.0053446595557034016, - "rewards/margins": 0.00811308529227972, - "rewards/rejected": -0.0027684266678988934, + "logits/chosen": -1.522200584411621, + "logits/rejected": -1.511781930923462, + "logps/chosen": -41.904945373535156, + "logps/rejected": -46.63688278198242, + "loss": 0.6891, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.005250618327409029, + "rewards/margins": 0.008264884352684021, + "rewards/rejected": -0.0030142655596137047, "step": 2720 }, { "epoch": 1.9668587896253602, - "grad_norm": 3.2000555992126465, + "grad_norm": 3.1982884407043457, "learning_rate": 4.1823408649391265e-11, - "logits/chosen": -1.4542639255523682, - "logits/rejected": -1.4478908777236938, - "logps/chosen": -45.98225784301758, - "logps/rejected": -47.78563690185547, - "loss": 0.6901, - "rewards/accuracies": 0.65625, - "rewards/chosen": 0.004633779171854258, - "rewards/margins": 0.0061810980550944805, - "rewards/rejected": -0.0015473200473934412, + "logits/chosen": -1.4544475078582764, + "logits/rejected": -1.4480139017105103, + "logps/chosen": -45.997161865234375, + "logps/rejected": -47.78651809692383, + "loss": 0.6902, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.004484730307012796, + "rewards/margins": 0.0060409121215343475, + "rewards/rejected": -0.0015561815816909075, "step": 2730 }, { "epoch": 1.9740634005763689, - "grad_norm": 3.026240587234497, + "grad_norm": 3.038734197616577, "learning_rate": 2.5618618380812694e-11, - "logits/chosen": -1.5805197954177856, - "logits/rejected": -1.5754799842834473, - "logps/chosen": -41.95084762573242, - "logps/rejected": -45.55485916137695, + "logits/chosen": -1.5807474851608276, + "logits/rejected": -1.5756494998931885, + "logps/chosen": -41.955406188964844, + "logps/rejected": -45.568946838378906, "loss": 0.6894, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.005279941018670797, - "rewards/margins": 0.007611162960529327, - "rewards/rejected": -0.00233122194185853, + "rewards/accuracies": 0.606249988079071, + "rewards/chosen": 0.005234319716691971, + "rewards/margins": 0.007706484291702509, + "rewards/rejected": -0.0024721643421798944, "step": 2740 }, { "epoch": 1.9812680115273775, - "grad_norm": 2.677151918411255, + "grad_norm": 2.673811912536621, "learning_rate": 1.3363889932338501e-11, - "logits/chosen": -1.477747917175293, - "logits/rejected": -1.4804916381835938, - "logps/chosen": -47.20847702026367, - "logps/rejected": -52.0623893737793, - "loss": 0.6896, - "rewards/accuracies": 0.6812499761581421, - "rewards/chosen": 0.005432409234344959, - "rewards/margins": 0.007217896170914173, - "rewards/rejected": -0.0017854865873232484, + "logits/chosen": -1.4779480695724487, + "logits/rejected": -1.480553150177002, + "logps/chosen": -47.22270202636719, + "logps/rejected": -52.02531814575195, + "loss": 0.6898, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0052901157177984715, + "rewards/margins": 0.006704888306558132, + "rewards/rejected": -0.001414772355929017, "step": 2750 }, { "epoch": 1.9884726224783862, - "grad_norm": 2.5347254276275635, + "grad_norm": 2.533945083618164, "learning_rate": 5.061161567596061e-12, - "logits/chosen": -1.5783098936080933, - "logits/rejected": -1.5731806755065918, - "logps/chosen": -47.280582427978516, - "logps/rejected": -50.71805953979492, + "logits/chosen": -1.5787914991378784, + "logits/rejected": -1.573646903038025, + "logps/chosen": -47.29462432861328, + "logps/rejected": -50.718727111816406, "loss": 0.6892, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.006104443222284317, - "rewards/margins": 0.0080792885273695, - "rewards/rejected": -0.001974845305085182, + "rewards/accuracies": 0.6187499761581421, + "rewards/chosen": 0.005964122246950865, + "rewards/margins": 0.007945626974105835, + "rewards/rejected": -0.001981505658477545, "step": 2760 }, { "epoch": 1.9956772334293948, - "grad_norm": 2.5474231243133545, + "grad_norm": 2.536482334136963, "learning_rate": 7.11746483889053e-13, - "logits/chosen": -1.499261736869812, - "logits/rejected": -1.4874109029769897, - "logps/chosen": -48.48490524291992, - "logps/rejected": -50.36320877075195, + "logits/chosen": -1.4995654821395874, + "logits/rejected": -1.4878028631210327, + "logps/chosen": -48.493412017822266, + "logps/rejected": -50.375938415527344, "loss": 0.6892, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.005374493543058634, - "rewards/margins": 0.007963933981955051, - "rewards/rejected": -0.0025894399732351303, + "rewards/accuracies": 0.643750011920929, + "rewards/chosen": 0.005289463326334953, + "rewards/margins": 0.00800617691129446, + "rewards/rejected": -0.0027167133521288633, "step": 2770 }, { "epoch": 2.0, "step": 2776, "total_flos": 0.0, - "train_loss": 0.6910306938443472, - "train_runtime": 4984.329, - "train_samples_per_second": 8.909, - "train_steps_per_second": 0.557 + "train_loss": 0.691034586010474, + "train_runtime": 5011.5072, + "train_samples_per_second": 8.86, + "train_steps_per_second": 0.554 } ], "logging_steps": 10,