{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02092050209205021, "grad_norm": 3.4774240176403444, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.6303696632385254, "logits/rejected": -2.576477289199829, "logps/chosen": -288.6245422363281, "logps/rejected": -275.9084167480469, "loss": 0.0582, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0006722577381879091, "rewards/margins": 0.001247679116204381, "rewards/rejected": -0.0005754214362241328, "step": 10 }, { "epoch": 0.04184100418410042, "grad_norm": 3.1260461200672642, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.6448445320129395, "logits/rejected": -2.6134140491485596, "logps/chosen": -293.53070068359375, "logps/rejected": -259.20135498046875, "loss": 0.0585, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0029079406522214413, "rewards/margins": 0.001445380854420364, "rewards/rejected": 0.0014625597978010774, "step": 20 }, { "epoch": 0.06276150627615062, "grad_norm": 2.8396874218169663, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.666360855102539, "logits/rejected": -2.589587688446045, "logps/chosen": -294.760986328125, "logps/rejected": -287.2416076660156, "loss": 0.0569, "rewards/accuracies": 0.59375, "rewards/chosen": 0.01464819349348545, "rewards/margins": 0.007496826350688934, "rewards/rejected": 0.007151367608457804, "step": 30 }, { "epoch": 0.08368200836820083, "grad_norm": 2.618292822796603, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.6354165077209473, "logits/rejected": -2.552605152130127, "logps/chosen": -270.3968200683594, "logps/rejected": -240.0804443359375, "loss": 0.0544, "rewards/accuracies": 0.625, "rewards/chosen": 0.03499267250299454, "rewards/margins": 0.025343021377921104, "rewards/rejected": 0.009649652987718582, "step": 40 }, { "epoch": 0.10460251046025104, "grad_norm": 3.012648304181472, "learning_rate": 4.999733114418725e-07, "logits/chosen": -2.5784354209899902, "logits/rejected": -2.5706632137298584, "logps/chosen": -262.97918701171875, "logps/rejected": -244.66690063476562, "loss": 0.0497, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0244884192943573, "rewards/margins": 0.06963478773832321, "rewards/rejected": -0.04514636844396591, "step": 50 }, { "epoch": 0.10460251046025104, "eval_logits/chosen": -2.6117944717407227, "eval_logits/rejected": -2.5747056007385254, "eval_logps/chosen": -259.21795654296875, "eval_logps/rejected": -269.0505065917969, "eval_loss": 0.047118376940488815, "eval_rewards/accuracies": 0.6953125, "eval_rewards/chosen": 0.034119848161935806, "eval_rewards/margins": 0.09800390154123306, "eval_rewards/rejected": -0.06388404965400696, "eval_runtime": 104.2768, "eval_samples_per_second": 19.18, "eval_steps_per_second": 0.307, "step": 50 }, { "epoch": 0.12552301255230125, "grad_norm": 3.174321476078958, "learning_rate": 4.990398100856366e-07, "logits/chosen": -2.537804126739502, "logits/rejected": -2.49711012840271, "logps/chosen": -261.7084045410156, "logps/rejected": -251.53125, "loss": 0.0464, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.0611313059926033, "rewards/margins": 0.11455783993005753, "rewards/rejected": -0.053426533937454224, "step": 60 }, { "epoch": 0.14644351464435146, "grad_norm": 3.5406121572748104, "learning_rate": 4.967775735898179e-07, "logits/chosen": -2.5840494632720947, "logits/rejected": -2.512150764465332, "logps/chosen": -288.1974792480469, "logps/rejected": -287.8002014160156, "loss": 0.0445, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.03117268718779087, "rewards/margins": 0.14650143682956696, "rewards/rejected": -0.17767412960529327, "step": 70 }, { "epoch": 0.16736401673640167, "grad_norm": 3.2391048621845417, "learning_rate": 4.931986719649298e-07, "logits/chosen": -2.4763026237487793, "logits/rejected": -2.464881658554077, "logps/chosen": -260.2668762207031, "logps/rejected": -276.872314453125, "loss": 0.0414, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.011183048598468304, "rewards/margins": 0.13343258202075958, "rewards/rejected": -0.14461562037467957, "step": 80 }, { "epoch": 0.18828451882845187, "grad_norm": 3.794320711197701, "learning_rate": 4.883222001996351e-07, "logits/chosen": -2.135620355606079, "logits/rejected": -2.0978758335113525, "logps/chosen": -281.79473876953125, "logps/rejected": -288.814697265625, "loss": 0.041, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.10697118937969208, "rewards/margins": 0.17307965457439423, "rewards/rejected": -0.2800508439540863, "step": 90 }, { "epoch": 0.20920502092050208, "grad_norm": 4.607538199967544, "learning_rate": 4.821741763807186e-07, "logits/chosen": -2.0615501403808594, "logits/rejected": -1.8890082836151123, "logps/chosen": -300.027587890625, "logps/rejected": -253.22848510742188, "loss": 0.0399, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.06046273559331894, "rewards/margins": 0.20909741520881653, "rewards/rejected": -0.26956015825271606, "step": 100 }, { "epoch": 0.20920502092050208, "eval_logits/chosen": -2.279555320739746, "eval_logits/rejected": -2.226322650909424, "eval_logps/chosen": -269.3652648925781, "eval_logps/rejected": -293.0491638183594, "eval_loss": 0.03997171297669411, "eval_rewards/accuracies": 0.765625, "eval_rewards/chosen": -0.06735337525606155, "eval_rewards/margins": 0.23651722073554993, "eval_rewards/rejected": -0.30387061834335327, "eval_runtime": 102.9716, "eval_samples_per_second": 19.423, "eval_steps_per_second": 0.311, "step": 100 }, { "epoch": 0.2301255230125523, "grad_norm": 3.018182648285734, "learning_rate": 4.747874028753375e-07, "logits/chosen": -2.2557241916656494, "logits/rejected": -2.202664375305176, "logps/chosen": -309.6082458496094, "logps/rejected": -311.09197998046875, "loss": 0.0395, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.14871956408023834, "rewards/margins": 0.2059335708618164, "rewards/rejected": -0.35465317964553833, "step": 110 }, { "epoch": 0.2510460251046025, "grad_norm": 3.013542760925059, "learning_rate": 4.662012913161997e-07, "logits/chosen": -2.4675145149230957, "logits/rejected": -2.377410411834717, "logps/chosen": -294.02093505859375, "logps/rejected": -280.5357971191406, "loss": 0.0383, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.17444203794002533, "rewards/margins": 0.2464580088853836, "rewards/rejected": -0.42090004682540894, "step": 120 }, { "epoch": 0.2719665271966527, "grad_norm": 3.0191370619156745, "learning_rate": 4.5646165232345103e-07, "logits/chosen": -2.4819345474243164, "logits/rejected": -2.4598195552825928, "logps/chosen": -303.03009033203125, "logps/rejected": -293.29705810546875, "loss": 0.037, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.05563073605298996, "rewards/margins": 0.2324099987745285, "rewards/rejected": -0.28804072737693787, "step": 130 }, { "epoch": 0.2928870292887029, "grad_norm": 3.626977578421201, "learning_rate": 4.456204510851956e-07, "logits/chosen": -2.478722095489502, "logits/rejected": -2.3737692832946777, "logps/chosen": -311.2557067871094, "logps/rejected": -308.92388916015625, "loss": 0.0365, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.10953263938426971, "rewards/margins": 0.28694862127304077, "rewards/rejected": -0.3964812457561493, "step": 140 }, { "epoch": 0.3138075313807531, "grad_norm": 4.252526149590865, "learning_rate": 4.337355301007335e-07, "logits/chosen": -2.488487482070923, "logits/rejected": -2.339409351348877, "logps/chosen": -307.0581970214844, "logps/rejected": -306.7461853027344, "loss": 0.0384, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.16012495756149292, "rewards/margins": 0.21382689476013184, "rewards/rejected": -0.37395185232162476, "step": 150 }, { "epoch": 0.3138075313807531, "eval_logits/chosen": -2.501673460006714, "eval_logits/rejected": -2.4575157165527344, "eval_logps/chosen": -277.8395690917969, "eval_logps/rejected": -303.1760559082031, "eval_loss": 0.03684209659695625, "eval_rewards/accuracies": 0.78125, "eval_rewards/chosen": -0.1520964801311493, "eval_rewards/margins": 0.2530430555343628, "eval_rewards/rejected": -0.4051395058631897, "eval_runtime": 102.7099, "eval_samples_per_second": 19.472, "eval_steps_per_second": 0.312, "step": 150 }, { "epoch": 0.33472803347280333, "grad_norm": 3.258158347527928, "learning_rate": 4.2087030056579986e-07, "logits/chosen": -2.4619104862213135, "logits/rejected": -2.3939900398254395, "logps/chosen": -278.5140686035156, "logps/rejected": -272.5802307128906, "loss": 0.0367, "rewards/accuracies": 0.75, "rewards/chosen": -0.07991068065166473, "rewards/margins": 0.2492825984954834, "rewards/rejected": -0.3291932940483093, "step": 160 }, { "epoch": 0.35564853556485354, "grad_norm": 5.659535165307617, "learning_rate": 4.070934040463998e-07, "logits/chosen": -2.3733975887298584, "logits/rejected": -2.274611711502075, "logps/chosen": -280.55621337890625, "logps/rejected": -291.4908142089844, "loss": 0.036, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.10827045142650604, "rewards/margins": 0.2207660973072052, "rewards/rejected": -0.32903656363487244, "step": 170 }, { "epoch": 0.37656903765690375, "grad_norm": 3.2906223684736924, "learning_rate": 3.9247834624635404e-07, "logits/chosen": -2.429305076599121, "logits/rejected": -2.3288915157318115, "logps/chosen": -327.5008239746094, "logps/rejected": -317.72308349609375, "loss": 0.0369, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.23194444179534912, "rewards/margins": 0.2526394724845886, "rewards/rejected": -0.48458394408226013, "step": 180 }, { "epoch": 0.39748953974895396, "grad_norm": 3.8317278665105072, "learning_rate": 3.7710310482256523e-07, "logits/chosen": -2.531461000442505, "logits/rejected": -2.459372043609619, "logps/chosen": -271.7269287109375, "logps/rejected": -283.0235595703125, "loss": 0.0349, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.1794736236333847, "rewards/margins": 0.21512684226036072, "rewards/rejected": -0.39460045099258423, "step": 190 }, { "epoch": 0.41841004184100417, "grad_norm": 3.3372690013710624, "learning_rate": 3.610497133404795e-07, "logits/chosen": -2.586259365081787, "logits/rejected": -2.531468391418457, "logps/chosen": -277.77606201171875, "logps/rejected": -270.3326721191406, "loss": 0.0354, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.17447002232074738, "rewards/margins": 0.22541049122810364, "rewards/rejected": -0.39988046884536743, "step": 200 }, { "epoch": 0.41841004184100417, "eval_logits/chosen": -2.6785476207733154, "eval_logits/rejected": -2.6354880332946777, "eval_logps/chosen": -278.7134094238281, "eval_logps/rejected": -306.79486083984375, "eval_loss": 0.036840494722127914, "eval_rewards/accuracies": 0.78125, "eval_rewards/chosen": -0.16083484888076782, "eval_rewards/margins": 0.28049272298812866, "eval_rewards/rejected": -0.4413275718688965, "eval_runtime": 102.7138, "eval_samples_per_second": 19.472, "eval_steps_per_second": 0.312, "step": 200 }, { "epoch": 0.4393305439330544, "grad_norm": 3.528281930925339, "learning_rate": 3.4440382358952115e-07, "logits/chosen": -2.6202356815338135, "logits/rejected": -2.496006727218628, "logps/chosen": -295.03680419921875, "logps/rejected": -302.34271240234375, "loss": 0.0372, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.23145723342895508, "rewards/margins": 0.2113940268754959, "rewards/rejected": -0.44285130500793457, "step": 210 }, { "epoch": 0.4602510460251046, "grad_norm": 3.160346667914988, "learning_rate": 3.272542485937368e-07, "logits/chosen": -2.634601593017578, "logits/rejected": -2.576793670654297, "logps/chosen": -294.7489318847656, "logps/rejected": -305.5950012207031, "loss": 0.0366, "rewards/accuracies": 0.75, "rewards/chosen": -0.10495420545339584, "rewards/margins": 0.23910054564476013, "rewards/rejected": -0.3440547585487366, "step": 220 }, { "epoch": 0.4811715481171548, "grad_norm": 3.1995588594588478, "learning_rate": 3.096924887558854e-07, "logits/chosen": -2.62168288230896, "logits/rejected": -2.5223376750946045, "logps/chosen": -295.1716003417969, "logps/rejected": -300.92059326171875, "loss": 0.0332, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.1310446560382843, "rewards/margins": 0.28820374608039856, "rewards/rejected": -0.41924840211868286, "step": 230 }, { "epoch": 0.502092050209205, "grad_norm": 4.3013803617394055, "learning_rate": 2.9181224366319943e-07, "logits/chosen": -2.3905320167541504, "logits/rejected": -2.253870725631714, "logps/chosen": -276.6922912597656, "logps/rejected": -267.25909423828125, "loss": 0.0359, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.22041603922843933, "rewards/margins": 0.2721412181854248, "rewards/rejected": -0.49255722761154175, "step": 240 }, { "epoch": 0.5230125523012552, "grad_norm": 2.9945903323401217, "learning_rate": 2.7370891215954565e-07, "logits/chosen": -2.575193166732788, "logits/rejected": -2.45214581489563, "logps/chosen": -307.0121154785156, "logps/rejected": -283.763916015625, "loss": 0.035, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.1271018236875534, "rewards/margins": 0.24608302116394043, "rewards/rejected": -0.37318480014801025, "step": 250 }, { "epoch": 0.5230125523012552, "eval_logits/chosen": -2.5931034088134766, "eval_logits/rejected": -2.536372184753418, "eval_logps/chosen": -265.3905029296875, "eval_logps/rejected": -292.68170166015625, "eval_loss": 0.03588278219103813, "eval_rewards/accuracies": 0.78125, "eval_rewards/chosen": -0.027605721727013588, "eval_rewards/margins": 0.27259036898612976, "eval_rewards/rejected": -0.3001960813999176, "eval_runtime": 105.5375, "eval_samples_per_second": 18.951, "eval_steps_per_second": 0.303, "step": 250 }, { "epoch": 0.5439330543933054, "grad_norm": 3.8379218102661117, "learning_rate": 2.55479083351317e-07, "logits/chosen": -2.571600914001465, "logits/rejected": -2.4651172161102295, "logps/chosen": -318.83074951171875, "logps/rejected": -297.157470703125, "loss": 0.0347, "rewards/accuracies": 0.78125, "rewards/chosen": -0.043129194527864456, "rewards/margins": 0.2787154018878937, "rewards/rejected": -0.32184460759162903, "step": 260 }, { "epoch": 0.5648535564853556, "grad_norm": 2.652257544306236, "learning_rate": 2.3722002126275822e-07, "logits/chosen": -2.4981651306152344, "logits/rejected": -2.3565025329589844, "logps/chosen": -312.01361083984375, "logps/rejected": -299.30072021484375, "loss": 0.0342, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -0.18602418899536133, "rewards/margins": 0.28917396068573, "rewards/rejected": -0.4751981794834137, "step": 270 }, { "epoch": 0.5857740585774058, "grad_norm": 3.8427126345253204, "learning_rate": 2.19029145890313e-07, "logits/chosen": -2.4173197746276855, "logits/rejected": -2.354979991912842, "logps/chosen": -276.7501525878906, "logps/rejected": -297.5990295410156, "loss": 0.035, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.19395743310451508, "rewards/margins": 0.2593483030796051, "rewards/rejected": -0.45330578088760376, "step": 280 }, { "epoch": 0.606694560669456, "grad_norm": 3.3214057827825507, "learning_rate": 2.0100351342479216e-07, "logits/chosen": -2.464021921157837, "logits/rejected": -2.39694881439209, "logps/chosen": -300.59075927734375, "logps/rejected": -315.4048156738281, "loss": 0.0356, "rewards/accuracies": 0.78125, "rewards/chosen": -0.09249688684940338, "rewards/margins": 0.27210110425949097, "rewards/rejected": -0.36459797620773315, "step": 290 }, { "epoch": 0.6276150627615062, "grad_norm": 3.077238623321376, "learning_rate": 1.8323929841460178e-07, "logits/chosen": -2.3226752281188965, "logits/rejected": -2.219759225845337, "logps/chosen": -283.2825622558594, "logps/rejected": -318.3642578125, "loss": 0.0336, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.13956645131111145, "rewards/margins": 0.2606472074985504, "rewards/rejected": -0.4002136290073395, "step": 300 }, { "epoch": 0.6276150627615062, "eval_logits/chosen": -2.4060091972351074, "eval_logits/rejected": -2.3179163932800293, "eval_logps/chosen": -278.71954345703125, "eval_logps/rejected": -307.55657958984375, "eval_loss": 0.0350707545876503, "eval_rewards/accuracies": 0.7734375, "eval_rewards/chosen": -0.16089613735675812, "eval_rewards/margins": 0.28804877400398254, "eval_rewards/rejected": -0.44894489645957947, "eval_runtime": 102.9876, "eval_samples_per_second": 19.42, "eval_steps_per_second": 0.311, "step": 300 }, { "epoch": 0.6485355648535565, "grad_norm": 3.9837593350930987, "learning_rate": 1.6583128063291573e-07, "logits/chosen": -2.4029016494750977, "logits/rejected": -2.2795987129211426, "logps/chosen": -311.8950500488281, "logps/rejected": -322.6264953613281, "loss": 0.0338, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.17933650314807892, "rewards/margins": 0.2761048972606659, "rewards/rejected": -0.455441415309906, "step": 310 }, { "epoch": 0.6694560669456067, "grad_norm": 3.564824868243538, "learning_rate": 1.488723393865766e-07, "logits/chosen": -2.4168436527252197, "logits/rejected": -2.22420072555542, "logps/chosen": -310.73724365234375, "logps/rejected": -295.3639831542969, "loss": 0.0331, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.18507453799247742, "rewards/margins": 0.24890851974487305, "rewards/rejected": -0.43398308753967285, "step": 320 }, { "epoch": 0.6903765690376569, "grad_norm": 3.612448436540657, "learning_rate": 1.3245295796480788e-07, "logits/chosen": -2.4088501930236816, "logits/rejected": -2.2718417644500732, "logps/chosen": -300.2972717285156, "logps/rejected": -283.8664245605469, "loss": 0.0331, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.1666984260082245, "rewards/margins": 0.2534824311733246, "rewards/rejected": -0.42018088698387146, "step": 330 }, { "epoch": 0.7112970711297071, "grad_norm": 3.369937066189534, "learning_rate": 1.1666074087171627e-07, "logits/chosen": -2.3149194717407227, "logits/rejected": -2.21246337890625, "logps/chosen": -259.3164978027344, "logps/rejected": -299.25360107421875, "loss": 0.0344, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.1287737488746643, "rewards/margins": 0.26817721128463745, "rewards/rejected": -0.39695096015930176, "step": 340 }, { "epoch": 0.7322175732217573, "grad_norm": 3.0588084178439083, "learning_rate": 1.0157994641835734e-07, "logits/chosen": -2.3560991287231445, "logits/rejected": -2.2018752098083496, "logps/chosen": -296.5240478515625, "logps/rejected": -308.5161437988281, "loss": 0.0338, "rewards/accuracies": 0.75, "rewards/chosen": -0.14297232031822205, "rewards/margins": 0.2560271918773651, "rewards/rejected": -0.39899951219558716, "step": 350 }, { "epoch": 0.7322175732217573, "eval_logits/chosen": -2.432863712310791, "eval_logits/rejected": -2.3603291511535645, "eval_logps/chosen": -274.0787353515625, "eval_logps/rejected": -302.0604248046875, "eval_loss": 0.03481597825884819, "eval_rewards/accuracies": 0.76953125, "eval_rewards/chosen": -0.11448819935321808, "eval_rewards/margins": 0.27949514985084534, "eval_rewards/rejected": -0.3939833641052246, "eval_runtime": 104.1739, "eval_samples_per_second": 19.199, "eval_steps_per_second": 0.307, "step": 350 }, { "epoch": 0.7531380753138075, "grad_norm": 3.2773134868984153, "learning_rate": 8.729103716819111e-08, "logits/chosen": -2.4333927631378174, "logits/rejected": -2.364302158355713, "logps/chosen": -261.3931579589844, "logps/rejected": -285.6098937988281, "loss": 0.0353, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.14534708857536316, "rewards/margins": 0.2542579770088196, "rewards/rejected": -0.39960503578186035, "step": 360 }, { "epoch": 0.7740585774058577, "grad_norm": 4.682836628884491, "learning_rate": 7.387025063449081e-08, "logits/chosen": -2.5038342475891113, "logits/rejected": -2.4199085235595703, "logps/chosen": -296.49896240234375, "logps/rejected": -309.2629699707031, "loss": 0.0336, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.0865456834435463, "rewards/margins": 0.28183671832084656, "rewards/rejected": -0.36838242411613464, "step": 370 }, { "epoch": 0.7949790794979079, "grad_norm": 3.127130773812614, "learning_rate": 6.138919252022435e-08, "logits/chosen": -2.5170111656188965, "logits/rejected": -2.4522595405578613, "logps/chosen": -325.26849365234375, "logps/rejected": -310.82513427734375, "loss": 0.0341, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.08804645389318466, "rewards/margins": 0.19630609452724457, "rewards/rejected": -0.28435254096984863, "step": 380 }, { "epoch": 0.8158995815899581, "grad_norm": 3.5675081888384224, "learning_rate": 4.991445467064689e-08, "logits/chosen": -2.4555935859680176, "logits/rejected": -2.34989595413208, "logps/chosen": -316.98394775390625, "logps/rejected": -330.12835693359375, "loss": 0.0353, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.10423789918422699, "rewards/margins": 0.29529544711112976, "rewards/rejected": -0.39953336119651794, "step": 390 }, { "epoch": 0.8368200836820083, "grad_norm": 3.22998220338436, "learning_rate": 3.9507259776993954e-08, "logits/chosen": -2.4702091217041016, "logits/rejected": -2.4040184020996094, "logps/chosen": -308.82708740234375, "logps/rejected": -316.28070068359375, "loss": 0.0352, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.14892233908176422, "rewards/margins": 0.22808849811553955, "rewards/rejected": -0.37701085209846497, "step": 400 }, { "epoch": 0.8368200836820083, "eval_logits/chosen": -2.511061429977417, "eval_logits/rejected": -2.437195062637329, "eval_logps/chosen": -275.127685546875, "eval_logps/rejected": -303.78619384765625, "eval_loss": 0.034544143825769424, "eval_rewards/accuracies": 0.7734375, "eval_rewards/chosen": -0.1249774694442749, "eval_rewards/margins": 0.28626346588134766, "eval_rewards/rejected": -0.41124093532562256, "eval_runtime": 104.7045, "eval_samples_per_second": 19.101, "eval_steps_per_second": 0.306, "step": 400 }, { "epoch": 0.8577405857740585, "grad_norm": 3.3828994704557025, "learning_rate": 3.022313472693447e-08, "logits/chosen": -2.5727744102478027, "logits/rejected": -2.472245931625366, "logps/chosen": -329.69915771484375, "logps/rejected": -336.42913818359375, "loss": 0.0333, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.1269669234752655, "rewards/margins": 0.2546684741973877, "rewards/rejected": -0.3816354274749756, "step": 410 }, { "epoch": 0.8786610878661087, "grad_norm": 3.1926804873738943, "learning_rate": 2.2111614344599684e-08, "logits/chosen": -2.4479329586029053, "logits/rejected": -2.3029239177703857, "logps/chosen": -310.84954833984375, "logps/rejected": -311.79693603515625, "loss": 0.0314, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.14661455154418945, "rewards/margins": 0.2614172101020813, "rewards/rejected": -0.40803179144859314, "step": 420 }, { "epoch": 0.899581589958159, "grad_norm": 2.8296496249212364, "learning_rate": 1.521597710086439e-08, "logits/chosen": -2.5343031883239746, "logits/rejected": -2.38383412361145, "logps/chosen": -310.29693603515625, "logps/rejected": -298.6971130371094, "loss": 0.0342, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.14315804839134216, "rewards/margins": 0.250461608171463, "rewards/rejected": -0.39361968636512756, "step": 430 }, { "epoch": 0.9205020920502092, "grad_norm": 2.966812625966253, "learning_rate": 9.57301420397924e-09, "logits/chosen": -2.479175329208374, "logits/rejected": -2.299555778503418, "logps/chosen": -298.52862548828125, "logps/rejected": -290.23345947265625, "loss": 0.0336, "rewards/accuracies": 0.71875, "rewards/chosen": -0.16254766285419464, "rewards/margins": 0.2794143557548523, "rewards/rejected": -0.4419620633125305, "step": 440 }, { "epoch": 0.9414225941422594, "grad_norm": 3.0686097215961046, "learning_rate": 5.212833302556258e-09, "logits/chosen": -2.508073091506958, "logits/rejected": -2.4404401779174805, "logps/chosen": -299.69134521484375, "logps/rejected": -320.4561767578125, "loss": 0.0342, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.14945265650749207, "rewards/margins": 0.2798132002353668, "rewards/rejected": -0.4292658269405365, "step": 450 }, { "epoch": 0.9414225941422594, "eval_logits/chosen": -2.500336170196533, "eval_logits/rejected": -2.419475793838501, "eval_logps/chosen": -275.03082275390625, "eval_logps/rejected": -304.2409362792969, "eval_loss": 0.03452227637171745, "eval_rewards/accuracies": 0.7734375, "eval_rewards/chosen": -0.12400892376899719, "eval_rewards/margins": 0.29177966713905334, "eval_rewards/rejected": -0.41578859090805054, "eval_runtime": 102.9541, "eval_samples_per_second": 19.426, "eval_steps_per_second": 0.311, "step": 450 }, { "epoch": 0.9623430962343096, "grad_norm": 3.2166545676628093, "learning_rate": 2.158697848236607e-09, "logits/chosen": -2.4364752769470215, "logits/rejected": -2.3313584327697754, "logps/chosen": -281.92254638671875, "logps/rejected": -301.19110107421875, "loss": 0.0341, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.18644972145557404, "rewards/margins": 0.2685782313346863, "rewards/rejected": -0.4550279676914215, "step": 460 }, { "epoch": 0.9832635983263598, "grad_norm": 2.5305189796072156, "learning_rate": 4.269029751107489e-10, "logits/chosen": -2.399737596511841, "logits/rejected": -2.354271650314331, "logps/chosen": -275.69482421875, "logps/rejected": -305.2413635253906, "loss": 0.0347, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.12569288909435272, "rewards/margins": 0.26325756311416626, "rewards/rejected": -0.38895049691200256, "step": 470 }, { "epoch": 1.0, "step": 478, "total_flos": 0.0, "train_loss": 0.037969033089641745, "train_runtime": 12745.9722, "train_samples_per_second": 4.796, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }