{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.998919113673212, "eval_steps": 100, "global_step": 2774, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 0.30078125, "learning_rate": 1.798561151079137e-08, "logits/chosen": -2.5878467559814453, "logits/rejected": -2.596919059753418, "logps/chosen": -50.55097579956055, "logps/rejected": -53.270023345947266, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "grad_norm": 0.33203125, "learning_rate": 1.7985611510791368e-07, "logits/chosen": -2.6599929332733154, "logits/rejected": -2.6492068767547607, "logps/chosen": -58.52377700805664, "logps/rejected": -61.61543273925781, "loss": 0.6931, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": -4.6036697312956676e-05, "rewards/margins": 4.705908213509247e-05, "rewards/rejected": -9.309577581007034e-05, "step": 10 }, { "epoch": 0.01, "grad_norm": 0.357421875, "learning_rate": 3.5971223021582736e-07, "logits/chosen": -2.65588641166687, "logits/rejected": -2.661142110824585, "logps/chosen": -60.95711135864258, "logps/rejected": -63.73247146606445, "loss": 0.6932, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.00015254078607540578, "rewards/margins": -0.00013396346184890717, "rewards/rejected": -1.857726601883769e-05, "step": 20 }, { "epoch": 0.02, "grad_norm": 0.435546875, "learning_rate": 5.39568345323741e-07, "logits/chosen": -2.626067876815796, "logits/rejected": -2.6205759048461914, "logps/chosen": -65.40022277832031, "logps/rejected": -68.29045104980469, "loss": 0.6933, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -3.5934197512688115e-05, "rewards/margins": -0.00024712778395041823, "rewards/rejected": 0.0002111935755237937, "step": 30 }, { "epoch": 0.03, "grad_norm": 0.3515625, "learning_rate": 7.194244604316547e-07, "logits/chosen": -2.6541905403137207, "logits/rejected": -2.6613316535949707, "logps/chosen": -58.868675231933594, "logps/rejected": -62.767356872558594, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.00014814280439168215, "rewards/margins": 0.00017737274174578488, "rewards/rejected": -2.922994281107094e-05, "step": 40 }, { "epoch": 0.04, "grad_norm": 0.37109375, "learning_rate": 8.992805755395684e-07, "logits/chosen": -2.614741802215576, "logits/rejected": -2.617932081222534, "logps/chosen": -59.7147216796875, "logps/rejected": -61.980995178222656, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.00018150641699321568, "rewards/margins": 0.00013201191904954612, "rewards/rejected": 4.949455615133047e-05, "step": 50 }, { "epoch": 0.04, "grad_norm": 0.50390625, "learning_rate": 1.079136690647482e-06, "logits/chosen": -2.6651856899261475, "logits/rejected": -2.6654398441314697, "logps/chosen": -68.95173645019531, "logps/rejected": -71.27698516845703, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": 0.00047902195365168154, "rewards/margins": 0.0001825519575504586, "rewards/rejected": 0.0002964699815493077, "step": 60 }, { "epoch": 0.05, "grad_norm": 0.291015625, "learning_rate": 1.2589928057553958e-06, "logits/chosen": -2.6852972507476807, "logits/rejected": -2.6725258827209473, "logps/chosen": -68.01790618896484, "logps/rejected": -72.10859680175781, "loss": 0.693, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.00035084557021036744, "rewards/margins": 0.00020036422938574106, "rewards/rejected": 0.00015048135537654161, "step": 70 }, { "epoch": 0.06, "grad_norm": 0.45703125, "learning_rate": 1.4388489208633094e-06, "logits/chosen": -2.6697287559509277, "logits/rejected": -2.668147563934326, "logps/chosen": -70.40176391601562, "logps/rejected": -73.07498931884766, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00043141478090547025, "rewards/margins": 0.00031604542164132, "rewards/rejected": 0.00011536936654010788, "step": 80 }, { "epoch": 0.06, "grad_norm": 0.39453125, "learning_rate": 1.618705035971223e-06, "logits/chosen": -2.670775890350342, "logits/rejected": -2.674410343170166, "logps/chosen": -66.90149688720703, "logps/rejected": -69.80754089355469, "loss": 0.6929, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0009530371753498912, "rewards/margins": 0.0005468233721330762, "rewards/rejected": 0.0004062137159053236, "step": 90 }, { "epoch": 0.07, "grad_norm": 0.380859375, "learning_rate": 1.7985611510791368e-06, "logits/chosen": -2.657923460006714, "logits/rejected": -2.658536911010742, "logps/chosen": -62.22175979614258, "logps/rejected": -66.25755310058594, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": 0.0009646881371736526, "rewards/margins": 0.00016965254326350987, "rewards/rejected": 0.0007950355065986514, "step": 100 }, { "epoch": 0.08, "grad_norm": 0.3125, "learning_rate": 1.9784172661870504e-06, "logits/chosen": -2.6612608432769775, "logits/rejected": -2.6600637435913086, "logps/chosen": -66.11808013916016, "logps/rejected": -69.09329986572266, "loss": 0.693, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0011165592586621642, "rewards/margins": 0.00037375936517491937, "rewards/rejected": 0.0007428000681102276, "step": 110 }, { "epoch": 0.09, "grad_norm": 0.322265625, "learning_rate": 2.158273381294964e-06, "logits/chosen": -2.6269524097442627, "logits/rejected": -2.627486228942871, "logps/chosen": -61.392478942871094, "logps/rejected": -64.30213165283203, "loss": 0.6928, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.001123746857047081, "rewards/margins": 0.0006171964341774583, "rewards/rejected": 0.0005065504228696227, "step": 120 }, { "epoch": 0.09, "grad_norm": 0.359375, "learning_rate": 2.3381294964028776e-06, "logits/chosen": -2.7004921436309814, "logits/rejected": -2.7044999599456787, "logps/chosen": -68.17378234863281, "logps/rejected": -70.49411010742188, "loss": 0.6929, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0015111321117728949, "rewards/margins": 0.0004890409181825817, "rewards/rejected": 0.0010220912517979741, "step": 130 }, { "epoch": 0.1, "grad_norm": 0.37890625, "learning_rate": 2.5179856115107916e-06, "logits/chosen": -2.646902084350586, "logits/rejected": -2.6411759853363037, "logps/chosen": -63.913551330566406, "logps/rejected": -68.77268981933594, "loss": 0.6925, "rewards/accuracies": 0.59375, "rewards/chosen": 0.002318110316991806, "rewards/margins": 0.0013834238052368164, "rewards/rejected": 0.0009346865117549896, "step": 140 }, { "epoch": 0.11, "grad_norm": 0.453125, "learning_rate": 2.6978417266187052e-06, "logits/chosen": -2.6741390228271484, "logits/rejected": -2.6727261543273926, "logps/chosen": -65.45186614990234, "logps/rejected": -69.5618667602539, "loss": 0.6922, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0026420415379107, "rewards/margins": 0.001897258684039116, "rewards/rejected": 0.0007447830284945667, "step": 150 }, { "epoch": 0.12, "grad_norm": 0.314453125, "learning_rate": 2.877697841726619e-06, "logits/chosen": -2.645352602005005, "logits/rejected": -2.6511170864105225, "logps/chosen": -57.53055953979492, "logps/rejected": -61.90361404418945, "loss": 0.692, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.003107175463810563, "rewards/margins": 0.0023571993224322796, "rewards/rejected": 0.0007499762577936053, "step": 160 }, { "epoch": 0.12, "grad_norm": 0.42578125, "learning_rate": 3.0575539568345324e-06, "logits/chosen": -2.650054454803467, "logits/rejected": -2.6488354206085205, "logps/chosen": -61.54291534423828, "logps/rejected": -64.30693054199219, "loss": 0.6917, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0038940298836678267, "rewards/margins": 0.0028370567597448826, "rewards/rejected": 0.001056973123922944, "step": 170 }, { "epoch": 0.13, "grad_norm": 0.33203125, "learning_rate": 3.237410071942446e-06, "logits/chosen": -2.645240306854248, "logits/rejected": -2.642111301422119, "logps/chosen": -61.097076416015625, "logps/rejected": -64.71502685546875, "loss": 0.6909, "rewards/accuracies": 0.625, "rewards/chosen": 0.004625825677067041, "rewards/margins": 0.004492693580687046, "rewards/rejected": 0.00013313218369148672, "step": 180 }, { "epoch": 0.14, "grad_norm": 0.3515625, "learning_rate": 3.4172661870503596e-06, "logits/chosen": -2.7127881050109863, "logits/rejected": -2.710603713989258, "logps/chosen": -57.48582077026367, "logps/rejected": -62.37571334838867, "loss": 0.6913, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.004764446523040533, "rewards/margins": 0.003814270021393895, "rewards/rejected": 0.0009501769091002643, "step": 190 }, { "epoch": 0.14, "grad_norm": 0.515625, "learning_rate": 3.5971223021582737e-06, "logits/chosen": -2.6896915435791016, "logits/rejected": -2.684767246246338, "logps/chosen": -59.758766174316406, "logps/rejected": -64.19367980957031, "loss": 0.6917, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0045052021741867065, "rewards/margins": 0.003016799222677946, "rewards/rejected": 0.0014884021366015077, "step": 200 }, { "epoch": 0.15, "grad_norm": 0.392578125, "learning_rate": 3.7769784172661873e-06, "logits/chosen": -2.6605515480041504, "logits/rejected": -2.6634459495544434, "logps/chosen": -58.80467987060547, "logps/rejected": -60.49141311645508, "loss": 0.691, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0028493874706327915, "rewards/margins": 0.004452340304851532, "rewards/rejected": -0.0016029527178034186, "step": 210 }, { "epoch": 0.16, "grad_norm": 0.41796875, "learning_rate": 3.956834532374101e-06, "logits/chosen": -2.6214749813079834, "logits/rejected": -2.6205554008483887, "logps/chosen": -63.977142333984375, "logps/rejected": -71.72235870361328, "loss": 0.6886, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0048708124086260796, "rewards/margins": 0.009143907576799393, "rewards/rejected": -0.0042730942368507385, "step": 220 }, { "epoch": 0.17, "grad_norm": 0.390625, "learning_rate": 4.1366906474820145e-06, "logits/chosen": -2.663078784942627, "logits/rejected": -2.667092800140381, "logps/chosen": -61.06050491333008, "logps/rejected": -66.15110778808594, "loss": 0.6897, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.005165424197912216, "rewards/margins": 0.007069968618452549, "rewards/rejected": -0.0019045437220484018, "step": 230 }, { "epoch": 0.17, "grad_norm": 0.453125, "learning_rate": 4.316546762589928e-06, "logits/chosen": -2.675718069076538, "logits/rejected": -2.6735589504241943, "logps/chosen": -65.82478332519531, "logps/rejected": -69.08268737792969, "loss": 0.6891, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.00702512264251709, "rewards/margins": 0.008214818313717842, "rewards/rejected": -0.0011896961368620396, "step": 240 }, { "epoch": 0.18, "grad_norm": 0.58984375, "learning_rate": 4.496402877697842e-06, "logits/chosen": -2.6274218559265137, "logits/rejected": -2.6306469440460205, "logps/chosen": -67.89946746826172, "logps/rejected": -71.547119140625, "loss": 0.6877, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.003348552156239748, "rewards/margins": 0.011096605099737644, "rewards/rejected": -0.0077480534091591835, "step": 250 }, { "epoch": 0.19, "grad_norm": 0.462890625, "learning_rate": 4.676258992805755e-06, "logits/chosen": -2.6246440410614014, "logits/rejected": -2.643188238143921, "logps/chosen": -67.15058135986328, "logps/rejected": -71.12448120117188, "loss": 0.6875, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.0009993333369493484, "rewards/margins": 0.011576562188565731, "rewards/rejected": -0.012575894594192505, "step": 260 }, { "epoch": 0.19, "grad_norm": 0.51171875, "learning_rate": 4.856115107913669e-06, "logits/chosen": -2.6977336406707764, "logits/rejected": -2.6968212127685547, "logps/chosen": -65.34959411621094, "logps/rejected": -68.08098602294922, "loss": 0.6882, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0017364125233143568, "rewards/margins": 0.010187914595007896, "rewards/rejected": -0.011924326419830322, "step": 270 }, { "epoch": 0.2, "grad_norm": 0.41796875, "learning_rate": 4.999992078993707e-06, "logits/chosen": -2.6335489749908447, "logits/rejected": -2.640903949737549, "logps/chosen": -58.345176696777344, "logps/rejected": -61.308982849121094, "loss": 0.6871, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.000765187491197139, "rewards/margins": 0.012545737437903881, "rewards/rejected": -0.01178054977208376, "step": 280 }, { "epoch": 0.21, "grad_norm": 0.44921875, "learning_rate": 4.999714849043746e-06, "logits/chosen": -2.662158489227295, "logits/rejected": -2.674367904663086, "logps/chosen": -62.21772003173828, "logps/rejected": -65.60545349121094, "loss": 0.6867, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.007329708430916071, "rewards/margins": 0.013346971943974495, "rewards/rejected": -0.020676681771874428, "step": 290 }, { "epoch": 0.22, "grad_norm": 0.46484375, "learning_rate": 4.999041618971537e-06, "logits/chosen": -2.6512532234191895, "logits/rejected": -2.6503214836120605, "logps/chosen": -67.29080963134766, "logps/rejected": -72.53589630126953, "loss": 0.6861, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.010256023146212101, "rewards/margins": 0.014919854700565338, "rewards/rejected": -0.025175878778100014, "step": 300 }, { "epoch": 0.22, "grad_norm": 0.56640625, "learning_rate": 4.997972495428924e-06, "logits/chosen": -2.615621328353882, "logits/rejected": -2.6233325004577637, "logps/chosen": -66.02967071533203, "logps/rejected": -70.49574279785156, "loss": 0.6852, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.007946187630295753, "rewards/margins": 0.016535501927137375, "rewards/rejected": -0.02448168769478798, "step": 310 }, { "epoch": 0.23, "grad_norm": 0.439453125, "learning_rate": 4.996507647784446e-06, "logits/chosen": -2.638176441192627, "logits/rejected": -2.6347122192382812, "logps/chosen": -67.33381652832031, "logps/rejected": -73.75712585449219, "loss": 0.6856, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.01815110817551613, "rewards/margins": 0.016093209385871887, "rewards/rejected": -0.034244317561388016, "step": 320 }, { "epoch": 0.24, "grad_norm": 0.427734375, "learning_rate": 4.994647308096509e-06, "logits/chosen": -2.629110813140869, "logits/rejected": -2.6443512439727783, "logps/chosen": -69.91134643554688, "logps/rejected": -69.85363006591797, "loss": 0.6876, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.014942710287868977, "rewards/margins": 0.012206320650875568, "rewards/rejected": -0.027149027213454247, "step": 330 }, { "epoch": 0.25, "grad_norm": 0.4765625, "learning_rate": 4.9923917710766266e-06, "logits/chosen": -2.6785271167755127, "logits/rejected": -2.6757400035858154, "logps/chosen": -71.02973937988281, "logps/rejected": -75.72981262207031, "loss": 0.6807, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.014097088947892189, "rewards/margins": 0.026118427515029907, "rewards/rejected": -0.04021551460027695, "step": 340 }, { "epoch": 0.25, "grad_norm": 0.66015625, "learning_rate": 4.989741394042728e-06, "logits/chosen": -2.598215103149414, "logits/rejected": -2.5950300693511963, "logps/chosen": -65.64091491699219, "logps/rejected": -70.74314880371094, "loss": 0.6838, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.028698483482003212, "rewards/margins": 0.019990913569927216, "rewards/rejected": -0.04868939518928528, "step": 350 }, { "epoch": 0.26, "grad_norm": 0.5625, "learning_rate": 4.986696596862556e-06, "logits/chosen": -2.625063180923462, "logits/rejected": -2.631725788116455, "logps/chosen": -78.42835998535156, "logps/rejected": -84.2737045288086, "loss": 0.6802, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03507710248231888, "rewards/margins": 0.027645844966173172, "rewards/rejected": -0.06272295117378235, "step": 360 }, { "epoch": 0.27, "grad_norm": 0.76953125, "learning_rate": 4.983257861887148e-06, "logits/chosen": -2.6487419605255127, "logits/rejected": -2.6524715423583984, "logps/chosen": -71.53236389160156, "logps/rejected": -81.15141296386719, "loss": 0.6738, "rewards/accuracies": 0.5625, "rewards/chosen": -0.05367087572813034, "rewards/margins": 0.04164598509669304, "rewards/rejected": -0.09531687200069427, "step": 370 }, { "epoch": 0.27, "grad_norm": 0.56640625, "learning_rate": 4.979425733874431e-06, "logits/chosen": -2.575629472732544, "logits/rejected": -2.5949313640594482, "logps/chosen": -71.41996765136719, "logps/rejected": -75.95075225830078, "loss": 0.6794, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.07148171961307526, "rewards/margins": 0.03022712469100952, "rewards/rejected": -0.10170884430408478, "step": 380 }, { "epoch": 0.28, "grad_norm": 0.75390625, "learning_rate": 4.975200819902911e-06, "logits/chosen": -2.608182430267334, "logits/rejected": -2.613959550857544, "logps/chosen": -77.80644226074219, "logps/rejected": -86.1133804321289, "loss": 0.6738, "rewards/accuracies": 0.625, "rewards/chosen": -0.10805950313806534, "rewards/margins": 0.04201812297105789, "rewards/rejected": -0.15007762610912323, "step": 390 }, { "epoch": 0.29, "grad_norm": 0.7109375, "learning_rate": 4.970583789275508e-06, "logits/chosen": -2.565563440322876, "logits/rejected": -2.575218677520752, "logps/chosen": -72.14826965332031, "logps/rejected": -76.73294830322266, "loss": 0.6828, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.09949363768100739, "rewards/margins": 0.02443886548280716, "rewards/rejected": -0.12393250316381454, "step": 400 }, { "epoch": 0.3, "grad_norm": 1.046875, "learning_rate": 4.965575373413527e-06, "logits/chosen": -2.5901551246643066, "logits/rejected": -2.592224359512329, "logps/chosen": -78.75377655029297, "logps/rejected": -87.20631408691406, "loss": 0.6708, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1401161104440689, "rewards/margins": 0.05296441912651062, "rewards/rejected": -0.19308052957057953, "step": 410 }, { "epoch": 0.3, "grad_norm": 0.8359375, "learning_rate": 4.960176365740783e-06, "logits/chosen": -2.568718671798706, "logits/rejected": -2.5703847408294678, "logps/chosen": -82.48625183105469, "logps/rejected": -91.03981018066406, "loss": 0.6763, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.1931959092617035, "rewards/margins": 0.04230925068259239, "rewards/rejected": -0.2355051338672638, "step": 420 }, { "epoch": 0.31, "grad_norm": 0.96875, "learning_rate": 4.954387621557911e-06, "logits/chosen": -2.472228527069092, "logits/rejected": -2.4818115234375, "logps/chosen": -83.677978515625, "logps/rejected": -90.09959411621094, "loss": 0.6676, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.20174559950828552, "rewards/margins": 0.060646455734968185, "rewards/rejected": -0.2623920440673828, "step": 430 }, { "epoch": 0.32, "grad_norm": 1.046875, "learning_rate": 4.948210057906871e-06, "logits/chosen": -2.424100637435913, "logits/rejected": -2.4418275356292725, "logps/chosen": -88.38543701171875, "logps/rejected": -100.091552734375, "loss": 0.6649, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.25682324171066284, "rewards/margins": 0.07104425877332687, "rewards/rejected": -0.3278675079345703, "step": 440 }, { "epoch": 0.32, "grad_norm": 0.78515625, "learning_rate": 4.941644653425671e-06, "logits/chosen": -2.452075481414795, "logits/rejected": -2.4671432971954346, "logps/chosen": -100.57665252685547, "logps/rejected": -104.40872955322266, "loss": 0.6732, "rewards/accuracies": 0.53125, "rewards/chosen": -0.2847747802734375, "rewards/margins": 0.05320798233151436, "rewards/rejected": -0.33798274397850037, "step": 450 }, { "epoch": 0.33, "grad_norm": 1.0234375, "learning_rate": 4.9346924481933345e-06, "logits/chosen": -2.459083318710327, "logits/rejected": -2.4748520851135254, "logps/chosen": -96.22517395019531, "logps/rejected": -105.0258560180664, "loss": 0.6636, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.23906302452087402, "rewards/margins": 0.06904648244380951, "rewards/rejected": -0.3081095516681671, "step": 460 }, { "epoch": 0.34, "grad_norm": 1.4140625, "learning_rate": 4.927354543565131e-06, "logits/chosen": -2.404327630996704, "logits/rejected": -2.4193339347839355, "logps/chosen": -101.07810974121094, "logps/rejected": -109.60540771484375, "loss": 0.6601, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.2962692379951477, "rewards/margins": 0.07886885851621628, "rewards/rejected": -0.3751381039619446, "step": 470 }, { "epoch": 0.35, "grad_norm": 1.15625, "learning_rate": 4.919632101998101e-06, "logits/chosen": -2.4055585861206055, "logits/rejected": -2.4047584533691406, "logps/chosen": -83.26808166503906, "logps/rejected": -96.07670593261719, "loss": 0.6557, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.25196436047554016, "rewards/margins": 0.09068725258111954, "rewards/rejected": -0.3426516056060791, "step": 480 }, { "epoch": 0.35, "grad_norm": 1.2109375, "learning_rate": 4.911526346866907e-06, "logits/chosen": -2.3670365810394287, "logits/rejected": -2.380223512649536, "logps/chosen": -96.45356750488281, "logps/rejected": -111.1182861328125, "loss": 0.6479, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.32315492630004883, "rewards/margins": 0.11133052408695221, "rewards/rejected": -0.43448543548583984, "step": 490 }, { "epoch": 0.36, "grad_norm": 0.953125, "learning_rate": 4.9030385622700225e-06, "logits/chosen": -2.3522255420684814, "logits/rejected": -2.358100414276123, "logps/chosen": -96.55814361572266, "logps/rejected": -112.30567932128906, "loss": 0.6517, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.37474876642227173, "rewards/margins": 0.10861654579639435, "rewards/rejected": -0.4833652973175049, "step": 500 }, { "epoch": 0.37, "grad_norm": 1.125, "learning_rate": 4.89417009282631e-06, "logits/chosen": -2.3778271675109863, "logits/rejected": -2.390409469604492, "logps/chosen": -98.19575500488281, "logps/rejected": -111.83695983886719, "loss": 0.6495, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3482569754123688, "rewards/margins": 0.11011794954538345, "rewards/rejected": -0.45837491750717163, "step": 510 }, { "epoch": 0.37, "grad_norm": 1.0703125, "learning_rate": 4.88492234346201e-06, "logits/chosen": -2.3503499031066895, "logits/rejected": -2.3607373237609863, "logps/chosen": -109.85440826416016, "logps/rejected": -122.345703125, "loss": 0.6576, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.37264811992645264, "rewards/margins": 0.09832003712654114, "rewards/rejected": -0.47096818685531616, "step": 520 }, { "epoch": 0.38, "grad_norm": 1.1328125, "learning_rate": 4.8752967791881735e-06, "logits/chosen": -2.356555461883545, "logits/rejected": -2.362435817718506, "logps/chosen": -101.36775970458984, "logps/rejected": -111.54450988769531, "loss": 0.6626, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.3653712868690491, "rewards/margins": 0.08574860543012619, "rewards/rejected": -0.45111989974975586, "step": 530 }, { "epoch": 0.39, "grad_norm": 1.5390625, "learning_rate": 4.865294924868578e-06, "logits/chosen": -2.3726258277893066, "logits/rejected": -2.3774704933166504, "logps/chosen": -95.63130950927734, "logps/rejected": -108.5069580078125, "loss": 0.6528, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.3163696825504303, "rewards/margins": 0.10675134509801865, "rewards/rejected": -0.42312103509902954, "step": 540 }, { "epoch": 0.4, "grad_norm": 1.03125, "learning_rate": 4.854918364978163e-06, "logits/chosen": -2.318713665008545, "logits/rejected": -2.3387556076049805, "logps/chosen": -92.5940933227539, "logps/rejected": -102.9334945678711, "loss": 0.6572, "rewards/accuracies": 0.53125, "rewards/chosen": -0.27377182245254517, "rewards/margins": 0.09678633511066437, "rewards/rejected": -0.3705581724643707, "step": 550 }, { "epoch": 0.4, "grad_norm": 1.0625, "learning_rate": 4.844168743352019e-06, "logits/chosen": -2.3034849166870117, "logits/rejected": -2.322415828704834, "logps/chosen": -93.90568542480469, "logps/rejected": -103.6268081665039, "loss": 0.6716, "rewards/accuracies": 0.5625, "rewards/chosen": -0.28542959690093994, "rewards/margins": 0.07337291538715363, "rewards/rejected": -0.35880252718925476, "step": 560 }, { "epoch": 0.41, "grad_norm": 1.21875, "learning_rate": 4.833047762924975e-06, "logits/chosen": -2.3396031856536865, "logits/rejected": -2.3490686416625977, "logps/chosen": -106.96073913574219, "logps/rejected": -120.58781433105469, "loss": 0.6514, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.35568124055862427, "rewards/margins": 0.10962893068790436, "rewards/rejected": -0.4653101861476898, "step": 570 }, { "epoch": 0.42, "grad_norm": 0.78515625, "learning_rate": 4.8215571854618216e-06, "logits/chosen": -2.2915313243865967, "logits/rejected": -2.3102214336395264, "logps/chosen": -95.60445404052734, "logps/rejected": -107.88221740722656, "loss": 0.6512, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3615376055240631, "rewards/margins": 0.10991451889276505, "rewards/rejected": -0.47145208716392517, "step": 580 }, { "epoch": 0.43, "grad_norm": 1.9765625, "learning_rate": 4.809698831278217e-06, "logits/chosen": -2.359297513961792, "logits/rejected": -2.364837884902954, "logps/chosen": -97.43984985351562, "logps/rejected": -118.7339096069336, "loss": 0.6343, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3123398721218109, "rewards/margins": 0.1503853052854538, "rewards/rejected": -0.4627251625061035, "step": 590 }, { "epoch": 0.43, "grad_norm": 1.015625, "learning_rate": 4.797474578952315e-06, "logits/chosen": -2.364551544189453, "logits/rejected": -2.368478536605835, "logps/chosen": -97.71867370605469, "logps/rejected": -116.11451721191406, "loss": 0.6416, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.3463926315307617, "rewards/margins": 0.13709910213947296, "rewards/rejected": -0.4834917485713959, "step": 600 }, { "epoch": 0.44, "grad_norm": 1.171875, "learning_rate": 4.7848863650271645e-06, "logits/chosen": -2.346735954284668, "logits/rejected": -2.349565029144287, "logps/chosen": -99.36726379394531, "logps/rejected": -108.9990463256836, "loss": 0.6641, "rewards/accuracies": 0.5625, "rewards/chosen": -0.31063222885131836, "rewards/margins": 0.08253936469554901, "rewards/rejected": -0.39317160844802856, "step": 610 }, { "epoch": 0.45, "grad_norm": 1.2421875, "learning_rate": 4.771936183703927e-06, "logits/chosen": -2.2801272869110107, "logits/rejected": -2.286823034286499, "logps/chosen": -90.63265228271484, "logps/rejected": -99.37889099121094, "loss": 0.6713, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.30601394176483154, "rewards/margins": 0.0690702348947525, "rewards/rejected": -0.37508416175842285, "step": 620 }, { "epoch": 0.45, "grad_norm": 1.40625, "learning_rate": 4.758626086525956e-06, "logits/chosen": -2.3465566635131836, "logits/rejected": -2.3557307720184326, "logps/chosen": -91.6519546508789, "logps/rejected": -107.96331787109375, "loss": 0.6483, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.32235291600227356, "rewards/margins": 0.11441938579082489, "rewards/rejected": -0.43677228689193726, "step": 630 }, { "epoch": 0.46, "grad_norm": 1.9453125, "learning_rate": 4.7449581820538e-06, "logits/chosen": -2.3313632011413574, "logits/rejected": -2.3418033123016357, "logps/chosen": -95.18330383300781, "logps/rejected": -111.15785217285156, "loss": 0.6478, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.32686877250671387, "rewards/margins": 0.12550675868988037, "rewards/rejected": -0.45237550139427185, "step": 640 }, { "epoch": 0.47, "grad_norm": 1.46875, "learning_rate": 4.730934635531161e-06, "logits/chosen": -2.3043503761291504, "logits/rejected": -2.310375690460205, "logps/chosen": -97.12528228759766, "logps/rejected": -108.67928314208984, "loss": 0.6521, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.34178540110588074, "rewards/margins": 0.11280516535043716, "rewards/rejected": -0.4545906186103821, "step": 650 }, { "epoch": 0.48, "grad_norm": 1.7578125, "learning_rate": 4.716557668541893e-06, "logits/chosen": -2.343346118927002, "logits/rejected": -2.3510937690734863, "logps/chosen": -97.1328125, "logps/rejected": -114.3864974975586, "loss": 0.6387, "rewards/accuracies": 0.625, "rewards/chosen": -0.31538838148117065, "rewards/margins": 0.13619980216026306, "rewards/rejected": -0.4515882134437561, "step": 660 }, { "epoch": 0.48, "grad_norm": 1.328125, "learning_rate": 4.701829558658047e-06, "logits/chosen": -2.3206913471221924, "logits/rejected": -2.3359267711639404, "logps/chosen": -102.4328384399414, "logps/rejected": -114.6532211303711, "loss": 0.6502, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.343585729598999, "rewards/margins": 0.11280594021081924, "rewards/rejected": -0.4563916325569153, "step": 670 }, { "epoch": 0.49, "grad_norm": 1.21875, "learning_rate": 4.686752639079076e-06, "logits/chosen": -2.28320050239563, "logits/rejected": -2.2843213081359863, "logps/chosen": -101.29241180419922, "logps/rejected": -113.52595520019531, "loss": 0.649, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.33359086513519287, "rewards/margins": 0.11714836210012436, "rewards/rejected": -0.45073920488357544, "step": 680 }, { "epoch": 0.5, "grad_norm": 1.7734375, "learning_rate": 4.671329298262208e-06, "logits/chosen": -2.351982593536377, "logits/rejected": -2.357144832611084, "logps/chosen": -102.9426498413086, "logps/rejected": -118.25224304199219, "loss": 0.6478, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.3722483217716217, "rewards/margins": 0.11313033103942871, "rewards/rejected": -0.4853786528110504, "step": 690 }, { "epoch": 0.5, "grad_norm": 1.15625, "learning_rate": 4.655561979544069e-06, "logits/chosen": -2.2974140644073486, "logits/rejected": -2.307819366455078, "logps/chosen": -101.06309509277344, "logps/rejected": -117.390625, "loss": 0.6451, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.3607991635799408, "rewards/margins": 0.1265007108449936, "rewards/rejected": -0.4872998595237732, "step": 700 }, { "epoch": 0.51, "grad_norm": 1.4765625, "learning_rate": 4.639453180753619e-06, "logits/chosen": -2.248704433441162, "logits/rejected": -2.257744789123535, "logps/chosen": -100.77429962158203, "logps/rejected": -117.0492935180664, "loss": 0.647, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.43067851662635803, "rewards/margins": 0.1362551599740982, "rewards/rejected": -0.5669336318969727, "step": 710 }, { "epoch": 0.52, "grad_norm": 1.2890625, "learning_rate": 4.623005453816447e-06, "logits/chosen": -2.3472437858581543, "logits/rejected": -2.352238655090332, "logps/chosen": -115.71247863769531, "logps/rejected": -131.8263702392578, "loss": 0.647, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.47270336747169495, "rewards/margins": 0.12508396804332733, "rewards/rejected": -0.5977872610092163, "step": 720 }, { "epoch": 0.53, "grad_norm": 1.6328125, "learning_rate": 4.606221404350504e-06, "logits/chosen": -2.28971529006958, "logits/rejected": -2.29419207572937, "logps/chosen": -109.21917724609375, "logps/rejected": -124.91645812988281, "loss": 0.6463, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.4223889708518982, "rewards/margins": 0.13507941365242004, "rewards/rejected": -0.5574684143066406, "step": 730 }, { "epoch": 0.53, "grad_norm": 2.171875, "learning_rate": 4.589103691253317e-06, "logits/chosen": -2.250274658203125, "logits/rejected": -2.2717387676239014, "logps/chosen": -112.26399230957031, "logps/rejected": -119.15122985839844, "loss": 0.6655, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.47238197922706604, "rewards/margins": 0.08317569643259048, "rewards/rejected": -0.5555577278137207, "step": 740 }, { "epoch": 0.54, "grad_norm": 1.921875, "learning_rate": 4.571655026280785e-06, "logits/chosen": -2.2718663215637207, "logits/rejected": -2.284795045852661, "logps/chosen": -112.97920227050781, "logps/rejected": -127.8006591796875, "loss": 0.6484, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.4664887487888336, "rewards/margins": 0.13087505102157593, "rewards/rejected": -0.5973638296127319, "step": 750 }, { "epoch": 0.55, "grad_norm": 1.234375, "learning_rate": 4.553878173617576e-06, "logits/chosen": -2.28155517578125, "logits/rejected": -2.289883613586426, "logps/chosen": -99.67669677734375, "logps/rejected": -116.39395904541016, "loss": 0.6423, "rewards/accuracies": 0.625, "rewards/chosen": -0.3680870532989502, "rewards/margins": 0.13191227614879608, "rewards/rejected": -0.49999934434890747, "step": 760 }, { "epoch": 0.55, "grad_norm": 1.4453125, "learning_rate": 4.5357759494392354e-06, "logits/chosen": -2.2865400314331055, "logits/rejected": -2.301579475402832, "logps/chosen": -103.4640884399414, "logps/rejected": -120.11143493652344, "loss": 0.6447, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.42991527915000916, "rewards/margins": 0.13045726716518402, "rewards/rejected": -0.560372531414032, "step": 770 }, { "epoch": 0.56, "grad_norm": 1.8828125, "learning_rate": 4.5173512214660495e-06, "logits/chosen": -2.290435314178467, "logits/rejected": -2.3016152381896973, "logps/chosen": -104.1209716796875, "logps/rejected": -120.07255554199219, "loss": 0.6424, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.3812081217765808, "rewards/margins": 0.13323888182640076, "rewards/rejected": -0.5144469738006592, "step": 780 }, { "epoch": 0.57, "grad_norm": 1.5546875, "learning_rate": 4.498606908508754e-06, "logits/chosen": -2.281541109085083, "logits/rejected": -2.2845287322998047, "logps/chosen": -108.74382019042969, "logps/rejected": -127.21275329589844, "loss": 0.6438, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.4051045775413513, "rewards/margins": 0.14055274426937103, "rewards/rejected": -0.5456573367118835, "step": 790 }, { "epoch": 0.58, "grad_norm": 1.2578125, "learning_rate": 4.47954598000613e-06, "logits/chosen": -2.3543689250946045, "logits/rejected": -2.363257646560669, "logps/chosen": -96.17681884765625, "logps/rejected": -110.3593521118164, "loss": 0.6472, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.319545716047287, "rewards/margins": 0.120999276638031, "rewards/rejected": -0.440544992685318, "step": 800 }, { "epoch": 0.58, "grad_norm": 1.578125, "learning_rate": 4.460171455554603e-06, "logits/chosen": -2.2809572219848633, "logits/rejected": -2.2786245346069336, "logps/chosen": -99.40967559814453, "logps/rejected": -117.13094329833984, "loss": 0.6423, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.37397512793540955, "rewards/margins": 0.14184913039207458, "rewards/rejected": -0.5158242583274841, "step": 810 }, { "epoch": 0.59, "grad_norm": 1.890625, "learning_rate": 4.4404864044298755e-06, "logits/chosen": -2.23799467086792, "logits/rejected": -2.245177745819092, "logps/chosen": -108.29356384277344, "logps/rejected": -121.1603012084961, "loss": 0.653, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.43523016571998596, "rewards/margins": 0.12470052391290665, "rewards/rejected": -0.559930682182312, "step": 820 }, { "epoch": 0.6, "grad_norm": 1.578125, "learning_rate": 4.420493945100702e-06, "logits/chosen": -2.266139507293701, "logits/rejected": -2.2764334678649902, "logps/chosen": -99.6155014038086, "logps/rejected": -117.4134521484375, "loss": 0.6368, "rewards/accuracies": 0.625, "rewards/chosen": -0.40347641706466675, "rewards/margins": 0.1434660702943802, "rewards/rejected": -0.5469424724578857, "step": 830 }, { "epoch": 0.61, "grad_norm": 1.3203125, "learning_rate": 4.400197244734866e-06, "logits/chosen": -2.3086845874786377, "logits/rejected": -2.3136982917785645, "logps/chosen": -105.7784652709961, "logps/rejected": -123.13346099853516, "loss": 0.6323, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.41168150305747986, "rewards/margins": 0.1643691062927246, "rewards/rejected": -0.5760505795478821, "step": 840 }, { "epoch": 0.61, "grad_norm": 1.6171875, "learning_rate": 4.379599518697444e-06, "logits/chosen": -2.302346706390381, "logits/rejected": -2.305290699005127, "logps/chosen": -110.01615905761719, "logps/rejected": -132.0598602294922, "loss": 0.6283, "rewards/accuracies": 0.625, "rewards/chosen": -0.45303821563720703, "rewards/margins": 0.17430761456489563, "rewards/rejected": -0.627345860004425, "step": 850 }, { "epoch": 0.62, "grad_norm": 2.109375, "learning_rate": 4.3587040300414325e-06, "logits/chosen": -2.249532461166382, "logits/rejected": -2.2589190006256104, "logps/chosen": -117.6961441040039, "logps/rejected": -128.64663696289062, "loss": 0.6567, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.40040212869644165, "rewards/margins": 0.11040042340755463, "rewards/rejected": -0.5108025670051575, "step": 860 }, { "epoch": 0.63, "grad_norm": 1.296875, "learning_rate": 4.337514088990822e-06, "logits/chosen": -2.278533458709717, "logits/rejected": -2.281517267227173, "logps/chosen": -103.110595703125, "logps/rejected": -122.44902038574219, "loss": 0.6331, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.3665826916694641, "rewards/margins": 0.16001132130622864, "rewards/rejected": -0.5265940427780151, "step": 870 }, { "epoch": 0.63, "grad_norm": 2.09375, "learning_rate": 4.316033052416196e-06, "logits/chosen": -2.2408275604248047, "logits/rejected": -2.2425954341888428, "logps/chosen": -104.7763442993164, "logps/rejected": -116.91007232666016, "loss": 0.6591, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.4133949875831604, "rewards/margins": 0.10547590255737305, "rewards/rejected": -0.5188708901405334, "step": 880 }, { "epoch": 0.64, "grad_norm": 1.4296875, "learning_rate": 4.294264323302946e-06, "logits/chosen": -2.3082475662231445, "logits/rejected": -2.3192391395568848, "logps/chosen": -103.19981384277344, "logps/rejected": -117.45137023925781, "loss": 0.6493, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.413215696811676, "rewards/margins": 0.1274113655090332, "rewards/rejected": -0.5406270027160645, "step": 890 }, { "epoch": 0.65, "grad_norm": 2.140625, "learning_rate": 4.272211350212171e-06, "logits/chosen": -2.3206677436828613, "logits/rejected": -2.3214950561523438, "logps/chosen": -110.54658508300781, "logps/rejected": -124.14703369140625, "loss": 0.6599, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.38299205899238586, "rewards/margins": 0.10237312316894531, "rewards/rejected": -0.48536521196365356, "step": 900 }, { "epoch": 0.66, "grad_norm": 1.7578125, "learning_rate": 4.249877626734366e-06, "logits/chosen": -2.2740793228149414, "logits/rejected": -2.2952816486358643, "logps/chosen": -108.4576644897461, "logps/rejected": -121.01176452636719, "loss": 0.6539, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.38486912846565247, "rewards/margins": 0.1163020133972168, "rewards/rejected": -0.5011711120605469, "step": 910 }, { "epoch": 0.66, "grad_norm": 1.7109375, "learning_rate": 4.2272666909359784e-06, "logits/chosen": -2.2910335063934326, "logits/rejected": -2.295705795288086, "logps/chosen": -102.13375091552734, "logps/rejected": -124.9188003540039, "loss": 0.6204, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4369734823703766, "rewards/margins": 0.20262956619262695, "rewards/rejected": -0.6396030783653259, "step": 920 }, { "epoch": 0.67, "grad_norm": 1.1484375, "learning_rate": 4.2043821247989036e-06, "logits/chosen": -2.278778553009033, "logits/rejected": -2.2924065589904785, "logps/chosen": -103.04777526855469, "logps/rejected": -120.569091796875, "loss": 0.6403, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.38494181632995605, "rewards/margins": 0.1459546983242035, "rewards/rejected": -0.5308965444564819, "step": 930 }, { "epoch": 0.68, "grad_norm": 2.171875, "learning_rate": 4.181227553653045e-06, "logits/chosen": -2.278262138366699, "logits/rejected": -2.3009345531463623, "logps/chosen": -121.49980163574219, "logps/rejected": -137.34304809570312, "loss": 0.6449, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4821853041648865, "rewards/margins": 0.14446020126342773, "rewards/rejected": -0.6266454458236694, "step": 940 }, { "epoch": 0.68, "grad_norm": 2.203125, "learning_rate": 4.1578066456019885e-06, "logits/chosen": -2.2163925170898438, "logits/rejected": -2.204552412033081, "logps/chosen": -114.92356872558594, "logps/rejected": -136.88050842285156, "loss": 0.6338, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.5124049186706543, "rewards/margins": 0.1587189882993698, "rewards/rejected": -0.6711238622665405, "step": 950 }, { "epoch": 0.69, "grad_norm": 1.734375, "learning_rate": 4.1341231109419135e-06, "logits/chosen": -2.203275442123413, "logits/rejected": -2.2119054794311523, "logps/chosen": -123.16712951660156, "logps/rejected": -137.21139526367188, "loss": 0.6585, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.5569948554039001, "rewards/margins": 0.1151203066110611, "rewards/rejected": -0.6721151471138, "step": 960 }, { "epoch": 0.7, "grad_norm": 1.5859375, "learning_rate": 4.110180701573809e-06, "logits/chosen": -2.200212001800537, "logits/rejected": -2.198477268218994, "logps/chosen": -109.5115966796875, "logps/rejected": -132.9260711669922, "loss": 0.6196, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.4715866148471832, "rewards/margins": 0.19370624423027039, "rewards/rejected": -0.6652928590774536, "step": 970 }, { "epoch": 0.71, "grad_norm": 1.0546875, "learning_rate": 4.085983210409114e-06, "logits/chosen": -2.227853775024414, "logits/rejected": -2.2186429500579834, "logps/chosen": -118.89933013916016, "logps/rejected": -136.7680206298828, "loss": 0.6514, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.49003705382347107, "rewards/margins": 0.12437830865383148, "rewards/rejected": -0.6144154071807861, "step": 980 }, { "epoch": 0.71, "grad_norm": 1.1640625, "learning_rate": 4.061534470768841e-06, "logits/chosen": -2.2407491207122803, "logits/rejected": -2.2455482482910156, "logps/chosen": -111.09078216552734, "logps/rejected": -124.2870101928711, "loss": 0.6505, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.42453131079673767, "rewards/margins": 0.11639855057001114, "rewards/rejected": -0.540929913520813, "step": 990 }, { "epoch": 0.72, "grad_norm": 2.484375, "learning_rate": 4.036838355776313e-06, "logits/chosen": -2.1629438400268555, "logits/rejected": -2.169175386428833, "logps/chosen": -115.92777252197266, "logps/rejected": -131.81134033203125, "loss": 0.6513, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.5123138427734375, "rewards/margins": 0.13096138834953308, "rewards/rejected": -0.6432752013206482, "step": 1000 }, { "epoch": 0.73, "grad_norm": 1.21875, "learning_rate": 4.011898777743594e-06, "logits/chosen": -2.211540699005127, "logits/rejected": -2.2166659832000732, "logps/chosen": -101.62760162353516, "logps/rejected": -119.66593933105469, "loss": 0.6398, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4295649528503418, "rewards/margins": 0.15017978847026825, "rewards/rejected": -0.5797447562217712, "step": 1010 }, { "epoch": 0.74, "grad_norm": 2.453125, "learning_rate": 3.9867196875517025e-06, "logits/chosen": -2.20629620552063, "logits/rejected": -2.2112419605255127, "logps/chosen": -107.81523132324219, "logps/rejected": -119.03303527832031, "loss": 0.6656, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.4413573145866394, "rewards/margins": 0.09499961137771606, "rewards/rejected": -0.5363569855690002, "step": 1020 }, { "epoch": 0.74, "grad_norm": 1.8828125, "learning_rate": 3.961305074024722e-06, "logits/chosen": -2.125932216644287, "logits/rejected": -2.130676746368408, "logps/chosen": -112.90400695800781, "logps/rejected": -138.2743377685547, "loss": 0.6151, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5330706834793091, "rewards/margins": 0.2042117565870285, "rewards/rejected": -0.737282395362854, "step": 1030 }, { "epoch": 0.75, "grad_norm": 1.5234375, "learning_rate": 3.935658963297902e-06, "logits/chosen": -2.212306261062622, "logits/rejected": -2.2203996181488037, "logps/chosen": -109.0052261352539, "logps/rejected": -125.78495788574219, "loss": 0.6437, "rewards/accuracies": 0.59375, "rewards/chosen": -0.44839105010032654, "rewards/margins": 0.14669093489646912, "rewards/rejected": -0.5950819849967957, "step": 1040 }, { "epoch": 0.76, "grad_norm": 2.28125, "learning_rate": 3.90978541817984e-06, "logits/chosen": -2.1384072303771973, "logits/rejected": -2.143054485321045, "logps/chosen": -108.29044342041016, "logps/rejected": -127.79345703125, "loss": 0.6431, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.5090142488479614, "rewards/margins": 0.16035351157188416, "rewards/rejected": -0.669367790222168, "step": 1050 }, { "epoch": 0.76, "grad_norm": 1.28125, "learning_rate": 3.8836885375088635e-06, "logits/chosen": -2.131621837615967, "logits/rejected": -2.1531243324279785, "logps/chosen": -115.8088150024414, "logps/rejected": -133.919921875, "loss": 0.6405, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5001341104507446, "rewards/margins": 0.1682174801826477, "rewards/rejected": -0.6683515310287476, "step": 1060 }, { "epoch": 0.77, "grad_norm": 2.28125, "learning_rate": 3.857372455503698e-06, "logits/chosen": -2.1725549697875977, "logits/rejected": -2.1732017993927, "logps/chosen": -117.09075927734375, "logps/rejected": -135.18533325195312, "loss": 0.6503, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.43959444761276245, "rewards/margins": 0.13271991908550262, "rewards/rejected": -0.5723143815994263, "step": 1070 }, { "epoch": 0.78, "grad_norm": 2.453125, "learning_rate": 3.830841341108528e-06, "logits/chosen": -2.212951421737671, "logits/rejected": -2.2198729515075684, "logps/chosen": -111.72041320800781, "logps/rejected": -132.552001953125, "loss": 0.6288, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.45366325974464417, "rewards/margins": 0.17270301282405853, "rewards/rejected": -0.6263662576675415, "step": 1080 }, { "epoch": 0.79, "grad_norm": 2.28125, "learning_rate": 3.804099397332572e-06, "logits/chosen": -2.215224027633667, "logits/rejected": -2.210907459259033, "logps/chosen": -112.65693664550781, "logps/rejected": -135.70443725585938, "loss": 0.6231, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.41242438554763794, "rewards/margins": 0.18879784643650055, "rewards/rejected": -0.6012222766876221, "step": 1090 }, { "epoch": 0.79, "grad_norm": 1.9375, "learning_rate": 3.7771508605842372e-06, "logits/chosen": -2.112990140914917, "logits/rejected": -2.1238150596618652, "logps/chosen": -116.11753845214844, "logps/rejected": -138.2332000732422, "loss": 0.6198, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5049333572387695, "rewards/margins": 0.20600661635398865, "rewards/rejected": -0.7109400033950806, "step": 1100 }, { "epoch": 0.8, "grad_norm": 2.078125, "learning_rate": 3.7500000000000005e-06, "logits/chosen": -2.1379756927490234, "logits/rejected": -2.13820219039917, "logps/chosen": -114.88435363769531, "logps/rejected": -133.52407836914062, "loss": 0.6373, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4984721541404724, "rewards/margins": 0.16139784455299377, "rewards/rejected": -0.6598700881004333, "step": 1110 }, { "epoch": 0.81, "grad_norm": 1.8125, "learning_rate": 3.7226511167681014e-06, "logits/chosen": -2.135016918182373, "logits/rejected": -2.126314163208008, "logps/chosen": -111.63895416259766, "logps/rejected": -126.47679138183594, "loss": 0.6509, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.48495396971702576, "rewards/margins": 0.13388456404209137, "rewards/rejected": -0.6188385486602783, "step": 1120 }, { "epoch": 0.81, "grad_norm": 1.8203125, "learning_rate": 3.6951085434471544e-06, "logits/chosen": -2.1722989082336426, "logits/rejected": -2.166605234146118, "logps/chosen": -105.00669860839844, "logps/rejected": -117.9762191772461, "loss": 0.6555, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.44906917214393616, "rewards/margins": 0.1117476224899292, "rewards/rejected": -0.5608168840408325, "step": 1130 }, { "epoch": 0.82, "grad_norm": 2.21875, "learning_rate": 3.6673766432797948e-06, "logits/chosen": -2.1750612258911133, "logits/rejected": -2.1879947185516357, "logps/chosen": -123.87275695800781, "logps/rejected": -144.77828979492188, "loss": 0.6334, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.584213137626648, "rewards/margins": 0.1833323985338211, "rewards/rejected": -0.7675455808639526, "step": 1140 }, { "epoch": 0.83, "grad_norm": 1.671875, "learning_rate": 3.6394598095014577e-06, "logits/chosen": -2.210446834564209, "logits/rejected": -2.213280200958252, "logps/chosen": -107.16650390625, "logps/rejected": -124.11837005615234, "loss": 0.6448, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.40074795484542847, "rewards/margins": 0.14601869881153107, "rewards/rejected": -0.5467666387557983, "step": 1150 }, { "epoch": 0.84, "grad_norm": 2.25, "learning_rate": 3.611362464644415e-06, "logits/chosen": -2.128871202468872, "logits/rejected": -2.1372876167297363, "logps/chosen": -116.8992919921875, "logps/rejected": -126.83099365234375, "loss": 0.6706, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.47680729627609253, "rewards/margins": 0.09380488097667694, "rewards/rejected": -0.5706123113632202, "step": 1160 }, { "epoch": 0.84, "grad_norm": 1.734375, "learning_rate": 3.5830890598371636e-06, "logits/chosen": -2.23905611038208, "logits/rejected": -2.252377510070801, "logps/chosen": -107.7120361328125, "logps/rejected": -124.34709167480469, "loss": 0.6327, "rewards/accuracies": 0.625, "rewards/chosen": -0.4397021234035492, "rewards/margins": 0.1647356003522873, "rewards/rejected": -0.6044376492500305, "step": 1170 }, { "epoch": 0.85, "grad_norm": 1.859375, "learning_rate": 3.5546440740992856e-06, "logits/chosen": -2.1930408477783203, "logits/rejected": -2.2014918327331543, "logps/chosen": -117.5343017578125, "logps/rejected": -131.0255889892578, "loss": 0.6547, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.5026694536209106, "rewards/margins": 0.11716220527887344, "rewards/rejected": -0.6198316812515259, "step": 1180 }, { "epoch": 0.86, "grad_norm": 1.765625, "learning_rate": 3.5260320136318927e-06, "logits/chosen": -2.1664159297943115, "logits/rejected": -2.176593542098999, "logps/chosen": -120.20884704589844, "logps/rejected": -136.73406982421875, "loss": 0.637, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.522860586643219, "rewards/margins": 0.15705768764019012, "rewards/rejected": -0.6799181699752808, "step": 1190 }, { "epoch": 0.86, "grad_norm": 2.140625, "learning_rate": 3.4972574111037587e-06, "logits/chosen": -2.1755106449127197, "logits/rejected": -2.1772923469543457, "logps/chosen": -115.28816223144531, "logps/rejected": -133.49465942382812, "loss": 0.6426, "rewards/accuracies": 0.625, "rewards/chosen": -0.44179558753967285, "rewards/margins": 0.16654905676841736, "rewards/rejected": -0.6083446741104126, "step": 1200 }, { "epoch": 0.87, "grad_norm": 1.484375, "learning_rate": 3.468324824933267e-06, "logits/chosen": -2.151540756225586, "logits/rejected": -2.1717865467071533, "logps/chosen": -115.64791107177734, "logps/rejected": -132.0801239013672, "loss": 0.6446, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.548166036605835, "rewards/margins": 0.13710884749889374, "rewards/rejected": -0.6852747797966003, "step": 1210 }, { "epoch": 0.88, "grad_norm": 1.0390625, "learning_rate": 3.4392388385662713e-06, "logits/chosen": -2.1935017108917236, "logits/rejected": -2.195500612258911, "logps/chosen": -107.0605697631836, "logps/rejected": -129.38616943359375, "loss": 0.6337, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.43120819330215454, "rewards/margins": 0.18481549620628357, "rewards/rejected": -0.6160237193107605, "step": 1220 }, { "epoch": 0.89, "grad_norm": 1.671875, "learning_rate": 3.410004059749996e-06, "logits/chosen": -2.164797067642212, "logits/rejected": -2.172008514404297, "logps/chosen": -110.7061767578125, "logps/rejected": -132.82736206054688, "loss": 0.6242, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.4369031488895416, "rewards/margins": 0.19907937943935394, "rewards/rejected": -0.6359825134277344, "step": 1230 }, { "epoch": 0.89, "grad_norm": 1.6796875, "learning_rate": 3.3806251198030843e-06, "logits/chosen": -2.1183745861053467, "logits/rejected": -2.13506817817688, "logps/chosen": -103.0443344116211, "logps/rejected": -128.4714813232422, "loss": 0.6102, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4332190155982971, "rewards/margins": 0.22365593910217285, "rewards/rejected": -0.65687495470047, "step": 1240 }, { "epoch": 0.9, "grad_norm": 1.484375, "learning_rate": 3.351106672881915e-06, "logits/chosen": -2.1771786212921143, "logits/rejected": -2.1897802352905273, "logps/chosen": -114.33122253417969, "logps/rejected": -135.31690979003906, "loss": 0.6334, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.44074517488479614, "rewards/margins": 0.17149756848812103, "rewards/rejected": -0.6122426986694336, "step": 1250 }, { "epoch": 0.91, "grad_norm": 2.21875, "learning_rate": 3.3214533952433017e-06, "logits/chosen": -2.203437328338623, "logits/rejected": -2.194852113723755, "logps/chosen": -114.83846282958984, "logps/rejected": -132.64036560058594, "loss": 0.6596, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.5022262334823608, "rewards/margins": 0.1002851277589798, "rewards/rejected": -0.6025113463401794, "step": 1260 }, { "epoch": 0.92, "grad_norm": 1.421875, "learning_rate": 3.291669984503682e-06, "logits/chosen": -2.09834361076355, "logits/rejected": -2.1034817695617676, "logps/chosen": -119.2296371459961, "logps/rejected": -144.80642700195312, "loss": 0.6184, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5479999780654907, "rewards/margins": 0.22145429253578186, "rewards/rejected": -0.7694542407989502, "step": 1270 }, { "epoch": 0.92, "grad_norm": 2.09375, "learning_rate": 3.261761158894937e-06, "logits/chosen": -2.072908878326416, "logits/rejected": -2.075850009918213, "logps/chosen": -121.2236099243164, "logps/rejected": -149.28993225097656, "loss": 0.6121, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5698887705802917, "rewards/margins": 0.23544566333293915, "rewards/rejected": -0.8053344488143921, "step": 1280 }, { "epoch": 0.93, "grad_norm": 3.453125, "learning_rate": 3.231731656516936e-06, "logits/chosen": -2.1070938110351562, "logits/rejected": -2.1028828620910645, "logps/chosen": -110.7509765625, "logps/rejected": -132.27105712890625, "loss": 0.6311, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.46972590684890747, "rewards/margins": 0.17397567629814148, "rewards/rejected": -0.6437015533447266, "step": 1290 }, { "epoch": 0.94, "grad_norm": 1.5234375, "learning_rate": 3.2015862345869335e-06, "logits/chosen": -2.1732888221740723, "logits/rejected": -2.181213855743408, "logps/chosen": -111.25699615478516, "logps/rejected": -123.9486083984375, "loss": 0.6552, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.44834476709365845, "rewards/margins": 0.11553524434566498, "rewards/rejected": -0.5638800263404846, "step": 1300 }, { "epoch": 0.94, "grad_norm": 1.7734375, "learning_rate": 3.171329668685942e-06, "logits/chosen": -2.0767674446105957, "logits/rejected": -2.070704936981201, "logps/chosen": -110.462890625, "logps/rejected": -133.4669647216797, "loss": 0.6249, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5272938013076782, "rewards/margins": 0.18891780078411102, "rewards/rejected": -0.7162116765975952, "step": 1310 }, { "epoch": 0.95, "grad_norm": 1.078125, "learning_rate": 3.140966752002193e-06, "logits/chosen": -2.0980172157287598, "logits/rejected": -2.102271556854248, "logps/chosen": -103.88057708740234, "logps/rejected": -130.67318725585938, "loss": 0.6066, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.39303597807884216, "rewards/margins": 0.2400202453136444, "rewards/rejected": -0.6330562233924866, "step": 1320 }, { "epoch": 0.96, "grad_norm": 1.4921875, "learning_rate": 3.1105022945718076e-06, "logits/chosen": -2.0586235523223877, "logits/rejected": -2.080989360809326, "logps/chosen": -132.25013732910156, "logps/rejected": -150.72528076171875, "loss": 0.641, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6205312013626099, "rewards/margins": 0.177308589220047, "rewards/rejected": -0.7978397607803345, "step": 1330 }, { "epoch": 0.97, "grad_norm": 1.78125, "learning_rate": 3.079941122516803e-06, "logits/chosen": -2.0391013622283936, "logits/rejected": -2.037480592727661, "logps/chosen": -114.75526428222656, "logps/rejected": -132.84378051757812, "loss": 0.6534, "rewards/accuracies": 0.59375, "rewards/chosen": -0.5115147829055786, "rewards/margins": 0.1457727551460266, "rewards/rejected": -0.6572875380516052, "step": 1340 }, { "epoch": 0.97, "grad_norm": 1.2578125, "learning_rate": 3.0492880772805433e-06, "logits/chosen": -2.05072283744812, "logits/rejected": -2.057342767715454, "logps/chosen": -120.1377182006836, "logps/rejected": -134.4777374267578, "loss": 0.6499, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.4882054328918457, "rewards/margins": 0.1324242353439331, "rewards/rejected": -0.6206297278404236, "step": 1350 }, { "epoch": 0.98, "grad_norm": 1.5, "learning_rate": 3.018548014860769e-06, "logits/chosen": -2.007279872894287, "logits/rejected": -2.0130622386932373, "logps/chosen": -120.3442153930664, "logps/rejected": -143.51101684570312, "loss": 0.6304, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5948348045349121, "rewards/margins": 0.20342986285686493, "rewards/rejected": -0.7982646822929382, "step": 1360 }, { "epoch": 0.99, "grad_norm": 2.3125, "learning_rate": 2.9877258050403214e-06, "logits/chosen": -2.031801223754883, "logits/rejected": -2.023563861846924, "logps/chosen": -121.27830505371094, "logps/rejected": -140.32757568359375, "loss": 0.6447, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6015397310256958, "rewards/margins": 0.1492132693529129, "rewards/rejected": -0.7507530450820923, "step": 1370 }, { "epoch": 0.99, "grad_norm": 1.7578125, "learning_rate": 2.9568263306156754e-06, "logits/chosen": -2.0874016284942627, "logits/rejected": -2.0975565910339355, "logps/chosen": -107.88935852050781, "logps/rejected": -121.3756103515625, "loss": 0.6615, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.4233587682247162, "rewards/margins": 0.1184331625699997, "rewards/rejected": -0.5417919754981995, "step": 1380 }, { "epoch": 1.0, "grad_norm": 2.0, "learning_rate": 2.9258544866234206e-06, "logits/chosen": -2.070168972015381, "logits/rejected": -2.0742554664611816, "logps/chosen": -112.909423828125, "logps/rejected": -130.5807342529297, "loss": 0.6372, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.49084582924842834, "rewards/margins": 0.15788979828357697, "rewards/rejected": -0.6487356424331665, "step": 1390 }, { "epoch": 1.01, "grad_norm": 1.6015625, "learning_rate": 2.8948151795647994e-06, "logits/chosen": -1.9922540187835693, "logits/rejected": -2.0036025047302246, "logps/chosen": -108.33551025390625, "logps/rejected": -134.94259643554688, "loss": 0.6044, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.44488996267318726, "rewards/margins": 0.2470279037952423, "rewards/rejected": -0.6919177770614624, "step": 1400 }, { "epoch": 1.02, "grad_norm": 2.34375, "learning_rate": 2.863713326628422e-06, "logits/chosen": -1.9860804080963135, "logits/rejected": -1.98525071144104, "logps/chosen": -111.2559585571289, "logps/rejected": -138.44357299804688, "loss": 0.6063, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4986530840396881, "rewards/margins": 0.2310309112071991, "rewards/rejected": -0.7296839952468872, "step": 1410 }, { "epoch": 1.02, "grad_norm": 1.828125, "learning_rate": 2.8325538549113006e-06, "logits/chosen": -2.030186891555786, "logits/rejected": -2.0408942699432373, "logps/chosen": -113.96142578125, "logps/rejected": -141.45428466796875, "loss": 0.6113, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.46484389901161194, "rewards/margins": 0.2421310842037201, "rewards/rejected": -0.706974983215332, "step": 1420 }, { "epoch": 1.03, "grad_norm": 2.515625, "learning_rate": 2.8013417006383078e-06, "logits/chosen": -1.9514284133911133, "logits/rejected": -1.963894248008728, "logps/chosen": -110.47412109375, "logps/rejected": -128.65701293945312, "loss": 0.6357, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5639623999595642, "rewards/margins": 0.1714598834514618, "rewards/rejected": -0.7354224324226379, "step": 1430 }, { "epoch": 1.04, "grad_norm": 2.65625, "learning_rate": 2.770081808380186e-06, "logits/chosen": -2.04837703704834, "logits/rejected": -2.05118989944458, "logps/chosen": -125.71846771240234, "logps/rejected": -144.2294158935547, "loss": 0.6284, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.48360365629196167, "rewards/margins": 0.19055330753326416, "rewards/rejected": -0.6741569638252258, "step": 1440 }, { "epoch": 1.04, "grad_norm": 2.328125, "learning_rate": 2.7387791302702398e-06, "logits/chosen": -1.980463981628418, "logits/rejected": -1.980164885520935, "logps/chosen": -123.45314025878906, "logps/rejected": -152.37655639648438, "loss": 0.6185, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6476872563362122, "rewards/margins": 0.214513897895813, "rewards/rejected": -0.8622010946273804, "step": 1450 }, { "epoch": 1.05, "grad_norm": 1.4296875, "learning_rate": 2.707438625219827e-06, "logits/chosen": -1.942488670349121, "logits/rejected": -1.9465347528457642, "logps/chosen": -128.45228576660156, "logps/rejected": -162.00949096679688, "loss": 0.5957, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6348342895507812, "rewards/margins": 0.292163610458374, "rewards/rejected": -0.9269979596138, "step": 1460 }, { "epoch": 1.06, "grad_norm": 1.2421875, "learning_rate": 2.67606525813278e-06, "logits/chosen": -1.9286657571792603, "logits/rejected": -1.9460529088974, "logps/chosen": -115.72102355957031, "logps/rejected": -142.26600646972656, "loss": 0.6079, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5551806092262268, "rewards/margins": 0.2322642058134079, "rewards/rejected": -0.7874448299407959, "step": 1470 }, { "epoch": 1.07, "grad_norm": 1.6796875, "learning_rate": 2.6446639991188716e-06, "logits/chosen": -1.973655343055725, "logits/rejected": -1.9923969507217407, "logps/chosen": -116.53816223144531, "logps/rejected": -137.4978790283203, "loss": 0.6328, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.5160804986953735, "rewards/margins": 0.17561517655849457, "rewards/rejected": -0.6916956305503845, "step": 1480 }, { "epoch": 1.07, "grad_norm": 2.1875, "learning_rate": 2.6132398227064615e-06, "logits/chosen": -2.0569424629211426, "logits/rejected": -2.061692237854004, "logps/chosen": -129.03684997558594, "logps/rejected": -151.86997985839844, "loss": 0.6229, "rewards/accuracies": 0.625, "rewards/chosen": -0.5522770285606384, "rewards/margins": 0.2137361317873001, "rewards/rejected": -0.7660132050514221, "step": 1490 }, { "epoch": 1.08, "grad_norm": 2.46875, "learning_rate": 2.5817977070544408e-06, "logits/chosen": -1.9222244024276733, "logits/rejected": -1.928789496421814, "logps/chosen": -122.2890396118164, "logps/rejected": -146.0366668701172, "loss": 0.6174, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6112322807312012, "rewards/margins": 0.2106863260269165, "rewards/rejected": -0.8219184875488281, "step": 1500 }, { "epoch": 1.09, "grad_norm": 2.0, "learning_rate": 2.550342633163601e-06, "logits/chosen": -1.994757890701294, "logits/rejected": -1.998810052871704, "logps/chosen": -119.19169616699219, "logps/rejected": -146.54074096679688, "loss": 0.6078, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.596774697303772, "rewards/margins": 0.23899777233600616, "rewards/rejected": -0.8357726335525513, "step": 1510 }, { "epoch": 1.1, "grad_norm": 1.9609375, "learning_rate": 2.5188795840875546e-06, "logits/chosen": -1.98430597782135, "logits/rejected": -1.989297866821289, "logps/chosen": -124.1072769165039, "logps/rejected": -133.15005493164062, "loss": 0.6559, "rewards/accuracies": 0.625, "rewards/chosen": -0.5129493474960327, "rewards/margins": 0.10699422657489777, "rewards/rejected": -0.6199434995651245, "step": 1520 }, { "epoch": 1.1, "grad_norm": 1.8828125, "learning_rate": 2.487413544143325e-06, "logits/chosen": -2.003361701965332, "logits/rejected": -1.9991636276245117, "logps/chosen": -120.54267883300781, "logps/rejected": -145.92556762695312, "loss": 0.6157, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5307844281196594, "rewards/margins": 0.21387752890586853, "rewards/rejected": -0.7446619868278503, "step": 1530 }, { "epoch": 1.11, "grad_norm": 1.8125, "learning_rate": 2.4559494981217464e-06, "logits/chosen": -2.009737968444824, "logits/rejected": -2.0052223205566406, "logps/chosen": -115.0971450805664, "logps/rejected": -140.93968200683594, "loss": 0.6113, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.513275146484375, "rewards/margins": 0.22737202048301697, "rewards/rejected": -0.7406471967697144, "step": 1540 }, { "epoch": 1.12, "grad_norm": 1.3828125, "learning_rate": 2.4244924304977785e-06, "logits/chosen": -1.9526363611221313, "logits/rejected": -1.9619600772857666, "logps/chosen": -117.15467834472656, "logps/rejected": -141.62472534179688, "loss": 0.6155, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.529160737991333, "rewards/margins": 0.21543464064598083, "rewards/rejected": -0.7445953488349915, "step": 1550 }, { "epoch": 1.12, "grad_norm": 2.96875, "learning_rate": 2.3930473246408752e-06, "logits/chosen": -2.0411906242370605, "logits/rejected": -2.056326389312744, "logps/chosen": -129.92892456054688, "logps/rejected": -157.43417358398438, "loss": 0.6072, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.612835705280304, "rewards/margins": 0.2452736347913742, "rewards/rejected": -0.8581092953681946, "step": 1560 }, { "epoch": 1.13, "grad_norm": 2.109375, "learning_rate": 2.3616191620255307e-06, "logits/chosen": -2.016146421432495, "logits/rejected": -2.031141996383667, "logps/chosen": -125.10477447509766, "logps/rejected": -144.62144470214844, "loss": 0.6344, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5862436890602112, "rewards/margins": 0.16933271288871765, "rewards/rejected": -0.7555764317512512, "step": 1570 }, { "epoch": 1.14, "grad_norm": 1.84375, "learning_rate": 2.3302129214421244e-06, "logits/chosen": -1.9942152500152588, "logits/rejected": -1.9925035238265991, "logps/chosen": -126.97408294677734, "logps/rejected": -157.83267211914062, "loss": 0.5967, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5849136114120483, "rewards/margins": 0.2714024782180786, "rewards/rejected": -0.8563162088394165, "step": 1580 }, { "epoch": 1.15, "grad_norm": 1.2578125, "learning_rate": 2.2988335782081854e-06, "logits/chosen": -1.9507849216461182, "logits/rejected": -1.9640982151031494, "logps/chosen": -114.99700927734375, "logps/rejected": -141.39306640625, "loss": 0.6105, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5272036194801331, "rewards/margins": 0.21672149002552032, "rewards/rejected": -0.7439250349998474, "step": 1590 }, { "epoch": 1.15, "grad_norm": 2.53125, "learning_rate": 2.2674861033802182e-06, "logits/chosen": -1.9975817203521729, "logits/rejected": -2.006187915802002, "logps/chosen": -121.58160400390625, "logps/rejected": -147.51416015625, "loss": 0.6135, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5389881134033203, "rewards/margins": 0.24041156470775604, "rewards/rejected": -0.7793997526168823, "step": 1600 }, { "epoch": 1.16, "grad_norm": 2.1875, "learning_rate": 2.236175462966192e-06, "logits/chosen": -1.9745140075683594, "logits/rejected": -1.990915060043335, "logps/chosen": -119.48726654052734, "logps/rejected": -139.58609008789062, "loss": 0.6336, "rewards/accuracies": 0.625, "rewards/chosen": -0.5391074419021606, "rewards/margins": 0.17995604872703552, "rewards/rejected": -0.7190635204315186, "step": 1610 }, { "epoch": 1.17, "grad_norm": 1.2421875, "learning_rate": 2.204906617138839e-06, "logits/chosen": -2.052870750427246, "logits/rejected": -2.0588982105255127, "logps/chosen": -115.16014099121094, "logps/rejected": -138.7529754638672, "loss": 0.6187, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.46134477853775024, "rewards/margins": 0.18981757760047913, "rewards/rejected": -0.651162326335907, "step": 1620 }, { "epoch": 1.17, "grad_norm": 2.3125, "learning_rate": 2.173684519449872e-06, "logits/chosen": -2.017367124557495, "logits/rejected": -2.0284934043884277, "logps/chosen": -118.7997055053711, "logps/rejected": -136.1349639892578, "loss": 0.6259, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.49513015151023865, "rewards/margins": 0.1775195300579071, "rewards/rejected": -0.672649621963501, "step": 1630 }, { "epoch": 1.18, "grad_norm": 1.609375, "learning_rate": 2.1425141160452495e-06, "logits/chosen": -1.9408687353134155, "logits/rejected": -1.9594764709472656, "logps/chosen": -116.89726257324219, "logps/rejected": -135.9090118408203, "loss": 0.6256, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5176368951797485, "rewards/margins": 0.18570610880851746, "rewards/rejected": -0.7033429145812988, "step": 1640 }, { "epoch": 1.19, "grad_norm": 2.046875, "learning_rate": 2.1114003448816205e-06, "logits/chosen": -1.9267289638519287, "logits/rejected": -1.930748701095581, "logps/chosen": -111.66670227050781, "logps/rejected": -129.04257202148438, "loss": 0.6341, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5672639608383179, "rewards/margins": 0.16399827599525452, "rewards/rejected": -0.7312622666358948, "step": 1650 }, { "epoch": 1.2, "grad_norm": 2.03125, "learning_rate": 2.080348134944063e-06, "logits/chosen": -1.9702529907226562, "logits/rejected": -1.9817975759506226, "logps/chosen": -119.13056945800781, "logps/rejected": -137.2602081298828, "loss": 0.6419, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5436175465583801, "rewards/margins": 0.15731260180473328, "rewards/rejected": -0.7009302377700806, "step": 1660 }, { "epoch": 1.2, "grad_norm": 1.890625, "learning_rate": 2.049362405465236e-06, "logits/chosen": -2.0406806468963623, "logits/rejected": -2.043137550354004, "logps/chosen": -112.21296691894531, "logps/rejected": -136.54315185546875, "loss": 0.6188, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4973304271697998, "rewards/margins": 0.20603354275226593, "rewards/rejected": -0.7033639550209045, "step": 1670 }, { "epoch": 1.21, "grad_norm": 1.8515625, "learning_rate": 2.0184480651460943e-06, "logits/chosen": -1.961282730102539, "logits/rejected": -1.9708878993988037, "logps/chosen": -121.48686218261719, "logps/rejected": -150.19656372070312, "loss": 0.5987, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5785536170005798, "rewards/margins": 0.25495901703834534, "rewards/rejected": -0.8335126638412476, "step": 1680 }, { "epoch": 1.22, "grad_norm": 2.3125, "learning_rate": 1.9876100113782534e-06, "logits/chosen": -2.0227205753326416, "logits/rejected": -2.0364012718200684, "logps/chosen": -114.74979400634766, "logps/rejected": -138.68084716796875, "loss": 0.6135, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4629599153995514, "rewards/margins": 0.2168576419353485, "rewards/rejected": -0.6798175573348999, "step": 1690 }, { "epoch": 1.23, "grad_norm": 1.75, "learning_rate": 1.9568531294681585e-06, "logits/chosen": -1.9471362829208374, "logits/rejected": -1.9518378973007202, "logps/chosen": -122.4446792602539, "logps/rejected": -156.8361053466797, "loss": 0.5817, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5372573137283325, "rewards/margins": 0.3052898943424225, "rewards/rejected": -0.8425471186637878, "step": 1700 }, { "epoch": 1.23, "grad_norm": 2.96875, "learning_rate": 1.926182291863162e-06, "logits/chosen": -1.8842859268188477, "logits/rejected": -1.8872559070587158, "logps/chosen": -115.28511810302734, "logps/rejected": -142.72998046875, "loss": 0.6057, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5601629018783569, "rewards/margins": 0.23351116478443146, "rewards/rejected": -0.7936740517616272, "step": 1710 }, { "epoch": 1.24, "grad_norm": 2.484375, "learning_rate": 1.895602357379637e-06, "logits/chosen": -1.851300597190857, "logits/rejected": -1.8685184717178345, "logps/chosen": -120.60140228271484, "logps/rejected": -148.75662231445312, "loss": 0.6097, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5954613089561462, "rewards/margins": 0.26449450850486755, "rewards/rejected": -0.859955906867981, "step": 1720 }, { "epoch": 1.25, "grad_norm": 2.390625, "learning_rate": 1.8651181704332578e-06, "logits/chosen": -1.9334551095962524, "logits/rejected": -1.9329140186309814, "logps/chosen": -126.85723876953125, "logps/rejected": -153.77548217773438, "loss": 0.6113, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6201340556144714, "rewards/margins": 0.23935556411743164, "rewards/rejected": -0.8594895601272583, "step": 1730 }, { "epoch": 1.25, "grad_norm": 1.4296875, "learning_rate": 1.8347345602715543e-06, "logits/chosen": -1.9892994165420532, "logits/rejected": -2.0142014026641846, "logps/chosen": -119.41783142089844, "logps/rejected": -146.21707153320312, "loss": 0.6015, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5245205163955688, "rewards/margins": 0.25306034088134766, "rewards/rejected": -0.7775809168815613, "step": 1740 }, { "epoch": 1.26, "grad_norm": 1.546875, "learning_rate": 1.8044563402088686e-06, "logits/chosen": -1.9546706676483154, "logits/rejected": -1.9724689722061157, "logps/chosen": -130.3236541748047, "logps/rejected": -160.91209411621094, "loss": 0.5816, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6136053800582886, "rewards/margins": 0.3152478337287903, "rewards/rejected": -0.9288532137870789, "step": 1750 }, { "epoch": 1.27, "grad_norm": 2.46875, "learning_rate": 1.7742883068638447e-06, "logits/chosen": -2.0497043132781982, "logits/rejected": -2.048368453979492, "logps/chosen": -127.9777603149414, "logps/rejected": -154.79327392578125, "loss": 0.6093, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5853675603866577, "rewards/margins": 0.23666468262672424, "rewards/rejected": -0.8220322728157043, "step": 1760 }, { "epoch": 1.28, "grad_norm": 1.5625, "learning_rate": 1.7442352393995516e-06, "logits/chosen": -1.9354140758514404, "logits/rejected": -1.9446359872817993, "logps/chosen": -124.5389175415039, "logps/rejected": -148.15814208984375, "loss": 0.6249, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6259867548942566, "rewards/margins": 0.19919133186340332, "rewards/rejected": -0.8251781463623047, "step": 1770 }, { "epoch": 1.28, "grad_norm": 1.65625, "learning_rate": 1.7143018987663814e-06, "logits/chosen": -1.9998855590820312, "logits/rejected": -2.0096142292022705, "logps/chosen": -126.11322021484375, "logps/rejected": -145.43692016601562, "loss": 0.6251, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5246410369873047, "rewards/margins": 0.19330283999443054, "rewards/rejected": -0.7179439663887024, "step": 1780 }, { "epoch": 1.29, "grad_norm": 2.25, "learning_rate": 1.6844930269478274e-06, "logits/chosen": -1.9050662517547607, "logits/rejected": -1.9045469760894775, "logps/chosen": -123.74418640136719, "logps/rejected": -137.86448669433594, "loss": 0.6512, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5779796242713928, "rewards/margins": 0.1330142766237259, "rewards/rejected": -0.7109938859939575, "step": 1790 }, { "epoch": 1.3, "grad_norm": 2.828125, "learning_rate": 1.6548133462092647e-06, "logits/chosen": -1.9649972915649414, "logits/rejected": -1.9714637994766235, "logps/chosen": -129.48873901367188, "logps/rejected": -158.52069091796875, "loss": 0.6204, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6639618277549744, "rewards/margins": 0.22173753380775452, "rewards/rejected": -0.885699450969696, "step": 1800 }, { "epoch": 1.3, "grad_norm": 2.375, "learning_rate": 1.6252675583498644e-06, "logits/chosen": -1.9044713973999023, "logits/rejected": -1.9047183990478516, "logps/chosen": -114.80845642089844, "logps/rejected": -141.17929077148438, "loss": 0.606, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.506965696811676, "rewards/margins": 0.22917525470256805, "rewards/rejected": -0.7361409068107605, "step": 1810 }, { "epoch": 1.31, "grad_norm": 2.40625, "learning_rate": 1.5958603439577381e-06, "logits/chosen": -1.883062720298767, "logits/rejected": -1.8782352209091187, "logps/chosen": -115.2265853881836, "logps/rejected": -145.7313690185547, "loss": 0.6037, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5807043313980103, "rewards/margins": 0.2627798616886139, "rewards/rejected": -0.843484103679657, "step": 1820 }, { "epoch": 1.32, "grad_norm": 1.9375, "learning_rate": 1.5665963616684477e-06, "logits/chosen": -1.8872991800308228, "logits/rejected": -1.9082088470458984, "logps/chosen": -118.02657318115234, "logps/rejected": -144.72972106933594, "loss": 0.6075, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5587154626846313, "rewards/margins": 0.24482004344463348, "rewards/rejected": -0.8035355806350708, "step": 1830 }, { "epoch": 1.33, "grad_norm": 2.03125, "learning_rate": 1.5374802474269973e-06, "logits/chosen": -1.8889667987823486, "logits/rejected": -1.8945300579071045, "logps/chosen": -120.35169982910156, "logps/rejected": -144.44015502929688, "loss": 0.6084, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.5581179857254028, "rewards/margins": 0.23101505637168884, "rewards/rejected": -0.7891330718994141, "step": 1840 }, { "epoch": 1.33, "grad_norm": 1.828125, "learning_rate": 1.5085166137534124e-06, "logits/chosen": -1.8958622217178345, "logits/rejected": -1.8905636072158813, "logps/chosen": -124.46125793457031, "logps/rejected": -151.10842895507812, "loss": 0.6129, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6295716762542725, "rewards/margins": 0.23376984894275665, "rewards/rejected": -0.8633416295051575, "step": 1850 }, { "epoch": 1.34, "grad_norm": 2.140625, "learning_rate": 1.479710049012033e-06, "logits/chosen": -1.9351632595062256, "logits/rejected": -1.9478946924209595, "logps/chosen": -121.83003234863281, "logps/rejected": -154.7652587890625, "loss": 0.5865, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5323207974433899, "rewards/margins": 0.2836820185184479, "rewards/rejected": -0.8160028457641602, "step": 1860 }, { "epoch": 1.35, "grad_norm": 1.34375, "learning_rate": 1.4510651166846369e-06, "logits/chosen": -1.8797328472137451, "logits/rejected": -1.9063024520874023, "logps/chosen": -112.57334899902344, "logps/rejected": -139.17276000976562, "loss": 0.6023, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5350430011749268, "rewards/margins": 0.23087510466575623, "rewards/rejected": -0.7659180760383606, "step": 1870 }, { "epoch": 1.35, "grad_norm": 1.8046875, "learning_rate": 1.4225863546474944e-06, "logits/chosen": -1.9153077602386475, "logits/rejected": -1.92630934715271, "logps/chosen": -117.9030532836914, "logps/rejected": -144.8286895751953, "loss": 0.6007, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5061390399932861, "rewards/margins": 0.2523689270019531, "rewards/rejected": -0.7585079669952393, "step": 1880 }, { "epoch": 1.36, "grad_norm": 1.625, "learning_rate": 1.3942782744524974e-06, "logits/chosen": -1.9394657611846924, "logits/rejected": -1.9521135091781616, "logps/chosen": -122.05293273925781, "logps/rejected": -145.07711791992188, "loss": 0.6148, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5658047199249268, "rewards/margins": 0.20031043887138367, "rewards/rejected": -0.7661150693893433, "step": 1890 }, { "epoch": 1.37, "grad_norm": 2.046875, "learning_rate": 1.3661453606124353e-06, "logits/chosen": -1.8490660190582275, "logits/rejected": -1.849898099899292, "logps/chosen": -117.15348815917969, "logps/rejected": -144.00486755371094, "loss": 0.6162, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.5616058111190796, "rewards/margins": 0.2321668118238449, "rewards/rejected": -0.7937726378440857, "step": 1900 }, { "epoch": 1.38, "grad_norm": 1.9453125, "learning_rate": 1.3381920698905788e-06, "logits/chosen": -1.8940210342407227, "logits/rejected": -1.8968864679336548, "logps/chosen": -122.25953674316406, "logps/rejected": -151.90289306640625, "loss": 0.6012, "rewards/accuracies": 0.6875, "rewards/chosen": -0.587809145450592, "rewards/margins": 0.24534039199352264, "rewards/rejected": -0.8331495523452759, "step": 1910 }, { "epoch": 1.38, "grad_norm": 1.453125, "learning_rate": 1.3104228305946385e-06, "logits/chosen": -1.8536640405654907, "logits/rejected": -1.8629567623138428, "logps/chosen": -108.5637435913086, "logps/rejected": -140.7650146484375, "loss": 0.5988, "rewards/accuracies": 0.65625, "rewards/chosen": -0.5375715494155884, "rewards/margins": 0.26606285572052, "rewards/rejected": -0.8036344647407532, "step": 1920 }, { "epoch": 1.39, "grad_norm": 1.90625, "learning_rate": 1.2828420418752442e-06, "logits/chosen": -1.8929815292358398, "logits/rejected": -1.9167912006378174, "logps/chosen": -130.51766967773438, "logps/rejected": -146.90655517578125, "loss": 0.6459, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5917934775352478, "rewards/margins": 0.16113656759262085, "rewards/rejected": -0.7529300451278687, "step": 1930 }, { "epoch": 1.4, "grad_norm": 2.0, "learning_rate": 1.2554540730290437e-06, "logits/chosen": -1.8626874685287476, "logits/rejected": -1.8674083948135376, "logps/chosen": -122.74143981933594, "logps/rejected": -148.19137573242188, "loss": 0.6133, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6179603338241577, "rewards/margins": 0.22583599388599396, "rewards/rejected": -0.8437963724136353, "step": 1940 }, { "epoch": 1.41, "grad_norm": 2.78125, "learning_rate": 1.2282632628065197e-06, "logits/chosen": -1.8630259037017822, "logits/rejected": -1.8686736822128296, "logps/chosen": -127.19625091552734, "logps/rejected": -152.7012939453125, "loss": 0.6144, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6396566033363342, "rewards/margins": 0.2267201840877533, "rewards/rejected": -0.8663768768310547, "step": 1950 }, { "epoch": 1.41, "grad_norm": 2.09375, "learning_rate": 1.2012739187246575e-06, "logits/chosen": -1.9101310968399048, "logits/rejected": -1.9150078296661377, "logps/chosen": -125.06834411621094, "logps/rejected": -152.13970947265625, "loss": 0.6079, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6242455840110779, "rewards/margins": 0.23204731941223145, "rewards/rejected": -0.8562929034233093, "step": 1960 }, { "epoch": 1.42, "grad_norm": 1.7890625, "learning_rate": 1.1744903163845578e-06, "logits/chosen": -1.9141194820404053, "logits/rejected": -1.9090967178344727, "logps/chosen": -125.41385650634766, "logps/rejected": -153.21505737304688, "loss": 0.6196, "rewards/accuracies": 0.625, "rewards/chosen": -0.6701093912124634, "rewards/margins": 0.23631341755390167, "rewards/rejected": -0.9064227938652039, "step": 1970 }, { "epoch": 1.43, "grad_norm": 2.3125, "learning_rate": 1.1479166987940981e-06, "logits/chosen": -1.9218595027923584, "logits/rejected": -1.9358152151107788, "logps/chosen": -121.7696533203125, "logps/rejected": -142.92947387695312, "loss": 0.6406, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6072500944137573, "rewards/margins": 0.1642817258834839, "rewards/rejected": -0.771531879901886, "step": 1980 }, { "epoch": 1.43, "grad_norm": 1.890625, "learning_rate": 1.121557275695771e-06, "logits/chosen": -1.8173805475234985, "logits/rejected": -1.8257992267608643, "logps/chosen": -123.26933288574219, "logps/rejected": -149.6290740966797, "loss": 0.6063, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6274382472038269, "rewards/margins": 0.2434215098619461, "rewards/rejected": -0.8708597421646118, "step": 1990 }, { "epoch": 1.44, "grad_norm": 2.734375, "learning_rate": 1.0954162228997778e-06, "logits/chosen": -1.944850206375122, "logits/rejected": -1.9471490383148193, "logps/chosen": -121.35750579833984, "logps/rejected": -149.9652557373047, "loss": 0.6078, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6109983325004578, "rewards/margins": 0.23012125492095947, "rewards/rejected": -0.8411195874214172, "step": 2000 }, { "epoch": 1.45, "grad_norm": 1.8359375, "learning_rate": 1.0694976816225072e-06, "logits/chosen": -1.931652307510376, "logits/rejected": -1.9369986057281494, "logps/chosen": -121.70970153808594, "logps/rejected": -147.0286102294922, "loss": 0.6173, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6017236709594727, "rewards/margins": 0.22414302825927734, "rewards/rejected": -0.82586669921875, "step": 2010 }, { "epoch": 1.46, "grad_norm": 2.015625, "learning_rate": 1.043805757830495e-06, "logits/chosen": -1.888380765914917, "logits/rejected": -1.898215889930725, "logps/chosen": -123.5376968383789, "logps/rejected": -143.40316772460938, "loss": 0.6269, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.5882707834243774, "rewards/margins": 0.17841866612434387, "rewards/rejected": -0.7666894793510437, "step": 2020 }, { "epoch": 1.46, "grad_norm": 1.921875, "learning_rate": 1.0183445215899585e-06, "logits/chosen": -1.9046001434326172, "logits/rejected": -1.8922739028930664, "logps/chosen": -119.24836730957031, "logps/rejected": -143.8057861328125, "loss": 0.6215, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5545870065689087, "rewards/margins": 0.2050396203994751, "rewards/rejected": -0.7596266269683838, "step": 2030 }, { "epoch": 1.47, "grad_norm": 2.578125, "learning_rate": 9.931180064220276e-07, "logits/chosen": -1.92236328125, "logits/rejected": -1.924556016921997, "logps/chosen": -136.11215209960938, "logps/rejected": -159.76925659179688, "loss": 0.6302, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6600568890571594, "rewards/margins": 0.19632843136787415, "rewards/rejected": -0.8563854098320007, "step": 2040 }, { "epoch": 1.48, "grad_norm": 1.671875, "learning_rate": 9.681302086637634e-07, "logits/chosen": -1.8995593786239624, "logits/rejected": -1.922876000404358, "logps/chosen": -134.8125457763672, "logps/rejected": -150.06996154785156, "loss": 0.6384, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.647091805934906, "rewards/margins": 0.1497163623571396, "rewards/rejected": -0.796808123588562, "step": 2050 }, { "epoch": 1.48, "grad_norm": 1.40625, "learning_rate": 9.433850868350619e-07, "logits/chosen": -1.8294461965560913, "logits/rejected": -1.845920205116272, "logps/chosen": -116.8541259765625, "logps/rejected": -143.2628936767578, "loss": 0.6033, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5514515042304993, "rewards/margins": 0.2609653174877167, "rewards/rejected": -0.8124168515205383, "step": 2060 }, { "epoch": 1.49, "grad_norm": 1.953125, "learning_rate": 9.188865610115572e-07, "logits/chosen": -1.921491265296936, "logits/rejected": -1.93572998046875, "logps/chosen": -126.38240814208984, "logps/rejected": -145.36019897460938, "loss": 0.6306, "rewards/accuracies": 0.625, "rewards/chosen": -0.5697722434997559, "rewards/margins": 0.17015670239925385, "rewards/rejected": -0.7399289608001709, "step": 2070 }, { "epoch": 1.5, "grad_norm": 1.7890625, "learning_rate": 8.946385122036066e-07, "logits/chosen": -1.8846461772918701, "logits/rejected": -1.8940550088882446, "logps/chosen": -121.21919250488281, "logps/rejected": -144.52047729492188, "loss": 0.6212, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5412122011184692, "rewards/margins": 0.19210803508758545, "rewards/rejected": -0.7333202958106995, "step": 2080 }, { "epoch": 1.51, "grad_norm": 1.8125, "learning_rate": 8.706447817414696e-07, "logits/chosen": -1.9248275756835938, "logits/rejected": -1.922368049621582, "logps/chosen": -127.80845642089844, "logps/rejected": -151.72030639648438, "loss": 0.6227, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.6831592917442322, "rewards/margins": 0.2034546136856079, "rewards/rejected": -0.8866138458251953, "step": 2090 }, { "epoch": 1.51, "grad_norm": 1.5390625, "learning_rate": 8.469091706667748e-07, "logits/chosen": -1.8915945291519165, "logits/rejected": -1.895379662513733, "logps/chosen": -122.15771484375, "logps/rejected": -147.4517364501953, "loss": 0.6135, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6304605007171631, "rewards/margins": 0.22080358862876892, "rewards/rejected": -0.8512641191482544, "step": 2100 }, { "epoch": 1.52, "grad_norm": 1.96875, "learning_rate": 8.234354391303606e-07, "logits/chosen": -1.8591235876083374, "logits/rejected": -1.8553224802017212, "logps/chosen": -124.28314208984375, "logps/rejected": -152.6465301513672, "loss": 0.6102, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6334540247917175, "rewards/margins": 0.24212419986724854, "rewards/rejected": -0.8755782246589661, "step": 2110 }, { "epoch": 1.53, "grad_norm": 2.15625, "learning_rate": 8.002273057966012e-07, "logits/chosen": -1.8992531299591064, "logits/rejected": -1.9243097305297852, "logps/chosen": -128.35739135742188, "logps/rejected": -149.07540893554688, "loss": 0.6183, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5612360835075378, "rewards/margins": 0.19729313254356384, "rewards/rejected": -0.7585291862487793, "step": 2120 }, { "epoch": 1.53, "grad_norm": 2.015625, "learning_rate": 7.772884472543066e-07, "logits/chosen": -1.9013763666152954, "logits/rejected": -1.9246801137924194, "logps/chosen": -124.08580017089844, "logps/rejected": -140.3448028564453, "loss": 0.6467, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.5578123927116394, "rewards/margins": 0.1421774923801422, "rewards/rejected": -0.699989914894104, "step": 2130 }, { "epoch": 1.54, "grad_norm": 2.09375, "learning_rate": 7.546224974342775e-07, "logits/chosen": -1.9061437845230103, "logits/rejected": -1.899762749671936, "logps/chosen": -136.278076171875, "logps/rejected": -161.5443115234375, "loss": 0.6124, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6313563585281372, "rewards/margins": 0.23241499066352844, "rewards/rejected": -0.8637714385986328, "step": 2140 }, { "epoch": 1.55, "grad_norm": 2.28125, "learning_rate": 7.322330470336314e-07, "logits/chosen": -1.919785499572754, "logits/rejected": -1.9172786474227905, "logps/chosen": -130.88584899902344, "logps/rejected": -155.8290252685547, "loss": 0.627, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6158424019813538, "rewards/margins": 0.20419493317604065, "rewards/rejected": -0.8200373649597168, "step": 2150 }, { "epoch": 1.56, "grad_norm": 2.6875, "learning_rate": 7.10123642946966e-07, "logits/chosen": -1.9181368350982666, "logits/rejected": -1.9378869533538818, "logps/chosen": -125.4980239868164, "logps/rejected": -148.65103149414062, "loss": 0.6113, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5430660247802734, "rewards/margins": 0.2324432134628296, "rewards/rejected": -0.775509238243103, "step": 2160 }, { "epoch": 1.56, "grad_norm": 2.296875, "learning_rate": 6.882977877044691e-07, "logits/chosen": -1.9170925617218018, "logits/rejected": -1.9317693710327148, "logps/chosen": -118.6207504272461, "logps/rejected": -140.1538543701172, "loss": 0.6322, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.576205313205719, "rewards/margins": 0.18266887962818146, "rewards/rejected": -0.758874237537384, "step": 2170 }, { "epoch": 1.57, "grad_norm": 2.0625, "learning_rate": 6.667589389170561e-07, "logits/chosen": -1.913522720336914, "logits/rejected": -1.9155197143554688, "logps/chosen": -127.30131530761719, "logps/rejected": -149.8275909423828, "loss": 0.6301, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5541995763778687, "rewards/margins": 0.18559743463993073, "rewards/rejected": -0.7397969365119934, "step": 2180 }, { "epoch": 1.58, "grad_norm": 1.9921875, "learning_rate": 6.455105087286173e-07, "logits/chosen": -1.9797407388687134, "logits/rejected": -1.9776780605316162, "logps/chosen": -130.6244659423828, "logps/rejected": -150.9293670654297, "loss": 0.6455, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6223429441452026, "rewards/margins": 0.1531866490840912, "rewards/rejected": -0.775529682636261, "step": 2190 }, { "epoch": 1.59, "grad_norm": 1.3359375, "learning_rate": 6.245558632754778e-07, "logits/chosen": -1.8683338165283203, "logits/rejected": -1.8908354043960571, "logps/chosen": -125.47029876708984, "logps/rejected": -154.7410125732422, "loss": 0.5925, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5848723649978638, "rewards/margins": 0.2577180862426758, "rewards/rejected": -0.8425905108451843, "step": 2200 }, { "epoch": 1.59, "grad_norm": 1.9375, "learning_rate": 6.038983221531353e-07, "logits/chosen": -1.9424070119857788, "logits/rejected": -1.9472051858901978, "logps/chosen": -120.87110900878906, "logps/rejected": -145.4131622314453, "loss": 0.604, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5482410192489624, "rewards/margins": 0.23777303099632263, "rewards/rejected": -0.7860140204429626, "step": 2210 }, { "epoch": 1.6, "grad_norm": 1.9296875, "learning_rate": 5.83541157890379e-07, "logits/chosen": -1.9903781414031982, "logits/rejected": -2.0020580291748047, "logps/chosen": -123.49699401855469, "logps/rejected": -151.511474609375, "loss": 0.6134, "rewards/accuracies": 0.625, "rewards/chosen": -0.501916766166687, "rewards/margins": 0.23306772112846375, "rewards/rejected": -0.7349845170974731, "step": 2220 }, { "epoch": 1.61, "grad_norm": 2.46875, "learning_rate": 5.634875954308638e-07, "logits/chosen": -1.9178415536880493, "logits/rejected": -1.908630609512329, "logps/chosen": -129.9990692138672, "logps/rejected": -152.32931518554688, "loss": 0.6365, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6379269361495972, "rewards/margins": 0.18584506213665009, "rewards/rejected": -0.8237720727920532, "step": 2230 }, { "epoch": 1.61, "grad_norm": 1.6953125, "learning_rate": 5.437408116222148e-07, "logits/chosen": -1.8094866275787354, "logits/rejected": -1.8253847360610962, "logps/chosen": -115.37788391113281, "logps/rejected": -147.9441375732422, "loss": 0.5973, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5823795199394226, "rewards/margins": 0.27050092816352844, "rewards/rejected": -0.852880597114563, "step": 2240 }, { "epoch": 1.62, "grad_norm": 2.234375, "learning_rate": 5.243039347127621e-07, "logits/chosen": -1.9520610570907593, "logits/rejected": -1.9586107730865479, "logps/chosen": -133.17977905273438, "logps/rejected": -154.48606872558594, "loss": 0.6407, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6545882225036621, "rewards/margins": 0.17907670140266418, "rewards/rejected": -0.8336648941040039, "step": 2250 }, { "epoch": 1.63, "grad_norm": 1.2421875, "learning_rate": 5.05180043855969e-07, "logits/chosen": -1.8850457668304443, "logits/rejected": -1.895381212234497, "logps/chosen": -113.73271179199219, "logps/rejected": -137.9407501220703, "loss": 0.6167, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5211442708969116, "rewards/margins": 0.21343907713890076, "rewards/rejected": -0.7345833778381348, "step": 2260 }, { "epoch": 1.64, "grad_norm": 1.8515625, "learning_rate": 4.86372168622635e-07, "logits/chosen": -1.8701765537261963, "logits/rejected": -1.897936463356018, "logps/chosen": -124.44834899902344, "logps/rejected": -146.36325073242188, "loss": 0.6207, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.6087551116943359, "rewards/margins": 0.192201167345047, "rewards/rejected": -0.8009563684463501, "step": 2270 }, { "epoch": 1.64, "grad_norm": 2.359375, "learning_rate": 4.678832885209622e-07, "logits/chosen": -1.9065357446670532, "logits/rejected": -1.9053528308868408, "logps/chosen": -133.0020294189453, "logps/rejected": -151.4831085205078, "loss": 0.6418, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6242747902870178, "rewards/margins": 0.15634949505329132, "rewards/rejected": -0.7806242108345032, "step": 2280 }, { "epoch": 1.65, "grad_norm": 2.03125, "learning_rate": 4.497163325245416e-07, "logits/chosen": -1.869490623474121, "logits/rejected": -1.8728406429290771, "logps/chosen": -129.19412231445312, "logps/rejected": -148.17112731933594, "loss": 0.6197, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.601394534111023, "rewards/margins": 0.20290544629096985, "rewards/rejected": -0.8042998313903809, "step": 2290 }, { "epoch": 1.66, "grad_norm": 2.34375, "learning_rate": 4.3187417860835386e-07, "logits/chosen": -1.8597100973129272, "logits/rejected": -1.8595672845840454, "logps/chosen": -123.48005676269531, "logps/rejected": -146.0297088623047, "loss": 0.6135, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6197245717048645, "rewards/margins": 0.22459180653095245, "rewards/rejected": -0.8443164825439453, "step": 2300 }, { "epoch": 1.66, "grad_norm": 1.7265625, "learning_rate": 4.143596532928468e-07, "logits/chosen": -1.8806402683258057, "logits/rejected": -1.901450753211975, "logps/chosen": -121.08549499511719, "logps/rejected": -143.3876190185547, "loss": 0.6214, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.48497408628463745, "rewards/margins": 0.20996761322021484, "rewards/rejected": -0.6949416995048523, "step": 2310 }, { "epoch": 1.67, "grad_norm": 1.9921875, "learning_rate": 3.971755311961606e-07, "logits/chosen": -1.9731667041778564, "logits/rejected": -1.9899402856826782, "logps/chosen": -119.02622985839844, "logps/rejected": -144.31222534179688, "loss": 0.6119, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5730189681053162, "rewards/margins": 0.22726324200630188, "rewards/rejected": -0.8002821207046509, "step": 2320 }, { "epoch": 1.68, "grad_norm": 2.609375, "learning_rate": 3.8032453459457884e-07, "logits/chosen": -1.8654229640960693, "logits/rejected": -1.8797037601470947, "logps/chosen": -130.2448272705078, "logps/rejected": -158.59487915039062, "loss": 0.6099, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.7119914293289185, "rewards/margins": 0.24810293316841125, "rewards/rejected": -0.9600943326950073, "step": 2330 }, { "epoch": 1.69, "grad_norm": 1.5234375, "learning_rate": 3.6380933299127285e-07, "logits/chosen": -1.9288314580917358, "logits/rejected": -1.943996787071228, "logps/chosen": -119.17585754394531, "logps/rejected": -147.80038452148438, "loss": 0.6022, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5984092950820923, "rewards/margins": 0.2575072944164276, "rewards/rejected": -0.8559166193008423, "step": 2340 }, { "epoch": 1.69, "grad_norm": 1.515625, "learning_rate": 3.4763254269339965e-07, "logits/chosen": -1.8540977239608765, "logits/rejected": -1.8645613193511963, "logps/chosen": -138.80377197265625, "logps/rejected": -159.77102661132812, "loss": 0.621, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6650180816650391, "rewards/margins": 0.20767009258270264, "rewards/rejected": -0.8726881146430969, "step": 2350 }, { "epoch": 1.7, "grad_norm": 2.296875, "learning_rate": 3.3179672639763737e-07, "logits/chosen": -1.9465539455413818, "logits/rejected": -1.9527965784072876, "logps/chosen": -113.87541198730469, "logps/rejected": -147.63290405273438, "loss": 0.5874, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.52118980884552, "rewards/margins": 0.291698157787323, "rewards/rejected": -0.812887966632843, "step": 2360 }, { "epoch": 1.71, "grad_norm": 2.03125, "learning_rate": 3.163043927842019e-07, "logits/chosen": -1.9162803888320923, "logits/rejected": -1.934597373008728, "logps/chosen": -128.00485229492188, "logps/rejected": -146.68711853027344, "loss": 0.6268, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.596336841583252, "rewards/margins": 0.18232461810112, "rewards/rejected": -0.7786614298820496, "step": 2370 }, { "epoch": 1.72, "grad_norm": 1.453125, "learning_rate": 3.011579961194286e-07, "logits/chosen": -1.956756591796875, "logits/rejected": -1.9522291421890259, "logps/chosen": -130.30319213867188, "logps/rejected": -157.8980712890625, "loss": 0.61, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6217106580734253, "rewards/margins": 0.24674446880817413, "rewards/rejected": -0.8684550523757935, "step": 2380 }, { "epoch": 1.72, "grad_norm": 2.109375, "learning_rate": 2.8635993586697555e-07, "logits/chosen": -1.8693218231201172, "logits/rejected": -1.8768870830535889, "logps/chosen": -117.62882995605469, "logps/rejected": -141.52174377441406, "loss": 0.6162, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.554486870765686, "rewards/margins": 0.21078386902809143, "rewards/rejected": -0.7652707695960999, "step": 2390 }, { "epoch": 1.73, "grad_norm": 2.125, "learning_rate": 2.7191255630769855e-07, "logits/chosen": -1.9183435440063477, "logits/rejected": -1.9085958003997803, "logps/chosen": -131.42051696777344, "logps/rejected": -160.8799591064453, "loss": 0.5994, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6411622762680054, "rewards/margins": 0.2594824433326721, "rewards/rejected": -0.9006446599960327, "step": 2400 }, { "epoch": 1.74, "grad_norm": 2.09375, "learning_rate": 2.5781814616827936e-07, "logits/chosen": -1.9339672327041626, "logits/rejected": -1.9259040355682373, "logps/chosen": -126.9478530883789, "logps/rejected": -150.4621124267578, "loss": 0.6299, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6331970691680908, "rewards/margins": 0.18595722317695618, "rewards/rejected": -0.8191541433334351, "step": 2410 }, { "epoch": 1.74, "grad_norm": 2.046875, "learning_rate": 2.4407893825864893e-07, "logits/chosen": -1.8841025829315186, "logits/rejected": -1.9010101556777954, "logps/chosen": -123.89164733886719, "logps/rejected": -148.62289428710938, "loss": 0.6121, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6260435581207275, "rewards/margins": 0.23781859874725342, "rewards/rejected": -0.863862156867981, "step": 2420 }, { "epoch": 1.75, "grad_norm": 2.5, "learning_rate": 2.3069710911826858e-07, "logits/chosen": -1.873400092124939, "logits/rejected": -1.8696212768554688, "logps/chosen": -131.77992248535156, "logps/rejected": -158.84988403320312, "loss": 0.6255, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.671171247959137, "rewards/margins": 0.2179645597934723, "rewards/rejected": -0.8891357183456421, "step": 2430 }, { "epoch": 1.76, "grad_norm": 1.5703125, "learning_rate": 2.176747786713282e-07, "logits/chosen": -1.8760631084442139, "logits/rejected": -1.8778858184814453, "logps/chosen": -125.80195617675781, "logps/rejected": -149.81651306152344, "loss": 0.6215, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6132515072822571, "rewards/margins": 0.2072262316942215, "rewards/rejected": -0.8204777836799622, "step": 2440 }, { "epoch": 1.77, "grad_norm": 2.859375, "learning_rate": 2.0501400989091036e-07, "logits/chosen": -1.9188148975372314, "logits/rejected": -1.9283740520477295, "logps/chosen": -126.12955474853516, "logps/rejected": -145.44830322265625, "loss": 0.6261, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5981575846672058, "rewards/margins": 0.18680432438850403, "rewards/rejected": -0.7849618792533875, "step": 2450 }, { "epoch": 1.77, "grad_norm": 3.578125, "learning_rate": 1.927168084721795e-07, "logits/chosen": -1.9040225744247437, "logits/rejected": -1.918349027633667, "logps/chosen": -121.2413558959961, "logps/rejected": -146.60833740234375, "loss": 0.6096, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5998507738113403, "rewards/margins": 0.24554471671581268, "rewards/rejected": -0.8453954458236694, "step": 2460 }, { "epoch": 1.78, "grad_norm": 1.6796875, "learning_rate": 1.8078512251464285e-07, "logits/chosen": -1.9655431509017944, "logits/rejected": -1.9582946300506592, "logps/chosen": -131.67860412597656, "logps/rejected": -152.63572692871094, "loss": 0.6273, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5897047519683838, "rewards/margins": 0.19095389544963837, "rewards/rejected": -0.7806587219238281, "step": 2470 }, { "epoch": 1.79, "grad_norm": 2.421875, "learning_rate": 1.6922084221353607e-07, "logits/chosen": -1.915435791015625, "logits/rejected": -1.941033124923706, "logps/chosen": -124.4185562133789, "logps/rejected": -150.6072540283203, "loss": 0.6078, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5871225595474243, "rewards/margins": 0.2428620308637619, "rewards/rejected": -0.8299845457077026, "step": 2480 }, { "epoch": 1.79, "grad_norm": 1.7109375, "learning_rate": 1.5802579956038093e-07, "logits/chosen": -1.8886842727661133, "logits/rejected": -1.9026222229003906, "logps/chosen": -115.16085052490234, "logps/rejected": -143.67367553710938, "loss": 0.5948, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5255457758903503, "rewards/margins": 0.26099538803100586, "rewards/rejected": -0.7865411639213562, "step": 2490 }, { "epoch": 1.8, "grad_norm": 2.453125, "learning_rate": 1.472017680527685e-07, "logits/chosen": -1.9219309091567993, "logits/rejected": -1.9138708114624023, "logps/chosen": -121.44775390625, "logps/rejected": -151.65969848632812, "loss": 0.6042, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6054600477218628, "rewards/margins": 0.24131233990192413, "rewards/rejected": -0.846772313117981, "step": 2500 }, { "epoch": 1.81, "grad_norm": 2.515625, "learning_rate": 1.3675046241339918e-07, "logits/chosen": -1.8917341232299805, "logits/rejected": -1.9025049209594727, "logps/chosen": -127.67252349853516, "logps/rejected": -148.03077697753906, "loss": 0.6303, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5955443978309631, "rewards/margins": 0.18720856308937073, "rewards/rejected": -0.7827528715133667, "step": 2510 }, { "epoch": 1.82, "grad_norm": 1.609375, "learning_rate": 1.2667353831844585e-07, "logits/chosen": -1.83237624168396, "logits/rejected": -1.8439161777496338, "logps/chosen": -128.77362060546875, "logps/rejected": -150.86912536621094, "loss": 0.62, "rewards/accuracies": 0.625, "rewards/chosen": -0.6072179675102234, "rewards/margins": 0.20041854679584503, "rewards/rejected": -0.807636559009552, "step": 2520 }, { "epoch": 1.82, "grad_norm": 1.8359375, "learning_rate": 1.1697259213525936e-07, "logits/chosen": -1.8945062160491943, "logits/rejected": -1.8865067958831787, "logps/chosen": -113.77557373046875, "logps/rejected": -145.04769897460938, "loss": 0.5948, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5280709266662598, "rewards/margins": 0.2756304144859314, "rewards/rejected": -0.8037012815475464, "step": 2530 }, { "epoch": 1.83, "grad_norm": 2.078125, "learning_rate": 1.0764916066947795e-07, "logits/chosen": -1.801983118057251, "logits/rejected": -1.791394829750061, "logps/chosen": -131.66268920898438, "logps/rejected": -166.57620239257812, "loss": 0.5934, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7564202547073364, "rewards/margins": 0.3014541268348694, "rewards/rejected": -1.057874321937561, "step": 2540 }, { "epoch": 1.84, "grad_norm": 1.640625, "learning_rate": 9.870472092156941e-08, "logits/chosen": -1.8863308429718018, "logits/rejected": -1.8997328281402588, "logps/chosen": -120.3888931274414, "logps/rejected": -147.81582641601562, "loss": 0.6151, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6340813040733337, "rewards/margins": 0.23104877769947052, "rewards/rejected": -0.8651300668716431, "step": 2550 }, { "epoch": 1.84, "grad_norm": 2.140625, "learning_rate": 9.014068985284618e-08, "logits/chosen": -1.8636146783828735, "logits/rejected": -1.8543964624404907, "logps/chosen": -123.03385162353516, "logps/rejected": -138.28298950195312, "loss": 0.6387, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5915199518203735, "rewards/margins": 0.1568307727575302, "rewards/rejected": -0.7483507394790649, "step": 2560 }, { "epoch": 1.85, "grad_norm": 2.078125, "learning_rate": 8.19584241609936e-08, "logits/chosen": -1.983232855796814, "logits/rejected": -1.9851760864257812, "logps/chosen": -132.586669921875, "logps/rejected": -164.07855224609375, "loss": 0.5915, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.6490141153335571, "rewards/margins": 0.2791653275489807, "rewards/rejected": -0.9281795620918274, "step": 2570 }, { "epoch": 1.86, "grad_norm": 2.25, "learning_rate": 7.415922006514448e-08, "logits/chosen": -1.888055443763733, "logits/rejected": -1.9014968872070312, "logps/chosen": -122.8010482788086, "logps/rejected": -147.2124786376953, "loss": 0.6102, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.582960307598114, "rewards/margins": 0.22722116112709045, "rewards/rejected": -0.8101814985275269, "step": 2580 }, { "epoch": 1.87, "grad_norm": 1.9140625, "learning_rate": 6.674431310053519e-08, "logits/chosen": -1.9087717533111572, "logits/rejected": -1.9133113622665405, "logps/chosen": -117.87506103515625, "logps/rejected": -142.63198852539062, "loss": 0.6151, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.5861650109291077, "rewards/margins": 0.21530351042747498, "rewards/rejected": -0.8014683723449707, "step": 2590 }, { "epoch": 1.87, "grad_norm": 1.625, "learning_rate": 5.971487792277297e-08, "logits/chosen": -1.927983045578003, "logits/rejected": -1.9456886053085327, "logps/chosen": -117.11873626708984, "logps/rejected": -138.4589385986328, "loss": 0.6214, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5543134212493896, "rewards/margins": 0.19218505918979645, "rewards/rejected": -0.7464984655380249, "step": 2600 }, { "epoch": 1.88, "grad_norm": 1.8984375, "learning_rate": 5.307202812175005e-08, "logits/chosen": -1.8637211322784424, "logits/rejected": -1.878003716468811, "logps/chosen": -124.0638656616211, "logps/rejected": -146.49771118164062, "loss": 0.6155, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6212812662124634, "rewards/margins": 0.21200039982795715, "rewards/rejected": -0.8332816362380981, "step": 2610 }, { "epoch": 1.89, "grad_norm": 1.7421875, "learning_rate": 4.681681604523064e-08, "logits/chosen": -1.877753496170044, "logits/rejected": -1.8904426097869873, "logps/chosen": -126.96830749511719, "logps/rejected": -155.18118286132812, "loss": 0.6053, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.6443455815315247, "rewards/margins": 0.26543739438056946, "rewards/rejected": -0.9097830057144165, "step": 2620 }, { "epoch": 1.9, "grad_norm": 1.625, "learning_rate": 4.0950232632141205e-08, "logits/chosen": -1.9946720600128174, "logits/rejected": -2.0097415447235107, "logps/chosen": -134.808349609375, "logps/rejected": -155.46096801757812, "loss": 0.622, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.6130391359329224, "rewards/margins": 0.19625858962535858, "rewards/rejected": -0.8092976808547974, "step": 2630 }, { "epoch": 1.9, "grad_norm": 2.015625, "learning_rate": 3.547320725558495e-08, "logits/chosen": -1.8689781427383423, "logits/rejected": -1.8868701457977295, "logps/chosen": -127.9247817993164, "logps/rejected": -149.6826629638672, "loss": 0.6185, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6171109676361084, "rewards/margins": 0.20821337401866913, "rewards/rejected": -0.8253243565559387, "step": 2640 }, { "epoch": 1.91, "grad_norm": 1.3828125, "learning_rate": 3.038660757561568e-08, "logits/chosen": -1.9256298542022705, "logits/rejected": -1.9275277853012085, "logps/chosen": -131.28964233398438, "logps/rejected": -156.5281524658203, "loss": 0.6136, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5464403629302979, "rewards/margins": 0.2162129133939743, "rewards/rejected": -0.7626532316207886, "step": 2650 }, { "epoch": 1.92, "grad_norm": 2.1875, "learning_rate": 2.569123940178192e-08, "logits/chosen": -1.8706880807876587, "logits/rejected": -1.895275354385376, "logps/chosen": -126.8041000366211, "logps/rejected": -152.7987060546875, "loss": 0.6077, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.6154114007949829, "rewards/margins": 0.23175501823425293, "rewards/rejected": -0.8471664190292358, "step": 2660 }, { "epoch": 1.92, "grad_norm": 2.4375, "learning_rate": 2.1387846565474047e-08, "logits/chosen": -1.9511226415634155, "logits/rejected": -1.9575185775756836, "logps/chosen": -122.9830322265625, "logps/rejected": -150.94505310058594, "loss": 0.6051, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6026027798652649, "rewards/margins": 0.24615421891212463, "rewards/rejected": -0.8487569689750671, "step": 2670 }, { "epoch": 1.93, "grad_norm": 1.375, "learning_rate": 1.7477110802086583e-08, "logits/chosen": -1.8935045003890991, "logits/rejected": -1.9025996923446655, "logps/chosen": -128.3074188232422, "logps/rejected": -151.99522399902344, "loss": 0.6256, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6137545704841614, "rewards/margins": 0.19285576045513153, "rewards/rejected": -0.8066104054450989, "step": 2680 }, { "epoch": 1.94, "grad_norm": 2.0, "learning_rate": 1.3959651643019601e-08, "logits/chosen": -1.9040815830230713, "logits/rejected": -1.9198287725448608, "logps/chosen": -127.39599609375, "logps/rejected": -146.60008239746094, "loss": 0.6284, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5616117119789124, "rewards/margins": 0.1830439418554306, "rewards/rejected": -0.7446557283401489, "step": 2690 }, { "epoch": 1.95, "grad_norm": 2.390625, "learning_rate": 1.0836026317533887e-08, "logits/chosen": -1.925762414932251, "logits/rejected": -1.9249995946884155, "logps/chosen": -134.05189514160156, "logps/rejected": -150.71640014648438, "loss": 0.6467, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.6878820657730103, "rewards/margins": 0.15121865272521973, "rewards/rejected": -0.83910071849823, "step": 2700 }, { "epoch": 1.95, "grad_norm": 2.453125, "learning_rate": 8.106729664475178e-09, "logits/chosen": -1.9120187759399414, "logits/rejected": -1.9409675598144531, "logps/chosen": -126.06358337402344, "logps/rejected": -143.04531860351562, "loss": 0.6437, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.599898636341095, "rewards/margins": 0.15934984385967255, "rewards/rejected": -0.7592484951019287, "step": 2710 }, { "epoch": 1.96, "grad_norm": 1.78125, "learning_rate": 5.772194053882962e-09, "logits/chosen": -1.8914750814437866, "logits/rejected": -1.8805005550384521, "logps/chosen": -124.221435546875, "logps/rejected": -154.97128295898438, "loss": 0.6014, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.6593270301818848, "rewards/margins": 0.2628743350505829, "rewards/rejected": -0.9222013354301453, "step": 2720 }, { "epoch": 1.97, "grad_norm": 1.3984375, "learning_rate": 3.832789318495289e-09, "logits/chosen": -1.9097976684570312, "logits/rejected": -1.9211629629135132, "logps/chosen": -117.1760025024414, "logps/rejected": -139.37118530273438, "loss": 0.6256, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.5425055027008057, "rewards/margins": 0.20210960507392883, "rewards/rejected": -0.7446150779724121, "step": 2730 }, { "epoch": 1.97, "grad_norm": 3.078125, "learning_rate": 2.288822695160897e-09, "logits/chosen": -1.8931448459625244, "logits/rejected": -1.8963727951049805, "logps/chosen": -138.54782104492188, "logps/rejected": -168.485595703125, "loss": 0.6046, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7985516786575317, "rewards/margins": 0.25337541103363037, "rewards/rejected": -1.051927089691162, "step": 2740 }, { "epoch": 1.98, "grad_norm": 1.515625, "learning_rate": 1.1405387761664888e-09, "logits/chosen": -1.9339786767959595, "logits/rejected": -1.9223487377166748, "logps/chosen": -120.66938781738281, "logps/rejected": -145.23965454101562, "loss": 0.6159, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.5394836664199829, "rewards/margins": 0.20085513591766357, "rewards/rejected": -0.7403386831283569, "step": 2750 }, { "epoch": 1.99, "grad_norm": 2.140625, "learning_rate": 3.8811947048994494e-10, "logits/chosen": -1.9571815729141235, "logits/rejected": -1.9642584323883057, "logps/chosen": -129.06796264648438, "logps/rejected": -154.98562622070312, "loss": 0.6102, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6091276407241821, "rewards/margins": 0.23150965571403503, "rewards/rejected": -0.8406373262405396, "step": 2760 }, { "epoch": 2.0, "grad_norm": 2.453125, "learning_rate": 3.168397498115594e-11, "logits/chosen": -1.903794288635254, "logits/rejected": -1.9025567770004272, "logps/chosen": -130.3432159423828, "logps/rejected": -151.00790405273438, "loss": 0.6335, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6614962220191956, "rewards/margins": 0.1874585896730423, "rewards/rejected": -0.8489547967910767, "step": 2770 }, { "epoch": 2.0, "step": 2774, "total_flos": 0.0, "train_loss": 0.6373478753496264, "train_runtime": 5106.1264, "train_samples_per_second": 8.696, "train_steps_per_second": 0.543 } ], "logging_steps": 10, "max_steps": 2774, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }