{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 4164, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007204610951008645, "grad_norm": 16.86738074547546, "learning_rate": 1.199040767386091e-10, "logits/chosen": -1.901450514793396, "logits/rejected": -1.9076323509216309, "logps/chosen": -0.8524526953697205, "logps/rejected": -0.9626365900039673, "loss": 1.6316, "rewards/accuracies": 0.5, "rewards/chosen": -1.704905390739441, "rewards/margins": 0.22036786377429962, "rewards/rejected": -1.9252731800079346, "step": 1 }, { "epoch": 0.007204610951008645, "grad_norm": 20.613645580428617, "learning_rate": 1.199040767386091e-09, "logits/chosen": -2.0206820964813232, "logits/rejected": -2.0064265727996826, "logps/chosen": -1.0048619508743286, "logps/rejected": -1.109930157661438, "loss": 1.6537, "rewards/accuracies": 0.5208333134651184, "rewards/chosen": -2.0097239017486572, "rewards/margins": 0.2101363092660904, "rewards/rejected": -2.219860315322876, "step": 10 }, { "epoch": 0.01440922190201729, "grad_norm": 26.0197713502751, "learning_rate": 2.398081534772182e-09, "logits/chosen": -2.021113634109497, "logits/rejected": -2.0176849365234375, "logps/chosen": -1.051918387413025, "logps/rejected": -1.1836382150650024, "loss": 1.6165, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.10383677482605, "rewards/margins": 0.26343920826911926, "rewards/rejected": -2.367276430130005, "step": 20 }, { "epoch": 0.021613832853025938, "grad_norm": 20.43768761976058, "learning_rate": 3.597122302158273e-09, "logits/chosen": -1.9865376949310303, "logits/rejected": -1.9792907238006592, "logps/chosen": -1.0540201663970947, "logps/rejected": -1.151362657546997, "loss": 1.6718, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.1080403327941895, "rewards/margins": 0.19468507170677185, "rewards/rejected": -2.302725315093994, "step": 30 }, { "epoch": 0.02881844380403458, "grad_norm": 22.637677196780825, "learning_rate": 4.796163069544364e-09, "logits/chosen": -2.0319294929504395, "logits/rejected": -2.0319533348083496, "logps/chosen": -1.036224603652954, "logps/rejected": -1.1371500492095947, "loss": 1.6753, "rewards/accuracies": 0.59375, "rewards/chosen": -2.072449207305908, "rewards/margins": 0.20185065269470215, "rewards/rejected": -2.2743000984191895, "step": 40 }, { "epoch": 0.03602305475504323, "grad_norm": 17.18417481978632, "learning_rate": 5.995203836930456e-09, "logits/chosen": -1.9625972509384155, "logits/rejected": -1.9633257389068604, "logps/chosen": -0.9420710802078247, "logps/rejected": -1.0080515146255493, "loss": 1.7036, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8841421604156494, "rewards/margins": 0.13196071982383728, "rewards/rejected": -2.0161030292510986, "step": 50 }, { "epoch": 0.043227665706051875, "grad_norm": 24.35672110993842, "learning_rate": 7.194244604316546e-09, "logits/chosen": -2.0340707302093506, "logits/rejected": -2.029440402984619, "logps/chosen": -1.0892889499664307, "logps/rejected": -1.1459773778915405, "loss": 1.7177, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.1785778999328613, "rewards/margins": 0.11337709426879883, "rewards/rejected": -2.291954755783081, "step": 60 }, { "epoch": 0.05043227665706052, "grad_norm": 23.156815234579298, "learning_rate": 8.393285371702639e-09, "logits/chosen": -2.0260024070739746, "logits/rejected": -2.01363468170166, "logps/chosen": -1.1102702617645264, "logps/rejected": -1.2045681476593018, "loss": 1.6674, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.2205405235290527, "rewards/margins": 0.18859562277793884, "rewards/rejected": -2.4091362953186035, "step": 70 }, { "epoch": 0.05763688760806916, "grad_norm": 28.36871838668304, "learning_rate": 9.592326139088728e-09, "logits/chosen": -2.0396475791931152, "logits/rejected": -2.036716938018799, "logps/chosen": -1.1660326719284058, "logps/rejected": -1.2376497983932495, "loss": 1.7002, "rewards/accuracies": 0.5625, "rewards/chosen": -2.3320653438568115, "rewards/margins": 0.1432342827320099, "rewards/rejected": -2.475299596786499, "step": 80 }, { "epoch": 0.06484149855907781, "grad_norm": 18.084958262698944, "learning_rate": 1.0791366906474819e-08, "logits/chosen": -2.006788730621338, "logits/rejected": -2.0083227157592773, "logps/chosen": -1.0416046380996704, "logps/rejected": -1.1487059593200684, "loss": 1.6524, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.083209276199341, "rewards/margins": 0.21420249342918396, "rewards/rejected": -2.2974119186401367, "step": 90 }, { "epoch": 0.07204610951008646, "grad_norm": 21.40317389126799, "learning_rate": 1.1990407673860912e-08, "logits/chosen": -2.0494165420532227, "logits/rejected": -2.043391704559326, "logps/chosen": -1.0069491863250732, "logps/rejected": -1.1141040325164795, "loss": 1.6541, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.0138983726501465, "rewards/margins": 0.2143097221851349, "rewards/rejected": -2.228208065032959, "step": 100 }, { "epoch": 0.0792507204610951, "grad_norm": 18.423852000519574, "learning_rate": 1.3189448441247003e-08, "logits/chosen": -1.9877067804336548, "logits/rejected": -1.9766420125961304, "logps/chosen": -1.029048204421997, "logps/rejected": -1.1286016702651978, "loss": 1.6653, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.058096408843994, "rewards/margins": 0.1991068422794342, "rewards/rejected": -2.2572033405303955, "step": 110 }, { "epoch": 0.08645533141210375, "grad_norm": 20.702019283446507, "learning_rate": 1.4388489208633092e-08, "logits/chosen": -1.9706007242202759, "logits/rejected": -1.9688093662261963, "logps/chosen": -0.9641367197036743, "logps/rejected": -1.0660324096679688, "loss": 1.6479, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.9282734394073486, "rewards/margins": 0.20379126071929932, "rewards/rejected": -2.1320648193359375, "step": 120 }, { "epoch": 0.0936599423631124, "grad_norm": 20.15601276103363, "learning_rate": 1.5587529976019183e-08, "logits/chosen": -2.065584659576416, "logits/rejected": -2.0650219917297363, "logps/chosen": -1.080052137374878, "logps/rejected": -1.152573585510254, "loss": 1.7001, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.160104274749756, "rewards/margins": 0.14504289627075195, "rewards/rejected": -2.305147171020508, "step": 130 }, { "epoch": 0.10086455331412104, "grad_norm": 23.99550514508731, "learning_rate": 1.6786570743405277e-08, "logits/chosen": -1.9799762964248657, "logits/rejected": -1.9734480381011963, "logps/chosen": -0.9775880575180054, "logps/rejected": -1.1228140592575073, "loss": 1.5978, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.9551761150360107, "rewards/margins": 0.2904522120952606, "rewards/rejected": -2.2456281185150146, "step": 140 }, { "epoch": 0.10806916426512968, "grad_norm": 22.89932823746954, "learning_rate": 1.7985611510791365e-08, "logits/chosen": -1.9970951080322266, "logits/rejected": -1.9926828145980835, "logps/chosen": -1.0191056728363037, "logps/rejected": -1.136625051498413, "loss": 1.6402, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0382113456726074, "rewards/margins": 0.23503896594047546, "rewards/rejected": -2.273250102996826, "step": 150 }, { "epoch": 0.11527377521613832, "grad_norm": 20.462749941454653, "learning_rate": 1.9184652278177456e-08, "logits/chosen": -2.0059406757354736, "logits/rejected": -1.9999682903289795, "logps/chosen": -0.9481186866760254, "logps/rejected": -1.0966413021087646, "loss": 1.5797, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8962373733520508, "rewards/margins": 0.2970453202724457, "rewards/rejected": -2.1932826042175293, "step": 160 }, { "epoch": 0.12247838616714697, "grad_norm": 25.008654214237108, "learning_rate": 2.038369304556355e-08, "logits/chosen": -2.0040414333343506, "logits/rejected": -1.996482491493225, "logps/chosen": -1.0370402336120605, "logps/rejected": -1.1610687971115112, "loss": 1.636, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.074080467224121, "rewards/margins": 0.24805748462677002, "rewards/rejected": -2.3221375942230225, "step": 170 }, { "epoch": 0.12968299711815562, "grad_norm": 26.23079891121696, "learning_rate": 2.1582733812949638e-08, "logits/chosen": -2.0341007709503174, "logits/rejected": -2.0272812843322754, "logps/chosen": -1.0205583572387695, "logps/rejected": -1.108747124671936, "loss": 1.6873, "rewards/accuracies": 0.53125, "rewards/chosen": -2.041116714477539, "rewards/margins": 0.17637746036052704, "rewards/rejected": -2.217494249343872, "step": 180 }, { "epoch": 0.13688760806916425, "grad_norm": 25.948781935914305, "learning_rate": 2.278177458033573e-08, "logits/chosen": -2.072432041168213, "logits/rejected": -2.070256471633911, "logps/chosen": -0.9695008397102356, "logps/rejected": -1.0653893947601318, "loss": 1.654, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9390016794204712, "rewards/margins": 0.1917770504951477, "rewards/rejected": -2.1307787895202637, "step": 190 }, { "epoch": 0.1440922190201729, "grad_norm": 26.28092638656012, "learning_rate": 2.3980815347721823e-08, "logits/chosen": -2.043623924255371, "logits/rejected": -2.040480852127075, "logps/chosen": -1.02638840675354, "logps/rejected": -1.1531604528427124, "loss": 1.6195, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.05277681350708, "rewards/margins": 0.25354427099227905, "rewards/rejected": -2.306320905685425, "step": 200 }, { "epoch": 0.15129682997118155, "grad_norm": 23.767598989527496, "learning_rate": 2.517985611510791e-08, "logits/chosen": -2.0386040210723877, "logits/rejected": -2.035883665084839, "logps/chosen": -1.0741676092147827, "logps/rejected": -1.1504744291305542, "loss": 1.695, "rewards/accuracies": 0.5625, "rewards/chosen": -2.1483352184295654, "rewards/margins": 0.15261325240135193, "rewards/rejected": -2.3009488582611084, "step": 210 }, { "epoch": 0.1585014409221902, "grad_norm": 18.04399847256739, "learning_rate": 2.6378896882494006e-08, "logits/chosen": -1.9858801364898682, "logits/rejected": -1.981899619102478, "logps/chosen": -1.0078871250152588, "logps/rejected": -1.176697850227356, "loss": 1.5723, "rewards/accuracies": 0.625, "rewards/chosen": -2.0157742500305176, "rewards/margins": 0.3376217782497406, "rewards/rejected": -2.353395700454712, "step": 220 }, { "epoch": 0.16570605187319884, "grad_norm": 19.47145481849836, "learning_rate": 2.7577937649880097e-08, "logits/chosen": -2.019965887069702, "logits/rejected": -2.0204622745513916, "logps/chosen": -1.013035535812378, "logps/rejected": -1.1259748935699463, "loss": 1.6389, "rewards/accuracies": 0.625, "rewards/chosen": -2.026071071624756, "rewards/margins": 0.22587890923023224, "rewards/rejected": -2.2519497871398926, "step": 230 }, { "epoch": 0.1729106628242075, "grad_norm": 25.436896017452572, "learning_rate": 2.8776978417266184e-08, "logits/chosen": -2.049868583679199, "logits/rejected": -2.0446996688842773, "logps/chosen": -1.0618335008621216, "logps/rejected": -1.139091968536377, "loss": 1.7037, "rewards/accuracies": 0.5625, "rewards/chosen": -2.123667001724243, "rewards/margins": 0.15451690554618835, "rewards/rejected": -2.278183937072754, "step": 240 }, { "epoch": 0.18011527377521613, "grad_norm": 21.808773581340166, "learning_rate": 2.997601918465228e-08, "logits/chosen": -1.967551589012146, "logits/rejected": -1.9636993408203125, "logps/chosen": -1.0833590030670166, "logps/rejected": -1.1740168333053589, "loss": 1.6776, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.166718006134033, "rewards/margins": 0.18131573498249054, "rewards/rejected": -2.3480336666107178, "step": 250 }, { "epoch": 0.1873198847262248, "grad_norm": 23.879416059454005, "learning_rate": 3.1175059952038366e-08, "logits/chosen": -1.9883846044540405, "logits/rejected": -1.9965698719024658, "logps/chosen": -1.1054890155792236, "logps/rejected": -1.2166268825531006, "loss": 1.6489, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.2109780311584473, "rewards/margins": 0.2222757786512375, "rewards/rejected": -2.433253765106201, "step": 260 }, { "epoch": 0.19452449567723343, "grad_norm": 23.46415292315461, "learning_rate": 3.237410071942446e-08, "logits/chosen": -2.066842555999756, "logits/rejected": -2.0588183403015137, "logps/chosen": -1.0719441175460815, "logps/rejected": -1.2008320093154907, "loss": 1.6138, "rewards/accuracies": 0.59375, "rewards/chosen": -2.143888235092163, "rewards/margins": 0.2577756643295288, "rewards/rejected": -2.4016640186309814, "step": 270 }, { "epoch": 0.2017291066282421, "grad_norm": 29.077459533277295, "learning_rate": 3.3573141486810555e-08, "logits/chosen": -2.014507293701172, "logits/rejected": -2.0130252838134766, "logps/chosen": -0.9357423782348633, "logps/rejected": -1.0497205257415771, "loss": 1.635, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.8714847564697266, "rewards/margins": 0.22795608639717102, "rewards/rejected": -2.0994410514831543, "step": 280 }, { "epoch": 0.20893371757925072, "grad_norm": 24.540718741705636, "learning_rate": 3.477218225419664e-08, "logits/chosen": -2.0456860065460205, "logits/rejected": -2.0476441383361816, "logps/chosen": -1.0138399600982666, "logps/rejected": -1.1081044673919678, "loss": 1.6741, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.027679920196533, "rewards/margins": 0.18852944672107697, "rewards/rejected": -2.2162089347839355, "step": 290 }, { "epoch": 0.21613832853025935, "grad_norm": 23.80678112503861, "learning_rate": 3.597122302158273e-08, "logits/chosen": -2.023660659790039, "logits/rejected": -2.0153934955596924, "logps/chosen": -1.0902923345565796, "logps/rejected": -1.1912401914596558, "loss": 1.6536, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.180584669113159, "rewards/margins": 0.20189563930034637, "rewards/rejected": -2.3824803829193115, "step": 300 }, { "epoch": 0.22334293948126802, "grad_norm": 21.247440773672857, "learning_rate": 3.717026378896883e-08, "logits/chosen": -1.9593013525009155, "logits/rejected": -1.9593509435653687, "logps/chosen": -1.0871893167495728, "logps/rejected": -1.1732357740402222, "loss": 1.6787, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1743786334991455, "rewards/margins": 0.17209310829639435, "rewards/rejected": -2.3464715480804443, "step": 310 }, { "epoch": 0.23054755043227665, "grad_norm": 18.906349788401887, "learning_rate": 3.836930455635491e-08, "logits/chosen": -2.029911994934082, "logits/rejected": -2.0213656425476074, "logps/chosen": -1.0086743831634521, "logps/rejected": -1.141055703163147, "loss": 1.6233, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0173487663269043, "rewards/margins": 0.26476234197616577, "rewards/rejected": -2.282111406326294, "step": 320 }, { "epoch": 0.2377521613832853, "grad_norm": 18.14298549042656, "learning_rate": 3.9568345323741003e-08, "logits/chosen": -2.0148098468780518, "logits/rejected": -2.0169730186462402, "logps/chosen": -1.0459707975387573, "logps/rejected": -1.069668173789978, "loss": 1.7894, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -2.0919415950775146, "rewards/margins": 0.04739472270011902, "rewards/rejected": -2.139336347579956, "step": 330 }, { "epoch": 0.24495677233429394, "grad_norm": 21.71794264069243, "learning_rate": 4.07673860911271e-08, "logits/chosen": -2.063164710998535, "logits/rejected": -2.0576515197753906, "logps/chosen": -1.0874309539794922, "logps/rejected": -1.1679496765136719, "loss": 1.681, "rewards/accuracies": 0.5625, "rewards/chosen": -2.1748619079589844, "rewards/margins": 0.16103747487068176, "rewards/rejected": -2.3358993530273438, "step": 340 }, { "epoch": 0.2521613832853026, "grad_norm": 21.72091647579136, "learning_rate": 4.1966426858513185e-08, "logits/chosen": -1.9914758205413818, "logits/rejected": -1.9858478307724, "logps/chosen": -0.9883527755737305, "logps/rejected": -1.1146571636199951, "loss": 1.6198, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.976705551147461, "rewards/margins": 0.25260910391807556, "rewards/rejected": -2.2293143272399902, "step": 350 }, { "epoch": 0.25936599423631124, "grad_norm": 24.239928528669363, "learning_rate": 4.3165467625899276e-08, "logits/chosen": -1.9954763650894165, "logits/rejected": -1.9914255142211914, "logps/chosen": -1.0863357782363892, "logps/rejected": -1.2026948928833008, "loss": 1.6317, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.1726715564727783, "rewards/margins": 0.23271822929382324, "rewards/rejected": -2.4053897857666016, "step": 360 }, { "epoch": 0.2665706051873199, "grad_norm": 21.133751661041465, "learning_rate": 4.4364508393285374e-08, "logits/chosen": -2.0022664070129395, "logits/rejected": -2.0021908283233643, "logps/chosen": -1.0523954629898071, "logps/rejected": -1.180757999420166, "loss": 1.6106, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1047909259796143, "rewards/margins": 0.2567252516746521, "rewards/rejected": -2.361515998840332, "step": 370 }, { "epoch": 0.2737752161383285, "grad_norm": 18.758223309316932, "learning_rate": 4.556354916067146e-08, "logits/chosen": -2.032881498336792, "logits/rejected": -2.037038803100586, "logps/chosen": -1.0132153034210205, "logps/rejected": -1.0857946872711182, "loss": 1.7134, "rewards/accuracies": 0.5625, "rewards/chosen": -2.026430606842041, "rewards/margins": 0.14515872299671173, "rewards/rejected": -2.1715893745422363, "step": 380 }, { "epoch": 0.28097982708933716, "grad_norm": 18.084496902551265, "learning_rate": 4.676258992805755e-08, "logits/chosen": -2.0284078121185303, "logits/rejected": -2.022207736968994, "logps/chosen": -1.022369146347046, "logps/rejected": -1.1491353511810303, "loss": 1.6141, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.044738292694092, "rewards/margins": 0.2535324990749359, "rewards/rejected": -2.2982707023620605, "step": 390 }, { "epoch": 0.2881844380403458, "grad_norm": 22.16978428512717, "learning_rate": 4.796163069544365e-08, "logits/chosen": -2.034036636352539, "logits/rejected": -2.034489393234253, "logps/chosen": -0.9963706731796265, "logps/rejected": -1.0486037731170654, "loss": 1.7238, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.992741346359253, "rewards/margins": 0.10446592420339584, "rewards/rejected": -2.097207546234131, "step": 400 }, { "epoch": 0.2953890489913545, "grad_norm": 21.273525929856714, "learning_rate": 4.916067146282973e-08, "logits/chosen": -2.0305874347686768, "logits/rejected": -2.0286340713500977, "logps/chosen": -1.07444167137146, "logps/rejected": -1.1461719274520874, "loss": 1.7045, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.14888334274292, "rewards/margins": 0.14346005022525787, "rewards/rejected": -2.292343854904175, "step": 410 }, { "epoch": 0.3025936599423631, "grad_norm": 19.47140296933347, "learning_rate": 4.999992091672379e-08, "logits/chosen": -2.008545398712158, "logits/rejected": -2.0128049850463867, "logps/chosen": -1.0455248355865479, "logps/rejected": -1.123600959777832, "loss": 1.6877, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.0910496711730957, "rewards/margins": 0.15615199506282806, "rewards/rejected": -2.247201919555664, "step": 420 }, { "epoch": 0.30979827089337175, "grad_norm": 21.062266015909344, "learning_rate": 4.999851500573209e-08, "logits/chosen": -1.9909534454345703, "logits/rejected": -1.991986870765686, "logps/chosen": -1.0589145421981812, "logps/rejected": -1.0995006561279297, "loss": 1.7539, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -2.1178290843963623, "rewards/margins": 0.08117230981588364, "rewards/rejected": -2.1990013122558594, "step": 430 }, { "epoch": 0.3170028818443804, "grad_norm": 18.648357429904824, "learning_rate": 4.999535180235972e-08, "logits/chosen": -1.9874866008758545, "logits/rejected": -1.9876487255096436, "logps/chosen": -1.021545171737671, "logps/rejected": -1.1438276767730713, "loss": 1.6288, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.043090343475342, "rewards/margins": 0.2445647269487381, "rewards/rejected": -2.2876553535461426, "step": 440 }, { "epoch": 0.3242074927953891, "grad_norm": 20.41517294169414, "learning_rate": 4.9990431528966836e-08, "logits/chosen": -2.012659788131714, "logits/rejected": -2.0090537071228027, "logps/chosen": -1.1445978879928589, "logps/rejected": -1.185278058052063, "loss": 1.7527, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.2891957759857178, "rewards/margins": 0.08136044442653656, "rewards/rejected": -2.370556116104126, "step": 450 }, { "epoch": 0.3314121037463977, "grad_norm": 27.95882368035639, "learning_rate": 4.9983754531428326e-08, "logits/chosen": -2.0088980197906494, "logits/rejected": -2.0034842491149902, "logps/chosen": -1.170532464981079, "logps/rejected": -1.2874171733856201, "loss": 1.636, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.341064929962158, "rewards/margins": 0.23376956582069397, "rewards/rejected": -2.5748343467712402, "step": 460 }, { "epoch": 0.33861671469740634, "grad_norm": 26.177277390892126, "learning_rate": 4.997532127910954e-08, "logits/chosen": -2.0459113121032715, "logits/rejected": -2.033708095550537, "logps/chosen": -1.100483775138855, "logps/rejected": -1.2025021314620972, "loss": 1.6569, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.20096755027771, "rewards/margins": 0.2040366679430008, "rewards/rejected": -2.4050042629241943, "step": 470 }, { "epoch": 0.345821325648415, "grad_norm": 24.775194995617756, "learning_rate": 4.996513236483331e-08, "logits/chosen": -2.100435733795166, "logits/rejected": -2.090082883834839, "logps/chosen": -0.9846943020820618, "logps/rejected": -1.106745719909668, "loss": 1.6197, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.9693886041641235, "rewards/margins": 0.24410302937030792, "rewards/rejected": -2.213491439819336, "step": 480 }, { "epoch": 0.3530259365994236, "grad_norm": 21.314873423010035, "learning_rate": 4.9953188504838225e-08, "logits/chosen": -2.018599510192871, "logits/rejected": -2.017850399017334, "logps/chosen": -0.987831711769104, "logps/rejected": -1.1018215417861938, "loss": 1.6308, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.975663423538208, "rewards/margins": 0.22797973453998566, "rewards/rejected": -2.2036430835723877, "step": 490 }, { "epoch": 0.36023054755043227, "grad_norm": 20.843999364445644, "learning_rate": 4.993949053872834e-08, "logits/chosen": -2.0248796939849854, "logits/rejected": -2.011425495147705, "logps/chosen": -1.0125197172164917, "logps/rejected": -1.1393004655838013, "loss": 1.6149, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0250394344329834, "rewards/margins": 0.2535613477230072, "rewards/rejected": -2.2786009311676025, "step": 500 }, { "epoch": 0.36743515850144093, "grad_norm": 21.724260529295954, "learning_rate": 4.9924039429414086e-08, "logits/chosen": -2.089134454727173, "logits/rejected": -2.0826263427734375, "logps/chosen": -1.0443626642227173, "logps/rejected": -1.1584975719451904, "loss": 1.6419, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0887253284454346, "rewards/margins": 0.22826950252056122, "rewards/rejected": -2.316995143890381, "step": 510 }, { "epoch": 0.3746397694524496, "grad_norm": 18.888085983647994, "learning_rate": 4.990683626304467e-08, "logits/chosen": -2.0145156383514404, "logits/rejected": -2.0130035877227783, "logps/chosen": -1.1068918704986572, "logps/rejected": -1.2029352188110352, "loss": 1.6615, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.2137837409973145, "rewards/margins": 0.19208666682243347, "rewards/rejected": -2.4058704376220703, "step": 520 }, { "epoch": 0.3818443804034582, "grad_norm": 19.95655560091149, "learning_rate": 4.9887882248931646e-08, "logits/chosen": -1.9786971807479858, "logits/rejected": -1.9687769412994385, "logps/chosen": -0.9844874143600464, "logps/rejected": -1.061359167098999, "loss": 1.694, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.9689748287200928, "rewards/margins": 0.1537436544895172, "rewards/rejected": -2.122718334197998, "step": 530 }, { "epoch": 0.38904899135446686, "grad_norm": 25.36665579798685, "learning_rate": 4.986717871946393e-08, "logits/chosen": -2.000777006149292, "logits/rejected": -1.9938316345214844, "logps/chosen": -1.0306737422943115, "logps/rejected": -1.1323692798614502, "loss": 1.6597, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.061347484588623, "rewards/margins": 0.203391432762146, "rewards/rejected": -2.2647385597229004, "step": 540 }, { "epoch": 0.3962536023054755, "grad_norm": 19.949235093698213, "learning_rate": 4.984472713001416e-08, "logits/chosen": -1.966073751449585, "logits/rejected": -1.966660737991333, "logps/chosen": -1.0001440048217773, "logps/rejected": -1.0772383213043213, "loss": 1.7098, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.0002880096435547, "rewards/margins": 0.15418852865695953, "rewards/rejected": -2.1544766426086426, "step": 550 }, { "epoch": 0.4034582132564842, "grad_norm": 19.74458144238105, "learning_rate": 4.982052905883637e-08, "logits/chosen": -2.0273754596710205, "logits/rejected": -2.0279335975646973, "logps/chosen": -1.0799560546875, "logps/rejected": -1.1804721355438232, "loss": 1.6629, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.159912109375, "rewards/margins": 0.20103248953819275, "rewards/rejected": -2.3609442710876465, "step": 560 }, { "epoch": 0.4106628242074928, "grad_norm": 18.071530723249605, "learning_rate": 4.979458620695505e-08, "logits/chosen": -2.034860372543335, "logits/rejected": -2.020716905593872, "logps/chosen": -1.0942167043685913, "logps/rejected": -1.2081631422042847, "loss": 1.6428, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.1884334087371826, "rewards/margins": 0.22789283096790314, "rewards/rejected": -2.4163262844085693, "step": 570 }, { "epoch": 0.41786743515850144, "grad_norm": 21.587300991983227, "learning_rate": 4.976690039804555e-08, "logits/chosen": -2.0321314334869385, "logits/rejected": -2.030474901199341, "logps/chosen": -0.9870206713676453, "logps/rejected": -1.0672633647918701, "loss": 1.6895, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9740413427352905, "rewards/margins": 0.16048535704612732, "rewards/rejected": -2.1345267295837402, "step": 580 }, { "epoch": 0.4250720461095101, "grad_norm": 24.004278735328736, "learning_rate": 4.973747357830592e-08, "logits/chosen": -2.0196094512939453, "logits/rejected": -2.0198519229888916, "logps/chosen": -1.0263643264770508, "logps/rejected": -1.1645493507385254, "loss": 1.5974, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0527286529541016, "rewards/margins": 0.2763698101043701, "rewards/rejected": -2.329098701477051, "step": 590 }, { "epoch": 0.4322766570605187, "grad_norm": 22.394722552690947, "learning_rate": 4.970630781632009e-08, "logits/chosen": -2.0787835121154785, "logits/rejected": -2.074876308441162, "logps/chosen": -1.032995343208313, "logps/rejected": -1.1753590106964111, "loss": 1.5973, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.065990686416626, "rewards/margins": 0.28472739458084106, "rewards/rejected": -2.3507180213928223, "step": 600 }, { "epoch": 0.43948126801152737, "grad_norm": 24.212668902315563, "learning_rate": 4.967340530291242e-08, "logits/chosen": -2.0278451442718506, "logits/rejected": -2.018012762069702, "logps/chosen": -1.0924032926559448, "logps/rejected": -1.1508691310882568, "loss": 1.7169, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.1848065853118896, "rewards/margins": 0.11693187803030014, "rewards/rejected": -2.3017382621765137, "step": 610 }, { "epoch": 0.44668587896253603, "grad_norm": 28.0595450997105, "learning_rate": 4.9638768350993755e-08, "logits/chosen": -2.026702880859375, "logits/rejected": -2.0193984508514404, "logps/chosen": -0.9957388639450073, "logps/rejected": -1.083117961883545, "loss": 1.6767, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.9914777278900146, "rewards/margins": 0.17475813627243042, "rewards/rejected": -2.16623592376709, "step": 620 }, { "epoch": 0.4538904899135447, "grad_norm": 22.956313206225012, "learning_rate": 4.9602399395398786e-08, "logits/chosen": -2.0426907539367676, "logits/rejected": -2.0426487922668457, "logps/chosen": -1.0262912511825562, "logps/rejected": -1.1543877124786377, "loss": 1.6157, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.0525825023651123, "rewards/margins": 0.256193071603775, "rewards/rejected": -2.3087754249572754, "step": 630 }, { "epoch": 0.4610951008645533, "grad_norm": 18.87407337598182, "learning_rate": 4.9564300992714914e-08, "logits/chosen": -1.9564889669418335, "logits/rejected": -1.957383155822754, "logps/chosen": -1.0102789402008057, "logps/rejected": -1.116477370262146, "loss": 1.6474, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0205578804016113, "rewards/margins": 0.2123967409133911, "rewards/rejected": -2.232954740524292, "step": 640 }, { "epoch": 0.46829971181556196, "grad_norm": 24.33234982153659, "learning_rate": 4.952447582110253e-08, "logits/chosen": -2.0563530921936035, "logits/rejected": -2.041901111602783, "logps/chosen": -1.0375721454620361, "logps/rejected": -1.117628812789917, "loss": 1.6908, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.0751442909240723, "rewards/margins": 0.1601136028766632, "rewards/rejected": -2.235257625579834, "step": 650 }, { "epoch": 0.4755043227665706, "grad_norm": 26.552144602867106, "learning_rate": 4.948292668010676e-08, "logits/chosen": -2.035076856613159, "logits/rejected": -2.035987377166748, "logps/chosen": -1.0880695581436157, "logps/rejected": -1.175052285194397, "loss": 1.6847, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.1761391162872314, "rewards/margins": 0.1739654839038849, "rewards/rejected": -2.350104570388794, "step": 660 }, { "epoch": 0.4827089337175792, "grad_norm": 24.110493312081978, "learning_rate": 4.943965649046064e-08, "logits/chosen": -2.0068695545196533, "logits/rejected": -1.9975703954696655, "logps/chosen": -1.062526822090149, "logps/rejected": -1.1664403676986694, "loss": 1.6538, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.125053644180298, "rewards/margins": 0.2078275978565216, "rewards/rejected": -2.332880735397339, "step": 670 }, { "epoch": 0.4899135446685879, "grad_norm": 21.908255569501478, "learning_rate": 4.9394668293879835e-08, "logits/chosen": -1.9596691131591797, "logits/rejected": -1.9506410360336304, "logps/chosen": -1.0364726781845093, "logps/rejected": -1.1055781841278076, "loss": 1.708, "rewards/accuracies": 0.53125, "rewards/chosen": -2.0729453563690186, "rewards/margins": 0.13821099698543549, "rewards/rejected": -2.2111563682556152, "step": 680 }, { "epoch": 0.49711815561959655, "grad_norm": 29.910895605785804, "learning_rate": 4.93479652528488e-08, "logits/chosen": -2.021054983139038, "logits/rejected": -2.0158448219299316, "logps/chosen": -1.1041984558105469, "logps/rejected": -1.2085119485855103, "loss": 1.6632, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.2083969116210938, "rewards/margins": 0.20862674713134766, "rewards/rejected": -2.4170238971710205, "step": 690 }, { "epoch": 0.5043227665706052, "grad_norm": 23.096708893800308, "learning_rate": 4.929955065039848e-08, "logits/chosen": -2.0168848037719727, "logits/rejected": -2.011321544647217, "logps/chosen": -1.0192147493362427, "logps/rejected": -1.1515916585922241, "loss": 1.6154, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0384294986724854, "rewards/margins": 0.26475387811660767, "rewards/rejected": -2.3031833171844482, "step": 700 }, { "epoch": 0.5115273775216138, "grad_norm": 21.90175405740422, "learning_rate": 4.92494278898755e-08, "logits/chosen": -1.9860484600067139, "logits/rejected": -1.9828269481658936, "logps/chosen": -0.8964320421218872, "logps/rejected": -1.0216236114501953, "loss": 1.6289, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.7928640842437744, "rewards/margins": 0.25038328766822815, "rewards/rejected": -2.0432472229003906, "step": 710 }, { "epoch": 0.5187319884726225, "grad_norm": 21.938421077731768, "learning_rate": 4.9197600494702955e-08, "logits/chosen": -2.0129542350769043, "logits/rejected": -2.0067877769470215, "logps/chosen": -1.0419350862503052, "logps/rejected": -1.165678858757019, "loss": 1.6192, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0838701725006104, "rewards/margins": 0.2474878579378128, "rewards/rejected": -2.331357717514038, "step": 720 }, { "epoch": 0.5259365994236311, "grad_norm": 23.196215525802465, "learning_rate": 4.9144072108132725e-08, "logits/chosen": -2.009798526763916, "logits/rejected": -1.9987558126449585, "logps/chosen": -1.0216114521026611, "logps/rejected": -1.1052320003509521, "loss": 1.6903, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0432229042053223, "rewards/margins": 0.16724112629890442, "rewards/rejected": -2.2104640007019043, "step": 730 }, { "epoch": 0.5331412103746398, "grad_norm": 20.25848452271348, "learning_rate": 4.908884649298937e-08, "logits/chosen": -1.9975754022598267, "logits/rejected": -2.004523515701294, "logps/chosen": -1.0185396671295166, "logps/rejected": -1.0793794393539429, "loss": 1.7305, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.037079334259033, "rewards/margins": 0.12167976051568985, "rewards/rejected": -2.1587588787078857, "step": 740 }, { "epoch": 0.5403458213256485, "grad_norm": 26.211714705517682, "learning_rate": 4.903192753140557e-08, "logits/chosen": -2.0177195072174072, "logits/rejected": -2.012320041656494, "logps/chosen": -1.0997415781021118, "logps/rejected": -1.1906296014785767, "loss": 1.678, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.1994831562042236, "rewards/margins": 0.18177607655525208, "rewards/rejected": -2.3812592029571533, "step": 750 }, { "epoch": 0.547550432276657, "grad_norm": 22.758326251530672, "learning_rate": 4.897331922454931e-08, "logits/chosen": -1.9822711944580078, "logits/rejected": -1.9859788417816162, "logps/chosen": -1.0030810832977295, "logps/rejected": -1.113510012626648, "loss": 1.6515, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.006162166595459, "rewards/margins": 0.22085781395435333, "rewards/rejected": -2.227020025253296, "step": 760 }, { "epoch": 0.5547550432276657, "grad_norm": 25.008812386735762, "learning_rate": 4.891302569234256e-08, "logits/chosen": -1.9758611917495728, "logits/rejected": -1.9786208868026733, "logps/chosen": -0.976547122001648, "logps/rejected": -1.128833532333374, "loss": 1.5906, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.953094244003296, "rewards/margins": 0.3045729994773865, "rewards/rejected": -2.257667064666748, "step": 770 }, { "epoch": 0.5619596541786743, "grad_norm": 25.498520426312076, "learning_rate": 4.8851051173171656e-08, "logits/chosen": -1.992806077003479, "logits/rejected": -1.99143385887146, "logps/chosen": -1.040136456489563, "logps/rejected": -1.1222012042999268, "loss": 1.6825, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.080272912979126, "rewards/margins": 0.16412939131259918, "rewards/rejected": -2.2444024085998535, "step": 780 }, { "epoch": 0.569164265129683, "grad_norm": 19.906167092488122, "learning_rate": 4.87874000235894e-08, "logits/chosen": -2.014981985092163, "logits/rejected": -2.0091872215270996, "logps/chosen": -1.0754855871200562, "logps/rejected": -1.2333699464797974, "loss": 1.5845, "rewards/accuracies": 0.625, "rewards/chosen": -2.1509711742401123, "rewards/margins": 0.3157687187194824, "rewards/rejected": -2.4667398929595947, "step": 790 }, { "epoch": 0.5763688760806917, "grad_norm": 21.828224545853562, "learning_rate": 4.872207671800876e-08, "logits/chosen": -2.03572416305542, "logits/rejected": -2.032073497772217, "logps/chosen": -1.0438910722732544, "logps/rejected": -1.121517300605774, "loss": 1.7003, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.087782144546509, "rewards/margins": 0.15525242686271667, "rewards/rejected": -2.243034601211548, "step": 800 }, { "epoch": 0.5835734870317003, "grad_norm": 18.581452299662768, "learning_rate": 4.865508584838841e-08, "logits/chosen": -2.018582344055176, "logits/rejected": -2.0212044715881348, "logps/chosen": -1.0126211643218994, "logps/rejected": -1.1025011539459229, "loss": 1.6742, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.025242328643799, "rewards/margins": 0.1797601878643036, "rewards/rejected": -2.2050023078918457, "step": 810 }, { "epoch": 0.590778097982709, "grad_norm": 24.17846087395363, "learning_rate": 4.858643212390985e-08, "logits/chosen": -2.025010585784912, "logits/rejected": -2.015291690826416, "logps/chosen": -1.0286962985992432, "logps/rejected": -1.1152880191802979, "loss": 1.6902, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0573925971984863, "rewards/margins": 0.17318376898765564, "rewards/rejected": -2.2305760383605957, "step": 820 }, { "epoch": 0.5979827089337176, "grad_norm": 21.183520805271574, "learning_rate": 4.851612037064643e-08, "logits/chosen": -1.997955083847046, "logits/rejected": -1.9959014654159546, "logps/chosen": -0.9596630930900574, "logps/rejected": -1.079238772392273, "loss": 1.6373, "rewards/accuracies": 0.625, "rewards/chosen": -1.9193261861801147, "rewards/margins": 0.23915132880210876, "rewards/rejected": -2.158477544784546, "step": 830 }, { "epoch": 0.6051873198847262, "grad_norm": 18.565915811402277, "learning_rate": 4.8444155531224065e-08, "logits/chosen": -2.028514862060547, "logits/rejected": -2.028571844100952, "logps/chosen": -1.0867416858673096, "logps/rejected": -1.1596667766571045, "loss": 1.7054, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.173483371734619, "rewards/margins": 0.1458502858877182, "rewards/rejected": -2.319333553314209, "step": 840 }, { "epoch": 0.6123919308357348, "grad_norm": 18.42088743135808, "learning_rate": 4.8370542664473805e-08, "logits/chosen": -2.034885883331299, "logits/rejected": -2.0290725231170654, "logps/chosen": -1.0495412349700928, "logps/rejected": -1.154597520828247, "loss": 1.6593, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0990824699401855, "rewards/margins": 0.210112527012825, "rewards/rejected": -2.309195041656494, "step": 850 }, { "epoch": 0.6195965417867435, "grad_norm": 20.628656390704144, "learning_rate": 4.829528694507624e-08, "logits/chosen": -2.01246976852417, "logits/rejected": -2.008507251739502, "logps/chosen": -1.1612248420715332, "logps/rejected": -1.2183961868286133, "loss": 1.7265, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.3224496841430664, "rewards/margins": 0.11434265226125717, "rewards/rejected": -2.4367923736572266, "step": 860 }, { "epoch": 0.6268011527377522, "grad_norm": 22.79809575533199, "learning_rate": 4.821839366319768e-08, "logits/chosen": -2.048476457595825, "logits/rejected": -2.0423402786254883, "logps/chosen": -1.0043013095855713, "logps/rejected": -1.1224620342254639, "loss": 1.6319, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0086026191711426, "rewards/margins": 0.23632164299488068, "rewards/rejected": -2.2449240684509277, "step": 870 }, { "epoch": 0.6340057636887608, "grad_norm": 23.572293533611532, "learning_rate": 4.813986822411833e-08, "logits/chosen": -2.035609722137451, "logits/rejected": -2.0335443019866943, "logps/chosen": -1.0144193172454834, "logps/rejected": -1.0793102979660034, "loss": 1.7139, "rewards/accuracies": 0.53125, "rewards/chosen": -2.028838634490967, "rewards/margins": 0.12978161871433258, "rewards/rejected": -2.158620595932007, "step": 880 }, { "epoch": 0.6412103746397695, "grad_norm": 21.900931247180367, "learning_rate": 4.805971614785231e-08, "logits/chosen": -2.0678467750549316, "logits/rejected": -2.0664026737213135, "logps/chosen": -1.0151159763336182, "logps/rejected": -1.1114805936813354, "loss": 1.6598, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0302319526672363, "rewards/margins": 0.19272920489311218, "rewards/rejected": -2.222961187362671, "step": 890 }, { "epoch": 0.6484149855907781, "grad_norm": 23.236819784308135, "learning_rate": 4.797794306875963e-08, "logits/chosen": -1.9788671731948853, "logits/rejected": -1.98031747341156, "logps/chosen": -1.1417499780654907, "logps/rejected": -1.213930368423462, "loss": 1.7119, "rewards/accuracies": 0.53125, "rewards/chosen": -2.2834999561309814, "rewards/margins": 0.14436064660549164, "rewards/rejected": -2.427860736846924, "step": 900 }, { "epoch": 0.6556195965417867, "grad_norm": 22.823271255081032, "learning_rate": 4.7894554735150076e-08, "logits/chosen": -1.9850410223007202, "logits/rejected": -1.9888070821762085, "logps/chosen": -1.041906476020813, "logps/rejected": -1.1080873012542725, "loss": 1.7092, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.083812952041626, "rewards/margins": 0.13236171007156372, "rewards/rejected": -2.216174602508545, "step": 910 }, { "epoch": 0.6628242074927954, "grad_norm": 25.12031278273105, "learning_rate": 4.7809557008879185e-08, "logits/chosen": -2.013596773147583, "logits/rejected": -2.008056163787842, "logps/chosen": -0.9724063873291016, "logps/rejected": -1.0620607137680054, "loss": 1.6767, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9448127746582031, "rewards/margins": 0.17930865287780762, "rewards/rejected": -2.1241214275360107, "step": 920 }, { "epoch": 0.670028818443804, "grad_norm": 20.34306897169735, "learning_rate": 4.772295586493613e-08, "logits/chosen": -2.056899309158325, "logits/rejected": -2.0542209148406982, "logps/chosen": -1.0338000059127808, "logps/rejected": -1.151123285293579, "loss": 1.6278, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0676000118255615, "rewards/margins": 0.23464636504650116, "rewards/rejected": -2.302246570587158, "step": 930 }, { "epoch": 0.6772334293948127, "grad_norm": 22.240011486982098, "learning_rate": 4.763475739102374e-08, "logits/chosen": -2.0088422298431396, "logits/rejected": -2.014742612838745, "logps/chosen": -1.1260392665863037, "logps/rejected": -1.1944981813430786, "loss": 1.7015, "rewards/accuracies": 0.59375, "rewards/chosen": -2.2520785331726074, "rewards/margins": 0.1369178146123886, "rewards/rejected": -2.3889963626861572, "step": 940 }, { "epoch": 0.6844380403458213, "grad_norm": 18.154153171466476, "learning_rate": 4.754496778713054e-08, "logits/chosen": -1.9671573638916016, "logits/rejected": -1.9713550806045532, "logps/chosen": -1.0105915069580078, "logps/rejected": -1.1340548992156982, "loss": 1.6324, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0211830139160156, "rewards/margins": 0.2469271421432495, "rewards/rejected": -2.2681097984313965, "step": 950 }, { "epoch": 0.69164265129683, "grad_norm": 23.890999661155988, "learning_rate": 4.7453593365094926e-08, "logits/chosen": -2.0449891090393066, "logits/rejected": -2.0441854000091553, "logps/chosen": -1.0483901500701904, "logps/rejected": -1.15888249874115, "loss": 1.6441, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.096780300140381, "rewards/margins": 0.22098441421985626, "rewards/rejected": -2.3177649974823, "step": 960 }, { "epoch": 0.6988472622478387, "grad_norm": 24.41501348892326, "learning_rate": 4.736064054816145e-08, "logits/chosen": -2.043104410171509, "logits/rejected": -2.0392186641693115, "logps/chosen": -0.9671171307563782, "logps/rejected": -1.0935986042022705, "loss": 1.6134, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9342342615127563, "rewards/margins": 0.25296297669410706, "rewards/rejected": -2.187197208404541, "step": 970 }, { "epoch": 0.7060518731988472, "grad_norm": 20.20810505102117, "learning_rate": 4.726611587052933e-08, "logits/chosen": -1.972968339920044, "logits/rejected": -1.9725048542022705, "logps/chosen": -1.1076877117156982, "logps/rejected": -1.235329031944275, "loss": 1.6128, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.2153754234313965, "rewards/margins": 0.25528302788734436, "rewards/rejected": -2.47065806388855, "step": 980 }, { "epoch": 0.7132564841498559, "grad_norm": 25.93630341697249, "learning_rate": 4.71700259768931e-08, "logits/chosen": -2.029388904571533, "logits/rejected": -2.0263442993164062, "logps/chosen": -1.1081831455230713, "logps/rejected": -1.206345796585083, "loss": 1.6698, "rewards/accuracies": 0.5, "rewards/chosen": -2.2163662910461426, "rewards/margins": 0.19632507860660553, "rewards/rejected": -2.412691593170166, "step": 990 }, { "epoch": 0.7204610951008645, "grad_norm": 22.29626785770212, "learning_rate": 4.707237762197549e-08, "logits/chosen": -2.008169174194336, "logits/rejected": -2.004971981048584, "logps/chosen": -1.0062434673309326, "logps/rejected": -1.1267154216766357, "loss": 1.6437, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0124869346618652, "rewards/margins": 0.24094398319721222, "rewards/rejected": -2.2534308433532715, "step": 1000 }, { "epoch": 0.7276657060518732, "grad_norm": 26.575500970274064, "learning_rate": 4.697317767005265e-08, "logits/chosen": -2.0243871212005615, "logits/rejected": -2.020923376083374, "logps/chosen": -1.0003429651260376, "logps/rejected": -1.0944573879241943, "loss": 1.6916, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.000685930252075, "rewards/margins": 0.1882287561893463, "rewards/rejected": -2.1889147758483887, "step": 1010 }, { "epoch": 0.7348703170028819, "grad_norm": 20.654506686037543, "learning_rate": 4.6872433094471577e-08, "logits/chosen": -2.0210866928100586, "logits/rejected": -2.016232967376709, "logps/chosen": -1.0313045978546143, "logps/rejected": -1.1274253129959106, "loss": 1.6531, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0626091957092285, "rewards/margins": 0.19224095344543457, "rewards/rejected": -2.2548506259918213, "step": 1020 }, { "epoch": 0.7420749279538905, "grad_norm": 20.08662493895599, "learning_rate": 4.677015097715994e-08, "logits/chosen": -1.971396803855896, "logits/rejected": -1.9708855152130127, "logps/chosen": -1.0200594663619995, "logps/rejected": -1.1550127267837524, "loss": 1.6275, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.040118932723999, "rewards/margins": 0.2699064314365387, "rewards/rejected": -2.310025453567505, "step": 1030 }, { "epoch": 0.7492795389048992, "grad_norm": 20.042202573565547, "learning_rate": 4.666633850812825e-08, "logits/chosen": -2.0164568424224854, "logits/rejected": -2.0103211402893066, "logps/chosen": -1.0115318298339844, "logps/rejected": -1.094136118888855, "loss": 1.6801, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.0230636596679688, "rewards/margins": 0.1652088165283203, "rewards/rejected": -2.18827223777771, "step": 1040 }, { "epoch": 0.7564841498559077, "grad_norm": 20.97912918551869, "learning_rate": 4.656100298496439e-08, "logits/chosen": -1.9679771661758423, "logits/rejected": -1.964240312576294, "logps/chosen": -0.9365024566650391, "logps/rejected": -1.0687068700790405, "loss": 1.615, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.8730049133300781, "rewards/margins": 0.26440873742103577, "rewards/rejected": -2.137413740158081, "step": 1050 }, { "epoch": 0.7636887608069164, "grad_norm": 21.623225226753767, "learning_rate": 4.6454151812320715e-08, "logits/chosen": -1.996995210647583, "logits/rejected": -1.9909130334854126, "logps/chosen": -1.037967562675476, "logps/rejected": -1.1474111080169678, "loss": 1.6516, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.075935125350952, "rewards/margins": 0.2188871204853058, "rewards/rejected": -2.2948222160339355, "step": 1060 }, { "epoch": 0.770893371757925, "grad_norm": 23.238711963759002, "learning_rate": 4.6345792501393434e-08, "logits/chosen": -1.9984365701675415, "logits/rejected": -1.996698021888733, "logps/chosen": -1.073228359222412, "logps/rejected": -1.2013862133026123, "loss": 1.6356, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.146456718444824, "rewards/margins": 0.2563156485557556, "rewards/rejected": -2.4027724266052246, "step": 1070 }, { "epoch": 0.7780979827089337, "grad_norm": 24.202773626093183, "learning_rate": 4.6235932669394676e-08, "logits/chosen": -2.0284552574157715, "logits/rejected": -2.029017925262451, "logps/chosen": -1.085923671722412, "logps/rejected": -1.1962679624557495, "loss": 1.6513, "rewards/accuracies": 0.625, "rewards/chosen": -2.171847343444824, "rewards/margins": 0.22068853676319122, "rewards/rejected": -2.392535924911499, "step": 1080 }, { "epoch": 0.7853025936599424, "grad_norm": 27.86052727034697, "learning_rate": 4.612458003901698e-08, "logits/chosen": -2.03735613822937, "logits/rejected": -2.0297584533691406, "logps/chosen": -1.1077696084976196, "logps/rejected": -1.2109193801879883, "loss": 1.6645, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.2155392169952393, "rewards/margins": 0.20629949867725372, "rewards/rejected": -2.4218387603759766, "step": 1090 }, { "epoch": 0.792507204610951, "grad_norm": 26.683711754548025, "learning_rate": 4.6011742437890476e-08, "logits/chosen": -2.0245442390441895, "logits/rejected": -2.018993854522705, "logps/chosen": -1.0436826944351196, "logps/rejected": -1.1799969673156738, "loss": 1.606, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.0873653888702393, "rewards/margins": 0.27262869477272034, "rewards/rejected": -2.3599939346313477, "step": 1100 }, { "epoch": 0.7997118155619597, "grad_norm": 18.849886367142954, "learning_rate": 4.589742779803259e-08, "logits/chosen": -2.023536205291748, "logits/rejected": -2.016479253768921, "logps/chosen": -1.0076560974121094, "logps/rejected": -1.1300846338272095, "loss": 1.6272, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0153121948242188, "rewards/margins": 0.24485734105110168, "rewards/rejected": -2.260169267654419, "step": 1110 }, { "epoch": 0.8069164265129684, "grad_norm": 21.926548181600204, "learning_rate": 4.5781644155290486e-08, "logits/chosen": -1.9820778369903564, "logits/rejected": -1.9744129180908203, "logps/chosen": -1.0467216968536377, "logps/rejected": -1.1080749034881592, "loss": 1.7177, "rewards/accuracies": 0.5625, "rewards/chosen": -2.0934433937072754, "rewards/margins": 0.12270689010620117, "rewards/rejected": -2.2161498069763184, "step": 1120 }, { "epoch": 0.8141210374639769, "grad_norm": 20.411269377785867, "learning_rate": 4.566439964877613e-08, "logits/chosen": -2.014411687850952, "logits/rejected": -2.010434627532959, "logps/chosen": -0.9983810186386108, "logps/rejected": -1.0851889848709106, "loss": 1.6852, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.9967620372772217, "rewards/margins": 0.17361582815647125, "rewards/rejected": -2.1703779697418213, "step": 1130 }, { "epoch": 0.8213256484149856, "grad_norm": 19.102313825674166, "learning_rate": 4.554570252029421e-08, "logits/chosen": -2.053616523742676, "logits/rejected": -2.0523505210876465, "logps/chosen": -1.0471397638320923, "logps/rejected": -1.1648762226104736, "loss": 1.6333, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0942795276641846, "rewards/margins": 0.2354734241962433, "rewards/rejected": -2.3297524452209473, "step": 1140 }, { "epoch": 0.8285302593659942, "grad_norm": 21.1710130841996, "learning_rate": 4.542556111376274e-08, "logits/chosen": -2.0461740493774414, "logits/rejected": -2.039647102355957, "logps/chosen": -1.073970079421997, "logps/rejected": -1.1668533086776733, "loss": 1.6769, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.147940158843994, "rewards/margins": 0.18576659262180328, "rewards/rejected": -2.3337066173553467, "step": 1150 }, { "epoch": 0.8357348703170029, "grad_norm": 26.408661113141083, "learning_rate": 4.5303983874626506e-08, "logits/chosen": -1.9958124160766602, "logits/rejected": -1.9943479299545288, "logps/chosen": -1.0374865531921387, "logps/rejected": -1.1170107126235962, "loss": 1.7057, "rewards/accuracies": 0.53125, "rewards/chosen": -2.0749731063842773, "rewards/margins": 0.1590481549501419, "rewards/rejected": -2.2340214252471924, "step": 1160 }, { "epoch": 0.8429394812680115, "grad_norm": 24.22981494259392, "learning_rate": 4.518097934926339e-08, "logits/chosen": -1.9959033727645874, "logits/rejected": -1.987290382385254, "logps/chosen": -1.014827013015747, "logps/rejected": -1.1259469985961914, "loss": 1.6418, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.029654026031494, "rewards/margins": 0.22223982214927673, "rewards/rejected": -2.251893997192383, "step": 1170 }, { "epoch": 0.8501440922190202, "grad_norm": 25.947963806010645, "learning_rate": 4.505655618438363e-08, "logits/chosen": -1.9583690166473389, "logits/rejected": -1.9542310237884521, "logps/chosen": -1.060659646987915, "logps/rejected": -1.1657010316848755, "loss": 1.6644, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.12131929397583, "rewards/margins": 0.21008257567882538, "rewards/rejected": -2.331402063369751, "step": 1180 }, { "epoch": 0.8573487031700289, "grad_norm": 20.34137730466121, "learning_rate": 4.4930723126421945e-08, "logits/chosen": -2.0515360832214355, "logits/rejected": -2.044708490371704, "logps/chosen": -1.0708990097045898, "logps/rejected": -1.1477235555648804, "loss": 1.6951, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.1417980194091797, "rewards/margins": 0.15364903211593628, "rewards/rejected": -2.2954471111297607, "step": 1190 }, { "epoch": 0.8645533141210374, "grad_norm": 25.575272773840386, "learning_rate": 4.48034890209227e-08, "logits/chosen": -1.9811756610870361, "logits/rejected": -1.9688783884048462, "logps/chosen": -1.0863144397735596, "logps/rejected": -1.1746426820755005, "loss": 1.6712, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.172628879547119, "rewards/margins": 0.17665648460388184, "rewards/rejected": -2.349285364151001, "step": 1200 }, { "epoch": 0.8717579250720461, "grad_norm": 22.798493483039103, "learning_rate": 4.4674862811918155e-08, "logits/chosen": -1.9707729816436768, "logits/rejected": -1.9792941808700562, "logps/chosen": -0.9373595118522644, "logps/rejected": -1.0916470289230347, "loss": 1.5848, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.8747190237045288, "rewards/margins": 0.3085750639438629, "rewards/rejected": -2.1832940578460693, "step": 1210 }, { "epoch": 0.8789625360230547, "grad_norm": 20.325415704711816, "learning_rate": 4.454485354129966e-08, "logits/chosen": -1.9973102807998657, "logits/rejected": -1.9927936792373657, "logps/chosen": -1.0084164142608643, "logps/rejected": -1.1153631210327148, "loss": 1.6559, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.0168328285217285, "rewards/margins": 0.2138935774564743, "rewards/rejected": -2.2307262420654297, "step": 1220 }, { "epoch": 0.8861671469740634, "grad_norm": 19.97614694130006, "learning_rate": 4.4413470348182124e-08, "logits/chosen": -1.9734569787979126, "logits/rejected": -1.9613616466522217, "logps/chosen": -0.9839061498641968, "logps/rejected": -1.077565312385559, "loss": 1.6696, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.9678122997283936, "rewards/margins": 0.18731816112995148, "rewards/rejected": -2.155130624771118, "step": 1230 }, { "epoch": 0.8933717579250721, "grad_norm": 23.912472118172317, "learning_rate": 4.42807224682615e-08, "logits/chosen": -1.9827508926391602, "logits/rejected": -1.9806731939315796, "logps/chosen": -0.9357818365097046, "logps/rejected": -1.073239803314209, "loss": 1.6081, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8715636730194092, "rewards/margins": 0.27491581439971924, "rewards/rejected": -2.146479606628418, "step": 1240 }, { "epoch": 0.9005763688760807, "grad_norm": 21.256561095344278, "learning_rate": 4.4146619233165604e-08, "logits/chosen": -2.018620491027832, "logits/rejected": -2.020691394805908, "logps/chosen": -1.0641486644744873, "logps/rejected": -1.2193925380706787, "loss": 1.593, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.1282973289489746, "rewards/margins": 0.3104875385761261, "rewards/rejected": -2.4387850761413574, "step": 1250 }, { "epoch": 0.9077809798270894, "grad_norm": 28.943488333626668, "learning_rate": 4.4011170069798126e-08, "logits/chosen": -2.0126404762268066, "logits/rejected": -2.0176773071289062, "logps/chosen": -1.117441177368164, "logps/rejected": -1.2449333667755127, "loss": 1.6233, "rewards/accuracies": 0.5625, "rewards/chosen": -2.234882354736328, "rewards/margins": 0.2549843192100525, "rewards/rejected": -2.4898667335510254, "step": 1260 }, { "epoch": 0.9149855907780979, "grad_norm": 20.962900135408486, "learning_rate": 4.387438449967594e-08, "logits/chosen": -1.980334997177124, "logits/rejected": -1.9738352298736572, "logps/chosen": -0.9648338556289673, "logps/rejected": -1.0865085124969482, "loss": 1.6239, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9296677112579346, "rewards/margins": 0.2433495968580246, "rewards/rejected": -2.1730170249938965, "step": 1270 }, { "epoch": 0.9221902017291066, "grad_norm": 24.458861277465072, "learning_rate": 4.373627213825983e-08, "logits/chosen": -2.071169376373291, "logits/rejected": -2.066804885864258, "logps/chosen": -1.0259402990341187, "logps/rejected": -1.1633208990097046, "loss": 1.6096, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0518805980682373, "rewards/margins": 0.27476125955581665, "rewards/rejected": -2.326641798019409, "step": 1280 }, { "epoch": 0.9293948126801153, "grad_norm": 19.716694271831496, "learning_rate": 4.359684269427848e-08, "logits/chosen": -2.033966064453125, "logits/rejected": -2.032963275909424, "logps/chosen": -0.9939344525337219, "logps/rejected": -1.0993297100067139, "loss": 1.6487, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9878689050674438, "rewards/margins": 0.2107907235622406, "rewards/rejected": -2.1986594200134277, "step": 1290 }, { "epoch": 0.9365994236311239, "grad_norm": 25.821856198947142, "learning_rate": 4.34561059690461e-08, "logits/chosen": -2.0793344974517822, "logits/rejected": -2.0813448429107666, "logps/chosen": -1.0473977327346802, "logps/rejected": -1.1126644611358643, "loss": 1.7167, "rewards/accuracies": 0.53125, "rewards/chosen": -2.0947954654693604, "rewards/margins": 0.1305336207151413, "rewards/rejected": -2.2253289222717285, "step": 1300 }, { "epoch": 0.9438040345821326, "grad_norm": 24.62746740806772, "learning_rate": 4.3314071855773314e-08, "logits/chosen": -2.0430221557617188, "logits/rejected": -2.0434608459472656, "logps/chosen": -0.982768714427948, "logps/rejected": -1.0802866220474243, "loss": 1.6606, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.965537428855896, "rewards/margins": 0.19503581523895264, "rewards/rejected": -2.1605732440948486, "step": 1310 }, { "epoch": 0.9510086455331412, "grad_norm": 24.127212441715745, "learning_rate": 4.3170750338871806e-08, "logits/chosen": -2.0090765953063965, "logits/rejected": -2.0027029514312744, "logps/chosen": -1.0742548704147339, "logps/rejected": -1.2199418544769287, "loss": 1.5932, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.1485097408294678, "rewards/margins": 0.29137399792671204, "rewards/rejected": -2.4398837089538574, "step": 1320 }, { "epoch": 0.9582132564841499, "grad_norm": 17.637078900383425, "learning_rate": 4.3026151493252414e-08, "logits/chosen": -2.0409767627716064, "logits/rejected": -2.036644697189331, "logps/chosen": -1.0592305660247803, "logps/rejected": -1.1833205223083496, "loss": 1.6304, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.1184611320495605, "rewards/margins": 0.24817979335784912, "rewards/rejected": -2.366641044616699, "step": 1330 }, { "epoch": 0.9654178674351584, "grad_norm": 29.39068619092762, "learning_rate": 4.2880285483616895e-08, "logits/chosen": -2.0100111961364746, "logits/rejected": -2.0106101036071777, "logps/chosen": -1.0155467987060547, "logps/rejected": -1.1326874494552612, "loss": 1.6415, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0310935974121094, "rewards/margins": 0.23428134620189667, "rewards/rejected": -2.2653748989105225, "step": 1340 }, { "epoch": 0.9726224783861671, "grad_norm": 18.381727924896822, "learning_rate": 4.273316256374342e-08, "logits/chosen": -1.9357595443725586, "logits/rejected": -1.9339239597320557, "logps/chosen": -1.012160301208496, "logps/rejected": -1.0870387554168701, "loss": 1.7061, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.024320602416992, "rewards/margins": 0.14975695312023163, "rewards/rejected": -2.1740775108337402, "step": 1350 }, { "epoch": 0.9798270893371758, "grad_norm": 19.119001475681582, "learning_rate": 4.258479307576576e-08, "logits/chosen": -1.9824535846710205, "logits/rejected": -1.9803975820541382, "logps/chosen": -0.9621704816818237, "logps/rejected": -1.0561802387237549, "loss": 1.6757, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.9243409633636475, "rewards/margins": 0.18801935017108917, "rewards/rejected": -2.1123604774475098, "step": 1360 }, { "epoch": 0.9870317002881844, "grad_norm": 24.966184896176923, "learning_rate": 4.243518744944626e-08, "logits/chosen": -2.009141206741333, "logits/rejected": -2.0044732093811035, "logps/chosen": -0.9992243647575378, "logps/rejected": -1.1229625940322876, "loss": 1.6201, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9984487295150757, "rewards/margins": 0.2474762499332428, "rewards/rejected": -2.245925188064575, "step": 1370 }, { "epoch": 0.9942363112391931, "grad_norm": 24.00608318418104, "learning_rate": 4.22843562014427e-08, "logits/chosen": -1.973170280456543, "logits/rejected": -1.9695253372192383, "logps/chosen": -1.0497918128967285, "logps/rejected": -1.1255929470062256, "loss": 1.693, "rewards/accuracies": 0.5625, "rewards/chosen": -2.099583625793457, "rewards/margins": 0.15160201489925385, "rewards/rejected": -2.251185894012451, "step": 1380 }, { "epoch": 1.0014409221902016, "grad_norm": 32.486150578164526, "learning_rate": 4.2132309934569e-08, "logits/chosen": -2.051807403564453, "logits/rejected": -2.0524046421051025, "logps/chosen": -1.014695405960083, "logps/rejected": -1.129429578781128, "loss": 1.6446, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.029390811920166, "rewards/margins": 0.22946810722351074, "rewards/rejected": -2.258859157562256, "step": 1390 }, { "epoch": 1.0086455331412103, "grad_norm": 22.195676598104907, "learning_rate": 4.197905933704989e-08, "logits/chosen": -1.9452602863311768, "logits/rejected": -1.9427295923233032, "logps/chosen": -1.0588452816009521, "logps/rejected": -1.1967337131500244, "loss": 1.6257, "rewards/accuracies": 0.59375, "rewards/chosen": -2.1176905632019043, "rewards/margins": 0.2757769823074341, "rewards/rejected": -2.393467426300049, "step": 1400 }, { "epoch": 1.015850144092219, "grad_norm": 26.19663828201113, "learning_rate": 4.1824615181769577e-08, "logits/chosen": -1.9866628646850586, "logits/rejected": -1.9909366369247437, "logps/chosen": -1.010948896408081, "logps/rejected": -1.1399239301681519, "loss": 1.6307, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.021897792816162, "rewards/margins": 0.25795018672943115, "rewards/rejected": -2.2798478603363037, "step": 1410 }, { "epoch": 1.0230547550432276, "grad_norm": 21.720677571858, "learning_rate": 4.1668988325514434e-08, "logits/chosen": -2.0138607025146484, "logits/rejected": -2.0087952613830566, "logps/chosen": -1.1150840520858765, "logps/rejected": -1.2339974641799927, "loss": 1.6559, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.230168104171753, "rewards/margins": 0.23782682418823242, "rewards/rejected": -2.4679949283599854, "step": 1420 }, { "epoch": 1.0302593659942363, "grad_norm": 24.479412943582165, "learning_rate": 4.1512189708209844e-08, "logits/chosen": -2.0544211864471436, "logits/rejected": -2.0532872676849365, "logps/chosen": -0.9395163655281067, "logps/rejected": -1.0278379917144775, "loss": 1.6851, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.8790327310562134, "rewards/margins": 0.17664287984371185, "rewards/rejected": -2.055675983428955, "step": 1430 }, { "epoch": 1.037463976945245, "grad_norm": 26.59673638607669, "learning_rate": 4.1354230352151143e-08, "logits/chosen": -1.9996252059936523, "logits/rejected": -1.9927451610565186, "logps/chosen": -1.1369383335113525, "logps/rejected": -1.2214683294296265, "loss": 1.6959, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.273876667022705, "rewards/margins": 0.16905997693538666, "rewards/rejected": -2.442936658859253, "step": 1440 }, { "epoch": 1.0446685878962536, "grad_norm": 19.47285274113812, "learning_rate": 4.119512136122882e-08, "logits/chosen": -2.071915864944458, "logits/rejected": -2.081059217453003, "logps/chosen": -0.9938374757766724, "logps/rejected": -1.1477389335632324, "loss": 1.5921, "rewards/accuracies": 0.59375, "rewards/chosen": -1.9876749515533447, "rewards/margins": 0.30780303478240967, "rewards/rejected": -2.295477867126465, "step": 1450 }, { "epoch": 1.0518731988472623, "grad_norm": 19.08504962330343, "learning_rate": 4.103487392014795e-08, "logits/chosen": -1.993143081665039, "logits/rejected": -1.9810212850570679, "logps/chosen": -0.9989188313484192, "logps/rejected": -1.1618555784225464, "loss": 1.566, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9978376626968384, "rewards/margins": 0.325873464345932, "rewards/rejected": -2.3237111568450928, "step": 1460 }, { "epoch": 1.059077809798271, "grad_norm": 19.435185309559383, "learning_rate": 4.087349929364192e-08, "logits/chosen": -2.028613567352295, "logits/rejected": -2.0190749168395996, "logps/chosen": -0.9579511880874634, "logps/rejected": -1.0935356616973877, "loss": 1.6113, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9159023761749268, "rewards/margins": 0.2711690068244934, "rewards/rejected": -2.1870713233947754, "step": 1470 }, { "epoch": 1.0662824207492796, "grad_norm": 20.25742795611284, "learning_rate": 4.0711008825680645e-08, "logits/chosen": -1.978436827659607, "logits/rejected": -1.9773874282836914, "logps/chosen": -1.0046164989471436, "logps/rejected": -1.1253421306610107, "loss": 1.6401, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.009232997894287, "rewards/margins": 0.24145087599754333, "rewards/rejected": -2.2506842613220215, "step": 1480 }, { "epoch": 1.0734870317002883, "grad_norm": 22.801376083305794, "learning_rate": 4.054741393867306e-08, "logits/chosen": -1.9947843551635742, "logits/rejected": -1.9919557571411133, "logps/chosen": -1.110395073890686, "logps/rejected": -1.1635833978652954, "loss": 1.734, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.220790147781372, "rewards/margins": 0.10637688636779785, "rewards/rejected": -2.327166795730591, "step": 1490 }, { "epoch": 1.080691642651297, "grad_norm": 21.594100457917882, "learning_rate": 4.038272613266419e-08, "logits/chosen": -1.9986584186553955, "logits/rejected": -1.9856328964233398, "logps/chosen": -1.00888991355896, "logps/rejected": -1.1221122741699219, "loss": 1.6366, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.01777982711792, "rewards/margins": 0.2264450490474701, "rewards/rejected": -2.2442245483398438, "step": 1500 }, { "epoch": 1.0878962536023056, "grad_norm": 20.30890318648656, "learning_rate": 4.0216956984526784e-08, "logits/chosen": -2.0419411659240723, "logits/rejected": -2.043952226638794, "logps/chosen": -1.0144214630126953, "logps/rejected": -1.1269077062606812, "loss": 1.6479, "rewards/accuracies": 0.5625, "rewards/chosen": -2.0288429260253906, "rewards/margins": 0.22497276961803436, "rewards/rejected": -2.2538154125213623, "step": 1510 }, { "epoch": 1.0951008645533142, "grad_norm": 18.891794539725794, "learning_rate": 4.0050118147147446e-08, "logits/chosen": -1.9847946166992188, "logits/rejected": -1.9852144718170166, "logps/chosen": -1.0975220203399658, "logps/rejected": -1.1103746891021729, "loss": 1.7972, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -2.1950440406799316, "rewards/margins": 0.02570497989654541, "rewards/rejected": -2.2207493782043457, "step": 1520 }, { "epoch": 1.1023054755043227, "grad_norm": 20.33027712421965, "learning_rate": 3.988222134860755e-08, "logits/chosen": -2.026733875274658, "logits/rejected": -2.0179905891418457, "logps/chosen": -0.949315071105957, "logps/rejected": -1.1185479164123535, "loss": 1.5608, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.898630142211914, "rewards/margins": 0.33846551179885864, "rewards/rejected": -2.237095832824707, "step": 1530 }, { "epoch": 1.1095100864553313, "grad_norm": 27.910943356280853, "learning_rate": 3.9713278391358724e-08, "logits/chosen": -2.032493829727173, "logits/rejected": -2.0264077186584473, "logps/chosen": -1.0231871604919434, "logps/rejected": -1.1496901512145996, "loss": 1.6171, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.0463743209838867, "rewards/margins": 0.2530057728290558, "rewards/rejected": -2.299380302429199, "step": 1540 }, { "epoch": 1.11671469740634, "grad_norm": 21.57609759253057, "learning_rate": 3.954330115139328e-08, "logits/chosen": -2.013040065765381, "logits/rejected": -2.007878541946411, "logps/chosen": -1.027060627937317, "logps/rejected": -1.1348285675048828, "loss": 1.6561, "rewards/accuracies": 0.59375, "rewards/chosen": -2.054121255874634, "rewards/margins": 0.2155359536409378, "rewards/rejected": -2.2696571350097656, "step": 1550 }, { "epoch": 1.1239193083573487, "grad_norm": 30.11904064080361, "learning_rate": 3.937230157740931e-08, "logits/chosen": -2.0719151496887207, "logits/rejected": -2.0658767223358154, "logps/chosen": -1.0463825464248657, "logps/rejected": -1.1857068538665771, "loss": 1.6083, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0927650928497314, "rewards/margins": 0.2786487936973572, "rewards/rejected": -2.3714137077331543, "step": 1560 }, { "epoch": 1.1311239193083573, "grad_norm": 19.072451198453745, "learning_rate": 3.920029168997077e-08, "logits/chosen": -2.0472500324249268, "logits/rejected": -2.04541015625, "logps/chosen": -1.0023488998413086, "logps/rejected": -1.1324958801269531, "loss": 1.6165, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.004697799682617, "rewards/margins": 0.2602941393852234, "rewards/rejected": -2.2649917602539062, "step": 1570 }, { "epoch": 1.138328530259366, "grad_norm": 35.2520295343237, "learning_rate": 3.9027283580662476e-08, "logits/chosen": -2.0187594890594482, "logits/rejected": -2.012882709503174, "logps/chosen": -1.0463117361068726, "logps/rejected": -1.1975940465927124, "loss": 1.5982, "rewards/accuracies": 0.625, "rewards/chosen": -2.092623472213745, "rewards/margins": 0.30256450176239014, "rewards/rejected": -2.395188093185425, "step": 1580 }, { "epoch": 1.1455331412103746, "grad_norm": 19.806903931606815, "learning_rate": 3.885328941124014e-08, "logits/chosen": -1.9918800592422485, "logits/rejected": -1.987460732460022, "logps/chosen": -0.9656961560249329, "logps/rejected": -1.1031134128570557, "loss": 1.5982, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9313923120498657, "rewards/margins": 0.27483415603637695, "rewards/rejected": -2.2062268257141113, "step": 1590 }, { "epoch": 1.1527377521613833, "grad_norm": 24.31015030694323, "learning_rate": 3.867832141277539e-08, "logits/chosen": -2.0323398113250732, "logits/rejected": -2.0235486030578613, "logps/chosen": -1.067231297492981, "logps/rejected": -1.1818373203277588, "loss": 1.6431, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.134462594985962, "rewards/margins": 0.229211688041687, "rewards/rejected": -2.3636746406555176, "step": 1600 }, { "epoch": 1.159942363112392, "grad_norm": 24.734749632652033, "learning_rate": 3.850239188479606e-08, "logits/chosen": -1.988034963607788, "logits/rejected": -1.9918092489242554, "logps/chosen": -1.0088120698928833, "logps/rejected": -1.102031946182251, "loss": 1.6752, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.0176241397857666, "rewards/margins": 0.1864398568868637, "rewards/rejected": -2.204063892364502, "step": 1610 }, { "epoch": 1.1671469740634006, "grad_norm": 24.854589205091994, "learning_rate": 3.832551319442151e-08, "logits/chosen": -2.057304620742798, "logits/rejected": -2.058384418487549, "logps/chosen": -1.055830955505371, "logps/rejected": -1.1856542825698853, "loss": 1.6188, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.111661911010742, "rewards/margins": 0.2596468925476074, "rewards/rejected": -2.3713085651397705, "step": 1620 }, { "epoch": 1.1743515850144093, "grad_norm": 20.224773050019834, "learning_rate": 3.81476977754933e-08, "logits/chosen": -1.9548689126968384, "logits/rejected": -1.9512507915496826, "logps/chosen": -1.0260975360870361, "logps/rejected": -1.1001927852630615, "loss": 1.6994, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0521950721740723, "rewards/margins": 0.1481909602880478, "rewards/rejected": -2.200385570526123, "step": 1630 }, { "epoch": 1.181556195965418, "grad_norm": 19.65296336042163, "learning_rate": 3.796895812770114e-08, "logits/chosen": -1.9807981252670288, "logits/rejected": -1.9817959070205688, "logps/chosen": -1.0159375667572021, "logps/rejected": -1.1112862825393677, "loss": 1.6766, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.0318751335144043, "rewards/margins": 0.19069740176200867, "rewards/rejected": -2.2225725650787354, "step": 1640 }, { "epoch": 1.1887608069164266, "grad_norm": 25.584883366560767, "learning_rate": 3.7789306815704216e-08, "logits/chosen": -2.0063586235046387, "logits/rejected": -2.0042014122009277, "logps/chosen": -1.0065621137619019, "logps/rejected": -1.0807863473892212, "loss": 1.7037, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.0131242275238037, "rewards/margins": 0.14844843745231628, "rewards/rejected": -2.1615726947784424, "step": 1650 }, { "epoch": 1.195965417867435, "grad_norm": 21.31154348253332, "learning_rate": 3.760875646824795e-08, "logits/chosen": -1.935102105140686, "logits/rejected": -1.9389255046844482, "logps/chosen": -0.9742142558097839, "logps/rejected": -1.0820832252502441, "loss": 1.6571, "rewards/accuracies": 0.59375, "rewards/chosen": -1.9484285116195679, "rewards/margins": 0.2157379686832428, "rewards/rejected": -2.1641664505004883, "step": 1660 }, { "epoch": 1.2031700288184437, "grad_norm": 26.46059734693064, "learning_rate": 3.742731977727623e-08, "logits/chosen": -2.0295019149780273, "logits/rejected": -2.026423931121826, "logps/chosen": -1.0395313501358032, "logps/rejected": -1.1799407005310059, "loss": 1.6054, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0790627002716064, "rewards/margins": 0.28081852197647095, "rewards/rejected": -2.3598814010620117, "step": 1670 }, { "epoch": 1.2103746397694524, "grad_norm": 23.581318925351276, "learning_rate": 3.7245009497039244e-08, "logits/chosen": -1.9701932668685913, "logits/rejected": -1.962283730506897, "logps/chosen": -1.0118839740753174, "logps/rejected": -1.152519941329956, "loss": 1.5992, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.0237679481506348, "rewards/margins": 0.28127187490463257, "rewards/rejected": -2.305039882659912, "step": 1680 }, { "epoch": 1.217579250720461, "grad_norm": 21.688577806368546, "learning_rate": 3.7061838443196886e-08, "logits/chosen": -2.0115764141082764, "logits/rejected": -2.013331890106201, "logps/chosen": -1.0244637727737427, "logps/rejected": -1.1531237363815308, "loss": 1.6164, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0489275455474854, "rewards/margins": 0.25731992721557617, "rewards/rejected": -2.3062474727630615, "step": 1690 }, { "epoch": 1.2247838616714697, "grad_norm": 26.535687264689457, "learning_rate": 3.68778194919179e-08, "logits/chosen": -1.9832502603530884, "logits/rejected": -1.9843780994415283, "logps/chosen": -1.0770550966262817, "logps/rejected": -1.2046512365341187, "loss": 1.6212, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.1541101932525635, "rewards/margins": 0.2551923096179962, "rewards/rejected": -2.4093024730682373, "step": 1700 }, { "epoch": 1.2319884726224783, "grad_norm": 23.510504161502794, "learning_rate": 3.66929655789747e-08, "logits/chosen": -2.0297393798828125, "logits/rejected": -2.018545389175415, "logps/chosen": -0.9388678669929504, "logps/rejected": -1.096282720565796, "loss": 1.5836, "rewards/accuracies": 0.59375, "rewards/chosen": -1.8777357339859009, "rewards/margins": 0.3148294985294342, "rewards/rejected": -2.192565441131592, "step": 1710 }, { "epoch": 1.239193083573487, "grad_norm": 19.679562411841342, "learning_rate": 3.6507289698834064e-08, "logits/chosen": -1.9751808643341064, "logits/rejected": -1.9716453552246094, "logps/chosen": -0.9839709401130676, "logps/rejected": -1.1197161674499512, "loss": 1.6221, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9679418802261353, "rewards/margins": 0.27149051427841187, "rewards/rejected": -2.2394323348999023, "step": 1720 }, { "epoch": 1.2463976945244957, "grad_norm": 28.804133744116083, "learning_rate": 3.6320804903743684e-08, "logits/chosen": -2.0218136310577393, "logits/rejected": -2.0213685035705566, "logps/chosen": -1.0343354940414429, "logps/rejected": -1.1643495559692383, "loss": 1.624, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0686709880828857, "rewards/margins": 0.2600279450416565, "rewards/rejected": -2.3286991119384766, "step": 1730 }, { "epoch": 1.2536023054755043, "grad_norm": 19.81307875577205, "learning_rate": 3.61335243028146e-08, "logits/chosen": -2.0029590129852295, "logits/rejected": -2.0075385570526123, "logps/chosen": -1.0911086797714233, "logps/rejected": -1.2263154983520508, "loss": 1.6167, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1822173595428467, "rewards/margins": 0.2704135477542877, "rewards/rejected": -2.4526309967041016, "step": 1740 }, { "epoch": 1.260806916426513, "grad_norm": 21.90261440649499, "learning_rate": 3.5945461061099736e-08, "logits/chosen": -1.96405827999115, "logits/rejected": -1.9503686428070068, "logps/chosen": -1.0430338382720947, "logps/rejected": -1.1261696815490723, "loss": 1.7049, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.0860676765441895, "rewards/margins": 0.16627170145511627, "rewards/rejected": -2.2523393630981445, "step": 1750 }, { "epoch": 1.2680115273775217, "grad_norm": 23.04646656897602, "learning_rate": 3.5756628398668446e-08, "logits/chosen": -2.050785541534424, "logits/rejected": -2.0557262897491455, "logps/chosen": -1.1324259042739868, "logps/rejected": -1.2370188236236572, "loss": 1.6725, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.2648518085479736, "rewards/margins": 0.20918580889701843, "rewards/rejected": -2.4740376472473145, "step": 1760 }, { "epoch": 1.2752161383285303, "grad_norm": 21.762752194355947, "learning_rate": 3.556703958967716e-08, "logits/chosen": -2.036355972290039, "logits/rejected": -2.031594753265381, "logps/chosen": -1.0511292219161987, "logps/rejected": -1.1884231567382812, "loss": 1.6162, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.1022584438323975, "rewards/margins": 0.2745879292488098, "rewards/rejected": -2.3768463134765625, "step": 1770 }, { "epoch": 1.282420749279539, "grad_norm": 27.92888301379295, "learning_rate": 3.5376707961436297e-08, "logits/chosen": -2.0217976570129395, "logits/rejected": -2.0158653259277344, "logps/chosen": -1.1400907039642334, "logps/rejected": -1.2049676179885864, "loss": 1.7157, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.280181407928467, "rewards/margins": 0.1297537386417389, "rewards/rejected": -2.409935235977173, "step": 1780 }, { "epoch": 1.2896253602305476, "grad_norm": 15.566471687063434, "learning_rate": 3.51856468934734e-08, "logits/chosen": -1.9757928848266602, "logits/rejected": -1.977355718612671, "logps/chosen": -0.9757205843925476, "logps/rejected": -1.0714739561080933, "loss": 1.6629, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.9514411687850952, "rewards/margins": 0.19150659441947937, "rewards/rejected": -2.1429479122161865, "step": 1790 }, { "epoch": 1.2968299711815563, "grad_norm": 23.43640208355738, "learning_rate": 3.499386981659262e-08, "logits/chosen": -2.059962749481201, "logits/rejected": -2.0546345710754395, "logps/chosen": -1.018078327178955, "logps/rejected": -1.2125370502471924, "loss": 1.5382, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.03615665435791, "rewards/margins": 0.3889172375202179, "rewards/rejected": -2.4250741004943848, "step": 1800 }, { "epoch": 1.304034582132565, "grad_norm": 25.44005060037713, "learning_rate": 3.480139021193057e-08, "logits/chosen": -1.9784746170043945, "logits/rejected": -1.980343222618103, "logps/chosen": -0.996778130531311, "logps/rejected": -1.1208679676055908, "loss": 1.6397, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.993556261062622, "rewards/margins": 0.24817998707294464, "rewards/rejected": -2.2417359352111816, "step": 1810 }, { "epoch": 1.3112391930835736, "grad_norm": 31.448844065081957, "learning_rate": 3.4608221610008666e-08, "logits/chosen": -2.010075807571411, "logits/rejected": -2.0055999755859375, "logps/chosen": -0.9734475016593933, "logps/rejected": -1.127052903175354, "loss": 1.5902, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.9468950033187866, "rewards/margins": 0.3072105944156647, "rewards/rejected": -2.254105806350708, "step": 1820 }, { "epoch": 1.318443804034582, "grad_norm": 18.643772728000954, "learning_rate": 3.4414377589782e-08, "logits/chosen": -1.9838117361068726, "logits/rejected": -1.9927136898040771, "logps/chosen": -1.0180085897445679, "logps/rejected": -1.15524423122406, "loss": 1.6209, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0360171794891357, "rewards/margins": 0.2744712233543396, "rewards/rejected": -2.31048846244812, "step": 1830 }, { "epoch": 1.3256484149855907, "grad_norm": 21.179875662198093, "learning_rate": 3.4219871777684745e-08, "logits/chosen": -1.9955089092254639, "logits/rejected": -1.9833972454071045, "logps/chosen": -0.9941864013671875, "logps/rejected": -1.119760513305664, "loss": 1.6371, "rewards/accuracies": 0.5625, "rewards/chosen": -1.988372802734375, "rewards/margins": 0.25114771723747253, "rewards/rejected": -2.239521026611328, "step": 1840 }, { "epoch": 1.3328530259365994, "grad_norm": 21.161991965438514, "learning_rate": 3.4024717846672364e-08, "logits/chosen": -2.0328078269958496, "logits/rejected": -2.0262277126312256, "logps/chosen": -0.994450569152832, "logps/rejected": -1.1281460523605347, "loss": 1.622, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.988901138305664, "rewards/margins": 0.2673908770084381, "rewards/rejected": -2.2562921047210693, "step": 1850 }, { "epoch": 1.340057636887608, "grad_norm": 20.855334911295476, "learning_rate": 3.382892951526036e-08, "logits/chosen": -2.012280225753784, "logits/rejected": -2.0093696117401123, "logps/chosen": -1.0515209436416626, "logps/rejected": -1.2058277130126953, "loss": 1.5835, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.103041887283325, "rewards/margins": 0.3086133897304535, "rewards/rejected": -2.4116554260253906, "step": 1860 }, { "epoch": 1.3472622478386167, "grad_norm": 24.525831599188695, "learning_rate": 3.3632520546559974e-08, "logits/chosen": -1.9768329858779907, "logits/rejected": -1.9647411108016968, "logps/chosen": -0.9254377484321594, "logps/rejected": -1.1011799573898315, "loss": 1.5435, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8508754968643188, "rewards/margins": 0.3514845371246338, "rewards/rejected": -2.202359914779663, "step": 1870 }, { "epoch": 1.3544668587896254, "grad_norm": 22.285654330604462, "learning_rate": 3.34355047473107e-08, "logits/chosen": -1.9928748607635498, "logits/rejected": -1.9885931015014648, "logps/chosen": -1.0292783975601196, "logps/rejected": -1.1223747730255127, "loss": 1.6824, "rewards/accuracies": 0.5625, "rewards/chosen": -2.0585567951202393, "rewards/margins": 0.18619287014007568, "rewards/rejected": -2.2447495460510254, "step": 1880 }, { "epoch": 1.361671469740634, "grad_norm": 26.73971486468477, "learning_rate": 3.323789596690971e-08, "logits/chosen": -1.9690624475479126, "logits/rejected": -1.970073938369751, "logps/chosen": -1.021382212638855, "logps/rejected": -1.1601226329803467, "loss": 1.6047, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.04276442527771, "rewards/margins": 0.2774810194969177, "rewards/rejected": -2.3202452659606934, "step": 1890 }, { "epoch": 1.3688760806916427, "grad_norm": 18.412875989410708, "learning_rate": 3.303970809643828e-08, "logits/chosen": -1.999266266822815, "logits/rejected": -2.0038928985595703, "logps/chosen": -1.0347532033920288, "logps/rejected": -1.1695308685302734, "loss": 1.6181, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0695064067840576, "rewards/margins": 0.2695554196834564, "rewards/rejected": -2.339061737060547, "step": 1900 }, { "epoch": 1.3760806916426513, "grad_norm": 24.682869463381866, "learning_rate": 3.2840955067685356e-08, "logits/chosen": -2.028286933898926, "logits/rejected": -2.032379627227783, "logps/chosen": -1.0541026592254639, "logps/rejected": -1.2088960409164429, "loss": 1.5824, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.1082053184509277, "rewards/margins": 0.30958667397499084, "rewards/rejected": -2.4177920818328857, "step": 1910 }, { "epoch": 1.38328530259366, "grad_norm": 20.38443501401726, "learning_rate": 3.264165085216817e-08, "logits/chosen": -2.0381264686584473, "logits/rejected": -2.0381598472595215, "logps/chosen": -0.9351852536201477, "logps/rejected": -1.1091148853302002, "loss": 1.5565, "rewards/accuracies": 0.59375, "rewards/chosen": -1.8703705072402954, "rewards/margins": 0.34785938262939453, "rewards/rejected": -2.2182297706604004, "step": 1920 }, { "epoch": 1.3904899135446687, "grad_norm": 21.798655439946447, "learning_rate": 3.244180946015008e-08, "logits/chosen": -1.9653640985488892, "logits/rejected": -1.966282844543457, "logps/chosen": -1.0345097780227661, "logps/rejected": -1.102309226989746, "loss": 1.7142, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.0690195560455322, "rewards/margins": 0.13559898734092712, "rewards/rejected": -2.204618453979492, "step": 1930 }, { "epoch": 1.397694524495677, "grad_norm": 18.91754321797937, "learning_rate": 3.224144493965578e-08, "logits/chosen": -2.046643018722534, "logits/rejected": -2.050302028656006, "logps/chosen": -0.9918837547302246, "logps/rejected": -1.1018450260162354, "loss": 1.6494, "rewards/accuracies": 0.59375, "rewards/chosen": -1.9837675094604492, "rewards/margins": 0.21992245316505432, "rewards/rejected": -2.2036900520324707, "step": 1940 }, { "epoch": 1.4048991354466858, "grad_norm": 20.49760479029081, "learning_rate": 3.204057137548371e-08, "logits/chosen": -2.0062310695648193, "logits/rejected": -2.0008187294006348, "logps/chosen": -0.9782951474189758, "logps/rejected": -1.0901457071304321, "loss": 1.646, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.9565902948379517, "rewards/margins": 0.22370114922523499, "rewards/rejected": -2.1802914142608643, "step": 1950 }, { "epoch": 1.4121037463976944, "grad_norm": 22.654332768618016, "learning_rate": 3.183920288821597e-08, "logits/chosen": -1.9924176931381226, "logits/rejected": -1.9890445470809937, "logps/chosen": -1.0017935037612915, "logps/rejected": -1.170503854751587, "loss": 1.565, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.003587007522583, "rewards/margins": 0.3374207317829132, "rewards/rejected": -2.341007709503174, "step": 1960 }, { "epoch": 1.419308357348703, "grad_norm": 27.29789736524546, "learning_rate": 3.1637353633225735e-08, "logits/chosen": -2.0368828773498535, "logits/rejected": -2.0306804180145264, "logps/chosen": -1.0293101072311401, "logps/rejected": -1.1836047172546387, "loss": 1.5882, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0586202144622803, "rewards/margins": 0.30858898162841797, "rewards/rejected": -2.3672094345092773, "step": 1970 }, { "epoch": 1.4265129682997117, "grad_norm": 23.20736242405514, "learning_rate": 3.143503779968213e-08, "logits/chosen": -2.0104708671569824, "logits/rejected": -2.0107173919677734, "logps/chosen": -1.0185365676879883, "logps/rejected": -1.1591142416000366, "loss": 1.6225, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0370731353759766, "rewards/margins": 0.2811557948589325, "rewards/rejected": -2.3182284832000732, "step": 1980 }, { "epoch": 1.4337175792507204, "grad_norm": 20.51470441564086, "learning_rate": 3.1232269609552875e-08, "logits/chosen": -1.9965565204620361, "logits/rejected": -1.9940776824951172, "logps/chosen": -0.99919593334198, "logps/rejected": -1.1262236833572388, "loss": 1.6305, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.99839186668396, "rewards/margins": 0.25405532121658325, "rewards/rejected": -2.2524473667144775, "step": 1990 }, { "epoch": 1.440922190201729, "grad_norm": 18.89505804226463, "learning_rate": 3.102906331660444e-08, "logits/chosen": -2.051891326904297, "logits/rejected": -2.0434744358062744, "logps/chosen": -0.9929302334785461, "logps/rejected": -1.1725804805755615, "loss": 1.5482, "rewards/accuracies": 0.625, "rewards/chosen": -1.9858604669570923, "rewards/margins": 0.35930031538009644, "rewards/rejected": -2.345160961151123, "step": 2000 }, { "epoch": 1.4481268011527377, "grad_norm": 18.959996565426806, "learning_rate": 3.082543320540015e-08, "logits/chosen": -1.9977144002914429, "logits/rejected": -1.9908920526504517, "logps/chosen": -1.0085601806640625, "logps/rejected": -1.1572504043579102, "loss": 1.5927, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.017120361328125, "rewards/margins": 0.29738035798072815, "rewards/rejected": -2.3145008087158203, "step": 2010 }, { "epoch": 1.4553314121037464, "grad_norm": 21.810322562230194, "learning_rate": 3.062139359029599e-08, "logits/chosen": -2.0235819816589355, "logits/rejected": -2.0234787464141846, "logps/chosen": -1.0301491022109985, "logps/rejected": -1.119282841682434, "loss": 1.6836, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.060298204421997, "rewards/margins": 0.1782674938440323, "rewards/rejected": -2.238565683364868, "step": 2020 }, { "epoch": 1.462536023054755, "grad_norm": 22.48549010986462, "learning_rate": 3.041695881443437e-08, "logits/chosen": -2.0445263385772705, "logits/rejected": -2.0399861335754395, "logps/chosen": -0.976142406463623, "logps/rejected": -1.1127902269363403, "loss": 1.6074, "rewards/accuracies": 0.625, "rewards/chosen": -1.952284812927246, "rewards/margins": 0.27329590916633606, "rewards/rejected": -2.2255804538726807, "step": 2030 }, { "epoch": 1.4697406340057637, "grad_norm": 26.647862266073773, "learning_rate": 3.0212143248735886e-08, "logits/chosen": -2.032400608062744, "logits/rejected": -2.0326879024505615, "logps/chosen": -1.0005394220352173, "logps/rejected": -1.1441175937652588, "loss": 1.6004, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0010788440704346, "rewards/margins": 0.2871564030647278, "rewards/rejected": -2.2882351875305176, "step": 2040 }, { "epoch": 1.4769452449567724, "grad_norm": 23.189218350680463, "learning_rate": 3.0006961290889077e-08, "logits/chosen": -2.0161643028259277, "logits/rejected": -2.0071606636047363, "logps/chosen": -1.1198402643203735, "logps/rejected": -1.294711947441101, "loss": 1.5796, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.239680528640747, "rewards/margins": 0.349743515253067, "rewards/rejected": -2.589423894882202, "step": 2050 }, { "epoch": 1.484149855907781, "grad_norm": 24.73130332357524, "learning_rate": 2.980142736433833e-08, "logits/chosen": -2.0039925575256348, "logits/rejected": -1.997148871421814, "logps/chosen": -1.0338767766952515, "logps/rejected": -1.1031367778778076, "loss": 1.7145, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.067753553390503, "rewards/margins": 0.1385200470685959, "rewards/rejected": -2.2062735557556152, "step": 2060 }, { "epoch": 1.4913544668587897, "grad_norm": 28.756647199650402, "learning_rate": 2.9595555917269997e-08, "logits/chosen": -2.0324764251708984, "logits/rejected": -2.0177855491638184, "logps/chosen": -1.1434462070465088, "logps/rejected": -1.2467788457870483, "loss": 1.6477, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.2868924140930176, "rewards/margins": 0.20666587352752686, "rewards/rejected": -2.4935576915740967, "step": 2070 }, { "epoch": 1.4985590778097984, "grad_norm": 21.5393656583957, "learning_rate": 2.9389361421596725e-08, "logits/chosen": -1.9500755071640015, "logits/rejected": -1.9524322748184204, "logps/chosen": -1.0599329471588135, "logps/rejected": -1.2015798091888428, "loss": 1.6039, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.119865894317627, "rewards/margins": 0.28329387307167053, "rewards/rejected": -2.4031596183776855, "step": 2080 }, { "epoch": 1.505763688760807, "grad_norm": 23.41149211531384, "learning_rate": 2.9182858371940126e-08, "logits/chosen": -2.033541440963745, "logits/rejected": -2.028110980987549, "logps/chosen": -1.0496987104415894, "logps/rejected": -1.185601830482483, "loss": 1.6118, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0993974208831787, "rewards/margins": 0.27180591225624084, "rewards/rejected": -2.371203660964966, "step": 2090 }, { "epoch": 1.5129682997118157, "grad_norm": 21.934605810431464, "learning_rate": 2.8976061284611908e-08, "logits/chosen": -1.9868707656860352, "logits/rejected": -1.9957765340805054, "logps/chosen": -0.9363808631896973, "logps/rejected": -1.0734131336212158, "loss": 1.6178, "rewards/accuracies": 0.59375, "rewards/chosen": -1.8727617263793945, "rewards/margins": 0.2740646004676819, "rewards/rejected": -2.1468262672424316, "step": 2100 }, { "epoch": 1.5201729106628243, "grad_norm": 25.364146062349946, "learning_rate": 2.8768984696593384e-08, "logits/chosen": -1.9792448282241821, "logits/rejected": -1.969897985458374, "logps/chosen": -1.0175460577011108, "logps/rejected": -1.1425501108169556, "loss": 1.6413, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.0350921154022217, "rewards/margins": 0.2500078082084656, "rewards/rejected": -2.285100221633911, "step": 2110 }, { "epoch": 1.527377521613833, "grad_norm": 21.74965631180048, "learning_rate": 2.8561643164513637e-08, "logits/chosen": -1.9004192352294922, "logits/rejected": -1.896916389465332, "logps/chosen": -1.0500614643096924, "logps/rejected": -1.1739121675491333, "loss": 1.6305, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.1001229286193848, "rewards/margins": 0.24770136177539825, "rewards/rejected": -2.3478243350982666, "step": 2120 }, { "epoch": 1.5345821325648417, "grad_norm": 23.658035524464136, "learning_rate": 2.8354051263626227e-08, "logits/chosen": -1.9793847799301147, "logits/rejected": -1.9849811792373657, "logps/chosen": -1.0610580444335938, "logps/rejected": -1.18191659450531, "loss": 1.633, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1221160888671875, "rewards/margins": 0.24171726405620575, "rewards/rejected": -2.36383318901062, "step": 2130 }, { "epoch": 1.54178674351585, "grad_norm": 23.128495896756156, "learning_rate": 2.8146223586784573e-08, "logits/chosen": -1.9796043634414673, "logits/rejected": -1.9717155694961548, "logps/chosen": -1.0671274662017822, "logps/rejected": -1.20950448513031, "loss": 1.6081, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1342549324035645, "rewards/margins": 0.28475409746170044, "rewards/rejected": -2.41900897026062, "step": 2140 }, { "epoch": 1.5489913544668588, "grad_norm": 30.055700048770536, "learning_rate": 2.7938174743416205e-08, "logits/chosen": -1.9360496997833252, "logits/rejected": -1.9328253269195557, "logps/chosen": -1.0519496202468872, "logps/rejected": -1.1694730520248413, "loss": 1.6427, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1038992404937744, "rewards/margins": 0.23504677414894104, "rewards/rejected": -2.3389461040496826, "step": 2150 }, { "epoch": 1.5561959654178674, "grad_norm": 23.129654916877083, "learning_rate": 2.7729919358495728e-08, "logits/chosen": -1.99872624874115, "logits/rejected": -1.9997644424438477, "logps/chosen": -1.113937497138977, "logps/rejected": -1.2002607583999634, "loss": 1.692, "rewards/accuracies": 0.59375, "rewards/chosen": -2.227874994277954, "rewards/margins": 0.17264637351036072, "rewards/rejected": -2.4005215167999268, "step": 2160 }, { "epoch": 1.563400576368876, "grad_norm": 22.434070942115937, "learning_rate": 2.7521472071516772e-08, "logits/chosen": -1.9986212253570557, "logits/rejected": -1.9973223209381104, "logps/chosen": -0.9446593523025513, "logps/rejected": -1.0659997463226318, "loss": 1.6354, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.8893187046051025, "rewards/margins": 0.24268050491809845, "rewards/rejected": -2.1319994926452637, "step": 2170 }, { "epoch": 1.5706051873198847, "grad_norm": 25.289102625881196, "learning_rate": 2.731284753546289e-08, "logits/chosen": -1.9849393367767334, "logits/rejected": -1.983152985572815, "logps/chosen": -1.0848243236541748, "logps/rejected": -1.2314993143081665, "loss": 1.5972, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.1696486473083496, "rewards/margins": 0.2933499813079834, "rewards/rejected": -2.462998628616333, "step": 2180 }, { "epoch": 1.5778097982708934, "grad_norm": 25.47161141421992, "learning_rate": 2.710406041577751e-08, "logits/chosen": -2.0457510948181152, "logits/rejected": -2.0424182415008545, "logps/chosen": -1.0336768627166748, "logps/rejected": -1.193527102470398, "loss": 1.5814, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0673537254333496, "rewards/margins": 0.3197005093097687, "rewards/rejected": -2.387054204940796, "step": 2190 }, { "epoch": 1.585014409221902, "grad_norm": 21.178219637982075, "learning_rate": 2.6895125389333017e-08, "logits/chosen": -2.007823944091797, "logits/rejected": -2.003513813018799, "logps/chosen": -1.025743842124939, "logps/rejected": -1.186995267868042, "loss": 1.5756, "rewards/accuracies": 0.625, "rewards/chosen": -2.051487684249878, "rewards/margins": 0.3225029706954956, "rewards/rejected": -2.373990535736084, "step": 2200 }, { "epoch": 1.5922190201729105, "grad_norm": 20.17448755091904, "learning_rate": 2.6686057143399028e-08, "logits/chosen": -2.004152774810791, "logits/rejected": -2.0057718753814697, "logps/chosen": -1.0627915859222412, "logps/rejected": -1.1675150394439697, "loss": 1.6769, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.1255831718444824, "rewards/margins": 0.20944683253765106, "rewards/rejected": -2.3350300788879395, "step": 2210 }, { "epoch": 1.5994236311239192, "grad_norm": 22.801188380825646, "learning_rate": 2.647687037460996e-08, "logits/chosen": -2.012364149093628, "logits/rejected": -2.0119943618774414, "logps/chosen": -1.087121844291687, "logps/rejected": -1.2915680408477783, "loss": 1.5342, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.174243688583374, "rewards/margins": 0.4088924825191498, "rewards/rejected": -2.5831360816955566, "step": 2220 }, { "epoch": 1.6066282420749278, "grad_norm": 24.06904938833163, "learning_rate": 2.626757978793187e-08, "logits/chosen": -2.013692855834961, "logits/rejected": -2.0071637630462646, "logps/chosen": -1.089404821395874, "logps/rejected": -1.219020128250122, "loss": 1.631, "rewards/accuracies": 0.625, "rewards/chosen": -2.178809642791748, "rewards/margins": 0.2592305541038513, "rewards/rejected": -2.438040256500244, "step": 2230 }, { "epoch": 1.6138328530259365, "grad_norm": 27.544711922367103, "learning_rate": 2.6058200095628797e-08, "logits/chosen": -1.9903295040130615, "logits/rejected": -1.9937385320663452, "logps/chosen": -0.9185327291488647, "logps/rejected": -1.098573088645935, "loss": 1.5548, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.8370654582977295, "rewards/margins": 0.36008089780807495, "rewards/rejected": -2.19714617729187, "step": 2240 }, { "epoch": 1.6210374639769451, "grad_norm": 22.727902157451695, "learning_rate": 2.584874601622854e-08, "logits/chosen": -2.051220417022705, "logits/rejected": -2.042055368423462, "logps/chosen": -1.0876684188842773, "logps/rejected": -1.2251036167144775, "loss": 1.6311, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1753368377685547, "rewards/margins": 0.2748700678348541, "rewards/rejected": -2.450207233428955, "step": 2250 }, { "epoch": 1.6282420749279538, "grad_norm": 25.621292936059685, "learning_rate": 2.5639232273487993e-08, "logits/chosen": -1.9734121561050415, "logits/rejected": -1.9636056423187256, "logps/chosen": -0.9798167943954468, "logps/rejected": -1.1094855070114136, "loss": 1.6263, "rewards/accuracies": 0.59375, "rewards/chosen": -1.9596335887908936, "rewards/margins": 0.2593373954296112, "rewards/rejected": -2.218971014022827, "step": 2260 }, { "epoch": 1.6354466858789625, "grad_norm": 25.394662467686896, "learning_rate": 2.5429673595358142e-08, "logits/chosen": -2.011430263519287, "logits/rejected": -2.010061264038086, "logps/chosen": -1.0478408336639404, "logps/rejected": -1.1755692958831787, "loss": 1.626, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.095681667327881, "rewards/margins": 0.2554570436477661, "rewards/rejected": -2.3511385917663574, "step": 2270 }, { "epoch": 1.6426512968299711, "grad_norm": 27.708485064047853, "learning_rate": 2.5220084712948764e-08, "logits/chosen": -1.977311134338379, "logits/rejected": -1.9665164947509766, "logps/chosen": -1.1222455501556396, "logps/rejected": -1.2458299398422241, "loss": 1.6213, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.2444911003112793, "rewards/margins": 0.24716897308826447, "rewards/rejected": -2.4916598796844482, "step": 2280 }, { "epoch": 1.6498559077809798, "grad_norm": 22.4782708123149, "learning_rate": 2.5010480359492838e-08, "logits/chosen": -1.9557056427001953, "logits/rejected": -1.9528486728668213, "logps/chosen": -1.053770661354065, "logps/rejected": -1.1224170923233032, "loss": 1.7247, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.10754132270813, "rewards/margins": 0.13729265332221985, "rewards/rejected": -2.2448341846466064, "step": 2290 }, { "epoch": 1.6570605187319885, "grad_norm": 25.005432685859258, "learning_rate": 2.480087526931091e-08, "logits/chosen": -2.002633571624756, "logits/rejected": -1.9903074502944946, "logps/chosen": -1.005736231803894, "logps/rejected": -1.1290199756622314, "loss": 1.6409, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.011472463607788, "rewards/margins": 0.2465672492980957, "rewards/rejected": -2.258039951324463, "step": 2300 }, { "epoch": 1.6642651296829971, "grad_norm": 21.988807710224364, "learning_rate": 2.4591284176775326e-08, "logits/chosen": -1.9641501903533936, "logits/rejected": -1.9605156183242798, "logps/chosen": -1.0775701999664307, "logps/rejected": -1.164699912071228, "loss": 1.6926, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.1551403999328613, "rewards/margins": 0.1742594838142395, "rewards/rejected": -2.329399824142456, "step": 2310 }, { "epoch": 1.6714697406340058, "grad_norm": 27.10350169462609, "learning_rate": 2.4381721815274443e-08, "logits/chosen": -2.032998561859131, "logits/rejected": -2.033088207244873, "logps/chosen": -1.023777723312378, "logps/rejected": -1.1642982959747314, "loss": 1.6157, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.047555446624756, "rewards/margins": 0.2810412347316742, "rewards/rejected": -2.328596591949463, "step": 2320 }, { "epoch": 1.6786743515850144, "grad_norm": 23.12831763727554, "learning_rate": 2.4172202916176936e-08, "logits/chosen": -2.039386749267578, "logits/rejected": -2.0418286323547363, "logps/chosen": -0.9720922708511353, "logps/rejected": -1.1477550268173218, "loss": 1.5723, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9441845417022705, "rewards/margins": 0.35132530331611633, "rewards/rejected": -2.2955100536346436, "step": 2330 }, { "epoch": 1.685878962536023, "grad_norm": 22.606325423263293, "learning_rate": 2.3962742207796268e-08, "logits/chosen": -1.983278512954712, "logits/rejected": -1.9811782836914062, "logps/chosen": -0.9583452343940735, "logps/rejected": -1.131553053855896, "loss": 1.5721, "rewards/accuracies": 0.59375, "rewards/chosen": -1.916690468788147, "rewards/margins": 0.34641581773757935, "rewards/rejected": -2.263106107711792, "step": 2340 }, { "epoch": 1.6930835734870318, "grad_norm": 26.480966843343246, "learning_rate": 2.3753354414355334e-08, "logits/chosen": -1.9475946426391602, "logits/rejected": -1.9369595050811768, "logps/chosen": -1.0692945718765259, "logps/rejected": -1.1918474435806274, "loss": 1.6427, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1385891437530518, "rewards/margins": 0.2451055943965912, "rewards/rejected": -2.383694887161255, "step": 2350 }, { "epoch": 1.7002881844380404, "grad_norm": 21.832228435816848, "learning_rate": 2.3544054254951408e-08, "logits/chosen": -1.9868978261947632, "logits/rejected": -1.9784021377563477, "logps/chosen": -0.9384185075759888, "logps/rejected": -1.1477484703063965, "loss": 1.5162, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8768370151519775, "rewards/margins": 0.41866016387939453, "rewards/rejected": -2.295496940612793, "step": 2360 }, { "epoch": 1.707492795389049, "grad_norm": 21.6438925609118, "learning_rate": 2.3334856442521435e-08, "logits/chosen": -2.0314948558807373, "logits/rejected": -2.0240464210510254, "logps/chosen": -1.0995467901229858, "logps/rejected": -1.1759902238845825, "loss": 1.7071, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.1990935802459717, "rewards/margins": 0.15288688242435455, "rewards/rejected": -2.351980447769165, "step": 2370 }, { "epoch": 1.7146974063400577, "grad_norm": 23.54141309768647, "learning_rate": 2.3125775682807826e-08, "logits/chosen": -2.047222852706909, "logits/rejected": -2.0470428466796875, "logps/chosen": -1.1707763671875, "logps/rejected": -1.2789132595062256, "loss": 1.6606, "rewards/accuracies": 0.625, "rewards/chosen": -2.341552734375, "rewards/margins": 0.21627375483512878, "rewards/rejected": -2.557826519012451, "step": 2380 }, { "epoch": 1.7219020172910664, "grad_norm": 24.08562200155348, "learning_rate": 2.291682667332464e-08, "logits/chosen": -2.0585429668426514, "logits/rejected": -2.0534658432006836, "logps/chosen": -1.0525509119033813, "logps/rejected": -1.188474416732788, "loss": 1.6182, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1051018238067627, "rewards/margins": 0.27184706926345825, "rewards/rejected": -2.376948833465576, "step": 2390 }, { "epoch": 1.729106628242075, "grad_norm": 19.19031720086758, "learning_rate": 2.2708024102324454e-08, "logits/chosen": -2.017749309539795, "logits/rejected": -2.0119831562042236, "logps/chosen": -1.0370593070983887, "logps/rejected": -1.2245299816131592, "loss": 1.5611, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0741186141967773, "rewards/margins": 0.37494122982025146, "rewards/rejected": -2.4490599632263184, "step": 2400 }, { "epoch": 1.7363112391930837, "grad_norm": 26.402120685629185, "learning_rate": 2.2499382647765797e-08, "logits/chosen": -2.0183489322662354, "logits/rejected": -2.014817476272583, "logps/chosen": -1.0770655870437622, "logps/rejected": -1.1719729900360107, "loss": 1.6807, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1541311740875244, "rewards/margins": 0.18981480598449707, "rewards/rejected": -2.3439459800720215, "step": 2410 }, { "epoch": 1.7435158501440924, "grad_norm": 25.14482018520918, "learning_rate": 2.2290916976281427e-08, "logits/chosen": -2.00168514251709, "logits/rejected": -1.9956331253051758, "logps/chosen": -1.0041890144348145, "logps/rejected": -1.146061897277832, "loss": 1.6336, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.008378028869629, "rewards/margins": 0.2837457060813904, "rewards/rejected": -2.292123794555664, "step": 2420 }, { "epoch": 1.7507204610951008, "grad_norm": 21.890039531986915, "learning_rate": 2.2082641742147238e-08, "logits/chosen": -1.9787976741790771, "logits/rejected": -1.9721981287002563, "logps/chosen": -1.0208359956741333, "logps/rejected": -1.2223812341690063, "loss": 1.526, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.0416719913482666, "rewards/margins": 0.4030904769897461, "rewards/rejected": -2.4447624683380127, "step": 2430 }, { "epoch": 1.7579250720461095, "grad_norm": 23.372489835442224, "learning_rate": 2.1874571586252177e-08, "logits/chosen": -2.0238265991210938, "logits/rejected": -2.0168251991271973, "logps/chosen": -1.0317771434783936, "logps/rejected": -1.116687536239624, "loss": 1.6916, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.063554286956787, "rewards/margins": 0.1698208600282669, "rewards/rejected": -2.233375072479248, "step": 2440 }, { "epoch": 1.7651296829971181, "grad_norm": 23.848360204628783, "learning_rate": 2.1666721135069037e-08, "logits/chosen": -2.013340473175049, "logits/rejected": -2.0101757049560547, "logps/chosen": -1.1160831451416016, "logps/rejected": -1.216787338256836, "loss": 1.6787, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.232166290283203, "rewards/margins": 0.20140838623046875, "rewards/rejected": -2.433574676513672, "step": 2450 }, { "epoch": 1.7723342939481268, "grad_norm": 18.807956131645753, "learning_rate": 2.145910499962628e-08, "logits/chosen": -2.0611259937286377, "logits/rejected": -2.053446054458618, "logps/chosen": -0.9635663032531738, "logps/rejected": -1.1173702478408813, "loss": 1.6001, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.9271326065063477, "rewards/margins": 0.307607501745224, "rewards/rejected": -2.2347404956817627, "step": 2460 }, { "epoch": 1.7795389048991355, "grad_norm": 26.945758127617577, "learning_rate": 2.1251737774480915e-08, "logits/chosen": -2.0376479625701904, "logits/rejected": -2.0281331539154053, "logps/chosen": -1.1738313436508179, "logps/rejected": -1.2722257375717163, "loss": 1.7024, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.3476626873016357, "rewards/margins": 0.19678860902786255, "rewards/rejected": -2.5444514751434326, "step": 2470 }, { "epoch": 1.7867435158501441, "grad_norm": 20.196166037220575, "learning_rate": 2.104463403669264e-08, "logits/chosen": -1.9929859638214111, "logits/rejected": -1.9903604984283447, "logps/chosen": -1.0489277839660645, "logps/rejected": -1.2035558223724365, "loss": 1.5986, "rewards/accuracies": 0.625, "rewards/chosen": -2.097855567932129, "rewards/margins": 0.30925604701042175, "rewards/rejected": -2.407111644744873, "step": 2480 }, { "epoch": 1.7939481268011528, "grad_norm": 20.20560502456053, "learning_rate": 2.0837808344799028e-08, "logits/chosen": -1.969743013381958, "logits/rejected": -1.965444564819336, "logps/chosen": -0.9422667622566223, "logps/rejected": -1.085220217704773, "loss": 1.6026, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.8845335245132446, "rewards/margins": 0.28590697050094604, "rewards/rejected": -2.170440435409546, "step": 2490 }, { "epoch": 1.8011527377521612, "grad_norm": 22.23939047890069, "learning_rate": 2.063127523779219e-08, "logits/chosen": -1.972491979598999, "logits/rejected": -1.968279480934143, "logps/chosen": -1.0105453729629517, "logps/rejected": -1.2083196640014648, "loss": 1.5204, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.0210907459259033, "rewards/margins": 0.39554858207702637, "rewards/rejected": -2.4166393280029297, "step": 2500 }, { "epoch": 1.8083573487031699, "grad_norm": 24.255549525289887, "learning_rate": 2.0425049234096737e-08, "logits/chosen": -1.9838634729385376, "logits/rejected": -1.9782577753067017, "logps/chosen": -1.0133405923843384, "logps/rejected": -1.140114188194275, "loss": 1.64, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.0266811847686768, "rewards/margins": 0.25354695320129395, "rewards/rejected": -2.28022837638855, "step": 2510 }, { "epoch": 1.8155619596541785, "grad_norm": 22.981178435808168, "learning_rate": 2.0219144830549163e-08, "logits/chosen": -1.9544254541397095, "logits/rejected": -1.9535505771636963, "logps/chosen": -1.0188138484954834, "logps/rejected": -1.1775290966033936, "loss": 1.5995, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.037627696990967, "rewards/margins": 0.3174312710762024, "rewards/rejected": -2.355058193206787, "step": 2520 }, { "epoch": 1.8227665706051872, "grad_norm": 21.549806800618708, "learning_rate": 2.0013576501378823e-08, "logits/chosen": -1.9713141918182373, "logits/rejected": -1.9649717807769775, "logps/chosen": -1.0106983184814453, "logps/rejected": -1.1603963375091553, "loss": 1.6075, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0213966369628906, "rewards/margins": 0.299396276473999, "rewards/rejected": -2.3207926750183105, "step": 2530 }, { "epoch": 1.8299711815561959, "grad_norm": 24.091025119222916, "learning_rate": 1.9808358697190426e-08, "logits/chosen": -1.9644063711166382, "logits/rejected": -1.9606761932373047, "logps/chosen": -0.9372221231460571, "logps/rejected": -1.0825825929641724, "loss": 1.6166, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.8744442462921143, "rewards/margins": 0.29072102904319763, "rewards/rejected": -2.1651651859283447, "step": 2540 }, { "epoch": 1.8371757925072045, "grad_norm": 25.850963029851194, "learning_rate": 1.9603505843948214e-08, "logits/chosen": -2.0107369422912598, "logits/rejected": -2.0008318424224854, "logps/chosen": -0.9533821940422058, "logps/rejected": -1.132520079612732, "loss": 1.5549, "rewards/accuracies": 0.65625, "rewards/chosen": -1.9067643880844116, "rewards/margins": 0.35827580094337463, "rewards/rejected": -2.265040159225464, "step": 2550 }, { "epoch": 1.8443804034582132, "grad_norm": 24.220299047462706, "learning_rate": 1.9399032341961886e-08, "logits/chosen": -1.9717953205108643, "logits/rejected": -1.9558881521224976, "logps/chosen": -0.9942399263381958, "logps/rejected": -1.0755915641784668, "loss": 1.7049, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -1.9884798526763916, "rewards/margins": 0.16270342469215393, "rewards/rejected": -2.1511831283569336, "step": 2560 }, { "epoch": 1.8515850144092219, "grad_norm": 30.892400879462205, "learning_rate": 1.9194952564874323e-08, "logits/chosen": -2.0171079635620117, "logits/rejected": -2.0111467838287354, "logps/chosen": -1.070192813873291, "logps/rejected": -1.2229851484298706, "loss": 1.5849, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.140385627746582, "rewards/margins": 0.30558472871780396, "rewards/rejected": -2.445970296859741, "step": 2570 }, { "epoch": 1.8587896253602305, "grad_norm": 23.431545625970763, "learning_rate": 1.8991280858651157e-08, "logits/chosen": -1.9772651195526123, "logits/rejected": -1.9714912176132202, "logps/chosen": -1.068394422531128, "logps/rejected": -1.1617610454559326, "loss": 1.6824, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.136788845062256, "rewards/margins": 0.1867334395647049, "rewards/rejected": -2.3235220909118652, "step": 2580 }, { "epoch": 1.8659942363112392, "grad_norm": 20.004677801235854, "learning_rate": 1.8788031540572327e-08, "logits/chosen": -1.978497862815857, "logits/rejected": -1.9703800678253174, "logps/chosen": -1.0038588047027588, "logps/rejected": -1.161897897720337, "loss": 1.587, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0077176094055176, "rewards/margins": 0.3160780966281891, "rewards/rejected": -2.323795795440674, "step": 2590 }, { "epoch": 1.8731988472622478, "grad_norm": 20.24036550761059, "learning_rate": 1.858521889822565e-08, "logits/chosen": -1.9954450130462646, "logits/rejected": -1.9979982376098633, "logps/chosen": -0.9771059155464172, "logps/rejected": -1.0936977863311768, "loss": 1.6527, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -1.9542118310928345, "rewards/margins": 0.23318378627300262, "rewards/rejected": -2.1873955726623535, "step": 2600 }, { "epoch": 1.8804034582132565, "grad_norm": 20.66519853223484, "learning_rate": 1.8382857188502422e-08, "logits/chosen": -1.9780279397964478, "logits/rejected": -1.9730167388916016, "logps/chosen": -0.9886795878410339, "logps/rejected": -1.1276530027389526, "loss": 1.6026, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9773591756820679, "rewards/margins": 0.27794694900512695, "rewards/rejected": -2.2553060054779053, "step": 2610 }, { "epoch": 1.8876080691642652, "grad_norm": 26.108731556451694, "learning_rate": 1.8180960636595234e-08, "logits/chosen": -1.960745096206665, "logits/rejected": -1.9583278894424438, "logps/chosen": -1.0417678356170654, "logps/rejected": -1.1960971355438232, "loss": 1.5969, "rewards/accuracies": 0.5625, "rewards/chosen": -2.083535671234131, "rewards/margins": 0.3086581826210022, "rewards/rejected": -2.3921942710876465, "step": 2620 }, { "epoch": 1.8948126801152738, "grad_norm": 23.584221176641442, "learning_rate": 1.7979543434998015e-08, "logits/chosen": -2.031872272491455, "logits/rejected": -2.0369346141815186, "logps/chosen": -1.1310575008392334, "logps/rejected": -1.2239940166473389, "loss": 1.6753, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.262115001678467, "rewards/margins": 0.18587279319763184, "rewards/rejected": -2.4479880332946777, "step": 2630 }, { "epoch": 1.9020172910662825, "grad_norm": 31.364543417190443, "learning_rate": 1.7778619742508345e-08, "logits/chosen": -1.991267442703247, "logits/rejected": -1.9845441579818726, "logps/chosen": -1.1004201173782349, "logps/rejected": -1.2034924030303955, "loss": 1.6821, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.2008402347564697, "rewards/margins": 0.2061447650194168, "rewards/rejected": -2.406984806060791, "step": 2640 }, { "epoch": 1.9092219020172911, "grad_norm": 27.52041460967926, "learning_rate": 1.757820368323213e-08, "logits/chosen": -1.9875816106796265, "logits/rejected": -1.9780056476593018, "logps/chosen": -1.110682725906372, "logps/rejected": -1.279497742652893, "loss": 1.5744, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.221365451812744, "rewards/margins": 0.33762988448143005, "rewards/rejected": -2.558995485305786, "step": 2650 }, { "epoch": 1.9164265129682998, "grad_norm": 25.96541697879584, "learning_rate": 1.7378309345590803e-08, "logits/chosen": -2.0010290145874023, "logits/rejected": -2.010678291320801, "logps/chosen": -1.092329502105713, "logps/rejected": -1.2433438301086426, "loss": 1.6021, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.184659004211426, "rewards/margins": 0.30202871561050415, "rewards/rejected": -2.486687660217285, "step": 2660 }, { "epoch": 1.9236311239193085, "grad_norm": 23.602517249179833, "learning_rate": 1.717895078133088e-08, "logits/chosen": -2.050629138946533, "logits/rejected": -2.046673059463501, "logps/chosen": -1.0645157098770142, "logps/rejected": -1.2170007228851318, "loss": 1.5971, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.1290314197540283, "rewards/margins": 0.3049699366092682, "rewards/rejected": -2.4340014457702637, "step": 2670 }, { "epoch": 1.9308357348703171, "grad_norm": 25.434308420606417, "learning_rate": 1.698014200453624e-08, "logits/chosen": -2.0077168941497803, "logits/rejected": -2.0154366493225098, "logps/chosen": -1.0369855165481567, "logps/rejected": -1.1728894710540771, "loss": 1.6069, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0739710330963135, "rewards/margins": 0.2718077301979065, "rewards/rejected": -2.3457789421081543, "step": 2680 }, { "epoch": 1.9380403458213258, "grad_norm": 29.16858521012446, "learning_rate": 1.6781896990642964e-08, "logits/chosen": -1.9381119012832642, "logits/rejected": -1.935669183731079, "logps/chosen": -1.1523436307907104, "logps/rejected": -1.2525025606155396, "loss": 1.6738, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.304687261581421, "rewards/margins": 0.20031766593456268, "rewards/rejected": -2.505005121231079, "step": 2690 }, { "epoch": 1.9452449567723344, "grad_norm": 27.868014793291156, "learning_rate": 1.658422967545693e-08, "logits/chosen": -2.040616750717163, "logits/rejected": -2.027702808380127, "logps/chosen": -1.0133957862854004, "logps/rejected": -1.1359028816223145, "loss": 1.6456, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.026791572570801, "rewards/margins": 0.2450142651796341, "rewards/rejected": -2.271805763244629, "step": 2700 }, { "epoch": 1.952449567723343, "grad_norm": 25.46611731423695, "learning_rate": 1.638715395417418e-08, "logits/chosen": -2.0178420543670654, "logits/rejected": -2.0159246921539307, "logps/chosen": -1.0762913227081299, "logps/rejected": -1.2214852571487427, "loss": 1.6057, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.1525826454162598, "rewards/margins": 0.2903878688812256, "rewards/rejected": -2.4429705142974854, "step": 2710 }, { "epoch": 1.9596541786743515, "grad_norm": 26.127102018934355, "learning_rate": 1.619068368040416e-08, "logits/chosen": -2.016381025314331, "logits/rejected": -2.0123202800750732, "logps/chosen": -1.006974458694458, "logps/rejected": -1.1961872577667236, "loss": 1.5383, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.013948917388916, "rewards/margins": 0.3784259855747223, "rewards/rejected": -2.3923745155334473, "step": 2720 }, { "epoch": 1.9668587896253602, "grad_norm": 20.1846253317206, "learning_rate": 1.5994832665195853e-08, "logits/chosen": -1.9545295238494873, "logits/rejected": -1.9548801183700562, "logps/chosen": -1.0389777421951294, "logps/rejected": -1.1589481830596924, "loss": 1.6403, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.077955484390259, "rewards/margins": 0.2399410903453827, "rewards/rejected": -2.3178963661193848, "step": 2730 }, { "epoch": 1.9740634005763689, "grad_norm": 24.5836136749927, "learning_rate": 1.5799614676066906e-08, "logits/chosen": -2.0632104873657227, "logits/rejected": -2.0603370666503906, "logps/chosen": -0.9570878744125366, "logps/rejected": -1.1055129766464233, "loss": 1.5921, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.9141757488250732, "rewards/margins": 0.29685014486312866, "rewards/rejected": -2.2110259532928467, "step": 2740 }, { "epoch": 1.9812680115273775, "grad_norm": 18.998833098514673, "learning_rate": 1.560504343603587e-08, "logits/chosen": -1.9754539728164673, "logits/rejected": -1.9758739471435547, "logps/chosen": -1.0740633010864258, "logps/rejected": -1.239249348640442, "loss": 1.5754, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1481266021728516, "rewards/margins": 0.33037224411964417, "rewards/rejected": -2.478498697280884, "step": 2750 }, { "epoch": 1.9884726224783862, "grad_norm": 22.303646966857205, "learning_rate": 1.541113262265748e-08, "logits/chosen": -2.0610458850860596, "logits/rejected": -2.0591893196105957, "logps/chosen": -1.0352472066879272, "logps/rejected": -1.1631605625152588, "loss": 1.6291, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0704944133758545, "rewards/margins": 0.2558266520500183, "rewards/rejected": -2.3263211250305176, "step": 2760 }, { "epoch": 1.9956772334293948, "grad_norm": 30.315418813112018, "learning_rate": 1.5217895867061227e-08, "logits/chosen": -2.0036141872406006, "logits/rejected": -1.9978039264678955, "logps/chosen": -1.0909672975540161, "logps/rejected": -1.1984527111053467, "loss": 1.6751, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.1819345951080322, "rewards/margins": 0.21497103571891785, "rewards/rejected": -2.3969054222106934, "step": 2770 }, { "epoch": 2.0028818443804033, "grad_norm": 26.757307871717927, "learning_rate": 1.5025346752993098e-08, "logits/chosen": -1.9917653799057007, "logits/rejected": -1.9936946630477905, "logps/chosen": -1.0804020166397095, "logps/rejected": -1.2135058641433716, "loss": 1.6287, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.160804033279419, "rewards/margins": 0.2662079930305481, "rewards/rejected": -2.427011728286743, "step": 2780 }, { "epoch": 2.010086455331412, "grad_norm": 28.71516711767607, "learning_rate": 1.4833498815860756e-08, "logits/chosen": -2.0444860458374023, "logits/rejected": -2.046851396560669, "logps/chosen": -1.0067435503005981, "logps/rejected": -1.2035396099090576, "loss": 1.5564, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0134871006011963, "rewards/margins": 0.3935920298099518, "rewards/rejected": -2.4070792198181152, "step": 2790 }, { "epoch": 2.0172910662824206, "grad_norm": 21.521526962630887, "learning_rate": 1.4642365541781993e-08, "logits/chosen": -1.9570014476776123, "logits/rejected": -1.9486595392227173, "logps/chosen": -1.0369818210601807, "logps/rejected": -1.2129871845245361, "loss": 1.565, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0739636421203613, "rewards/margins": 0.35201066732406616, "rewards/rejected": -2.4259743690490723, "step": 2800 }, { "epoch": 2.0244956772334293, "grad_norm": 20.735986073532114, "learning_rate": 1.4451960366636745e-08, "logits/chosen": -2.021026134490967, "logits/rejected": -2.032320737838745, "logps/chosen": -1.046454668045044, "logps/rejected": -1.190464735031128, "loss": 1.6037, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.092909336090088, "rewards/margins": 0.2880205512046814, "rewards/rejected": -2.380929470062256, "step": 2810 }, { "epoch": 2.031700288184438, "grad_norm": 23.466486185487597, "learning_rate": 1.4262296675122592e-08, "logits/chosen": -2.0087318420410156, "logits/rejected": -2.005138397216797, "logps/chosen": -1.0375940799713135, "logps/rejected": -1.2084938287734985, "loss": 1.5637, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.075188159942627, "rewards/margins": 0.3417995572090149, "rewards/rejected": -2.416987657546997, "step": 2820 }, { "epoch": 2.0389048991354466, "grad_norm": 23.31922472892848, "learning_rate": 1.407338779981389e-08, "logits/chosen": -1.987663984298706, "logits/rejected": -1.985594391822815, "logps/chosen": -0.9225015640258789, "logps/rejected": -1.1155905723571777, "loss": 1.5198, "rewards/accuracies": 0.65625, "rewards/chosen": -1.8450031280517578, "rewards/margins": 0.386178195476532, "rewards/rejected": -2.2311811447143555, "step": 2830 }, { "epoch": 2.0461095100864553, "grad_norm": 25.250402894361468, "learning_rate": 1.3885247020224534e-08, "logits/chosen": -1.999804139137268, "logits/rejected": -1.9952776432037354, "logps/chosen": -1.010948896408081, "logps/rejected": -1.1508748531341553, "loss": 1.6127, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.021897792816162, "rewards/margins": 0.27985164523124695, "rewards/rejected": -2.3017497062683105, "step": 2840 }, { "epoch": 2.053314121037464, "grad_norm": 21.182177574612197, "learning_rate": 1.369788756187445e-08, "logits/chosen": -2.0057382583618164, "logits/rejected": -2.0026371479034424, "logps/chosen": -1.034969449043274, "logps/rejected": -1.1372885704040527, "loss": 1.6686, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.069938898086548, "rewards/margins": 0.20463857054710388, "rewards/rejected": -2.2745771408081055, "step": 2850 }, { "epoch": 2.0605187319884726, "grad_norm": 22.0672882845519, "learning_rate": 1.3511322595359925e-08, "logits/chosen": -2.029994249343872, "logits/rejected": -2.021892547607422, "logps/chosen": -0.9452686309814453, "logps/rejected": -1.1269365549087524, "loss": 1.5467, "rewards/accuracies": 0.65625, "rewards/chosen": -1.8905372619628906, "rewards/margins": 0.36333557963371277, "rewards/rejected": -2.253873109817505, "step": 2860 }, { "epoch": 2.0677233429394812, "grad_norm": 21.005601512034414, "learning_rate": 1.3325565235427716e-08, "logits/chosen": -2.0221211910247803, "logits/rejected": -2.020735263824463, "logps/chosen": -0.9910463094711304, "logps/rejected": -1.1467411518096924, "loss": 1.5929, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9820926189422607, "rewards/margins": 0.3113897144794464, "rewards/rejected": -2.2934823036193848, "step": 2870 }, { "epoch": 2.07492795389049, "grad_norm": 21.704546586617013, "learning_rate": 1.3140628540053218e-08, "logits/chosen": -1.9898440837860107, "logits/rejected": -1.9922568798065186, "logps/chosen": -0.979377269744873, "logps/rejected": -1.125458002090454, "loss": 1.6002, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.958754539489746, "rewards/margins": 0.2921615540981293, "rewards/rejected": -2.250916004180908, "step": 2880 }, { "epoch": 2.0821325648414986, "grad_norm": 23.05835307849785, "learning_rate": 1.2956525509522451e-08, "logits/chosen": -1.9692738056182861, "logits/rejected": -1.9688589572906494, "logps/chosen": -1.1163288354873657, "logps/rejected": -1.2278274297714233, "loss": 1.6629, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.2326576709747314, "rewards/margins": 0.22299735248088837, "rewards/rejected": -2.4556548595428467, "step": 2890 }, { "epoch": 2.089337175792507, "grad_norm": 23.89882897604448, "learning_rate": 1.2773269085518267e-08, "logits/chosen": -2.0057029724121094, "logits/rejected": -2.007406234741211, "logps/chosen": -1.082912564277649, "logps/rejected": -1.2209830284118652, "loss": 1.6083, "rewards/accuracies": 0.65625, "rewards/chosen": -2.165825128555298, "rewards/margins": 0.27614089846611023, "rewards/rejected": -2.4419660568237305, "step": 2900 }, { "epoch": 2.096541786743516, "grad_norm": 26.767663277581054, "learning_rate": 1.2590872150210574e-08, "logits/chosen": -2.061650276184082, "logits/rejected": -2.05505633354187, "logps/chosen": -1.0687328577041626, "logps/rejected": -1.1909129619598389, "loss": 1.6479, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.137465715408325, "rewards/margins": 0.24436035752296448, "rewards/rejected": -2.3818259239196777, "step": 2910 }, { "epoch": 2.1037463976945245, "grad_norm": 24.87095647307409, "learning_rate": 1.2409347525350775e-08, "logits/chosen": -2.0226938724517822, "logits/rejected": -2.012955904006958, "logps/chosen": -1.11465585231781, "logps/rejected": -1.274889349937439, "loss": 1.5801, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.22931170463562, "rewards/margins": 0.32046717405319214, "rewards/rejected": -2.549778699874878, "step": 2920 }, { "epoch": 2.110951008645533, "grad_norm": 26.0814644187234, "learning_rate": 1.2228707971370421e-08, "logits/chosen": -2.01188325881958, "logits/rejected": -2.0049335956573486, "logps/chosen": -0.9994518160820007, "logps/rejected": -1.1240427494049072, "loss": 1.6453, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9989036321640015, "rewards/margins": 0.24918179214000702, "rewards/rejected": -2.2480854988098145, "step": 2930 }, { "epoch": 2.118155619596542, "grad_norm": 25.7899367951667, "learning_rate": 1.2048966186484282e-08, "logits/chosen": -2.010864019393921, "logits/rejected": -1.994388222694397, "logps/chosen": -1.124501347541809, "logps/rejected": -1.2469813823699951, "loss": 1.6399, "rewards/accuracies": 0.59375, "rewards/chosen": -2.249002695083618, "rewards/margins": 0.24496027827262878, "rewards/rejected": -2.4939627647399902, "step": 2940 }, { "epoch": 2.1253602305475505, "grad_norm": 34.01305409800202, "learning_rate": 1.187013480579762e-08, "logits/chosen": -2.00197172164917, "logits/rejected": -2.004648208618164, "logps/chosen": -1.0525786876678467, "logps/rejected": -1.1973202228546143, "loss": 1.6217, "rewards/accuracies": 0.625, "rewards/chosen": -2.1051573753356934, "rewards/margins": 0.28948310017585754, "rewards/rejected": -2.3946404457092285, "step": 2950 }, { "epoch": 2.132564841498559, "grad_norm": 48.722792574193626, "learning_rate": 1.1692226400418073e-08, "logits/chosen": -1.9430949687957764, "logits/rejected": -1.9416509866714478, "logps/chosen": -1.0946460962295532, "logps/rejected": -1.2351014614105225, "loss": 1.64, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.1892921924591064, "rewards/margins": 0.2809109091758728, "rewards/rejected": -2.470202922821045, "step": 2960 }, { "epoch": 2.139769452449568, "grad_norm": 20.442787504036996, "learning_rate": 1.1515253476571923e-08, "logits/chosen": -1.9755675792694092, "logits/rejected": -1.9697250127792358, "logps/chosen": -1.0182015895843506, "logps/rejected": -1.212741494178772, "loss": 1.5268, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.036403179168701, "rewards/margins": 0.3890801966190338, "rewards/rejected": -2.425482988357544, "step": 2970 }, { "epoch": 2.1469740634005765, "grad_norm": 23.636262265997587, "learning_rate": 1.133922847472496e-08, "logits/chosen": -1.9919281005859375, "logits/rejected": -1.9928478002548218, "logps/chosen": -1.1163393259048462, "logps/rejected": -1.2261422872543335, "loss": 1.6771, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.2326786518096924, "rewards/margins": 0.21960613131523132, "rewards/rejected": -2.452284574508667, "step": 2980 }, { "epoch": 2.154178674351585, "grad_norm": 27.35793823071837, "learning_rate": 1.1164163768707952e-08, "logits/chosen": -1.9976240396499634, "logits/rejected": -1.9923988580703735, "logps/chosen": -1.0124857425689697, "logps/rejected": -1.1620383262634277, "loss": 1.605, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0249714851379395, "rewards/margins": 0.2991052567958832, "rewards/rejected": -2.3240766525268555, "step": 2990 }, { "epoch": 2.161383285302594, "grad_norm": 21.14670400379884, "learning_rate": 1.0990071664846861e-08, "logits/chosen": -1.965047836303711, "logits/rejected": -1.9639549255371094, "logps/chosen": -1.026025414466858, "logps/rejected": -1.221134901046753, "loss": 1.5614, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.052050828933716, "rewards/margins": 0.39021891355514526, "rewards/rejected": -2.442269802093506, "step": 3000 }, { "epoch": 2.1685878962536025, "grad_norm": 22.47627800274925, "learning_rate": 1.0816964401097739e-08, "logits/chosen": -1.9546226263046265, "logits/rejected": -1.951666235923767, "logps/chosen": -0.9643720388412476, "logps/rejected": -1.0974457263946533, "loss": 1.6301, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9287440776824951, "rewards/margins": 0.26614707708358765, "rewards/rejected": -2.1948914527893066, "step": 3010 }, { "epoch": 2.175792507204611, "grad_norm": 24.81301945148687, "learning_rate": 1.0644854146186406e-08, "logits/chosen": -2.0162227153778076, "logits/rejected": -2.0100228786468506, "logps/chosen": -1.0355775356292725, "logps/rejected": -1.2059860229492188, "loss": 1.5756, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.071155071258545, "rewards/margins": 0.34081682562828064, "rewards/rejected": -2.4119720458984375, "step": 3020 }, { "epoch": 2.18299711815562, "grad_norm": 22.691794166345073, "learning_rate": 1.0473752998753114e-08, "logits/chosen": -2.000042676925659, "logits/rejected": -1.9917463064193726, "logps/chosen": -1.0259116888046265, "logps/rejected": -1.199589490890503, "loss": 1.5627, "rewards/accuracies": 0.625, "rewards/chosen": -2.051823377609253, "rewards/margins": 0.34735578298568726, "rewards/rejected": -2.399178981781006, "step": 3030 }, { "epoch": 2.1902017291066285, "grad_norm": 23.732869238641477, "learning_rate": 1.030367298650201e-08, "logits/chosen": -2.012312650680542, "logits/rejected": -2.0125322341918945, "logps/chosen": -1.0479185581207275, "logps/rejected": -1.2102346420288086, "loss": 1.5779, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.095837116241455, "rewards/margins": 0.324631929397583, "rewards/rejected": -2.420469284057617, "step": 3040 }, { "epoch": 2.1974063400576367, "grad_norm": 25.38335204026135, "learning_rate": 1.0134626065355675e-08, "logits/chosen": -2.0673251152038574, "logits/rejected": -2.064276695251465, "logps/chosen": -1.0288770198822021, "logps/rejected": -1.1877471208572388, "loss": 1.6031, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0577540397644043, "rewards/margins": 0.31774038076400757, "rewards/rejected": -2.3754942417144775, "step": 3050 }, { "epoch": 2.2046109510086453, "grad_norm": 23.686385016463422, "learning_rate": 9.966624118614611e-09, "logits/chosen": -2.0056352615356445, "logits/rejected": -2.000843048095703, "logps/chosen": -1.0693120956420898, "logps/rejected": -1.2264524698257446, "loss": 1.6057, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.1386241912841797, "rewards/margins": 0.3142808973789215, "rewards/rejected": -2.4529049396514893, "step": 3060 }, { "epoch": 2.211815561959654, "grad_norm": 18.302601585475063, "learning_rate": 9.799678956121976e-09, "logits/chosen": -1.9648053646087646, "logits/rejected": -1.9606415033340454, "logps/chosen": -1.0401145219802856, "logps/rejected": -1.1557649374008179, "loss": 1.6316, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0802290439605713, "rewards/margins": 0.23130115866661072, "rewards/rejected": -2.3115298748016357, "step": 3070 }, { "epoch": 2.2190201729106627, "grad_norm": 27.958099571573886, "learning_rate": 9.633802313433314e-09, "logits/chosen": -1.939426064491272, "logits/rejected": -1.9455314874649048, "logps/chosen": -1.026546835899353, "logps/rejected": -1.1441318988800049, "loss": 1.6306, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.053093671798706, "rewards/margins": 0.23517021536827087, "rewards/rejected": -2.2882637977600098, "step": 3080 }, { "epoch": 2.2262247838616713, "grad_norm": 24.057068078233453, "learning_rate": 9.469005850991705e-09, "logits/chosen": -2.005563259124756, "logits/rejected": -1.9999898672103882, "logps/chosen": -1.021996021270752, "logps/rejected": -1.1510010957717896, "loss": 1.6554, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.043992042541504, "rewards/margins": 0.2580098509788513, "rewards/rejected": -2.302002191543579, "step": 3090 }, { "epoch": 2.23342939481268, "grad_norm": 22.993189794217244, "learning_rate": 9.305301153307949e-09, "logits/chosen": -2.0027341842651367, "logits/rejected": -2.010744571685791, "logps/chosen": -0.9593653678894043, "logps/rejected": -1.1363437175750732, "loss": 1.5707, "rewards/accuracies": 0.625, "rewards/chosen": -1.9187307357788086, "rewards/margins": 0.3539566397666931, "rewards/rejected": -2.2726874351501465, "step": 3100 }, { "epoch": 2.2406340057636887, "grad_norm": 22.689624680301527, "learning_rate": 9.142699728146336e-09, "logits/chosen": -1.975015640258789, "logits/rejected": -1.9685420989990234, "logps/chosen": -1.0418678522109985, "logps/rejected": -1.1868336200714111, "loss": 1.6201, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.083735704421997, "rewards/margins": 0.2899312674999237, "rewards/rejected": -2.3736672401428223, "step": 3110 }, { "epoch": 2.2478386167146973, "grad_norm": 21.35327217286005, "learning_rate": 8.981213005715627e-09, "logits/chosen": -1.9979450702667236, "logits/rejected": -2.0012001991271973, "logps/chosen": -1.0012747049331665, "logps/rejected": -1.1851648092269897, "loss": 1.56, "rewards/accuracies": 0.65625, "rewards/chosen": -2.002549409866333, "rewards/margins": 0.36778026819229126, "rewards/rejected": -2.3703296184539795, "step": 3120 }, { "epoch": 2.255043227665706, "grad_norm": 26.383183029607626, "learning_rate": 8.820852337865611e-09, "logits/chosen": -2.0243782997131348, "logits/rejected": -2.020660161972046, "logps/chosen": -1.0028272867202759, "logps/rejected": -1.1637529134750366, "loss": 1.5862, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.0056545734405518, "rewards/margins": 0.3218514025211334, "rewards/rejected": -2.3275058269500732, "step": 3130 }, { "epoch": 2.2622478386167146, "grad_norm": 21.395454754738285, "learning_rate": 8.661628997289044e-09, "logits/chosen": -1.9658374786376953, "logits/rejected": -1.9619413614273071, "logps/chosen": -1.023062825202942, "logps/rejected": -1.1900697946548462, "loss": 1.5851, "rewards/accuracies": 0.5625, "rewards/chosen": -2.046125650405884, "rewards/margins": 0.33401376008987427, "rewards/rejected": -2.3801395893096924, "step": 3140 }, { "epoch": 2.2694524495677233, "grad_norm": 19.8105220075867, "learning_rate": 8.503554176729341e-09, "logits/chosen": -1.967435598373413, "logits/rejected": -1.9660450220108032, "logps/chosen": -1.0334124565124512, "logps/rejected": -1.2073090076446533, "loss": 1.58, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0668249130249023, "rewards/margins": 0.3477930426597595, "rewards/rejected": -2.4146180152893066, "step": 3150 }, { "epoch": 2.276657060518732, "grad_norm": 28.48880138636305, "learning_rate": 8.346638988193636e-09, "logits/chosen": -2.0007739067077637, "logits/rejected": -1.995672583580017, "logps/chosen": -0.9323548078536987, "logps/rejected": -1.0959632396697998, "loss": 1.5918, "rewards/accuracies": 0.59375, "rewards/chosen": -1.8647096157073975, "rewards/margins": 0.3272170424461365, "rewards/rejected": -2.1919264793395996, "step": 3160 }, { "epoch": 2.2838616714697406, "grad_norm": 27.64259270476726, "learning_rate": 8.19089446217176e-09, "logits/chosen": -1.9689916372299194, "logits/rejected": -1.958873987197876, "logps/chosen": -1.0094202756881714, "logps/rejected": -1.2147648334503174, "loss": 1.5205, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.0188405513763428, "rewards/margins": 0.410689115524292, "rewards/rejected": -2.4295296669006348, "step": 3170 }, { "epoch": 2.2910662824207493, "grad_norm": 20.197095337849724, "learning_rate": 8.036331546860777e-09, "logits/chosen": -1.9783294200897217, "logits/rejected": -1.977897047996521, "logps/chosen": -0.9609595537185669, "logps/rejected": -1.055878758430481, "loss": 1.6842, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.9219191074371338, "rewards/margins": 0.18983834981918335, "rewards/rejected": -2.111757516860962, "step": 3180 }, { "epoch": 2.298270893371758, "grad_norm": 26.910394163242426, "learning_rate": 7.882961107395416e-09, "logits/chosen": -1.9873777627944946, "logits/rejected": -1.9819095134735107, "logps/chosen": -1.1414562463760376, "logps/rejected": -1.198393702507019, "loss": 1.7533, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.282912492752075, "rewards/margins": 0.11387516558170319, "rewards/rejected": -2.396787405014038, "step": 3190 }, { "epoch": 2.3054755043227666, "grad_norm": 30.62934953252631, "learning_rate": 7.73079392508428e-09, "logits/chosen": -1.9583721160888672, "logits/rejected": -1.957950234413147, "logps/chosen": -1.0997769832611084, "logps/rejected": -1.304617166519165, "loss": 1.5595, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.199553966522217, "rewards/margins": 0.4096801280975342, "rewards/rejected": -2.60923433303833, "step": 3200 }, { "epoch": 2.3126801152737753, "grad_norm": 26.064769484113217, "learning_rate": 7.579840696651938e-09, "logits/chosen": -1.9889991283416748, "logits/rejected": -1.9861869812011719, "logps/chosen": -1.0576711893081665, "logps/rejected": -1.1947062015533447, "loss": 1.6269, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.115342378616333, "rewards/margins": 0.27406978607177734, "rewards/rejected": -2.3894124031066895, "step": 3210 }, { "epoch": 2.319884726224784, "grad_norm": 27.55359582356058, "learning_rate": 7.43011203348704e-09, "logits/chosen": -1.9100205898284912, "logits/rejected": -1.906876564025879, "logps/chosen": -1.062924861907959, "logps/rejected": -1.1486908197402954, "loss": 1.7031, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.125849723815918, "rewards/margins": 0.17153207957744598, "rewards/rejected": -2.297381639480591, "step": 3220 }, { "epoch": 2.3270893371757926, "grad_norm": 22.565019216219657, "learning_rate": 7.281618460896344e-09, "logits/chosen": -1.985080361366272, "logits/rejected": -1.9826555252075195, "logps/chosen": -0.9736523628234863, "logps/rejected": -1.128810167312622, "loss": 1.5883, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9473047256469727, "rewards/margins": 0.3103155493736267, "rewards/rejected": -2.257620334625244, "step": 3230 }, { "epoch": 2.3342939481268012, "grad_norm": 24.11876392443081, "learning_rate": 7.134370417364849e-09, "logits/chosen": -1.9583276510238647, "logits/rejected": -1.9580589532852173, "logps/chosen": -1.0128490924835205, "logps/rejected": -1.1612526178359985, "loss": 1.623, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.025698184967041, "rewards/margins": 0.2968069911003113, "rewards/rejected": -2.322505235671997, "step": 3240 }, { "epoch": 2.34149855907781, "grad_norm": 28.34904804817512, "learning_rate": 6.988378253821981e-09, "logits/chosen": -1.9590322971343994, "logits/rejected": -1.9579321146011353, "logps/chosen": -1.031524658203125, "logps/rejected": -1.1591465473175049, "loss": 1.634, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.06304931640625, "rewards/margins": 0.25524377822875977, "rewards/rejected": -2.3182930946350098, "step": 3250 }, { "epoch": 2.3487031700288186, "grad_norm": 24.44313047891004, "learning_rate": 6.8436522329140186e-09, "logits/chosen": -1.9688135385513306, "logits/rejected": -1.975393295288086, "logps/chosen": -1.0421944856643677, "logps/rejected": -1.176190733909607, "loss": 1.6358, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.0843889713287354, "rewards/margins": 0.26799264550209045, "rewards/rejected": -2.352381467819214, "step": 3260 }, { "epoch": 2.3559077809798272, "grad_norm": 26.113896060355533, "learning_rate": 6.700202528282603e-09, "logits/chosen": -1.9689340591430664, "logits/rejected": -1.9594923257827759, "logps/chosen": -1.038690209388733, "logps/rejected": -1.1668148040771484, "loss": 1.636, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.077380418777466, "rewards/margins": 0.25624918937683105, "rewards/rejected": -2.333629608154297, "step": 3270 }, { "epoch": 2.363112391930836, "grad_norm": 27.624293457824887, "learning_rate": 6.558039223849668e-09, "logits/chosen": -2.0257744789123535, "logits/rejected": -2.0163044929504395, "logps/chosen": -1.045290470123291, "logps/rejected": -1.2717329263687134, "loss": 1.5152, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.090580940246582, "rewards/margins": 0.4528846740722656, "rewards/rejected": -2.5434658527374268, "step": 3280 }, { "epoch": 2.3703170028818445, "grad_norm": 26.125113709901196, "learning_rate": 6.417172313108471e-09, "logits/chosen": -1.949104905128479, "logits/rejected": -1.9437875747680664, "logps/chosen": -0.998646080493927, "logps/rejected": -1.1385498046875, "loss": 1.6205, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.997292160987854, "rewards/margins": 0.279807448387146, "rewards/rejected": -2.277099609375, "step": 3290 }, { "epoch": 2.377521613832853, "grad_norm": 25.77724831498868, "learning_rate": 6.277611698421179e-09, "logits/chosen": -2.012104034423828, "logits/rejected": -2.00412917137146, "logps/chosen": -0.9130659103393555, "logps/rejected": -1.1234972476959229, "loss": 1.5294, "rewards/accuracies": 0.625, "rewards/chosen": -1.826131820678711, "rewards/margins": 0.42086291313171387, "rewards/rejected": -2.2469944953918457, "step": 3300 }, { "epoch": 2.3847262247838614, "grad_norm": 27.423722821123295, "learning_rate": 6.139367190322714e-09, "logits/chosen": -2.0002529621124268, "logits/rejected": -1.999985933303833, "logps/chosen": -1.0683438777923584, "logps/rejected": -1.2363920211791992, "loss": 1.5799, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.136687755584717, "rewards/margins": 0.3360963761806488, "rewards/rejected": -2.4727840423583984, "step": 3310 }, { "epoch": 2.39193083573487, "grad_norm": 19.991775212655945, "learning_rate": 6.002448506831171e-09, "logits/chosen": -1.9950650930404663, "logits/rejected": -1.9902185201644897, "logps/chosen": -0.9909921884536743, "logps/rejected": -1.1469800472259521, "loss": 1.5889, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9819843769073486, "rewards/margins": 0.3119755983352661, "rewards/rejected": -2.2939600944519043, "step": 3320 }, { "epoch": 2.3991354466858787, "grad_norm": 21.831185665117665, "learning_rate": 5.866865272764607e-09, "logits/chosen": -2.0154030323028564, "logits/rejected": -2.015378952026367, "logps/chosen": -1.0298879146575928, "logps/rejected": -1.1846274137496948, "loss": 1.5967, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0597758293151855, "rewards/margins": 0.3094790279865265, "rewards/rejected": -2.3692548274993896, "step": 3330 }, { "epoch": 2.4063400576368874, "grad_norm": 28.7947807103091, "learning_rate": 5.7326270190645595e-09, "logits/chosen": -1.8919003009796143, "logits/rejected": -1.8937880992889404, "logps/chosen": -1.0716549158096313, "logps/rejected": -1.1915693283081055, "loss": 1.6436, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.1433098316192627, "rewards/margins": 0.2398286759853363, "rewards/rejected": -2.383138656616211, "step": 3340 }, { "epoch": 2.413544668587896, "grad_norm": 21.933856781166835, "learning_rate": 5.599743182125938e-09, "logits/chosen": -2.0387024879455566, "logits/rejected": -2.038865089416504, "logps/chosen": -1.0583384037017822, "logps/rejected": -1.2039889097213745, "loss": 1.6009, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.1166768074035645, "rewards/margins": 0.29130083322525024, "rewards/rejected": -2.407977819442749, "step": 3350 }, { "epoch": 2.4207492795389047, "grad_norm": 24.65065888208135, "learning_rate": 5.46822310313379e-09, "logits/chosen": -2.03857159614563, "logits/rejected": -2.0484530925750732, "logps/chosen": -1.1011087894439697, "logps/rejected": -1.2127426862716675, "loss": 1.6671, "rewards/accuracies": 0.5625, "rewards/chosen": -2.2022175788879395, "rewards/margins": 0.22326767444610596, "rewards/rejected": -2.425485372543335, "step": 3360 }, { "epoch": 2.4279538904899134, "grad_norm": 24.58782837504702, "learning_rate": 5.33807602740658e-09, "logits/chosen": -2.0139212608337402, "logits/rejected": -2.0074281692504883, "logps/chosen": -0.9654278755187988, "logps/rejected": -1.188689112663269, "loss": 1.5063, "rewards/accuracies": 0.65625, "rewards/chosen": -1.9308557510375977, "rewards/margins": 0.4465225338935852, "rewards/rejected": -2.377378225326538, "step": 3370 }, { "epoch": 2.435158501440922, "grad_norm": 25.002863563846113, "learning_rate": 5.209311103746334e-09, "logits/chosen": -1.991249442100525, "logits/rejected": -1.9918267726898193, "logps/chosen": -1.062037706375122, "logps/rejected": -1.2459160089492798, "loss": 1.5699, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.124075412750244, "rewards/margins": 0.36775654554367065, "rewards/rejected": -2.4918320178985596, "step": 3380 }, { "epoch": 2.4423631123919307, "grad_norm": 29.00940526057699, "learning_rate": 5.081937383795484e-09, "logits/chosen": -1.9643027782440186, "logits/rejected": -1.9637250900268555, "logps/chosen": -0.9805372357368469, "logps/rejected": -1.1601638793945312, "loss": 1.5569, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9610744714736938, "rewards/margins": 0.35925301909446716, "rewards/rejected": -2.3203277587890625, "step": 3390 }, { "epoch": 2.4495677233429394, "grad_norm": 22.309943391629336, "learning_rate": 4.955963821400599e-09, "logits/chosen": -2.0172548294067383, "logits/rejected": -2.0117599964141846, "logps/chosen": -1.040705919265747, "logps/rejected": -1.1921679973602295, "loss": 1.6115, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.081411838531494, "rewards/margins": 0.3029238283634186, "rewards/rejected": -2.384335994720459, "step": 3400 }, { "epoch": 2.456772334293948, "grad_norm": 18.75415359602338, "learning_rate": 4.831399271982928e-09, "logits/chosen": -1.9422295093536377, "logits/rejected": -1.934175729751587, "logps/chosen": -1.051344633102417, "logps/rejected": -1.195204734802246, "loss": 1.6304, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.102689266204834, "rewards/margins": 0.2877200245857239, "rewards/rejected": -2.390409469604492, "step": 3410 }, { "epoch": 2.4639769452449567, "grad_norm": 30.21053604777478, "learning_rate": 4.708252491915951e-09, "logits/chosen": -2.0238802433013916, "logits/rejected": -2.017911911010742, "logps/chosen": -1.0559172630310059, "logps/rejected": -1.216577172279358, "loss": 1.6101, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.1118345260620117, "rewards/margins": 0.32131966948509216, "rewards/rejected": -2.433154344558716, "step": 3420 }, { "epoch": 2.4711815561959654, "grad_norm": 30.07542856849867, "learning_rate": 4.58653213790981e-09, "logits/chosen": -1.9997894763946533, "logits/rejected": -1.9920082092285156, "logps/chosen": -1.0349630117416382, "logps/rejected": -1.1963032484054565, "loss": 1.5937, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0699260234832764, "rewards/margins": 0.3226805329322815, "rewards/rejected": -2.392606496810913, "step": 3430 }, { "epoch": 2.478386167146974, "grad_norm": 23.053312586128765, "learning_rate": 4.466246766402773e-09, "logits/chosen": -1.9811131954193115, "logits/rejected": -1.975023865699768, "logps/chosen": -1.0486400127410889, "logps/rejected": -1.216692566871643, "loss": 1.5943, "rewards/accuracies": 0.625, "rewards/chosen": -2.0972800254821777, "rewards/margins": 0.3361048102378845, "rewards/rejected": -2.433385133743286, "step": 3440 }, { "epoch": 2.4855907780979827, "grad_norm": 27.71207284816762, "learning_rate": 4.347404832959775e-09, "logits/chosen": -2.0265183448791504, "logits/rejected": -2.026737928390503, "logps/chosen": -1.0446550846099854, "logps/rejected": -1.217142105102539, "loss": 1.5733, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.0893101692199707, "rewards/margins": 0.3449738323688507, "rewards/rejected": -2.434284210205078, "step": 3450 }, { "epoch": 2.4927953890489913, "grad_norm": 37.25296678647402, "learning_rate": 4.230014691678016e-09, "logits/chosen": -1.9857333898544312, "logits/rejected": -1.9863560199737549, "logps/chosen": -1.0709832906723022, "logps/rejected": -1.14461350440979, "loss": 1.7112, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.1419665813446045, "rewards/margins": 0.14726075530052185, "rewards/rejected": -2.28922700881958, "step": 3460 }, { "epoch": 2.5, "grad_norm": 21.496014500502305, "learning_rate": 4.114084594599707e-09, "logits/chosen": -1.986973762512207, "logits/rejected": -1.9872105121612549, "logps/chosen": -1.0206706523895264, "logps/rejected": -1.2565962076187134, "loss": 1.4966, "rewards/accuracies": 0.65625, "rewards/chosen": -2.0413413047790527, "rewards/margins": 0.47185102105140686, "rewards/rejected": -2.5131924152374268, "step": 3470 }, { "epoch": 2.5072046109510087, "grad_norm": 25.3775928816355, "learning_rate": 3.9996226911319546e-09, "logits/chosen": -1.9873378276824951, "logits/rejected": -1.9751040935516357, "logps/chosen": -1.026548981666565, "logps/rejected": -1.1670781373977661, "loss": 1.6105, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.05309796333313, "rewards/margins": 0.28105807304382324, "rewards/rejected": -2.3341562747955322, "step": 3480 }, { "epoch": 2.5144092219020173, "grad_norm": 23.397986233529224, "learning_rate": 3.886637027473949e-09, "logits/chosen": -1.9925390481948853, "logits/rejected": -1.9949783086776733, "logps/chosen": -1.086550235748291, "logps/rejected": -1.2638647556304932, "loss": 1.567, "rewards/accuracies": 0.625, "rewards/chosen": -2.173100471496582, "rewards/margins": 0.35462915897369385, "rewards/rejected": -2.5277295112609863, "step": 3490 }, { "epoch": 2.521613832853026, "grad_norm": 23.620445031166188, "learning_rate": 3.775135546051295e-09, "logits/chosen": -1.9295272827148438, "logits/rejected": -1.9305601119995117, "logps/chosen": -1.0361953973770142, "logps/rejected": -1.1760914325714111, "loss": 1.6193, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0723907947540283, "rewards/margins": 0.2797921895980835, "rewards/rejected": -2.3521828651428223, "step": 3500 }, { "epoch": 2.5288184438040346, "grad_norm": 29.199556258079916, "learning_rate": 3.665126084957723e-09, "logits/chosen": -1.9770501852035522, "logits/rejected": -1.9813220500946045, "logps/chosen": -1.1458697319030762, "logps/rejected": -1.2556262016296387, "loss": 1.6871, "rewards/accuracies": 0.5, "rewards/chosen": -2.2917394638061523, "rewards/margins": 0.21951286494731903, "rewards/rejected": -2.5112524032592773, "step": 3510 }, { "epoch": 2.5360230547550433, "grad_norm": 24.97063443817187, "learning_rate": 3.556616377404101e-09, "logits/chosen": -2.002643585205078, "logits/rejected": -2.000908613204956, "logps/chosen": -1.092260479927063, "logps/rejected": -1.2640482187271118, "loss": 1.5644, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.184520959854126, "rewards/margins": 0.3435753881931305, "rewards/rejected": -2.5280964374542236, "step": 3520 }, { "epoch": 2.543227665706052, "grad_norm": 24.36478594825762, "learning_rate": 3.4496140511748125e-09, "logits/chosen": -1.9859501123428345, "logits/rejected": -1.9806346893310547, "logps/chosen": -1.0690721273422241, "logps/rejected": -1.225163221359253, "loss": 1.5919, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.1381442546844482, "rewards/margins": 0.3121821880340576, "rewards/rejected": -2.450326442718506, "step": 3530 }, { "epoch": 2.5504322766570606, "grad_norm": 36.81700995699646, "learning_rate": 3.3441266280915427e-09, "logits/chosen": -1.9802547693252563, "logits/rejected": -1.9810454845428467, "logps/chosen": -1.1021790504455566, "logps/rejected": -1.226802110671997, "loss": 1.638, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.2043581008911133, "rewards/margins": 0.24924595654010773, "rewards/rejected": -2.453604221343994, "step": 3540 }, { "epoch": 2.5576368876080693, "grad_norm": 28.623014364985952, "learning_rate": 3.2401615234845693e-09, "logits/chosen": -1.9976619482040405, "logits/rejected": -1.9920257329940796, "logps/chosen": -1.1042211055755615, "logps/rejected": -1.2627480030059814, "loss": 1.6023, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.208442211151123, "rewards/margins": 0.3170536756515503, "rewards/rejected": -2.525496006011963, "step": 3550 }, { "epoch": 2.564841498559078, "grad_norm": 20.877272352467568, "learning_rate": 3.1377260456714375e-09, "logits/chosen": -1.89840829372406, "logits/rejected": -1.8899259567260742, "logps/chosen": -1.0741069316864014, "logps/rejected": -1.2302823066711426, "loss": 1.5833, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1482138633728027, "rewards/margins": 0.31235069036483765, "rewards/rejected": -2.460564613342285, "step": 3560 }, { "epoch": 2.5720461095100866, "grad_norm": 21.35685803536181, "learning_rate": 3.0368273954432698e-09, "logits/chosen": -2.0214016437530518, "logits/rejected": -2.0133938789367676, "logps/chosen": -1.0598551034927368, "logps/rejected": -1.1744581460952759, "loss": 1.6537, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1197102069854736, "rewards/margins": 0.22920596599578857, "rewards/rejected": -2.3489162921905518, "step": 3570 }, { "epoch": 2.5792507204610953, "grad_norm": 21.652139413736148, "learning_rate": 2.937472665558541e-09, "logits/chosen": -2.0178732872009277, "logits/rejected": -2.0194754600524902, "logps/chosen": -1.0495593547821045, "logps/rejected": -1.1783432960510254, "loss": 1.6421, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.099118709564209, "rewards/margins": 0.25756800174713135, "rewards/rejected": -2.356686592102051, "step": 3580 }, { "epoch": 2.586455331412104, "grad_norm": 25.689703954197313, "learning_rate": 2.8396688402445053e-09, "logits/chosen": -2.0545198917388916, "logits/rejected": -2.0471858978271484, "logps/chosen": -1.026667833328247, "logps/rejected": -1.252987265586853, "loss": 1.5055, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.053335666656494, "rewards/margins": 0.45263880491256714, "rewards/rejected": -2.505974531173706, "step": 3590 }, { "epoch": 2.5936599423631126, "grad_norm": 27.84193007245708, "learning_rate": 2.7434227947062324e-09, "logits/chosen": -1.9976928234100342, "logits/rejected": -1.991416573524475, "logps/chosen": -1.1429685354232788, "logps/rejected": -1.2610361576080322, "loss": 1.6584, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.2859370708465576, "rewards/margins": 0.2361353635787964, "rewards/rejected": -2.5220723152160645, "step": 3600 }, { "epoch": 2.6008645533141213, "grad_norm": 21.806762124742306, "learning_rate": 2.6487412946432976e-09, "logits/chosen": -1.9651085138320923, "logits/rejected": -1.96010422706604, "logps/chosen": -1.0865273475646973, "logps/rejected": -1.2380552291870117, "loss": 1.6056, "rewards/accuracies": 0.59375, "rewards/chosen": -2.1730546951293945, "rewards/margins": 0.3030560612678528, "rewards/rejected": -2.4761104583740234, "step": 3610 }, { "epoch": 2.60806916426513, "grad_norm": 27.853506917371565, "learning_rate": 2.5556309957742024e-09, "logits/chosen": -1.9727575778961182, "logits/rejected": -1.9678795337677002, "logps/chosen": -1.0326440334320068, "logps/rejected": -1.2449814081192017, "loss": 1.5164, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -2.0652880668640137, "rewards/margins": 0.4246746003627777, "rewards/rejected": -2.4899628162384033, "step": 3620 }, { "epoch": 2.6152737752161386, "grad_norm": 28.266405336091644, "learning_rate": 2.4640984433684758e-09, "logits/chosen": -2.0244503021240234, "logits/rejected": -2.0256316661834717, "logps/chosen": -1.1292182207107544, "logps/rejected": -1.2573057413101196, "loss": 1.6565, "rewards/accuracies": 0.5625, "rewards/chosen": -2.258436441421509, "rewards/margins": 0.25617507100105286, "rewards/rejected": -2.5146114826202393, "step": 3630 }, { "epoch": 2.6224783861671472, "grad_norm": 22.57745904560482, "learning_rate": 2.3741500717865987e-09, "logits/chosen": -1.9889405965805054, "logits/rejected": -1.9999366998672485, "logps/chosen": -1.0152301788330078, "logps/rejected": -1.1731317043304443, "loss": 1.5943, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0304603576660156, "rewards/margins": 0.31580331921577454, "rewards/rejected": -2.3462634086608887, "step": 3640 }, { "epoch": 2.629682997118156, "grad_norm": 22.170753475053665, "learning_rate": 2.285792204027678e-09, "logits/chosen": -1.9736753702163696, "logits/rejected": -1.9713001251220703, "logps/chosen": -1.0239061117172241, "logps/rejected": -1.2365758419036865, "loss": 1.5034, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.0478122234344482, "rewards/margins": 0.42533937096595764, "rewards/rejected": -2.473151683807373, "step": 3650 }, { "epoch": 2.636887608069164, "grad_norm": 25.749078081153876, "learning_rate": 2.199031051284972e-09, "logits/chosen": -1.9990713596343994, "logits/rejected": -1.994596242904663, "logps/chosen": -1.0814430713653564, "logps/rejected": -1.2243701219558716, "loss": 1.632, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.162886142730713, "rewards/margins": 0.2858540117740631, "rewards/rejected": -2.448740243911743, "step": 3660 }, { "epoch": 2.6440922190201728, "grad_norm": 21.713647632214947, "learning_rate": 2.113872712509254e-09, "logits/chosen": -1.982190728187561, "logits/rejected": -1.9747440814971924, "logps/chosen": -1.1430615186691284, "logps/rejected": -1.268019676208496, "loss": 1.6487, "rewards/accuracies": 0.59375, "rewards/chosen": -2.286123037338257, "rewards/margins": 0.24991640448570251, "rewards/rejected": -2.536039352416992, "step": 3670 }, { "epoch": 2.6512968299711814, "grad_norm": 17.48737483363496, "learning_rate": 2.0303231739801143e-09, "logits/chosen": -1.9670238494873047, "logits/rejected": -1.9565740823745728, "logps/chosen": -1.031555414199829, "logps/rejected": -1.1823747158050537, "loss": 1.6014, "rewards/accuracies": 0.59375, "rewards/chosen": -2.063110828399658, "rewards/margins": 0.3016386032104492, "rewards/rejected": -2.3647494316101074, "step": 3680 }, { "epoch": 2.65850144092219, "grad_norm": 28.38148915526448, "learning_rate": 1.948388308885102e-09, "logits/chosen": -2.029740571975708, "logits/rejected": -2.0215508937835693, "logps/chosen": -1.0746675729751587, "logps/rejected": -1.195127248764038, "loss": 1.642, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.1493351459503174, "rewards/margins": 0.24091950058937073, "rewards/rejected": -2.390254497528076, "step": 3690 }, { "epoch": 2.6657060518731988, "grad_norm": 31.36979240103486, "learning_rate": 1.86807387690692e-09, "logits/chosen": -2.056037664413452, "logits/rejected": -2.0529327392578125, "logps/chosen": -1.0985705852508545, "logps/rejected": -1.3052425384521484, "loss": 1.5147, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.197141170501709, "rewards/margins": 0.41334372758865356, "rewards/rejected": -2.610485076904297, "step": 3700 }, { "epoch": 2.6729106628242074, "grad_norm": 24.220968168491435, "learning_rate": 1.789385523818493e-09, "logits/chosen": -2.0213685035705566, "logits/rejected": -2.022982358932495, "logps/chosen": -1.0506846904754639, "logps/rejected": -1.2349631786346436, "loss": 1.553, "rewards/accuracies": 0.625, "rewards/chosen": -2.1013693809509277, "rewards/margins": 0.3685569763183594, "rewards/rejected": -2.469926357269287, "step": 3710 }, { "epoch": 2.680115273775216, "grad_norm": 30.993723692404313, "learning_rate": 1.712328781086131e-09, "logits/chosen": -2.040888547897339, "logits/rejected": -2.03602933883667, "logps/chosen": -1.1361511945724487, "logps/rejected": -1.2417099475860596, "loss": 1.6701, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.2723023891448975, "rewards/margins": 0.211117222905159, "rewards/rejected": -2.483419895172119, "step": 3720 }, { "epoch": 2.6873198847262247, "grad_norm": 25.834324291704004, "learning_rate": 1.6369090654806543e-09, "logits/chosen": -2.046886920928955, "logits/rejected": -2.040391683578491, "logps/chosen": -1.0335193872451782, "logps/rejected": -1.189134120941162, "loss": 1.5858, "rewards/accuracies": 0.625, "rewards/chosen": -2.0670387744903564, "rewards/margins": 0.3112295866012573, "rewards/rejected": -2.378268241882324, "step": 3730 }, { "epoch": 2.6945244956772334, "grad_norm": 23.490666350782163, "learning_rate": 1.5631316786966498e-09, "logits/chosen": -1.9731369018554688, "logits/rejected": -1.9668524265289307, "logps/chosen": -1.0346230268478394, "logps/rejected": -1.185753583908081, "loss": 1.6147, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0692460536956787, "rewards/margins": 0.3022609353065491, "rewards/rejected": -2.371507167816162, "step": 3740 }, { "epoch": 2.701729106628242, "grad_norm": 22.569153615110963, "learning_rate": 1.491001806979772e-09, "logits/chosen": -2.0251309871673584, "logits/rejected": -2.018158197402954, "logps/chosen": -1.0858871936798096, "logps/rejected": -1.2490332126617432, "loss": 1.5884, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.171774387359619, "rewards/margins": 0.3262918293476105, "rewards/rejected": -2.4980664253234863, "step": 3750 }, { "epoch": 2.7089337175792507, "grad_norm": 33.80465347026232, "learning_rate": 1.4205245207621508e-09, "logits/chosen": -1.9726536273956299, "logits/rejected": -1.9704698324203491, "logps/chosen": -1.128059983253479, "logps/rejected": -1.3132567405700684, "loss": 1.5607, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.256119966506958, "rewards/margins": 0.37039369344711304, "rewards/rejected": -2.6265134811401367, "step": 3760 }, { "epoch": 2.7161383285302594, "grad_norm": 22.173921402790466, "learning_rate": 1.3517047743059978e-09, "logits/chosen": -2.0023999214172363, "logits/rejected": -2.0059781074523926, "logps/chosen": -1.0848875045776367, "logps/rejected": -1.2582701444625854, "loss": 1.5665, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.1697750091552734, "rewards/margins": 0.3467653691768646, "rewards/rejected": -2.516540288925171, "step": 3770 }, { "epoch": 2.723342939481268, "grad_norm": 20.818341763586307, "learning_rate": 1.2845474053553156e-09, "logits/chosen": -2.008556842803955, "logits/rejected": -2.004732847213745, "logps/chosen": -1.0449047088623047, "logps/rejected": -1.1916186809539795, "loss": 1.624, "rewards/accuracies": 0.5625, "rewards/chosen": -2.0898094177246094, "rewards/margins": 0.29342788457870483, "rewards/rejected": -2.383237361907959, "step": 3780 }, { "epoch": 2.7305475504322767, "grad_norm": 26.718686245212997, "learning_rate": 1.2190571347958422e-09, "logits/chosen": -2.0307281017303467, "logits/rejected": -2.0321125984191895, "logps/chosen": -0.9740373492240906, "logps/rejected": -1.1882193088531494, "loss": 1.5138, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9480746984481812, "rewards/margins": 0.4283638000488281, "rewards/rejected": -2.376438617706299, "step": 3790 }, { "epoch": 2.7377521613832854, "grad_norm": 21.738543212731777, "learning_rate": 1.1552385663231634e-09, "logits/chosen": -1.9871746301651, "logits/rejected": -1.9778721332550049, "logps/chosen": -1.1047613620758057, "logps/rejected": -1.209443211555481, "loss": 1.6729, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.2095227241516113, "rewards/margins": 0.20936377346515656, "rewards/rejected": -2.418886423110962, "step": 3800 }, { "epoch": 2.744956772334294, "grad_norm": 23.19155618896189, "learning_rate": 1.0930961861191302e-09, "logits/chosen": -1.9520400762557983, "logits/rejected": -1.9569333791732788, "logps/chosen": -1.0511337518692017, "logps/rejected": -1.2038328647613525, "loss": 1.622, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.1022675037384033, "rewards/margins": 0.3053983449935913, "rewards/rejected": -2.407665729522705, "step": 3810 }, { "epoch": 2.7521613832853027, "grad_norm": 20.531328415125483, "learning_rate": 1.0326343625364608e-09, "logits/chosen": -1.9647035598754883, "logits/rejected": -1.959842324256897, "logps/chosen": -1.0526940822601318, "logps/rejected": -1.2392138242721558, "loss": 1.5449, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.1053881645202637, "rewards/margins": 0.3730394244194031, "rewards/rejected": -2.4784276485443115, "step": 3820 }, { "epoch": 2.7593659942363113, "grad_norm": 21.719358673465138, "learning_rate": 9.738573457917066e-10, "logits/chosen": -2.03556227684021, "logits/rejected": -2.0341899394989014, "logps/chosen": -1.062088966369629, "logps/rejected": -1.2683091163635254, "loss": 1.5133, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.124177932739258, "rewards/margins": 0.41244035959243774, "rewards/rejected": -2.536618232727051, "step": 3830 }, { "epoch": 2.76657060518732, "grad_norm": 25.322384547593344, "learning_rate": 9.16769267666434e-10, "logits/chosen": -2.0040173530578613, "logits/rejected": -2.0020196437835693, "logps/chosen": -1.0869022607803345, "logps/rejected": -1.1681996583938599, "loss": 1.705, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.173804521560669, "rewards/margins": 0.16259454190731049, "rewards/rejected": -2.3363993167877197, "step": 3840 }, { "epoch": 2.7737752161383287, "grad_norm": 25.10130501782085, "learning_rate": 8.613741412168113e-10, "logits/chosen": -2.008990526199341, "logits/rejected": -2.00834321975708, "logps/chosen": -1.0906997919082642, "logps/rejected": -1.2317142486572266, "loss": 1.5994, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.1813995838165283, "rewards/margins": 0.28202903270721436, "rewards/rejected": -2.463428497314453, "step": 3850 }, { "epoch": 2.7809798270893373, "grad_norm": 24.416518592438287, "learning_rate": 8.076758604914802e-10, "logits/chosen": -1.9480493068695068, "logits/rejected": -1.9436168670654297, "logps/chosen": -0.9899576306343079, "logps/rejected": -1.1366034746170044, "loss": 1.6159, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.9799152612686157, "rewards/margins": 0.29329171776771545, "rewards/rejected": -2.273206949234009, "step": 3860 }, { "epoch": 2.7881844380403455, "grad_norm": 28.480348150453, "learning_rate": 7.55678200257856e-10, "logits/chosen": -1.9803158044815063, "logits/rejected": -1.9738891124725342, "logps/chosen": -1.04558265209198, "logps/rejected": -1.2032787799835205, "loss": 1.5881, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.09116530418396, "rewards/margins": 0.3153918981552124, "rewards/rejected": -2.406557559967041, "step": 3870 }, { "epoch": 2.795389048991354, "grad_norm": 20.915236612825627, "learning_rate": 7.053848157367315e-10, "logits/chosen": -1.995981216430664, "logits/rejected": -1.9908380508422852, "logps/chosen": -1.0492351055145264, "logps/rejected": -1.2123117446899414, "loss": 1.5961, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0984702110290527, "rewards/margins": 0.32615360617637634, "rewards/rejected": -2.424623489379883, "step": 3880 }, { "epoch": 2.802593659942363, "grad_norm": 19.65622065135912, "learning_rate": 6.567992423453794e-10, "logits/chosen": -2.008981704711914, "logits/rejected": -2.0076236724853516, "logps/chosen": -0.9723512530326843, "logps/rejected": -1.1025577783584595, "loss": 1.6215, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9447025060653687, "rewards/margins": 0.26041287183761597, "rewards/rejected": -2.205115556716919, "step": 3890 }, { "epoch": 2.8097982708933715, "grad_norm": 24.1130742142929, "learning_rate": 6.099248954489794e-10, "logits/chosen": -1.9493846893310547, "logits/rejected": -1.9470285177230835, "logps/chosen": -1.0799471139907837, "logps/rejected": -1.2553883790969849, "loss": 1.5655, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1598942279815674, "rewards/margins": 0.3508824408054352, "rewards/rejected": -2.5107767581939697, "step": 3900 }, { "epoch": 2.81700288184438, "grad_norm": 28.6376251625417, "learning_rate": 5.647650701205653e-10, "logits/chosen": -2.0223331451416016, "logits/rejected": -2.0144119262695312, "logps/chosen": -1.1203711032867432, "logps/rejected": -1.2936725616455078, "loss": 1.5856, "rewards/accuracies": 0.625, "rewards/chosen": -2.2407422065734863, "rewards/margins": 0.3466026186943054, "rewards/rejected": -2.5873451232910156, "step": 3910 }, { "epoch": 2.824207492795389, "grad_norm": 19.452475511473583, "learning_rate": 5.213229409093856e-10, "logits/chosen": -2.023984670639038, "logits/rejected": -2.0185182094573975, "logps/chosen": -1.065812349319458, "logps/rejected": -1.2093201875686646, "loss": 1.6177, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.131624698638916, "rewards/margins": 0.28701576590538025, "rewards/rejected": -2.418640375137329, "step": 3920 }, { "epoch": 2.8314121037463975, "grad_norm": 25.388087544991077, "learning_rate": 4.796015616177401e-10, "logits/chosen": -1.9873781204223633, "logits/rejected": -1.9817928075790405, "logps/chosen": -1.0786560773849487, "logps/rejected": -1.199804425239563, "loss": 1.6411, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1573121547698975, "rewards/margins": 0.2422964870929718, "rewards/rejected": -2.399608850479126, "step": 3930 }, { "epoch": 2.838616714697406, "grad_norm": 20.329189956275457, "learning_rate": 4.3960386508631595e-10, "logits/chosen": -1.9287586212158203, "logits/rejected": -1.9212329387664795, "logps/chosen": -0.9807891845703125, "logps/rejected": -1.1096421480178833, "loss": 1.6523, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.961578369140625, "rewards/margins": 0.2577061057090759, "rewards/rejected": -2.2192842960357666, "step": 3940 }, { "epoch": 2.845821325648415, "grad_norm": 42.782507946743614, "learning_rate": 4.013326629880243e-10, "logits/chosen": -1.9632227420806885, "logits/rejected": -1.9537756443023682, "logps/chosen": -1.1211081743240356, "logps/rejected": -1.2638612985610962, "loss": 1.6209, "rewards/accuracies": 0.5625, "rewards/chosen": -2.2422163486480713, "rewards/margins": 0.2855061888694763, "rewards/rejected": -2.5277225971221924, "step": 3950 }, { "epoch": 2.8530259365994235, "grad_norm": 23.312997369735474, "learning_rate": 3.64790645630339e-10, "logits/chosen": -1.929166555404663, "logits/rejected": -1.9286658763885498, "logps/chosen": -1.0627485513687134, "logps/rejected": -1.139578104019165, "loss": 1.7018, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.1254971027374268, "rewards/margins": 0.15365901589393616, "rewards/rejected": -2.27915620803833, "step": 3960 }, { "epoch": 2.860230547550432, "grad_norm": 26.10078978992991, "learning_rate": 3.2998038176619e-10, "logits/chosen": -1.9690284729003906, "logits/rejected": -1.9605762958526611, "logps/chosen": -1.068928837776184, "logps/rejected": -1.2027775049209595, "loss": 1.6299, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.137857675552368, "rewards/margins": 0.26769721508026123, "rewards/rejected": -2.405555009841919, "step": 3970 }, { "epoch": 2.867435158501441, "grad_norm": 25.234220988826745, "learning_rate": 2.969043184133907e-10, "logits/chosen": -2.0387911796569824, "logits/rejected": -2.03761625289917, "logps/chosen": -0.9789720773696899, "logps/rejected": -1.213514804840088, "loss": 1.4718, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9579441547393799, "rewards/margins": 0.46908536553382874, "rewards/rejected": -2.427029609680176, "step": 3980 }, { "epoch": 2.8746397694524495, "grad_norm": 22.286319727271383, "learning_rate": 2.6556478068261447e-10, "logits/chosen": -1.9648367166519165, "logits/rejected": -1.9626352787017822, "logps/chosen": -0.9818083047866821, "logps/rejected": -1.127106785774231, "loss": 1.6232, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.9636166095733643, "rewards/margins": 0.2905968725681305, "rewards/rejected": -2.254213571548462, "step": 3990 }, { "epoch": 2.881844380403458, "grad_norm": 24.02561776408271, "learning_rate": 2.3596397161395607e-10, "logits/chosen": -2.040318250656128, "logits/rejected": -2.0288829803466797, "logps/chosen": -1.0755038261413574, "logps/rejected": -1.2570974826812744, "loss": 1.5654, "rewards/accuracies": 0.65625, "rewards/chosen": -2.151007652282715, "rewards/margins": 0.36318737268447876, "rewards/rejected": -2.514194965362549, "step": 4000 }, { "epoch": 2.889048991354467, "grad_norm": 32.48825757373928, "learning_rate": 2.0810397202206399e-10, "logits/chosen": -1.9489730596542358, "logits/rejected": -1.9547178745269775, "logps/chosen": -1.0728187561035156, "logps/rejected": -1.212433099746704, "loss": 1.6122, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.1456375122070312, "rewards/margins": 0.2792285680770874, "rewards/rejected": -2.424866199493408, "step": 4010 }, { "epoch": 2.8962536023054755, "grad_norm": 27.02523912157932, "learning_rate": 1.819867403498737e-10, "logits/chosen": -2.024656295776367, "logits/rejected": -2.0224509239196777, "logps/chosen": -1.0818800926208496, "logps/rejected": -1.2248767614364624, "loss": 1.6218, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.163760185241699, "rewards/margins": 0.285993367433548, "rewards/rejected": -2.449753522872925, "step": 4020 }, { "epoch": 2.903458213256484, "grad_norm": 26.86229801663181, "learning_rate": 1.5761411253092382e-10, "logits/chosen": -1.9581588506698608, "logits/rejected": -1.9483264684677124, "logps/chosen": -0.9981783032417297, "logps/rejected": -1.1319841146469116, "loss": 1.6204, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9963566064834595, "rewards/margins": 0.26761192083358765, "rewards/rejected": -2.2639682292938232, "step": 4030 }, { "epoch": 2.910662824207493, "grad_norm": 24.49725819437343, "learning_rate": 1.3498780186031455e-10, "logits/chosen": -2.0099265575408936, "logits/rejected": -2.0065605640411377, "logps/chosen": -1.1728383302688599, "logps/rejected": -1.3035690784454346, "loss": 1.6505, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.3456766605377197, "rewards/margins": 0.26146143674850464, "rewards/rejected": -2.607138156890869, "step": 4040 }, { "epoch": 2.9178674351585014, "grad_norm": 19.940189005964534, "learning_rate": 1.1410939887425141e-10, "logits/chosen": -1.9925037622451782, "logits/rejected": -1.9946651458740234, "logps/chosen": -1.0628712177276611, "logps/rejected": -1.1979005336761475, "loss": 1.6383, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.1257424354553223, "rewards/margins": 0.27005869150161743, "rewards/rejected": -2.395801067352295, "step": 4050 }, { "epoch": 2.92507204610951, "grad_norm": 23.319482286830002, "learning_rate": 9.498037123825686e-11, "logits/chosen": -2.005716562271118, "logits/rejected": -2.002530574798584, "logps/chosen": -1.0336294174194336, "logps/rejected": -1.1716318130493164, "loss": 1.6185, "rewards/accuracies": 0.59375, "rewards/chosen": -2.067258834838867, "rewards/margins": 0.276004821062088, "rewards/rejected": -2.343263626098633, "step": 4060 }, { "epoch": 2.9322766570605188, "grad_norm": 25.881271974785715, "learning_rate": 7.760206364398614e-11, "logits/chosen": -2.0617315769195557, "logits/rejected": -2.0592739582061768, "logps/chosen": -1.0913032293319702, "logps/rejected": -1.2478063106536865, "loss": 1.6002, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1826064586639404, "rewards/margins": 0.3130059540271759, "rewards/rejected": -2.495612621307373, "step": 4070 }, { "epoch": 2.9394812680115274, "grad_norm": 26.85110956665357, "learning_rate": 6.19756977147029e-11, "logits/chosen": -1.9937469959259033, "logits/rejected": -1.9905986785888672, "logps/chosen": -1.042575478553772, "logps/rejected": -1.2636533975601196, "loss": 1.5114, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.085150957107544, "rewards/margins": 0.44215598702430725, "rewards/rejected": -2.5273067951202393, "step": 4080 }, { "epoch": 2.946685878962536, "grad_norm": 25.4840778275471, "learning_rate": 4.810237191940625e-11, "logits/chosen": -1.9718611240386963, "logits/rejected": -1.9710248708724976, "logps/chosen": -1.0520293712615967, "logps/rejected": -1.1911691427230835, "loss": 1.6396, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.1040587425231934, "rewards/margins": 0.2782793939113617, "rewards/rejected": -2.382338285446167, "step": 4090 }, { "epoch": 2.9538904899135447, "grad_norm": 23.758425171401605, "learning_rate": 3.5983061495617476e-11, "logits/chosen": -2.029209613800049, "logits/rejected": -2.029736042022705, "logps/chosen": -1.1354830265045166, "logps/rejected": -1.2939428091049194, "loss": 1.6003, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.270966053009033, "rewards/margins": 0.31691962480545044, "rewards/rejected": -2.587885618209839, "step": 4100 }, { "epoch": 2.9610951008645534, "grad_norm": 26.352266682348706, "learning_rate": 2.5618618380812694e-11, "logits/chosen": -2.0186376571655273, "logits/rejected": -2.008363962173462, "logps/chosen": -1.013756275177002, "logps/rejected": -1.1949803829193115, "loss": 1.575, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.027512550354004, "rewards/margins": 0.36244791746139526, "rewards/rejected": -2.389960765838623, "step": 4110 }, { "epoch": 2.968299711815562, "grad_norm": 27.360344602179914, "learning_rate": 1.700977115254576e-11, "logits/chosen": -1.992940902709961, "logits/rejected": -1.9898687601089478, "logps/chosen": -1.00955069065094, "logps/rejected": -1.1720443964004517, "loss": 1.5826, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.01910138130188, "rewards/margins": 0.3249874711036682, "rewards/rejected": -2.3440887928009033, "step": 4120 }, { "epoch": 2.9755043227665707, "grad_norm": 25.015443326219106, "learning_rate": 1.0157124977230868e-11, "logits/chosen": -1.971022367477417, "logits/rejected": -1.9698013067245483, "logps/chosen": -0.9795894622802734, "logps/rejected": -1.1395008563995361, "loss": 1.5852, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.9591789245605469, "rewards/margins": 0.3198230266571045, "rewards/rejected": -2.2790017127990723, "step": 4130 }, { "epoch": 2.9827089337175794, "grad_norm": 26.021155111282482, "learning_rate": 5.061161567596061e-12, "logits/chosen": -1.9912612438201904, "logits/rejected": -1.9871059656143188, "logps/chosen": -1.0640586614608765, "logps/rejected": -1.1597628593444824, "loss": 1.6899, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.128117322921753, "rewards/margins": 0.1914081871509552, "rewards/rejected": -2.319525718688965, "step": 4140 }, { "epoch": 2.989913544668588, "grad_norm": 25.693571919739124, "learning_rate": 1.7222391488297406e-12, "logits/chosen": -2.011465311050415, "logits/rejected": -2.007890224456787, "logps/chosen": -1.11729896068573, "logps/rejected": -1.279888391494751, "loss": 1.5877, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.23459792137146, "rewards/margins": 0.3251785635948181, "rewards/rejected": -2.559776782989502, "step": 4150 }, { "epoch": 2.9971181556195967, "grad_norm": 23.539641948439122, "learning_rate": 1.4059243338693238e-13, "logits/chosen": -1.986538290977478, "logits/rejected": -1.979648232460022, "logps/chosen": -1.0680465698242188, "logps/rejected": -1.204766035079956, "loss": 1.6145, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.1360931396484375, "rewards/margins": 0.27343878149986267, "rewards/rejected": -2.409532070159912, "step": 4160 }, { "epoch": 3.0, "step": 4164, "total_flos": 0.0, "train_loss": 0.024831957134572962, "train_runtime": 70.0566, "train_samples_per_second": 950.746, "train_steps_per_second": 59.438 } ], "logging_steps": 10, "max_steps": 4164, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }