{ "best_metric": null, "best_model_checkpoint": null, "epoch": 10.0, "eval_steps": 1000, "global_step": 1250, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.008, "grad_norm": 0.5664508606614915, "learning_rate": 8e-08, "logits/chosen": -1.7671998739242554, "logits/rejected": -2.2639822959899902, "logps/chosen": -46.430763244628906, "logps/rejected": -102.85381317138672, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.08, "grad_norm": 0.5939624426575738, "learning_rate": 8.000000000000001e-07, "logits/chosen": -1.6349234580993652, "logits/rejected": -2.0293447971343994, "logps/chosen": -50.39720916748047, "logps/rejected": -98.8401870727539, "loss": 0.6929, "rewards/accuracies": 0.5069444179534912, "rewards/chosen": -0.00029719059239141643, "rewards/margins": 0.0004301935259718448, "rewards/rejected": -0.0007273841183632612, "step": 10 }, { "epoch": 0.16, "grad_norm": 0.5700892501542819, "learning_rate": 1.6000000000000001e-06, "logits/chosen": -1.6655193567276, "logits/rejected": -2.1240811347961426, "logps/chosen": -47.0224609375, "logps/rejected": -105.32938385009766, "loss": 0.6875, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.0002471635234542191, "rewards/margins": 0.012360258027911186, "rewards/rejected": -0.01211309339851141, "step": 20 }, { "epoch": 0.24, "grad_norm": 0.7554778741427768, "learning_rate": 2.4000000000000003e-06, "logits/chosen": -1.7440729141235352, "logits/rejected": -2.1188106536865234, "logps/chosen": -50.84648513793945, "logps/rejected": -106.61978912353516, "loss": 0.6579, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.0015375657239928842, "rewards/margins": 0.07141353189945221, "rewards/rejected": -0.07295109331607819, "step": 30 }, { "epoch": 0.32, "grad_norm": 0.7532828556442333, "learning_rate": 3.2000000000000003e-06, "logits/chosen": -1.6423908472061157, "logits/rejected": -2.07899808883667, "logps/chosen": -48.13755416870117, "logps/rejected": -131.5819854736328, "loss": 0.5614, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.005950761027634144, "rewards/margins": 0.3043977618217468, "rewards/rejected": -0.29844698309898376, "step": 40 }, { "epoch": 0.4, "grad_norm": 0.49124925197921077, "learning_rate": 4.000000000000001e-06, "logits/chosen": -1.577450156211853, "logits/rejected": -2.0352962017059326, "logps/chosen": -47.25825119018555, "logps/rejected": -175.77737426757812, "loss": 0.4101, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.014603497460484505, "rewards/margins": 0.7516024708747864, "rewards/rejected": -0.7369989156723022, "step": 50 }, { "epoch": 0.48, "grad_norm": 0.3421632301388367, "learning_rate": 4.800000000000001e-06, "logits/chosen": -1.54917311668396, "logits/rejected": -1.9336481094360352, "logps/chosen": -54.58576583862305, "logps/rejected": -217.2119598388672, "loss": 0.2991, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.034352801740169525, "rewards/margins": 1.1476414203643799, "rewards/rejected": -1.1819943189620972, "step": 60 }, { "epoch": 0.56, "grad_norm": 0.33380968035486847, "learning_rate": 5.600000000000001e-06, "logits/chosen": -1.4622433185577393, "logits/rejected": -1.9104700088500977, "logps/chosen": -53.6714973449707, "logps/rejected": -297.1726379394531, "loss": 0.2231, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.03121398389339447, "rewards/margins": 1.926568627357483, "rewards/rejected": -1.957782506942749, "step": 70 }, { "epoch": 0.64, "grad_norm": 0.14294152601696442, "learning_rate": 6.4000000000000006e-06, "logits/chosen": -1.3913681507110596, "logits/rejected": -1.818612813949585, "logps/chosen": -54.882850646972656, "logps/rejected": -364.594970703125, "loss": 0.2104, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.04058960825204849, "rewards/margins": 2.5959861278533936, "rewards/rejected": -2.636575937271118, "step": 80 }, { "epoch": 0.72, "grad_norm": 0.2043412163104557, "learning_rate": 7.2000000000000005e-06, "logits/chosen": -1.3484817743301392, "logits/rejected": -1.7438443899154663, "logps/chosen": -57.729034423828125, "logps/rejected": -391.08892822265625, "loss": 0.194, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -0.0659557357430458, "rewards/margins": 2.8331382274627686, "rewards/rejected": -2.8990941047668457, "step": 90 }, { "epoch": 0.8, "grad_norm": 0.45349092259459883, "learning_rate": 8.000000000000001e-06, "logits/chosen": -1.3855174779891968, "logits/rejected": -1.819236159324646, "logps/chosen": -49.61981964111328, "logps/rejected": -360.9615173339844, "loss": 0.1907, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.004681958816945553, "rewards/margins": 2.5954396724700928, "rewards/rejected": -2.5907576084136963, "step": 100 }, { "epoch": 0.88, "grad_norm": 0.2724569498424518, "learning_rate": 8.8e-06, "logits/chosen": -1.3040708303451538, "logits/rejected": -1.7432258129119873, "logps/chosen": -52.180030822753906, "logps/rejected": -379.49664306640625, "loss": 0.1868, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.02268405631184578, "rewards/margins": 2.7577483654022217, "rewards/rejected": -2.78043270111084, "step": 110 }, { "epoch": 0.96, "grad_norm": 0.17941572717033036, "learning_rate": 9.600000000000001e-06, "logits/chosen": -1.2530890703201294, "logits/rejected": -1.7477638721466064, "logps/chosen": -38.981895446777344, "logps/rejected": -443.25347900390625, "loss": 0.1556, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.07770782709121704, "rewards/margins": 3.4688849449157715, "rewards/rejected": -3.3911774158477783, "step": 120 }, { "epoch": 1.04, "grad_norm": 0.3303028590674529, "learning_rate": 9.999512620046523e-06, "logits/chosen": -1.346740484237671, "logits/rejected": -1.805371880531311, "logps/chosen": -55.48865509033203, "logps/rejected": -389.91607666015625, "loss": 0.1713, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.04284561052918434, "rewards/margins": 2.8560733795166016, "rewards/rejected": -2.898919105529785, "step": 130 }, { "epoch": 1.12, "grad_norm": 0.9056804720582001, "learning_rate": 9.995614150494293e-06, "logits/chosen": -1.234359622001648, "logits/rejected": -1.6866785287857056, "logps/chosen": -48.799949645996094, "logps/rejected": -423.95452880859375, "loss": 0.164, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.011531209573149681, "rewards/margins": 3.2391915321350098, "rewards/rejected": -3.2276599407196045, "step": 140 }, { "epoch": 1.2, "grad_norm": 0.5254416965749492, "learning_rate": 9.987820251299121e-06, "logits/chosen": -1.0958689451217651, "logits/rejected": -1.4965614080429077, "logps/chosen": -52.782798767089844, "logps/rejected": -455.8980407714844, "loss": 0.1648, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -0.012227678671479225, "rewards/margins": 3.540536880493164, "rewards/rejected": -3.552764415740967, "step": 150 }, { "epoch": 1.28, "grad_norm": 0.5108498363771904, "learning_rate": 9.976136999909156e-06, "logits/chosen": -1.1107490062713623, "logits/rejected": -1.5378646850585938, "logps/chosen": -47.4257698059082, "logps/rejected": -470.36163330078125, "loss": 0.11, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": 0.026664480566978455, "rewards/margins": 3.716306209564209, "rewards/rejected": -3.6896419525146484, "step": 160 }, { "epoch": 1.3599999999999999, "grad_norm": 1.2383483480266742, "learning_rate": 9.960573506572391e-06, "logits/chosen": -1.028438687324524, "logits/rejected": -1.5239307880401611, "logps/chosen": -42.13895034790039, "logps/rejected": -517.1678466796875, "loss": 0.1026, "rewards/accuracies": 0.96875, "rewards/chosen": 0.05908266454935074, "rewards/margins": 4.210160255432129, "rewards/rejected": -4.151078224182129, "step": 170 }, { "epoch": 1.44, "grad_norm": 0.9166087354544606, "learning_rate": 9.941141907232766e-06, "logits/chosen": -0.9610630869865417, "logits/rejected": -1.4120290279388428, "logps/chosen": -46.765838623046875, "logps/rejected": -542.84716796875, "loss": 0.109, "rewards/accuracies": 0.9375, "rewards/chosen": 0.03420247882604599, "rewards/margins": 4.456036567687988, "rewards/rejected": -4.421834468841553, "step": 180 }, { "epoch": 1.52, "grad_norm": 1.6771630345432116, "learning_rate": 9.91785735406693e-06, "logits/chosen": -0.8817203640937805, "logits/rejected": -1.3373805284500122, "logps/chosen": -43.31850051879883, "logps/rejected": -525.7540283203125, "loss": 0.0982, "rewards/accuracies": 0.9375, "rewards/chosen": 0.06416326761245728, "rewards/margins": 4.315359115600586, "rewards/rejected": -4.251195907592773, "step": 190 }, { "epoch": 1.6, "grad_norm": 1.2635622789324346, "learning_rate": 9.890738003669029e-06, "logits/chosen": -0.9383857846260071, "logits/rejected": -1.3598968982696533, "logps/chosen": -40.324180603027344, "logps/rejected": -580.6539916992188, "loss": 0.0748, "rewards/accuracies": 0.956250011920929, "rewards/chosen": 0.09767557680606842, "rewards/margins": 4.886355876922607, "rewards/rejected": -4.788680553436279, "step": 200 }, { "epoch": 1.6800000000000002, "grad_norm": 2.051002862784939, "learning_rate": 9.859805002892733e-06, "logits/chosen": -0.747630774974823, "logits/rejected": -1.189408302307129, "logps/chosen": -38.997947692871094, "logps/rejected": -600.232421875, "loss": 0.0693, "rewards/accuracies": 0.981249988079071, "rewards/chosen": 0.10605404525995255, "rewards/margins": 5.094006538391113, "rewards/rejected": -4.98795223236084, "step": 210 }, { "epoch": 1.76, "grad_norm": 0.5973861965703521, "learning_rate": 9.825082472361558e-06, "logits/chosen": -0.769231915473938, "logits/rejected": -1.2444443702697754, "logps/chosen": -37.80399703979492, "logps/rejected": -661.2396850585938, "loss": 0.0355, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.1041645035147667, "rewards/margins": 5.686351299285889, "rewards/rejected": -5.582187652587891, "step": 220 }, { "epoch": 1.8399999999999999, "grad_norm": 0.36668201108212406, "learning_rate": 9.786597487660336e-06, "logits/chosen": -0.7400678396224976, "logits/rejected": -1.185605764389038, "logps/chosen": -38.35655975341797, "logps/rejected": -675.3729248046875, "loss": 0.0604, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.10024236142635345, "rewards/margins": 5.824278831481934, "rewards/rejected": -5.72403621673584, "step": 230 }, { "epoch": 1.92, "grad_norm": 0.18097106498610982, "learning_rate": 9.744380058222483e-06, "logits/chosen": -0.8332279324531555, "logits/rejected": -1.2189085483551025, "logps/chosen": -39.80487060546875, "logps/rejected": -665.7376708984375, "loss": 0.0445, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1086835116147995, "rewards/margins": 5.7541303634643555, "rewards/rejected": -5.64544677734375, "step": 240 }, { "epoch": 2.0, "grad_norm": 0.5482510873677592, "learning_rate": 9.698463103929542e-06, "logits/chosen": -0.8593254089355469, "logits/rejected": -1.219299077987671, "logps/chosen": -40.509761810302734, "logps/rejected": -678.4737548828125, "loss": 0.0333, "rewards/accuracies": 1.0, "rewards/chosen": 0.10330041497945786, "rewards/margins": 5.88486385345459, "rewards/rejected": -5.781562805175781, "step": 250 }, { "epoch": 2.08, "grad_norm": 0.06311428811879578, "learning_rate": 9.648882429441258e-06, "logits/chosen": -0.721664309501648, "logits/rejected": -1.1366941928863525, "logps/chosen": -43.40575408935547, "logps/rejected": -718.126953125, "loss": 0.043, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.06619155406951904, "rewards/margins": 6.222557067871094, "rewards/rejected": -6.156365394592285, "step": 260 }, { "epoch": 2.16, "grad_norm": 0.10420712161857161, "learning_rate": 9.595676696276173e-06, "logits/chosen": -0.8397006988525391, "logits/rejected": -1.1919224262237549, "logps/chosen": -45.371559143066406, "logps/rejected": -671.6702880859375, "loss": 0.0399, "rewards/accuracies": 1.0, "rewards/chosen": 0.05775808170437813, "rewards/margins": 5.775918006896973, "rewards/rejected": -5.718161106109619, "step": 270 }, { "epoch": 2.24, "grad_norm": 0.048219144314199394, "learning_rate": 9.538887392664544e-06, "logits/chosen": -0.932695746421814, "logits/rejected": -1.2857800722122192, "logps/chosen": -49.043556213378906, "logps/rejected": -700.9938354492188, "loss": 0.0302, "rewards/accuracies": 1.0, "rewards/chosen": 0.01686009392142296, "rewards/margins": 6.021510601043701, "rewards/rejected": -6.004650592803955, "step": 280 }, { "epoch": 2.32, "grad_norm": 0.052699483559969486, "learning_rate": 9.478558801197065e-06, "logits/chosen": -1.0952008962631226, "logits/rejected": -1.419945478439331, "logps/chosen": -50.7900276184082, "logps/rejected": -746.294921875, "loss": 0.0287, "rewards/accuracies": 1.0, "rewards/chosen": -0.011282497085630894, "rewards/margins": 6.445050239562988, "rewards/rejected": -6.456332206726074, "step": 290 }, { "epoch": 2.4, "grad_norm": 0.39282718133559014, "learning_rate": 9.414737964294636e-06, "logits/chosen": -1.276970386505127, "logits/rejected": -1.5558645725250244, "logps/chosen": -57.718162536621094, "logps/rejected": -758.6500244140625, "loss": 0.0284, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.07060392200946808, "rewards/margins": 6.502586364746094, "rewards/rejected": -6.573190212249756, "step": 300 }, { "epoch": 2.48, "grad_norm": 0.12204254875967184, "learning_rate": 9.347474647526095e-06, "logits/chosen": -1.4385970830917358, "logits/rejected": -1.6844545602798462, "logps/chosen": -59.007118225097656, "logps/rejected": -749.748046875, "loss": 0.0353, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.08172997832298279, "rewards/margins": 6.407279014587402, "rewards/rejected": -6.489008903503418, "step": 310 }, { "epoch": 2.56, "grad_norm": 0.05045051166529495, "learning_rate": 9.276821300802535e-06, "logits/chosen": -1.682716965675354, "logits/rejected": -1.9395768642425537, "logps/chosen": -69.35375213623047, "logps/rejected": -762.8756103515625, "loss": 0.0149, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -0.20327362418174744, "rewards/margins": 6.410320281982422, "rewards/rejected": -6.613594055175781, "step": 320 }, { "epoch": 2.64, "grad_norm": 0.041644876319752826, "learning_rate": 9.202833017478421e-06, "logits/chosen": -1.9520126581192017, "logits/rejected": -2.184046745300293, "logps/chosen": -74.17578125, "logps/rejected": -782.9901123046875, "loss": 0.0139, "rewards/accuracies": 1.0, "rewards/chosen": -0.25414201617240906, "rewards/margins": 6.548297882080078, "rewards/rejected": -6.802439212799072, "step": 330 }, { "epoch": 2.7199999999999998, "grad_norm": 0.10894239404290727, "learning_rate": 9.125567491391476e-06, "logits/chosen": -2.3633341789245605, "logits/rejected": -2.5258421897888184, "logps/chosen": -95.89537811279297, "logps/rejected": -769.3350830078125, "loss": 0.026, "rewards/accuracies": 1.0, "rewards/chosen": -0.4556370675563812, "rewards/margins": 6.227226734161377, "rewards/rejected": -6.682864189147949, "step": 340 }, { "epoch": 2.8, "grad_norm": 0.09090196291657318, "learning_rate": 9.045084971874738e-06, "logits/chosen": -2.9310622215270996, "logits/rejected": -3.0340473651885986, "logps/chosen": -181.8387451171875, "logps/rejected": -789.5302734375, "loss": 0.0108, "rewards/accuracies": 1.0, "rewards/chosen": -1.3074408769607544, "rewards/margins": 5.575064182281494, "rewards/rejected": -6.882504463195801, "step": 350 }, { "epoch": 2.88, "grad_norm": 0.12031593041735512, "learning_rate": 8.961448216775955e-06, "logits/chosen": -2.900047779083252, "logits/rejected": -3.0111327171325684, "logps/chosen": -186.22052001953125, "logps/rejected": -826.2166137695312, "loss": 0.0242, "rewards/accuracies": 1.0, "rewards/chosen": -1.3595960140228271, "rewards/margins": 5.881885051727295, "rewards/rejected": -7.241480350494385, "step": 360 }, { "epoch": 2.96, "grad_norm": 0.055136260032898635, "learning_rate": 8.874722443520898e-06, "logits/chosen": -2.855480432510376, "logits/rejected": -2.9430692195892334, "logps/chosen": -199.16647338867188, "logps/rejected": -827.880859375, "loss": 0.0346, "rewards/accuracies": 1.0, "rewards/chosen": -1.4962244033813477, "rewards/margins": 5.776437282562256, "rewards/rejected": -7.272662162780762, "step": 370 }, { "epoch": 3.04, "grad_norm": 0.05072076339049116, "learning_rate": 8.784975278258783e-06, "logits/chosen": -2.6045117378234863, "logits/rejected": -2.729827404022217, "logps/chosen": -133.8080596923828, "logps/rejected": -809.306640625, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -0.8419109582901001, "rewards/margins": 6.24552059173584, "rewards/rejected": -7.08743143081665, "step": 380 }, { "epoch": 3.12, "grad_norm": 0.7450627702438166, "learning_rate": 8.692276703129421e-06, "logits/chosen": -2.6937222480773926, "logits/rejected": -2.7929458618164062, "logps/chosen": -168.631103515625, "logps/rejected": -842.021484375, "loss": 0.0196, "rewards/accuracies": 1.0, "rewards/chosen": -1.18534255027771, "rewards/margins": 6.214483737945557, "rewards/rejected": -7.3998260498046875, "step": 390 }, { "epoch": 3.2, "grad_norm": 0.05027116077254764, "learning_rate": 8.596699001693257e-06, "logits/chosen": -2.852487087249756, "logits/rejected": -2.908648729324341, "logps/chosen": -187.98727416992188, "logps/rejected": -834.2393798828125, "loss": 0.016, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.361481785774231, "rewards/margins": 5.982842922210693, "rewards/rejected": -7.344325065612793, "step": 400 }, { "epoch": 3.2800000000000002, "grad_norm": 0.042611669944768574, "learning_rate": 8.498316702566828e-06, "logits/chosen": -2.6405489444732666, "logits/rejected": -2.737513303756714, "logps/chosen": -163.23654174804688, "logps/rejected": -875.9517822265625, "loss": 0.0099, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.1402196884155273, "rewards/margins": 6.602635383605957, "rewards/rejected": -7.742854118347168, "step": 410 }, { "epoch": 3.36, "grad_norm": 0.08908259328663255, "learning_rate": 8.397206521307584e-06, "logits/chosen": -2.797304630279541, "logits/rejected": -2.8749709129333496, "logps/chosen": -200.6483917236328, "logps/rejected": -878.53466796875, "loss": 0.0056, "rewards/accuracies": 1.0, "rewards/chosen": -1.5122627019882202, "rewards/margins": 6.271607398986816, "rewards/rejected": -7.783869743347168, "step": 420 }, { "epoch": 3.44, "grad_norm": 0.11190864225013811, "learning_rate": 8.293447300593402e-06, "logits/chosen": -2.712634563446045, "logits/rejected": -2.790599822998047, "logps/chosen": -156.58209228515625, "logps/rejected": -865.5999755859375, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -1.0528513193130493, "rewards/margins": 6.598712921142578, "rewards/rejected": -7.651564598083496, "step": 430 }, { "epoch": 3.52, "grad_norm": 0.06216733964642332, "learning_rate": 8.18711994874345e-06, "logits/chosen": -2.7393593788146973, "logits/rejected": -2.810574531555176, "logps/chosen": -159.99278259277344, "logps/rejected": -852.8875732421875, "loss": 0.0102, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.0680657625198364, "rewards/margins": 6.475743770599365, "rewards/rejected": -7.543809413909912, "step": 440 }, { "epoch": 3.6, "grad_norm": 1.6020556191748092, "learning_rate": 8.078307376628292e-06, "logits/chosen": -2.8640763759613037, "logits/rejected": -2.9100239276885986, "logps/chosen": -194.4068603515625, "logps/rejected": -889.5808715820312, "loss": 0.0241, "rewards/accuracies": 1.0, "rewards/chosen": -1.4508755207061768, "rewards/margins": 6.437768459320068, "rewards/rejected": -7.888643741607666, "step": 450 }, { "epoch": 3.68, "grad_norm": 0.08068344292732496, "learning_rate": 7.967094433018508e-06, "logits/chosen": -2.863485813140869, "logits/rejected": -2.9134774208068848, "logps/chosen": -192.51461791992188, "logps/rejected": -875.9539184570312, "loss": 0.0173, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4307234287261963, "rewards/margins": 6.317349433898926, "rewards/rejected": -7.748072624206543, "step": 460 }, { "epoch": 3.76, "grad_norm": 0.03890160957476439, "learning_rate": 7.85356783842216e-06, "logits/chosen": -2.780956983566284, "logits/rejected": -2.8286776542663574, "logps/chosen": -156.67994689941406, "logps/rejected": -865.4143676757812, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -1.0601476430892944, "rewards/margins": 6.580509185791016, "rewards/rejected": -7.6406569480896, "step": 470 }, { "epoch": 3.84, "grad_norm": 0.2797627859203357, "learning_rate": 7.737816117462752e-06, "logits/chosen": -2.872148036956787, "logits/rejected": -2.9261298179626465, "logps/chosen": -162.9639434814453, "logps/rejected": -861.81201171875, "loss": 0.0189, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.1223922967910767, "rewards/margins": 6.500160217285156, "rewards/rejected": -7.62255334854126, "step": 480 }, { "epoch": 3.92, "grad_norm": 0.03202298420625548, "learning_rate": 7.619929529850397e-06, "logits/chosen": -3.1152751445770264, "logits/rejected": -3.124176263809204, "logps/chosen": -212.0701446533203, "logps/rejected": -899.0244140625, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -1.6227067708969116, "rewards/margins": 6.344549655914307, "rewards/rejected": -7.96725606918335, "step": 490 }, { "epoch": 4.0, "grad_norm": 0.056671500571648784, "learning_rate": 7.500000000000001e-06, "logits/chosen": -3.027770757675171, "logits/rejected": -3.058871030807495, "logps/chosen": -178.46084594726562, "logps/rejected": -894.8692626953125, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -1.2816778421401978, "rewards/margins": 6.661429405212402, "rewards/rejected": -7.943107604980469, "step": 500 }, { "epoch": 4.08, "grad_norm": 0.08007457084674505, "learning_rate": 7.378121045351378e-06, "logits/chosen": -3.0769410133361816, "logits/rejected": -3.099307060241699, "logps/chosen": -194.36056518554688, "logps/rejected": -881.5306396484375, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -1.4257417917251587, "rewards/margins": 6.408936977386475, "rewards/rejected": -7.834680080413818, "step": 510 }, { "epoch": 4.16, "grad_norm": 2.4134748485667603, "learning_rate": 7.254387703447154e-06, "logits/chosen": -3.0368213653564453, "logits/rejected": -3.0519778728485107, "logps/chosen": -203.78298950195312, "logps/rejected": -914.9696044921875, "loss": 0.0071, "rewards/accuracies": 1.0, "rewards/chosen": -1.5372952222824097, "rewards/margins": 6.609500885009766, "rewards/rejected": -8.146797180175781, "step": 520 }, { "epoch": 4.24, "grad_norm": 0.029510070347368533, "learning_rate": 7.128896457825364e-06, "logits/chosen": -2.9700584411621094, "logits/rejected": -3.0200304985046387, "logps/chosen": -185.78460693359375, "logps/rejected": -919.3395385742188, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -1.3613429069519043, "rewards/margins": 6.823419094085693, "rewards/rejected": -8.184762001037598, "step": 530 }, { "epoch": 4.32, "grad_norm": 1.2493776168986983, "learning_rate": 7.0017451627844765e-06, "logits/chosen": -2.917212724685669, "logits/rejected": -2.9564435482025146, "logps/chosen": -197.5432891845703, "logps/rejected": -924.7901611328125, "loss": 0.0201, "rewards/accuracies": 1.0, "rewards/chosen": -1.476697325706482, "rewards/margins": 6.7479658126831055, "rewards/rejected": -8.224664688110352, "step": 540 }, { "epoch": 4.4, "grad_norm": 0.2763373802778404, "learning_rate": 6.873032967079562e-06, "logits/chosen": -2.7449865341186523, "logits/rejected": -2.7941806316375732, "logps/chosen": -146.8518829345703, "logps/rejected": -870.0069580078125, "loss": 0.0121, "rewards/accuracies": 1.0, "rewards/chosen": -0.9301493763923645, "rewards/margins": 6.802959442138672, "rewards/rejected": -7.733109474182129, "step": 550 }, { "epoch": 4.48, "grad_norm": 0.03334067080540203, "learning_rate": 6.7428602366090764e-06, "logits/chosen": -2.7536513805389404, "logits/rejected": -2.8025503158569336, "logps/chosen": -160.67921447753906, "logps/rejected": -924.1072998046875, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": -1.0948050022125244, "rewards/margins": 7.13351583480835, "rewards/rejected": -8.228321075439453, "step": 560 }, { "epoch": 4.5600000000000005, "grad_norm": 0.10014923752683715, "learning_rate": 6.611328476152557e-06, "logits/chosen": -2.943023204803467, "logits/rejected": -2.971611261367798, "logps/chosen": -181.0494384765625, "logps/rejected": -916.7213134765625, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -1.306639552116394, "rewards/margins": 6.8469133377075195, "rewards/rejected": -8.153553009033203, "step": 570 }, { "epoch": 4.64, "grad_norm": 0.5303544380996533, "learning_rate": 6.4785402502202345e-06, "logits/chosen": -3.0545575618743896, "logits/rejected": -3.05729079246521, "logps/chosen": -207.95034790039062, "logps/rejected": -939.8880615234375, "loss": 0.0076, "rewards/accuracies": 1.0, "rewards/chosen": -1.5920554399490356, "rewards/margins": 6.775687217712402, "rewards/rejected": -8.367743492126465, "step": 580 }, { "epoch": 4.72, "grad_norm": 0.04084615849116197, "learning_rate": 6.344599103076329e-06, "logits/chosen": -2.9924559593200684, "logits/rejected": -2.992640972137451, "logps/chosen": -198.8438720703125, "logps/rejected": -943.1241455078125, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -1.5001078844070435, "rewards/margins": 6.916898250579834, "rewards/rejected": -8.417006492614746, "step": 590 }, { "epoch": 4.8, "grad_norm": 1.0475565210531403, "learning_rate": 6.209609477998339e-06, "logits/chosen": -3.0877511501312256, "logits/rejected": -3.0805959701538086, "logps/chosen": -226.8428955078125, "logps/rejected": -941.5935668945312, "loss": 0.0178, "rewards/accuracies": 0.987500011920929, "rewards/chosen": -1.7735874652862549, "rewards/margins": 6.6292877197265625, "rewards/rejected": -8.402875900268555, "step": 600 }, { "epoch": 4.88, "grad_norm": 0.0998487112139878, "learning_rate": 6.073676635835317e-06, "logits/chosen": -3.117692232131958, "logits/rejected": -3.0967986583709717, "logps/chosen": -220.5649871826172, "logps/rejected": -916.0166015625, "loss": 0.01, "rewards/accuracies": 1.0, "rewards/chosen": -1.7153180837631226, "rewards/margins": 6.428615570068359, "rewards/rejected": -8.143933296203613, "step": 610 }, { "epoch": 4.96, "grad_norm": 0.04577054636911131, "learning_rate": 5.936906572928625e-06, "logits/chosen": -2.8556697368621826, "logits/rejected": -2.8885579109191895, "logps/chosen": -171.10992431640625, "logps/rejected": -943.6531982421875, "loss": 0.0063, "rewards/accuracies": 1.0, "rewards/chosen": -1.2215626239776611, "rewards/margins": 7.193364143371582, "rewards/rejected": -8.414926528930664, "step": 620 }, { "epoch": 5.04, "grad_norm": 0.1261248214008726, "learning_rate": 5.799405938459175e-06, "logits/chosen": -2.886103868484497, "logits/rejected": -2.930838108062744, "logps/chosen": -162.7739715576172, "logps/rejected": -950.7653198242188, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -1.1451432704925537, "rewards/margins": 7.34460973739624, "rewards/rejected": -8.489753723144531, "step": 630 }, { "epoch": 5.12, "grad_norm": 0.1505621604680798, "learning_rate": 5.661281951285613e-06, "logits/chosen": -3.0620782375335693, "logits/rejected": -3.0735926628112793, "logps/chosen": -191.69161987304688, "logps/rejected": -940.0349731445312, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -1.417407751083374, "rewards/margins": 6.979997158050537, "rewards/rejected": -8.397405624389648, "step": 640 }, { "epoch": 5.2, "grad_norm": 0.019057548002213837, "learning_rate": 5.522642316338268e-06, "logits/chosen": -3.170454502105713, "logits/rejected": -3.1733736991882324, "logps/chosen": -206.1622314453125, "logps/rejected": -938.7789306640625, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -1.5485303401947021, "rewards/margins": 6.836629390716553, "rewards/rejected": -8.385160446166992, "step": 650 }, { "epoch": 5.28, "grad_norm": 0.04553800143055307, "learning_rate": 5.383595140634093e-06, "logits/chosen": -3.111215114593506, "logits/rejected": -3.113774061203003, "logps/chosen": -197.33230590820312, "logps/rejected": -965.2951049804688, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -1.4725955724716187, "rewards/margins": 7.165165901184082, "rewards/rejected": -8.637761116027832, "step": 660 }, { "epoch": 5.36, "grad_norm": 0.023672933333312794, "learning_rate": 5.244248848978067e-06, "logits/chosen": -3.087512493133545, "logits/rejected": -3.0911624431610107, "logps/chosen": -215.00796508789062, "logps/rejected": -978.7565307617188, "loss": 0.0064, "rewards/accuracies": 1.0, "rewards/chosen": -1.669549584388733, "rewards/margins": 7.078073978424072, "rewards/rejected": -8.747623443603516, "step": 670 }, { "epoch": 5.44, "grad_norm": 0.027237164042707367, "learning_rate": 5.1047120994167855e-06, "logits/chosen": -3.069511890411377, "logits/rejected": -3.0692381858825684, "logps/chosen": -202.83555603027344, "logps/rejected": -972.17578125, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -1.5378749370574951, "rewards/margins": 7.15402889251709, "rewards/rejected": -8.691904067993164, "step": 680 }, { "epoch": 5.52, "grad_norm": 0.0238313232064652, "learning_rate": 4.965093698510192e-06, "logits/chosen": -3.045116424560547, "logits/rejected": -3.070358991622925, "logps/chosen": -189.43704223632812, "logps/rejected": -947.7220458984375, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -1.3802012205123901, "rewards/margins": 7.0903000831604, "rewards/rejected": -8.470501899719238, "step": 690 }, { "epoch": 5.6, "grad_norm": 0.06266588262969701, "learning_rate": 4.825502516487497e-06, "logits/chosen": -2.9892053604125977, "logits/rejected": -3.0090491771698, "logps/chosen": -190.52420043945312, "logps/rejected": -957.5784301757812, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -1.3977526426315308, "rewards/margins": 7.167298793792725, "rewards/rejected": -8.565052032470703, "step": 700 }, { "epoch": 5.68, "grad_norm": 0.21683933666142513, "learning_rate": 4.686047402353433e-06, "logits/chosen": -3.0948545932769775, "logits/rejected": -3.0949206352233887, "logps/chosen": -215.0717315673828, "logps/rejected": -974.2849731445312, "loss": 0.0048, "rewards/accuracies": 1.0, "rewards/chosen": -1.6559123992919922, "rewards/margins": 7.0673394203186035, "rewards/rejected": -8.723250389099121, "step": 710 }, { "epoch": 5.76, "grad_norm": 0.029076731867175645, "learning_rate": 4.546837099011101e-06, "logits/chosen": -3.138347864151001, "logits/rejected": -3.1147701740264893, "logps/chosen": -221.6696014404297, "logps/rejected": -976.2689208984375, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -1.7190300226211548, "rewards/margins": 7.019604682922363, "rewards/rejected": -8.738635063171387, "step": 720 }, { "epoch": 5.84, "grad_norm": 0.036662851444810934, "learning_rate": 4.4079801584674955e-06, "logits/chosen": -3.0819013118743896, "logits/rejected": -3.0665931701660156, "logps/chosen": -216.7307891845703, "logps/rejected": -979.8884887695312, "loss": 0.0056, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6721961498260498, "rewards/margins": 7.096380710601807, "rewards/rejected": -8.768576622009277, "step": 730 }, { "epoch": 5.92, "grad_norm": 0.03922484581534493, "learning_rate": 4.269584857187942e-06, "logits/chosen": -3.033648729324341, "logits/rejected": -3.03180193901062, "logps/chosen": -203.57614135742188, "logps/rejected": -953.6720581054688, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -1.5226513147354126, "rewards/margins": 7.014796257019043, "rewards/rejected": -8.537446975708008, "step": 740 }, { "epoch": 6.0, "grad_norm": 0.031567842692926854, "learning_rate": 4.131759111665349e-06, "logits/chosen": -3.0094892978668213, "logits/rejected": -3.0100715160369873, "logps/chosen": -222.1414794921875, "logps/rejected": -981.8389892578125, "loss": 0.0031, "rewards/accuracies": 1.0, "rewards/chosen": -1.7311455011367798, "rewards/margins": 7.06912899017334, "rewards/rejected": -8.800275802612305, "step": 750 }, { "epoch": 6.08, "grad_norm": 0.02449878094582341, "learning_rate": 3.994610394270178e-06, "logits/chosen": -2.970534563064575, "logits/rejected": -2.9691994190216064, "logps/chosen": -207.33847045898438, "logps/rejected": -994.8795166015625, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -1.5827735662460327, "rewards/margins": 7.3385796546936035, "rewards/rejected": -8.921354293823242, "step": 760 }, { "epoch": 6.16, "grad_norm": 0.02981087291288187, "learning_rate": 3.8582456494467214e-06, "logits/chosen": -2.9473607540130615, "logits/rejected": -2.956864833831787, "logps/chosen": -203.9072265625, "logps/rejected": -984.5435791015625, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -1.5347377061843872, "rewards/margins": 7.284613132476807, "rewards/rejected": -8.819352149963379, "step": 770 }, { "epoch": 6.24, "grad_norm": 0.021043064932021155, "learning_rate": 3.7227712103210485e-06, "logits/chosen": -2.973010778427124, "logits/rejected": -2.9938042163848877, "logps/chosen": -190.59471130371094, "logps/rejected": -966.3521728515625, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.3837454319000244, "rewards/margins": 7.294589996337891, "rewards/rejected": -8.678335189819336, "step": 780 }, { "epoch": 6.32, "grad_norm": 0.039673872651556516, "learning_rate": 3.5882927157856175e-06, "logits/chosen": -2.9964919090270996, "logits/rejected": -3.003882884979248, "logps/chosen": -211.8938446044922, "logps/rejected": -996.83544921875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -1.6143487691879272, "rewards/margins": 7.334100246429443, "rewards/rejected": -8.948450088500977, "step": 790 }, { "epoch": 6.4, "grad_norm": 0.04339817998984244, "learning_rate": 3.4549150281252635e-06, "logits/chosen": -2.9923062324523926, "logits/rejected": -3.008997917175293, "logps/chosen": -199.03875732421875, "logps/rejected": -962.5731201171875, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.4706881046295166, "rewards/margins": 7.157661437988281, "rewards/rejected": -8.628351211547852, "step": 800 }, { "epoch": 6.48, "grad_norm": 0.03256715178755916, "learning_rate": 3.322742151248726e-06, "logits/chosen": -2.947535514831543, "logits/rejected": -2.9602136611938477, "logps/chosen": -194.71202087402344, "logps/rejected": -992.3277587890625, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -1.4545636177062988, "rewards/margins": 7.46028995513916, "rewards/rejected": -8.914854049682617, "step": 810 }, { "epoch": 6.5600000000000005, "grad_norm": 0.03103799595540118, "learning_rate": 3.1918771495895395e-06, "logits/chosen": -2.986626625061035, "logits/rejected": -3.0086371898651123, "logps/chosen": -199.30032348632812, "logps/rejected": -983.8270263671875, "loss": 0.0051, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.479447603225708, "rewards/margins": 7.346575736999512, "rewards/rejected": -8.826024055480957, "step": 820 }, { "epoch": 6.64, "grad_norm": 0.012923263616360448, "learning_rate": 3.0624220677394854e-06, "logits/chosen": -2.980841636657715, "logits/rejected": -2.9940218925476074, "logps/chosen": -201.14207458496094, "logps/rejected": -989.4503784179688, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.5033818483352661, "rewards/margins": 7.376742362976074, "rewards/rejected": -8.880125045776367, "step": 830 }, { "epoch": 6.72, "grad_norm": 0.4008776832999778, "learning_rate": 2.934477850877292e-06, "logits/chosen": -2.8989338874816895, "logits/rejected": -2.904329299926758, "logps/chosen": -175.6615447998047, "logps/rejected": -993.4110107421875, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -1.2583926916122437, "rewards/margins": 7.667746543884277, "rewards/rejected": -8.926138877868652, "step": 840 }, { "epoch": 6.8, "grad_norm": 0.03242501786191666, "learning_rate": 2.8081442660546126e-06, "logits/chosen": -2.9362475872039795, "logits/rejected": -2.942796230316162, "logps/chosen": -200.99786376953125, "logps/rejected": -999.9691162109375, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -1.530400037765503, "rewards/margins": 7.44219970703125, "rewards/rejected": -8.9725980758667, "step": 850 }, { "epoch": 6.88, "grad_norm": 0.04701366652822717, "learning_rate": 2.683519824400693e-06, "logits/chosen": -2.9499258995056152, "logits/rejected": -2.9498021602630615, "logps/chosen": -200.02708435058594, "logps/rejected": -1003.5309448242188, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.5131416320800781, "rewards/margins": 7.508401393890381, "rewards/rejected": -9.021543502807617, "step": 860 }, { "epoch": 6.96, "grad_norm": 0.06965442317131018, "learning_rate": 2.560701704306336e-06, "logits/chosen": -2.992658853530884, "logits/rejected": -3.0063395500183105, "logps/chosen": -196.2028045654297, "logps/rejected": -988.9591064453125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -1.4468988180160522, "rewards/margins": 7.434958457946777, "rewards/rejected": -8.881857872009277, "step": 870 }, { "epoch": 7.04, "grad_norm": 0.04943704962243662, "learning_rate": 2.4397856756471435e-06, "logits/chosen": -2.967582941055298, "logits/rejected": -2.969144344329834, "logps/chosen": -201.17965698242188, "logps/rejected": -1002.7322387695312, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.5157372951507568, "rewards/margins": 7.4925055503845215, "rewards/rejected": -9.0082426071167, "step": 880 }, { "epoch": 7.12, "grad_norm": 0.026898832512229635, "learning_rate": 2.320866025105016e-06, "logits/chosen": -3.0048301219940186, "logits/rejected": -3.0222504138946533, "logps/chosen": -212.1355438232422, "logps/rejected": -974.49072265625, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -1.6027580499649048, "rewards/margins": 7.149576663970947, "rewards/rejected": -8.752334594726562, "step": 890 }, { "epoch": 7.2, "grad_norm": 0.018773487381868556, "learning_rate": 2.204035482646267e-06, "logits/chosen": -3.004668712615967, "logits/rejected": -3.02616024017334, "logps/chosen": -204.20449829101562, "logps/rejected": -988.2781372070312, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.5249879360198975, "rewards/margins": 7.354971408843994, "rewards/rejected": -8.879959106445312, "step": 900 }, { "epoch": 7.28, "grad_norm": 0.010977198093830244, "learning_rate": 2.0893851492135536e-06, "logits/chosen": -3.026869773864746, "logits/rejected": -3.0388169288635254, "logps/chosen": -211.68359375, "logps/rejected": -987.3611450195312, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.6020605564117432, "rewards/margins": 7.269850254058838, "rewards/rejected": -8.871912002563477, "step": 910 }, { "epoch": 7.36, "grad_norm": 0.29173762576971574, "learning_rate": 1.977004425688126e-06, "logits/chosen": -3.0339274406433105, "logits/rejected": -3.066455602645874, "logps/chosen": -212.5125732421875, "logps/rejected": -1007.0792846679688, "loss": 0.005, "rewards/accuracies": 1.0, "rewards/chosen": -1.6347990036010742, "rewards/margins": 7.410192966461182, "rewards/rejected": -9.044992446899414, "step": 920 }, { "epoch": 7.44, "grad_norm": 0.021055806854853823, "learning_rate": 1.8669809431776991e-06, "logits/chosen": -2.9736626148223877, "logits/rejected": -2.9851651191711426, "logps/chosen": -196.3205108642578, "logps/rejected": -990.7384643554688, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.459997296333313, "rewards/margins": 7.45266580581665, "rewards/rejected": -8.912662506103516, "step": 930 }, { "epoch": 7.52, "grad_norm": 0.18348617533249506, "learning_rate": 1.7594004946843458e-06, "logits/chosen": -2.942028522491455, "logits/rejected": -2.9542624950408936, "logps/chosen": -199.4943389892578, "logps/rejected": -1008.6310424804688, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -1.5059641599655151, "rewards/margins": 7.565948486328125, "rewards/rejected": -9.07191276550293, "step": 940 }, { "epoch": 7.6, "grad_norm": 0.02961600216806713, "learning_rate": 1.6543469682057105e-06, "logits/chosen": -2.9790632724761963, "logits/rejected": -2.981333017349243, "logps/chosen": -225.8614959716797, "logps/rejected": -1018.3231201171875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -1.773397445678711, "rewards/margins": 7.389155387878418, "rewards/rejected": -9.162553787231445, "step": 950 }, { "epoch": 7.68, "grad_norm": 0.033303351274835385, "learning_rate": 1.551902281321651e-06, "logits/chosen": -3.0063486099243164, "logits/rejected": -3.0057759284973145, "logps/chosen": -210.657470703125, "logps/rejected": -1013.4367065429688, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.6198650598526, "rewards/margins": 7.4931464195251465, "rewards/rejected": -9.113012313842773, "step": 960 }, { "epoch": 7.76, "grad_norm": 0.02991133657233578, "learning_rate": 1.4521463173173966e-06, "logits/chosen": -2.9552433490753174, "logits/rejected": -2.986539363861084, "logps/chosen": -185.34751892089844, "logps/rejected": -995.6087646484375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.3463085889816284, "rewards/margins": 7.610987186431885, "rewards/rejected": -8.957294464111328, "step": 970 }, { "epoch": 7.84, "grad_norm": 0.03956036447686216, "learning_rate": 1.3551568628929434e-06, "logits/chosen": -2.987436532974243, "logits/rejected": -2.99739146232605, "logps/chosen": -202.7963409423828, "logps/rejected": -1010.6976318359375, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -1.5336072444915771, "rewards/margins": 7.547433376312256, "rewards/rejected": -9.081039428710938, "step": 980 }, { "epoch": 7.92, "grad_norm": 0.0380908854419757, "learning_rate": 1.2610095475073415e-06, "logits/chosen": -2.966024160385132, "logits/rejected": -2.9880573749542236, "logps/chosen": -188.83627319335938, "logps/rejected": -1017.0061645507812, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.397072196006775, "rewards/margins": 7.754624366760254, "rewards/rejected": -9.15169620513916, "step": 990 }, { "epoch": 8.0, "grad_norm": 0.056813053850928295, "learning_rate": 1.1697777844051105e-06, "logits/chosen": -3.0522654056549072, "logits/rejected": -3.037459135055542, "logps/chosen": -224.0611114501953, "logps/rejected": -1007.8203125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -1.7456802129745483, "rewards/margins": 7.323793888092041, "rewards/rejected": -9.069474220275879, "step": 1000 }, { "epoch": 8.08, "grad_norm": 0.04428932232427895, "learning_rate": 1.0815327133708015e-06, "logits/chosen": -2.974698305130005, "logits/rejected": -2.989356756210327, "logps/chosen": -199.12106323242188, "logps/rejected": -999.2364501953125, "loss": 0.0048, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.4827978610992432, "rewards/margins": 7.491432189941406, "rewards/rejected": -8.97422981262207, "step": 1010 }, { "epoch": 8.16, "grad_norm": 0.03094089083601851, "learning_rate": 9.963431452563331e-07, "logits/chosen": -2.9914603233337402, "logits/rejected": -2.9979825019836426, "logps/chosen": -202.41212463378906, "logps/rejected": -983.4681396484375, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.5089391469955444, "rewards/margins": 7.3303656578063965, "rewards/rejected": -8.839305877685547, "step": 1020 }, { "epoch": 8.24, "grad_norm": 0.027522596865270775, "learning_rate": 9.142755083243577e-07, "logits/chosen": -3.0142979621887207, "logits/rejected": -3.0246782302856445, "logps/chosen": -206.26199340820312, "logps/rejected": -1013.3017578125, "loss": 0.0028, "rewards/accuracies": 1.0, "rewards/chosen": -1.5663325786590576, "rewards/margins": 7.545605659484863, "rewards/rejected": -9.111937522888184, "step": 1030 }, { "epoch": 8.32, "grad_norm": 0.02636838723753889, "learning_rate": 8.353937964495029e-07, "logits/chosen": -2.9712257385253906, "logits/rejected": -2.994713544845581, "logps/chosen": -199.3070526123047, "logps/rejected": -992.4100341796875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.476993441581726, "rewards/margins": 7.447300910949707, "rewards/rejected": -8.924293518066406, "step": 1040 }, { "epoch": 8.4, "grad_norm": 0.021646075748453193, "learning_rate": 7.597595192178702e-07, "logits/chosen": -2.9999868869781494, "logits/rejected": -3.0006492137908936, "logps/chosen": -215.9517364501953, "logps/rejected": -997.6002197265625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -1.6561391353607178, "rewards/margins": 7.31753396987915, "rewards/rejected": -8.973673820495605, "step": 1050 }, { "epoch": 8.48, "grad_norm": 0.030961558657159674, "learning_rate": 6.874316539637127e-07, "logits/chosen": -2.9921913146972656, "logits/rejected": -2.9953560829162598, "logps/chosen": -215.5575408935547, "logps/rejected": -1014.0217895507812, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.659396767616272, "rewards/margins": 7.469141960144043, "rewards/rejected": -9.128539085388184, "step": 1060 }, { "epoch": 8.56, "grad_norm": 0.021967484279154154, "learning_rate": 6.184665997806832e-07, "logits/chosen": -3.006669282913208, "logits/rejected": -3.005312442779541, "logps/chosen": -213.9212188720703, "logps/rejected": -990.9972534179688, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -1.6195297241210938, "rewards/margins": 7.294030666351318, "rewards/rejected": -8.91356086730957, "step": 1070 }, { "epoch": 8.64, "grad_norm": 0.042081431226620426, "learning_rate": 5.529181335435124e-07, "logits/chosen": -3.008005142211914, "logits/rejected": -3.0087175369262695, "logps/chosen": -218.51760864257812, "logps/rejected": -1024.052978515625, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.690387487411499, "rewards/margins": 7.524999141693115, "rewards/rejected": -9.215387344360352, "step": 1080 }, { "epoch": 8.72, "grad_norm": 0.043812953916419525, "learning_rate": 4.908373679744316e-07, "logits/chosen": -3.0348081588745117, "logits/rejected": -3.0385255813598633, "logps/chosen": -226.70779418945312, "logps/rejected": -1010.8932495117188, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.7693846225738525, "rewards/margins": 7.316775321960449, "rewards/rejected": -9.086159706115723, "step": 1090 }, { "epoch": 8.8, "grad_norm": 0.02641646477682858, "learning_rate": 4.322727117869951e-07, "logits/chosen": -3.014754056930542, "logits/rejected": -3.0084919929504395, "logps/chosen": -217.29788208007812, "logps/rejected": -1013.4778442382812, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.679574728012085, "rewards/margins": 7.428310394287109, "rewards/rejected": -9.107885360717773, "step": 1100 }, { "epoch": 8.88, "grad_norm": 0.027581407278890403, "learning_rate": 3.772698319384349e-07, "logits/chosen": -2.9870715141296387, "logits/rejected": -2.974972724914551, "logps/chosen": -213.857177734375, "logps/rejected": -1029.269287109375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.6536537408828735, "rewards/margins": 7.60448694229126, "rewards/rejected": -9.258139610290527, "step": 1110 }, { "epoch": 8.96, "grad_norm": 0.018398802557307123, "learning_rate": 3.258716180199278e-07, "logits/chosen": -2.969782590866089, "logits/rejected": -2.9666781425476074, "logps/chosen": -201.89608764648438, "logps/rejected": -1012.4627075195312, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.521287441253662, "rewards/margins": 7.587810516357422, "rewards/rejected": -9.109098434448242, "step": 1120 }, { "epoch": 9.04, "grad_norm": 0.02530896697144264, "learning_rate": 2.7811814881259503e-07, "logits/chosen": -2.967005491256714, "logits/rejected": -2.9650299549102783, "logps/chosen": -211.93923950195312, "logps/rejected": -1015.2527465820312, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.6247079372406006, "rewards/margins": 7.509281158447266, "rewards/rejected": -9.133989334106445, "step": 1130 }, { "epoch": 9.12, "grad_norm": 0.021443293328022403, "learning_rate": 2.3404666103526542e-07, "logits/chosen": -2.951347827911377, "logits/rejected": -2.967103958129883, "logps/chosen": -192.74740600585938, "logps/rejected": -1004.8844604492188, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -1.4152947664260864, "rewards/margins": 7.6158270835876465, "rewards/rejected": -9.031121253967285, "step": 1140 }, { "epoch": 9.2, "grad_norm": 0.028269129055998828, "learning_rate": 1.9369152030840553e-07, "logits/chosen": -2.968433141708374, "logits/rejected": -2.975369930267334, "logps/chosen": -208.6031036376953, "logps/rejected": -1004.6807861328125, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.5802977085113525, "rewards/margins": 7.448237419128418, "rewards/rejected": -9.028534889221191, "step": 1150 }, { "epoch": 9.28, "grad_norm": 0.012741391504073269, "learning_rate": 1.5708419435684463e-07, "logits/chosen": -2.979626417160034, "logits/rejected": -3.0008533000946045, "logps/chosen": -203.78268432617188, "logps/rejected": -998.7214965820312, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.53437077999115, "rewards/margins": 7.443058967590332, "rewards/rejected": -8.97743034362793, "step": 1160 }, { "epoch": 9.36, "grad_norm": 0.018928993356126772, "learning_rate": 1.2425322847218368e-07, "logits/chosen": -2.9456331729888916, "logits/rejected": -2.947418689727783, "logps/chosen": -208.3197784423828, "logps/rejected": -1002.4910888671875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -1.5662791728973389, "rewards/margins": 7.459691524505615, "rewards/rejected": -9.025972366333008, "step": 1170 }, { "epoch": 9.44, "grad_norm": 0.11171748274756513, "learning_rate": 9.522422325404234e-08, "logits/chosen": -2.972970485687256, "logits/rejected": -2.9851505756378174, "logps/chosen": -196.48709106445312, "logps/rejected": -1000.6891479492188, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -1.4484045505523682, "rewards/margins": 7.5574445724487305, "rewards/rejected": -9.005849838256836, "step": 1180 }, { "epoch": 9.52, "grad_norm": 0.014643254346465179, "learning_rate": 7.001981464747565e-08, "logits/chosen": -3.006410598754883, "logits/rejected": -3.001279354095459, "logps/chosen": -239.8344268798828, "logps/rejected": -1013.0524291992188, "loss": 0.0044, "rewards/accuracies": 1.0, "rewards/chosen": -1.9079118967056274, "rewards/margins": 7.20407247543335, "rewards/rejected": -9.111984252929688, "step": 1190 }, { "epoch": 9.6, "grad_norm": 0.023445989539047585, "learning_rate": 4.865965629214819e-08, "logits/chosen": -2.9561855792999268, "logits/rejected": -2.9647486209869385, "logps/chosen": -201.7306671142578, "logps/rejected": -982.6613159179688, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -1.502721905708313, "rewards/margins": 7.332386016845703, "rewards/rejected": -8.835107803344727, "step": 1200 }, { "epoch": 9.68, "grad_norm": 0.0197958747361798, "learning_rate": 3.1160404197018155e-08, "logits/chosen": -2.9367284774780273, "logits/rejected": -2.94865083694458, "logps/chosen": -199.8970947265625, "logps/rejected": -1011.3294677734375, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -1.5013561248779297, "rewards/margins": 7.6023712158203125, "rewards/rejected": -9.103727340698242, "step": 1210 }, { "epoch": 9.76, "grad_norm": 0.07635646097593517, "learning_rate": 1.753570375247815e-08, "logits/chosen": -3.0195131301879883, "logits/rejected": -3.0240790843963623, "logps/chosen": -220.675048828125, "logps/rejected": -1000.5875854492188, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -1.7074737548828125, "rewards/margins": 7.296438694000244, "rewards/rejected": -9.003911972045898, "step": 1220 }, { "epoch": 9.84, "grad_norm": 0.011307093391428234, "learning_rate": 7.796179090094891e-09, "logits/chosen": -2.937976837158203, "logits/rejected": -2.9619266986846924, "logps/chosen": -190.8544921875, "logps/rejected": -1021.5460815429688, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -1.419277548789978, "rewards/margins": 7.7647881507873535, "rewards/rejected": -9.184064865112305, "step": 1230 }, { "epoch": 9.92, "grad_norm": 0.017541852350439674, "learning_rate": 1.9494247982282386e-09, "logits/chosen": -2.976008653640747, "logits/rejected": -2.9958183765411377, "logps/chosen": -201.06101989746094, "logps/rejected": -1008.154296875, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -1.5034410953521729, "rewards/margins": 7.560837745666504, "rewards/rejected": -9.064278602600098, "step": 1240 }, { "epoch": 10.0, "grad_norm": 0.01886261936533329, "learning_rate": 0.0, "logits/chosen": -2.998544454574585, "logits/rejected": -3.019519329071045, "logps/chosen": -217.5911102294922, "logps/rejected": -1009.5062255859375, "loss": 0.002, "rewards/accuracies": 1.0, "rewards/chosen": -1.6805696487426758, "rewards/margins": 7.409621238708496, "rewards/rejected": -9.090190887451172, "step": 1250 }, { "epoch": 10.0, "step": 1250, "total_flos": 0.0, "train_loss": 0.05206389323771, "train_runtime": 14277.3883, "train_samples_per_second": 5.603, "train_steps_per_second": 0.088 } ], "logging_steps": 10, "max_steps": 1250, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }