{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998691442030882, "eval_steps": 500, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.010468463752944255, "grad_norm": 33.21111681571131, "learning_rate": 2.0833333333333334e-06, "logits/chosen": -0.4980102479457855, "logits/rejected": -0.5135027170181274, "logps/chosen": -1.1746745109558105, "logps/rejected": -1.3606590032577515, "loss": 2.1734, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1746745109558105, "rewards/margins": 0.1859845519065857, "rewards/rejected": -1.3606590032577515, "step": 5 }, { "epoch": 0.02093692750588851, "grad_norm": 21.16742924169967, "learning_rate": 4.166666666666667e-06, "logits/chosen": -0.5296765565872192, "logits/rejected": -0.5027884244918823, "logps/chosen": -1.1314122676849365, "logps/rejected": -1.2633330821990967, "loss": 2.1306, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -1.1314122676849365, "rewards/margins": 0.13192060589790344, "rewards/rejected": -1.2633330821990967, "step": 10 }, { "epoch": 0.031405391258832765, "grad_norm": 18.622273155389507, "learning_rate": 6.25e-06, "logits/chosen": -0.45581430196762085, "logits/rejected": -0.42932063341140747, "logps/chosen": -1.1560032367706299, "logps/rejected": -1.4923290014266968, "loss": 2.0523, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.1560032367706299, "rewards/margins": 0.3363257944583893, "rewards/rejected": -1.4923290014266968, "step": 15 }, { "epoch": 0.04187385501177702, "grad_norm": 25.84825355543498, "learning_rate": 8.333333333333334e-06, "logits/chosen": -0.6032270789146423, "logits/rejected": -0.5604568719863892, "logps/chosen": -1.2145692110061646, "logps/rejected": -1.5209157466888428, "loss": 2.101, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2145692110061646, "rewards/margins": 0.30634641647338867, "rewards/rejected": -1.5209157466888428, "step": 20 }, { "epoch": 0.05234231876472128, "grad_norm": 10.051572353875851, "learning_rate": 1.0416666666666668e-05, "logits/chosen": -0.7330023646354675, "logits/rejected": -0.6652411222457886, "logps/chosen": -1.3188468217849731, "logps/rejected": -1.643450140953064, "loss": 2.0473, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3188468217849731, "rewards/margins": 0.32460346817970276, "rewards/rejected": -1.643450140953064, "step": 25 }, { "epoch": 0.06281078251766553, "grad_norm": 12.836119680084701, "learning_rate": 1.25e-05, "logits/chosen": -0.7389785051345825, "logits/rejected": -0.7157658338546753, "logps/chosen": -1.2610353231430054, "logps/rejected": -1.5368638038635254, "loss": 2.1476, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.2610353231430054, "rewards/margins": 0.27582842111587524, "rewards/rejected": -1.5368638038635254, "step": 30 }, { "epoch": 0.07327924627060979, "grad_norm": 10.916144118237353, "learning_rate": 1.4583333333333333e-05, "logits/chosen": -0.6624680757522583, "logits/rejected": -0.5841827392578125, "logps/chosen": -1.3438886404037476, "logps/rejected": -1.5585218667984009, "loss": 2.1252, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.3438886404037476, "rewards/margins": 0.2146332710981369, "rewards/rejected": -1.5585218667984009, "step": 35 }, { "epoch": 0.08374771002355404, "grad_norm": 7.904056592473059, "learning_rate": 1.6666666666666667e-05, "logits/chosen": -0.8896552920341492, "logits/rejected": -0.7669180631637573, "logps/chosen": -1.3083586692810059, "logps/rejected": -1.7862266302108765, "loss": 2.0664, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.3083586692810059, "rewards/margins": 0.47786790132522583, "rewards/rejected": -1.7862266302108765, "step": 40 }, { "epoch": 0.0942161737764983, "grad_norm": 13.316362762434997, "learning_rate": 1.8750000000000002e-05, "logits/chosen": -0.7929601669311523, "logits/rejected": -0.752467930316925, "logps/chosen": -1.2723389863967896, "logps/rejected": -1.6567331552505493, "loss": 2.0997, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2723389863967896, "rewards/margins": 0.38439422845840454, "rewards/rejected": -1.6567331552505493, "step": 45 }, { "epoch": 0.10468463752944256, "grad_norm": 16.98219621825263, "learning_rate": 1.9998927475076107e-05, "logits/chosen": -0.3519185483455658, "logits/rejected": -0.30840247869491577, "logps/chosen": -1.275742769241333, "logps/rejected": -1.7419742345809937, "loss": 2.1089, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.275742769241333, "rewards/margins": 0.4662315845489502, "rewards/rejected": -1.7419742345809937, "step": 50 }, { "epoch": 0.11515310128238682, "grad_norm": 7.79923910311647, "learning_rate": 1.998686421164407e-05, "logits/chosen": -0.13412383198738098, "logits/rejected": -0.06430118530988693, "logps/chosen": -1.3077303171157837, "logps/rejected": -1.7474453449249268, "loss": 2.0751, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3077303171157837, "rewards/margins": 0.43971508741378784, "rewards/rejected": -1.7474453449249268, "step": 55 }, { "epoch": 0.12562156503533106, "grad_norm": 7.005978339387122, "learning_rate": 1.9961413253717214e-05, "logits/chosen": -0.4779301583766937, "logits/rejected": -0.4137405455112457, "logps/chosen": -1.3986326456069946, "logps/rejected": -1.6036043167114258, "loss": 2.1009, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.3986326456069946, "rewards/margins": 0.20497193932533264, "rewards/rejected": -1.6036043167114258, "step": 60 }, { "epoch": 0.1360900287882753, "grad_norm": 6.982350763810334, "learning_rate": 1.9922608719076874e-05, "logits/chosen": -0.267805278301239, "logits/rejected": -0.1766107976436615, "logps/chosen": -1.2244327068328857, "logps/rejected": -2.0722804069519043, "loss": 2.0512, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.2244327068328857, "rewards/margins": 0.8478477597236633, "rewards/rejected": -2.0722804069519043, "step": 65 }, { "epoch": 0.14655849254121958, "grad_norm": 10.132344121190984, "learning_rate": 1.9870502626379127e-05, "logits/chosen": -0.35906368494033813, "logits/rejected": -0.33368802070617676, "logps/chosen": -1.450307846069336, "logps/rejected": -1.7906442880630493, "loss": 2.1396, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.450307846069336, "rewards/margins": 0.34033653140068054, "rewards/rejected": -1.7906442880630493, "step": 70 }, { "epoch": 0.15702695629416383, "grad_norm": 13.919238096447465, "learning_rate": 1.980516482542224e-05, "logits/chosen": -0.6731249094009399, "logits/rejected": -0.6837888956069946, "logps/chosen": -1.2502187490463257, "logps/rejected": -1.7363303899765015, "loss": 2.078, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2502187490463257, "rewards/margins": 0.4861116409301758, "rewards/rejected": -1.7363303899765015, "step": 75 }, { "epoch": 0.16749542004710807, "grad_norm": 8.714846434947708, "learning_rate": 1.972668290351084e-05, "logits/chosen": -0.8093172311782837, "logits/rejected": -0.8910678029060364, "logps/chosen": -1.3465213775634766, "logps/rejected": -1.8265445232391357, "loss": 2.1277, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.3465213775634766, "rewards/margins": 0.480023056268692, "rewards/rejected": -1.8265445232391357, "step": 80 }, { "epoch": 0.17796388380005235, "grad_norm": 34.11378786664511, "learning_rate": 1.9635162068042547e-05, "logits/chosen": -0.6499379873275757, "logits/rejected": -0.6738103628158569, "logps/chosen": -1.2838003635406494, "logps/rejected": -1.6559721231460571, "loss": 2.1205, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2838003635406494, "rewards/margins": 0.37217170000076294, "rewards/rejected": -1.6559721231460571, "step": 85 }, { "epoch": 0.1884323475529966, "grad_norm": 6.6135900227176, "learning_rate": 1.9530725005474195e-05, "logits/chosen": -0.1341579109430313, "logits/rejected": -0.1497870236635208, "logps/chosen": -1.3539865016937256, "logps/rejected": -1.7489871978759766, "loss": 2.0639, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3539865016937256, "rewards/margins": 0.39500072598457336, "rewards/rejected": -1.7489871978759766, "step": 90 }, { "epoch": 0.19890081130594087, "grad_norm": 7.360183689725487, "learning_rate": 1.9413511716856973e-05, "logits/chosen": -0.12092798948287964, "logits/rejected": -0.07471726834774017, "logps/chosen": -1.3030513525009155, "logps/rejected": -1.8159534931182861, "loss": 2.0725, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.3030513525009155, "rewards/margins": 0.512902021408081, "rewards/rejected": -1.8159534931182861, "step": 95 }, { "epoch": 0.2093692750588851, "grad_norm": 7.942085572142913, "learning_rate": 1.9283679330160726e-05, "logits/chosen": 0.026644444093108177, "logits/rejected": 0.05488858371973038, "logps/chosen": -1.3510897159576416, "logps/rejected": -1.8336998224258423, "loss": 2.0911, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3510897159576416, "rewards/margins": 0.482610285282135, "rewards/rejected": -1.8336998224258423, "step": 100 }, { "epoch": 0.21983773881182936, "grad_norm": 10.38696834889441, "learning_rate": 1.9141401889639167e-05, "logits/chosen": 0.12454743683338165, "logits/rejected": 0.1521395593881607, "logps/chosen": -1.308062195777893, "logps/rejected": -1.873884916305542, "loss": 2.029, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.308062195777893, "rewards/margins": 0.5658227205276489, "rewards/rejected": -1.873884916305542, "step": 105 }, { "epoch": 0.23030620256477363, "grad_norm": 14.10578954487523, "learning_rate": 1.898687012251826e-05, "logits/chosen": -0.14296935498714447, "logits/rejected": -0.08335347473621368, "logps/chosen": -1.3113409280776978, "logps/rejected": -1.7755804061889648, "loss": 2.0509, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3113409280776978, "rewards/margins": 0.4642394483089447, "rewards/rejected": -1.7755804061889648, "step": 110 }, { "epoch": 0.24077466631771788, "grad_norm": 7.096253964138347, "learning_rate": 1.8820291183320602e-05, "logits/chosen": -0.20576635003089905, "logits/rejected": -0.1285274177789688, "logps/chosen": -1.2730509042739868, "logps/rejected": -1.8806079626083374, "loss": 2.0506, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2730509042739868, "rewards/margins": 0.6075571179389954, "rewards/rejected": -1.8806079626083374, "step": 115 }, { "epoch": 0.2512431300706621, "grad_norm": 8.976667698418499, "learning_rate": 1.8641888376168483e-05, "logits/chosen": -0.10974551737308502, "logits/rejected": -0.07689039409160614, "logps/chosen": -1.442338466644287, "logps/rejected": -1.90249764919281, "loss": 2.1387, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.442338466644287, "rewards/margins": 0.46015921235084534, "rewards/rejected": -1.90249764919281, "step": 120 }, { "epoch": 0.26171159382360637, "grad_norm": 11.84902364416852, "learning_rate": 1.845190085543795e-05, "logits/chosen": 0.1279464215040207, "logits/rejected": 0.1599569022655487, "logps/chosen": -1.29521906375885, "logps/rejected": -1.5512622594833374, "loss": 2.0874, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.29521906375885, "rewards/margins": 0.2560431957244873, "rewards/rejected": -1.5512622594833374, "step": 125 }, { "epoch": 0.2721800575765506, "grad_norm": 12.665702263586864, "learning_rate": 1.8250583305165098e-05, "logits/chosen": 0.10071973502635956, "logits/rejected": 0.114678755402565, "logps/chosen": -1.3293492794036865, "logps/rejected": -1.6235164403915405, "loss": 2.105, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.3293492794036865, "rewards/margins": 0.29416733980178833, "rewards/rejected": -1.6235164403915405, "step": 130 }, { "epoch": 0.2826485213294949, "grad_norm": 11.168921888078422, "learning_rate": 1.8038205597634392e-05, "logits/chosen": -0.2312246859073639, "logits/rejected": -0.13947580754756927, "logps/chosen": -1.3103423118591309, "logps/rejected": -1.973184585571289, "loss": 2.0983, "rewards/accuracies": 0.78125, "rewards/chosen": -1.3103423118591309, "rewards/margins": 0.662842333316803, "rewards/rejected": -1.973184585571289, "step": 135 }, { "epoch": 0.29311698508243916, "grad_norm": 15.647952191048011, "learning_rate": 1.7815052431606702e-05, "logits/chosen": -0.27144142985343933, "logits/rejected": -0.2118106335401535, "logps/chosen": -1.3751564025878906, "logps/rejected": -2.03005051612854, "loss": 2.0429, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3751564025878906, "rewards/margins": 0.6548939943313599, "rewards/rejected": -2.03005051612854, "step": 140 }, { "epoch": 0.3035854488353834, "grad_norm": 7.242840211884775, "learning_rate": 1.7581422950671942e-05, "logits/chosen": -0.19757069647312164, "logits/rejected": -0.1668623834848404, "logps/chosen": -1.3345425128936768, "logps/rejected": -1.8127644062042236, "loss": 2.0876, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3345425128936768, "rewards/margins": 0.4782216548919678, "rewards/rejected": -1.8127644062042236, "step": 145 }, { "epoch": 0.31405391258832765, "grad_norm": 8.615419856166682, "learning_rate": 1.733763034223804e-05, "logits/chosen": -0.21767687797546387, "logits/rejected": -0.21838533878326416, "logps/chosen": -1.2229845523834229, "logps/rejected": -1.660559058189392, "loss": 2.0294, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.2229845523834229, "rewards/margins": 0.43757471442222595, "rewards/rejected": -1.660559058189392, "step": 150 }, { "epoch": 0.3245223763412719, "grad_norm": 11.4467193943517, "learning_rate": 1.7084001417693702e-05, "logits/chosen": -0.17819705605506897, "logits/rejected": -0.1267833411693573, "logps/chosen": -1.389460563659668, "logps/rejected": -1.8192943334579468, "loss": 2.084, "rewards/accuracies": 0.625, "rewards/chosen": -1.389460563659668, "rewards/margins": 0.4298337399959564, "rewards/rejected": -1.8192943334579468, "step": 155 }, { "epoch": 0.33499084009421615, "grad_norm": 8.12988482306829, "learning_rate": 1.682087617430782e-05, "logits/chosen": -0.12651406228542328, "logits/rejected": -0.04694231227040291, "logps/chosen": -1.318313479423523, "logps/rejected": -1.8056846857070923, "loss": 2.0818, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.318313479423523, "rewards/margins": 0.4873710572719574, "rewards/rejected": -1.8056846857070923, "step": 160 }, { "epoch": 0.34545930384716045, "grad_norm": 6.762373951814964, "learning_rate": 1.6548607339452853e-05, "logits/chosen": -0.10803677886724472, "logits/rejected": -0.048906028270721436, "logps/chosen": -1.25996994972229, "logps/rejected": -1.8231735229492188, "loss": 2.0354, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.25996994972229, "rewards/margins": 0.5632035732269287, "rewards/rejected": -1.8231735229492188, "step": 165 }, { "epoch": 0.3559277676001047, "grad_norm": 11.050433248891123, "learning_rate": 1.626755989776303e-05, "logits/chosen": -0.1651381254196167, "logits/rejected": -0.04633602499961853, "logps/chosen": -1.4237867593765259, "logps/rejected": -2.101548671722412, "loss": 2.0616, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.4237867593765259, "rewards/margins": 0.6777619123458862, "rewards/rejected": -2.101548671722412, "step": 170 }, { "epoch": 0.36639623135304894, "grad_norm": 7.0268134591588245, "learning_rate": 1.5978110601861408e-05, "logits/chosen": -0.12373347580432892, "logits/rejected": -0.0877654105424881, "logps/chosen": -1.3757156133651733, "logps/rejected": -1.7569023370742798, "loss": 2.072, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.3757156133651733, "rewards/margins": 0.3811867833137512, "rewards/rejected": -1.7569023370742798, "step": 175 }, { "epoch": 0.3768646951059932, "grad_norm": 12.761833101713918, "learning_rate": 1.568064746731156e-05, "logits/chosen": -0.11597935855388641, "logits/rejected": -0.1455441117286682, "logps/chosen": -1.374710202217102, "logps/rejected": -1.7882392406463623, "loss": 2.0783, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.374710202217102, "rewards/margins": 0.4135288596153259, "rewards/rejected": -1.7882392406463623, "step": 180 }, { "epoch": 0.38733315885893743, "grad_norm": 7.06376095255915, "learning_rate": 1.5375569252470897e-05, "logits/chosen": -0.16596433520317078, "logits/rejected": -0.026778871193528175, "logps/chosen": -1.3514513969421387, "logps/rejected": -2.0585455894470215, "loss": 2.0175, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3514513969421387, "rewards/margins": 0.7070940732955933, "rewards/rejected": -2.0585455894470215, "step": 185 }, { "epoch": 0.39780162261188173, "grad_norm": 8.565174444883986, "learning_rate": 1.506328492394303e-05, "logits/chosen": -0.1680208444595337, "logits/rejected": -0.10585353523492813, "logps/chosen": -1.3384554386138916, "logps/rejected": -1.7696669101715088, "loss": 2.1269, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.3384554386138916, "rewards/margins": 0.43121138215065, "rewards/rejected": -1.7696669101715088, "step": 190 }, { "epoch": 0.408270086364826, "grad_norm": 8.66730844602624, "learning_rate": 1.4744213108345605e-05, "logits/chosen": -0.18466773629188538, "logits/rejected": -0.03730706498026848, "logps/chosen": -1.3560011386871338, "logps/rejected": -1.740012764930725, "loss": 2.0877, "rewards/accuracies": 0.625, "rewards/chosen": -1.3560011386871338, "rewards/margins": 0.3840116560459137, "rewards/rejected": -1.740012764930725, "step": 195 }, { "epoch": 0.4187385501177702, "grad_norm": 7.228970778278195, "learning_rate": 1.4418781531128636e-05, "logits/chosen": -0.0062202452681958675, "logits/rejected": 0.13902577757835388, "logps/chosen": -1.3838578462600708, "logps/rejected": -1.9292205572128296, "loss": 2.0565, "rewards/accuracies": 0.59375, "rewards/chosen": -1.3838578462600708, "rewards/margins": 0.5453627705574036, "rewards/rejected": -1.9292205572128296, "step": 200 }, { "epoch": 0.42920701387071447, "grad_norm": 9.000136464867888, "learning_rate": 1.4087426443195549e-05, "logits/chosen": 0.13322147727012634, "logits/rejected": 0.31764692068099976, "logps/chosen": -1.2240257263183594, "logps/rejected": -1.729107141494751, "loss": 2.0412, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.2240257263183594, "rewards/margins": 0.5050811171531677, "rewards/rejected": -1.729107141494751, "step": 205 }, { "epoch": 0.4396754776236587, "grad_norm": 9.154719134241198, "learning_rate": 1.375059203609562e-05, "logits/chosen": 0.16319788992404938, "logits/rejected": 0.3346864581108093, "logps/chosen": -1.4042994976043701, "logps/rejected": -1.8507611751556396, "loss": 2.1446, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.4042994976043701, "rewards/margins": 0.44646158814430237, "rewards/rejected": -1.8507611751556396, "step": 210 }, { "epoch": 0.45014394137660296, "grad_norm": 6.0840861971289115, "learning_rate": 1.3408729846571716e-05, "logits/chosen": 0.09617350250482559, "logits/rejected": 0.3308163285255432, "logps/chosen": -1.284582495689392, "logps/rejected": -1.9165366888046265, "loss": 2.0609, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.284582495689392, "rewards/margins": 0.6319543123245239, "rewards/rejected": -1.9165366888046265, "step": 215 }, { "epoch": 0.46061240512954726, "grad_norm": 8.00813078143392, "learning_rate": 1.3062298151261592e-05, "logits/chosen": 0.044529713690280914, "logits/rejected": 0.3042605519294739, "logps/chosen": -1.358139157295227, "logps/rejected": -1.929091215133667, "loss": 2.0609, "rewards/accuracies": 0.625, "rewards/chosen": -1.358139157295227, "rewards/margins": 0.5709521770477295, "rewards/rejected": -1.929091215133667, "step": 220 }, { "epoch": 0.4710808688824915, "grad_norm": 7.246210126735172, "learning_rate": 1.2711761352364172e-05, "logits/chosen": 0.03733745217323303, "logits/rejected": 0.2569599449634552, "logps/chosen": -1.2875401973724365, "logps/rejected": -2.0033631324768066, "loss": 1.9734, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2875401973724365, "rewards/margins": 0.7158228754997253, "rewards/rejected": -2.0033631324768066, "step": 225 }, { "epoch": 0.48154933263543576, "grad_norm": 9.113546871891296, "learning_rate": 1.2357589355094275e-05, "logits/chosen": 0.014212149195373058, "logits/rejected": 0.32842034101486206, "logps/chosen": -1.3232357501983643, "logps/rejected": -2.133357524871826, "loss": 2.0056, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.3232357501983643, "rewards/margins": 0.8101218342781067, "rewards/rejected": -2.133357524871826, "step": 230 }, { "epoch": 0.49201779638838, "grad_norm": 8.416270817600122, "learning_rate": 1.2000256937760446e-05, "logits/chosen": 0.1557755172252655, "logits/rejected": 0.4363393187522888, "logps/chosen": -1.2627068758010864, "logps/rejected": -1.9027000665664673, "loss": 2.0341, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2627068758010864, "rewards/margins": 0.6399933099746704, "rewards/rejected": -1.9027000665664673, "step": 235 }, { "epoch": 0.5024862601413242, "grad_norm": 8.509607776673544, "learning_rate": 1.1640243115310219e-05, "logits/chosen": 0.1399160474538803, "logits/rejected": 0.41544660925865173, "logps/chosen": -1.2238709926605225, "logps/rejected": -1.8552753925323486, "loss": 2.043, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.2238709926605225, "rewards/margins": 0.6314042210578918, "rewards/rejected": -1.8552753925323486, "step": 240 }, { "epoch": 0.5129547238942685, "grad_norm": 7.675820182938368, "learning_rate": 1.127803049719605e-05, "logits/chosen": 0.1161346435546875, "logits/rejected": 0.3014758825302124, "logps/chosen": -1.3739269971847534, "logps/rejected": -1.896535873413086, "loss": 2.0516, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3739269971847534, "rewards/margins": 0.5226086378097534, "rewards/rejected": -1.896535873413086, "step": 245 }, { "epoch": 0.5234231876472127, "grad_norm": 8.105295156815169, "learning_rate": 1.091410464042268e-05, "logits/chosen": 0.13467064499855042, "logits/rejected": 0.20409516990184784, "logps/chosen": -1.283080816268921, "logps/rejected": -1.9848954677581787, "loss": 2.0297, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.283080816268921, "rewards/margins": 0.701814591884613, "rewards/rejected": -1.9848954677581787, "step": 250 }, { "epoch": 0.533891651400157, "grad_norm": 8.059542189088054, "learning_rate": 1.0548953398643276e-05, "logits/chosen": 0.16154329478740692, "logits/rejected": 0.32910025119781494, "logps/chosen": -1.3794437646865845, "logps/rejected": -2.083132743835449, "loss": 2.0164, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3794437646865845, "rewards/margins": 0.70368891954422, "rewards/rejected": -2.083132743835449, "step": 255 }, { "epoch": 0.5443601151531012, "grad_norm": 6.4999384094249955, "learning_rate": 1.0183066268176775e-05, "logits/chosen": 0.6510103940963745, "logits/rejected": 1.0612311363220215, "logps/chosen": -1.3178019523620605, "logps/rejected": -2.1374964714050293, "loss": 2.0503, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.3178019523620605, "rewards/margins": 0.8196946978569031, "rewards/rejected": -2.1374964714050293, "step": 260 }, { "epoch": 0.5548285789060455, "grad_norm": 7.313206322781419, "learning_rate": 9.81693373182323e-06, "logits/chosen": 0.484092652797699, "logits/rejected": 0.6402750015258789, "logps/chosen": -1.3769603967666626, "logps/rejected": -1.7423893213272095, "loss": 2.0132, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.3769603967666626, "rewards/margins": 0.365428626537323, "rewards/rejected": -1.7423893213272095, "step": 265 }, { "epoch": 0.5652970426589898, "grad_norm": 8.756069730601565, "learning_rate": 9.451046601356725e-06, "logits/chosen": 0.13520203530788422, "logits/rejected": 0.3411861062049866, "logps/chosen": -1.3921834230422974, "logps/rejected": -1.8440046310424805, "loss": 2.0618, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.3921834230422974, "rewards/margins": 0.4518211781978607, "rewards/rejected": -1.8440046310424805, "step": 270 }, { "epoch": 0.575765506411934, "grad_norm": 6.597441645495227, "learning_rate": 9.085895359577324e-06, "logits/chosen": -0.17431692779064178, "logits/rejected": 0.03041163645684719, "logps/chosen": -1.317625641822815, "logps/rejected": -2.10213041305542, "loss": 2.1302, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.317625641822815, "rewards/margins": 0.7845045328140259, "rewards/rejected": -2.10213041305542, "step": 275 }, { "epoch": 0.5862339701648783, "grad_norm": 7.014450023153696, "learning_rate": 8.721969502803954e-06, "logits/chosen": -0.2185564786195755, "logits/rejected": -0.061094462871551514, "logps/chosen": -1.2835057973861694, "logps/rejected": -1.7011182308197021, "loss": 2.0526, "rewards/accuracies": 0.65625, "rewards/chosen": -1.2835057973861694, "rewards/margins": 0.41761231422424316, "rewards/rejected": -1.7011182308197021, "step": 280 }, { "epoch": 0.5967024339178225, "grad_norm": 8.176491560294703, "learning_rate": 8.359756884689785e-06, "logits/chosen": -0.24773511290550232, "logits/rejected": -0.19381779432296753, "logps/chosen": -1.4067161083221436, "logps/rejected": -2.0575575828552246, "loss": 2.0134, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.4067161083221436, "rewards/margins": 0.6508415341377258, "rewards/rejected": -2.0575575828552246, "step": 285 }, { "epoch": 0.6071708976707668, "grad_norm": 7.876329412436082, "learning_rate": 7.999743062239557e-06, "logits/chosen": -0.4009264409542084, "logits/rejected": -0.22962765395641327, "logps/chosen": -1.3704339265823364, "logps/rejected": -2.099834680557251, "loss": 2.0007, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3704339265823364, "rewards/margins": 0.7294005155563354, "rewards/rejected": -2.099834680557251, "step": 290 }, { "epoch": 0.6176393614237111, "grad_norm": 7.662219802711994, "learning_rate": 7.642410644905726e-06, "logits/chosen": -0.4107929766178131, "logits/rejected": -0.22412636876106262, "logps/chosen": -1.3308082818984985, "logps/rejected": -2.054591655731201, "loss": 2.0538, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3308082818984985, "rewards/margins": 0.7237831354141235, "rewards/rejected": -2.054591655731201, "step": 295 }, { "epoch": 0.6281078251766553, "grad_norm": 8.408469077357054, "learning_rate": 7.2882386476358304e-06, "logits/chosen": -0.4748775064945221, "logits/rejected": -0.3870747983455658, "logps/chosen": -1.3256374597549438, "logps/rejected": -1.8613688945770264, "loss": 2.0223, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.3256374597549438, "rewards/margins": 0.5357314348220825, "rewards/rejected": -1.8613688945770264, "step": 300 }, { "epoch": 0.6385762889295996, "grad_norm": 9.20767722080512, "learning_rate": 6.937701848738407e-06, "logits/chosen": -0.5532702803611755, "logits/rejected": -0.517833411693573, "logps/chosen": -1.3089849948883057, "logps/rejected": -1.9914848804473877, "loss": 2.0097, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3089849948883057, "rewards/margins": 0.6825000047683716, "rewards/rejected": -1.9914848804473877, "step": 305 }, { "epoch": 0.6490447526825438, "grad_norm": 9.168807609346384, "learning_rate": 6.591270153428288e-06, "logits/chosen": -0.5921626687049866, "logits/rejected": -0.5593982934951782, "logps/chosen": -1.233320951461792, "logps/rejected": -1.824541449546814, "loss": 2.0114, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.233320951461792, "rewards/margins": 0.5912207365036011, "rewards/rejected": -1.824541449546814, "step": 310 }, { "epoch": 0.6595132164354881, "grad_norm": 11.74928248778955, "learning_rate": 6.249407963904381e-06, "logits/chosen": -0.5820972323417664, "logits/rejected": -0.37617072463035583, "logps/chosen": -1.308586835861206, "logps/rejected": -2.1290369033813477, "loss": 2.0377, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.308586835861206, "rewards/margins": 0.8204501271247864, "rewards/rejected": -2.1290369033813477, "step": 315 }, { "epoch": 0.6699816801884323, "grad_norm": 7.446497900716036, "learning_rate": 5.912573556804453e-06, "logits/chosen": -0.4366278648376465, "logits/rejected": -0.34356969594955444, "logps/chosen": -1.3265436887741089, "logps/rejected": -1.9333693981170654, "loss": 1.9563, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.3265436887741089, "rewards/margins": 0.6068258285522461, "rewards/rejected": -1.9333693981170654, "step": 320 }, { "epoch": 0.6804501439413766, "grad_norm": 7.834867022615392, "learning_rate": 5.581218468871365e-06, "logits/chosen": -0.42373937368392944, "logits/rejected": -0.1534721851348877, "logps/chosen": -1.178399682044983, "logps/rejected": -1.9840328693389893, "loss": 1.9479, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.178399682044983, "rewards/margins": 0.8056330680847168, "rewards/rejected": -1.9840328693389893, "step": 325 }, { "epoch": 0.6909186076943209, "grad_norm": 7.323036569812089, "learning_rate": 5.2557868916543996e-06, "logits/chosen": -0.24924680590629578, "logits/rejected": 0.03496779128909111, "logps/chosen": -1.226240873336792, "logps/rejected": -1.8618109226226807, "loss": 1.9899, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.226240873336792, "rewards/margins": 0.6355697512626648, "rewards/rejected": -1.8618109226226807, "step": 330 }, { "epoch": 0.7013870714472651, "grad_norm": 7.704436038290053, "learning_rate": 4.9367150760569746e-06, "logits/chosen": -0.289539635181427, "logits/rejected": 0.08722052723169327, "logps/chosen": -1.2469284534454346, "logps/rejected": -2.0381367206573486, "loss": 1.9837, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2469284534454346, "rewards/margins": 0.7912081480026245, "rewards/rejected": -2.0381367206573486, "step": 335 }, { "epoch": 0.7118555352002094, "grad_norm": 8.947529662445875, "learning_rate": 4.6244307475291025e-06, "logits/chosen": -0.18107546865940094, "logits/rejected": 0.22904996573925018, "logps/chosen": -1.446345329284668, "logps/rejected": -2.1773736476898193, "loss": 2.0338, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.446345329284668, "rewards/margins": 0.7310282588005066, "rewards/rejected": -2.1773736476898193, "step": 340 }, { "epoch": 0.7223239989531536, "grad_norm": 9.583549007245818, "learning_rate": 4.319352532688444e-06, "logits/chosen": -0.29274436831474304, "logits/rejected": 0.00033287107362411916, "logps/chosen": -1.2861610651016235, "logps/rejected": -1.9985120296478271, "loss": 2.0307, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.2861610651016235, "rewards/margins": 0.7123511433601379, "rewards/rejected": -1.9985120296478271, "step": 345 }, { "epoch": 0.7327924627060979, "grad_norm": 6.266721042983619, "learning_rate": 4.0218893981385935e-06, "logits/chosen": -0.28006237745285034, "logits/rejected": -0.14952346682548523, "logps/chosen": -1.2466567754745483, "logps/rejected": -1.7666349411010742, "loss": 2.0448, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2466567754745483, "rewards/margins": 0.5199781656265259, "rewards/rejected": -1.7666349411010742, "step": 350 }, { "epoch": 0.7432609264590422, "grad_norm": 8.529189010191509, "learning_rate": 3.732440102236975e-06, "logits/chosen": -0.38612329959869385, "logits/rejected": -0.1318252980709076, "logps/chosen": -1.1369296312332153, "logps/rejected": -1.9028323888778687, "loss": 1.9434, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -1.1369296312332153, "rewards/margins": 0.7659028172492981, "rewards/rejected": -1.9028323888778687, "step": 355 }, { "epoch": 0.7537293902119864, "grad_norm": 8.98691100719935, "learning_rate": 3.4513926605471504e-06, "logits/chosen": -0.2708672881126404, "logits/rejected": -0.002604148583486676, "logps/chosen": -1.2128788232803345, "logps/rejected": -1.881731629371643, "loss": 1.9188, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2128788232803345, "rewards/margins": 0.6688528060913086, "rewards/rejected": -1.881731629371643, "step": 360 }, { "epoch": 0.7641978539649307, "grad_norm": 8.427363605011601, "learning_rate": 3.1791238256921785e-06, "logits/chosen": -0.2245834320783615, "logits/rejected": 0.03710466995835304, "logps/chosen": -1.3998098373413086, "logps/rejected": -2.1002440452575684, "loss": 2.0448, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.3998098373413086, "rewards/margins": 0.7004340887069702, "rewards/rejected": -2.1002440452575684, "step": 365 }, { "epoch": 0.7746663177178749, "grad_norm": 8.84465057372313, "learning_rate": 2.9159985823062997e-06, "logits/chosen": -0.35639292001724243, "logits/rejected": -0.17423222959041595, "logps/chosen": -1.3097021579742432, "logps/rejected": -2.0959980487823486, "loss": 1.9799, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.3097021579742432, "rewards/margins": 0.7862957715988159, "rewards/rejected": -2.0959980487823486, "step": 370 }, { "epoch": 0.7851347814708192, "grad_norm": 7.000990509721392, "learning_rate": 2.662369657761963e-06, "logits/chosen": -0.32897457480430603, "logits/rejected": -0.3297235369682312, "logps/chosen": -1.2792903184890747, "logps/rejected": -1.8416297435760498, "loss": 2.0065, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2792903184890747, "rewards/margins": 0.5623396635055542, "rewards/rejected": -1.8416297435760498, "step": 375 }, { "epoch": 0.7956032452237635, "grad_norm": 8.146523706243913, "learning_rate": 2.418577049328058e-06, "logits/chosen": -0.3611428439617157, "logits/rejected": -0.2454133927822113, "logps/chosen": -1.266564130783081, "logps/rejected": -1.7843055725097656, "loss": 1.9381, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.266564130783081, "rewards/margins": 0.517741322517395, "rewards/rejected": -1.7843055725097656, "step": 380 }, { "epoch": 0.8060717089767077, "grad_norm": 8.167524524758202, "learning_rate": 2.1849475683932996e-06, "logits/chosen": -0.3831802010536194, "logits/rejected": -0.2638740539550781, "logps/chosen": -1.2620943784713745, "logps/rejected": -1.9156465530395508, "loss": 1.9611, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2620943784713745, "rewards/margins": 0.6535523533821106, "rewards/rejected": -1.9156465530395508, "step": 385 }, { "epoch": 0.816540172729652, "grad_norm": 8.30770659075244, "learning_rate": 1.961794402365611e-06, "logits/chosen": -0.3085532486438751, "logits/rejected": -0.14951160550117493, "logps/chosen": -1.2462884187698364, "logps/rejected": -2.1334927082061768, "loss": 1.9555, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -1.2462884187698364, "rewards/margins": 0.8872040510177612, "rewards/rejected": -2.1334927082061768, "step": 390 }, { "epoch": 0.8270086364825961, "grad_norm": 7.7825890292689905, "learning_rate": 1.7494166948349057e-06, "logits/chosen": -0.267643541097641, "logits/rejected": 0.02370324358344078, "logps/chosen": -1.3041934967041016, "logps/rejected": -2.0585289001464844, "loss": 1.9722, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3041934967041016, "rewards/margins": 0.7543356418609619, "rewards/rejected": -2.0585289001464844, "step": 395 }, { "epoch": 0.8374771002355405, "grad_norm": 11.47946040263076, "learning_rate": 1.5480991445620541e-06, "logits/chosen": -0.2226092368364334, "logits/rejected": 0.03486952185630798, "logps/chosen": -1.237866759300232, "logps/rejected": -1.9466907978057861, "loss": 1.9589, "rewards/accuracies": 0.65625, "rewards/chosen": -1.237866759300232, "rewards/margins": 0.7088239192962646, "rewards/rejected": -1.9466907978057861, "step": 400 }, { "epoch": 0.8479455639884846, "grad_norm": 8.663615848996892, "learning_rate": 1.3581116238315194e-06, "logits/chosen": -0.2045535147190094, "logits/rejected": 0.006712320260703564, "logps/chosen": -1.3987605571746826, "logps/rejected": -2.1295289993286133, "loss": 2.0021, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3987605571746826, "rewards/margins": 0.7307685017585754, "rewards/rejected": -2.1295289993286133, "step": 405 }, { "epoch": 0.8584140277414289, "grad_norm": 8.186995321334237, "learning_rate": 1.1797088166794002e-06, "logits/chosen": -0.20332176983356476, "logits/rejected": 0.029474016278982162, "logps/chosen": -1.172918677330017, "logps/rejected": -1.7614654302597046, "loss": 1.9558, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.172918677330017, "rewards/margins": 0.5885466933250427, "rewards/rejected": -1.7614654302597046, "step": 410 }, { "epoch": 0.8688824914943732, "grad_norm": 6.525083767762586, "learning_rate": 1.013129877481741e-06, "logits/chosen": -0.2240767925977707, "logits/rejected": 0.07820748537778854, "logps/chosen": -1.20893132686615, "logps/rejected": -1.91985285282135, "loss": 1.9579, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.20893132686615, "rewards/margins": 0.7109212875366211, "rewards/rejected": -1.91985285282135, "step": 415 }, { "epoch": 0.8793509552473174, "grad_norm": 9.083137976581597, "learning_rate": 8.585981103608343e-07, "logits/chosen": -0.11082730442285538, "logits/rejected": 0.09507735818624496, "logps/chosen": -1.1996517181396484, "logps/rejected": -1.8896563053131104, "loss": 2.0042, "rewards/accuracies": 0.625, "rewards/chosen": -1.1996517181396484, "rewards/margins": 0.6900044679641724, "rewards/rejected": -1.8896563053131104, "step": 420 }, { "epoch": 0.8898194190002617, "grad_norm": 6.873945114472838, "learning_rate": 7.163206698392744e-07, "logits/chosen": -0.10862954705953598, "logits/rejected": 0.1946602761745453, "logps/chosen": -1.3608647584915161, "logps/rejected": -1.970487356185913, "loss": 1.9911, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.3608647584915161, "rewards/margins": 0.6096227169036865, "rewards/rejected": -1.970487356185913, "step": 425 }, { "epoch": 0.9002878827532059, "grad_norm": 7.542928376234922, "learning_rate": 5.864882831430274e-07, "logits/chosen": -0.16213567554950714, "logits/rejected": 0.21448484063148499, "logps/chosen": -1.3169506788253784, "logps/rejected": -2.048978328704834, "loss": 1.956, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3169506788253784, "rewards/margins": 0.7320275902748108, "rewards/rejected": -2.048978328704834, "step": 430 }, { "epoch": 0.9107563465061502, "grad_norm": 10.100813855786358, "learning_rate": 4.6927499452580574e-07, "logits/chosen": -0.12442419677972794, "logits/rejected": 0.06503897905349731, "logps/chosen": -1.288496732711792, "logps/rejected": -2.066117286682129, "loss": 1.9804, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.288496732711792, "rewards/margins": 0.777620792388916, "rewards/rejected": -2.066117286682129, "step": 435 }, { "epoch": 0.9212248102590945, "grad_norm": 10.273579675957365, "learning_rate": 3.6483793195745686e-07, "logits/chosen": -0.04678087681531906, "logits/rejected": 0.3355256915092468, "logps/chosen": -1.2764372825622559, "logps/rejected": -2.03460955619812, "loss": 1.9929, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.2764372825622559, "rewards/margins": 0.7581723928451538, "rewards/rejected": -2.03460955619812, "step": 440 }, { "epoch": 0.9316932740120387, "grad_norm": 8.126486085847516, "learning_rate": 2.733170964891607e-07, "logits/chosen": -0.17456679046154022, "logits/rejected": 0.09745622426271439, "logps/chosen": -1.2472385168075562, "logps/rejected": -1.892249345779419, "loss": 1.9962, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.2472385168075562, "rewards/margins": 0.6450108289718628, "rewards/rejected": -1.892249345779419, "step": 445 }, { "epoch": 0.942161737764983, "grad_norm": 8.866488191256463, "learning_rate": 1.9483517457776436e-07, "logits/chosen": -0.05351231247186661, "logits/rejected": 0.1783636510372162, "logps/chosen": -1.2550338506698608, "logps/rejected": -1.8777573108673096, "loss": 1.9585, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -1.2550338506698608, "rewards/margins": 0.6227231621742249, "rewards/rejected": -1.8777573108673096, "step": 450 }, { "epoch": 0.9526302015179272, "grad_norm": 8.43900399314022, "learning_rate": 1.2949737362087156e-07, "logits/chosen": -0.09251005947589874, "logits/rejected": 0.2792736291885376, "logps/chosen": -1.381317377090454, "logps/rejected": -1.868950605392456, "loss": 1.9791, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.381317377090454, "rewards/margins": 0.48763322830200195, "rewards/rejected": -1.868950605392456, "step": 455 }, { "epoch": 0.9630986652708715, "grad_norm": 7.3135362806868365, "learning_rate": 7.73912809231292e-08, "logits/chosen": -0.15108491480350494, "logits/rejected": 0.16171926259994507, "logps/chosen": -1.2007992267608643, "logps/rejected": -2.045020341873169, "loss": 1.9082, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2007992267608643, "rewards/margins": 0.8442209362983704, "rewards/rejected": -2.045020341873169, "step": 460 }, { "epoch": 0.9735671290238157, "grad_norm": 8.496120629902867, "learning_rate": 3.858674628278825e-08, "logits/chosen": -0.16963747143745422, "logits/rejected": 0.33284881711006165, "logps/chosen": -1.3178845643997192, "logps/rejected": -2.077279806137085, "loss": 1.9421, "rewards/accuracies": 0.71875, "rewards/chosen": -1.3178845643997192, "rewards/margins": 0.7593953013420105, "rewards/rejected": -2.077279806137085, "step": 465 }, { "epoch": 0.98403559277676, "grad_norm": 8.29474797261466, "learning_rate": 1.3135788355934652e-08, "logits/chosen": -0.18520286679267883, "logits/rejected": 0.14646300673484802, "logps/chosen": -1.2585632801055908, "logps/rejected": -1.9155442714691162, "loss": 1.9995, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2585632801055908, "rewards/margins": 0.6569809317588806, "rewards/rejected": -1.9155442714691162, "step": 470 }, { "epoch": 0.9945040565297043, "grad_norm": 8.421060788269113, "learning_rate": 1.0725249238940916e-09, "logits/chosen": -0.2087993174791336, "logits/rejected": 0.3205938935279846, "logps/chosen": -1.2470612525939941, "logps/rejected": -1.9542922973632812, "loss": 1.985, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.2470612525939941, "rewards/margins": 0.7072311639785767, "rewards/rejected": -1.9542922973632812, "step": 475 }, { "epoch": 0.998691442030882, "step": 477, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 4.0963, "train_samples_per_second": 14924.323, "train_steps_per_second": 116.446 } ], "logging_steps": 5, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }