{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984168865435357, "eval_steps": 400, "global_step": 473, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021108179419525065, "grad_norm": 3.7888171889145084, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -1.7614977359771729, "logits/rejected": -2.1336593627929688, "logps/chosen": -258.78717041015625, "logps/rejected": -241.137451171875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.010554089709762533, "grad_norm": 5.486005258108119, "learning_rate": 5.208333333333333e-08, "logits/chosen": -1.652553915977478, "logits/rejected": -1.944653868675232, "logps/chosen": -254.9417724609375, "logps/rejected": -233.73040771484375, "loss": 0.6933, "rewards/accuracies": 0.3359375, "rewards/chosen": 0.0009400760754942894, "rewards/margins": -0.00012203870574012399, "rewards/rejected": 0.0010621148394420743, "step": 5 }, { "epoch": 0.021108179419525065, "grad_norm": 4.961389255891659, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -1.7172822952270508, "logits/rejected": -1.9224716424942017, "logps/chosen": -285.58203125, "logps/rejected": -271.65899658203125, "loss": 0.6933, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0001836848387029022, "rewards/margins": 5.627591235679574e-05, "rewards/rejected": 0.00012740897363983095, "step": 10 }, { "epoch": 0.0316622691292876, "grad_norm": 4.465341637207026, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -1.7000751495361328, "logits/rejected": -2.006362199783325, "logps/chosen": -294.66119384765625, "logps/rejected": -266.40240478515625, "loss": 0.693, "rewards/accuracies": 0.5, "rewards/chosen": 0.003800996346399188, "rewards/margins": 0.0003609915147535503, "rewards/rejected": 0.003440004540607333, "step": 15 }, { "epoch": 0.04221635883905013, "grad_norm": 4.29839711906534, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -1.5813852548599243, "logits/rejected": -1.917645812034607, "logps/chosen": -269.6716003417969, "logps/rejected": -243.76126098632812, "loss": 0.6929, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.004329930525273085, "rewards/margins": 0.0014942068373784423, "rewards/rejected": 0.002835723338648677, "step": 20 }, { "epoch": 0.052770448548812667, "grad_norm": 4.1974406559327, "learning_rate": 2.604166666666667e-07, "logits/chosen": -1.4200265407562256, "logits/rejected": -1.6618592739105225, "logps/chosen": -277.4543762207031, "logps/rejected": -256.47283935546875, "loss": 0.6921, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.011705084703862667, "rewards/margins": 0.001989929471164942, "rewards/rejected": 0.009715155698359013, "step": 25 }, { "epoch": 0.0633245382585752, "grad_norm": 4.063491497272294, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -1.5429295301437378, "logits/rejected": -1.7798576354980469, "logps/chosen": -282.87689208984375, "logps/rejected": -262.7992858886719, "loss": 0.6906, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.019914668053388596, "rewards/margins": 0.004695773124694824, "rewards/rejected": 0.015218895860016346, "step": 30 }, { "epoch": 0.07387862796833773, "grad_norm": 4.21419727711893, "learning_rate": 3.645833333333333e-07, "logits/chosen": -1.5838125944137573, "logits/rejected": -1.8180118799209595, "logps/chosen": -261.9321594238281, "logps/rejected": -255.01626586914062, "loss": 0.689, "rewards/accuracies": 0.65625, "rewards/chosen": 0.029436618089675903, "rewards/margins": 0.007458895444869995, "rewards/rejected": 0.021977724507451057, "step": 35 }, { "epoch": 0.08443271767810026, "grad_norm": 3.992903303419019, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -1.5156856775283813, "logits/rejected": -1.7749900817871094, "logps/chosen": -263.44287109375, "logps/rejected": -244.74044799804688, "loss": 0.6854, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.03905363008379936, "rewards/margins": 0.01673820987343788, "rewards/rejected": 0.02231542207300663, "step": 40 }, { "epoch": 0.09498680738786279, "grad_norm": 4.26522333339902, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -1.6547425985336304, "logits/rejected": -1.8507578372955322, "logps/chosen": -268.19354248046875, "logps/rejected": -257.1205139160156, "loss": 0.6822, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.04456416517496109, "rewards/margins": 0.022208593785762787, "rewards/rejected": 0.0223555751144886, "step": 45 }, { "epoch": 0.10554089709762533, "grad_norm": 4.338593813384902, "learning_rate": 4.999726797933858e-07, "logits/chosen": -1.6332728862762451, "logits/rejected": -1.850756049156189, "logps/chosen": -263.47998046875, "logps/rejected": -249.18734741210938, "loss": 0.6782, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.037815388292074203, "rewards/margins": 0.02914128080010414, "rewards/rejected": 0.008674108423292637, "step": 50 }, { "epoch": 0.11609498680738786, "grad_norm": 4.4734059999380635, "learning_rate": 4.99665396039775e-07, "logits/chosen": -1.6244800090789795, "logits/rejected": -1.8389520645141602, "logps/chosen": -280.68115234375, "logps/rejected": -266.77935791015625, "loss": 0.6705, "rewards/accuracies": 0.6875, "rewards/chosen": 0.01766197383403778, "rewards/margins": 0.03439956158399582, "rewards/rejected": -0.016737591475248337, "step": 55 }, { "epoch": 0.1266490765171504, "grad_norm": 5.180495838186906, "learning_rate": 4.99017099386437e-07, "logits/chosen": -1.7377235889434814, "logits/rejected": -1.9698741436004639, "logps/chosen": -276.61029052734375, "logps/rejected": -265.09356689453125, "loss": 0.6677, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.04501429200172424, "rewards/margins": 0.047541338950395584, "rewards/rejected": -0.09255563467741013, "step": 60 }, { "epoch": 0.13720316622691292, "grad_norm": 5.0555182208034335, "learning_rate": 4.980286753286194e-07, "logits/chosen": -1.7377593517303467, "logits/rejected": -1.9555679559707642, "logps/chosen": -297.94842529296875, "logps/rejected": -286.69110107421875, "loss": 0.6676, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.013160338625311852, "rewards/margins": 0.06764128059148788, "rewards/rejected": -0.08080162107944489, "step": 65 }, { "epoch": 0.14775725593667546, "grad_norm": 5.593861280995437, "learning_rate": 4.967014739346915e-07, "logits/chosen": -1.902021050453186, "logits/rejected": -2.1676580905914307, "logps/chosen": -274.930908203125, "logps/rejected": -265.46917724609375, "loss": 0.6612, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.07597370445728302, "rewards/margins": 0.06604303419589996, "rewards/rejected": -0.14201673865318298, "step": 70 }, { "epoch": 0.158311345646438, "grad_norm": 5.729170497012147, "learning_rate": 4.950373080021136e-07, "logits/chosen": -1.8614518642425537, "logits/rejected": -2.113079786300659, "logps/chosen": -286.76824951171875, "logps/rejected": -274.01043701171875, "loss": 0.6598, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07419217377901077, "rewards/margins": 0.07467035204172134, "rewards/rejected": -0.14886252582073212, "step": 75 }, { "epoch": 0.16886543535620052, "grad_norm": 5.494684099688743, "learning_rate": 4.930384505813737e-07, "logits/chosen": -1.923152208328247, "logits/rejected": -2.1438252925872803, "logps/chosen": -284.2359619140625, "logps/rejected": -276.147705078125, "loss": 0.6638, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.11922915279865265, "rewards/margins": 0.0638352707028389, "rewards/rejected": -0.18306441605091095, "step": 80 }, { "epoch": 0.17941952506596306, "grad_norm": 5.815028665022688, "learning_rate": 4.907076318712738e-07, "logits/chosen": -1.9811105728149414, "logits/rejected": -2.159453868865967, "logps/chosen": -286.17047119140625, "logps/rejected": -275.25762939453125, "loss": 0.6572, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.10111021995544434, "rewards/margins": 0.09180058538913727, "rewards/rejected": -0.1929108202457428, "step": 85 }, { "epoch": 0.18997361477572558, "grad_norm": 6.05860390265305, "learning_rate": 4.88048035489807e-07, "logits/chosen": -1.8606590032577515, "logits/rejected": -2.207517147064209, "logps/chosen": -288.4847106933594, "logps/rejected": -272.2112731933594, "loss": 0.6493, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.16333934664726257, "rewards/margins": 0.09367315471172333, "rewards/rejected": -0.2570124864578247, "step": 90 }, { "epoch": 0.20052770448548812, "grad_norm": 5.9220580962205105, "learning_rate": 4.85063294125718e-07, "logits/chosen": -1.9957729578018188, "logits/rejected": -2.102470636367798, "logps/chosen": -306.8893127441406, "logps/rejected": -315.8654479980469, "loss": 0.6528, "rewards/accuracies": 0.6875, "rewards/chosen": -0.189494326710701, "rewards/margins": 0.06697932630777359, "rewards/rejected": -0.2564736604690552, "step": 95 }, { "epoch": 0.21108179419525067, "grad_norm": 7.106879633726346, "learning_rate": 4.817574845766874e-07, "logits/chosen": -1.914390206336975, "logits/rejected": -2.158510446548462, "logps/chosen": -312.228271484375, "logps/rejected": -307.2701416015625, "loss": 0.6473, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.23975618183612823, "rewards/margins": 0.13584721088409424, "rewards/rejected": -0.37560343742370605, "step": 100 }, { "epoch": 0.22163588390501318, "grad_norm": 6.261854070868125, "learning_rate": 4.781351221809166e-07, "logits/chosen": -2.121222496032715, "logits/rejected": -2.3385891914367676, "logps/chosen": -288.9300231933594, "logps/rejected": -287.0550537109375, "loss": 0.6455, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.34203463792800903, "rewards/margins": 0.1223745122551918, "rewards/rejected": -0.4644091725349426, "step": 105 }, { "epoch": 0.23218997361477572, "grad_norm": 6.2411593338388816, "learning_rate": 4.742011546497182e-07, "logits/chosen": -1.9769681692123413, "logits/rejected": -2.1361823081970215, "logps/chosen": -309.54766845703125, "logps/rejected": -307.20306396484375, "loss": 0.6489, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.28532418608665466, "rewards/margins": 0.1274307519197464, "rewards/rejected": -0.41275492310523987, "step": 110 }, { "epoch": 0.24274406332453827, "grad_norm": 6.782103703812521, "learning_rate": 4.6996095530953875e-07, "logits/chosen": -1.9189682006835938, "logits/rejected": -2.1745872497558594, "logps/chosen": -314.22308349609375, "logps/rejected": -309.86859130859375, "loss": 0.6341, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3373129367828369, "rewards/margins": 0.11438401788473129, "rewards/rejected": -0.4516969621181488, "step": 115 }, { "epoch": 0.2532981530343008, "grad_norm": 7.845105191338386, "learning_rate": 4.654203157626399e-07, "logits/chosen": -2.0927116870880127, "logits/rejected": -2.4226441383361816, "logps/chosen": -330.85467529296875, "logps/rejected": -319.5343933105469, "loss": 0.6363, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.41750985383987427, "rewards/margins": 0.1433776617050171, "rewards/rejected": -0.5608875155448914, "step": 120 }, { "epoch": 0.2638522427440633, "grad_norm": 7.838145177076444, "learning_rate": 4.605854379764673e-07, "logits/chosen": -2.088397264480591, "logits/rejected": -2.309814453125, "logps/chosen": -321.032958984375, "logps/rejected": -316.51812744140625, "loss": 0.6335, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.44016337394714355, "rewards/margins": 0.12760691344738007, "rewards/rejected": -0.5677703619003296, "step": 125 }, { "epoch": 0.27440633245382584, "grad_norm": 7.175313209394211, "learning_rate": 4.5546292581250857e-07, "logits/chosen": -2.1430201530456543, "logits/rejected": -2.3672008514404297, "logps/chosen": -320.6697692871094, "logps/rejected": -315.40594482421875, "loss": 0.6314, "rewards/accuracies": 0.75, "rewards/chosen": -0.46516576409339905, "rewards/margins": 0.20094823837280273, "rewards/rejected": -0.6661140322685242, "step": 130 }, { "epoch": 0.2849604221635884, "grad_norm": 8.139621502884008, "learning_rate": 4.5005977600621275e-07, "logits/chosen": -2.157411813735962, "logits/rejected": -2.422761917114258, "logps/chosen": -334.2851867675781, "logps/rejected": -331.96240234375, "loss": 0.6361, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.5050655007362366, "rewards/margins": 0.16327856481075287, "rewards/rejected": -0.6683440208435059, "step": 135 }, { "epoch": 0.2955145118733509, "grad_norm": 8.66968270113885, "learning_rate": 4.443833686102919e-07, "logits/chosen": -2.18753981590271, "logits/rejected": -2.4200239181518555, "logps/chosen": -351.04388427734375, "logps/rejected": -355.5639953613281, "loss": 0.6345, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6600111126899719, "rewards/margins": 0.2067330777645111, "rewards/rejected": -0.8667442202568054, "step": 140 }, { "epoch": 0.30606860158311344, "grad_norm": 8.486691938463958, "learning_rate": 4.384414569144561e-07, "logits/chosen": -2.2690327167510986, "logits/rejected": -2.467618227005005, "logps/chosen": -345.2842102050781, "logps/rejected": -351.8019104003906, "loss": 0.6236, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.6318648457527161, "rewards/margins": 0.21810145676136017, "rewards/rejected": -0.8499662280082703, "step": 145 }, { "epoch": 0.316622691292876, "grad_norm": 10.799601481281647, "learning_rate": 4.3224215685535287e-07, "logits/chosen": -2.1107537746429443, "logits/rejected": -2.275696039199829, "logps/chosen": -330.2477111816406, "logps/rejected": -332.95306396484375, "loss": 0.6218, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6694210767745972, "rewards/margins": 0.16177809238433838, "rewards/rejected": -0.8311992883682251, "step": 150 }, { "epoch": 0.32717678100263853, "grad_norm": 9.91414676698451, "learning_rate": 4.2579393593117364e-07, "logits/chosen": -2.109783887863159, "logits/rejected": -2.3675389289855957, "logps/chosen": -360.96612548828125, "logps/rejected": -354.8112487792969, "loss": 0.6228, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.7774096727371216, "rewards/margins": 0.20474901795387268, "rewards/rejected": -0.9821586608886719, "step": 155 }, { "epoch": 0.33773087071240104, "grad_norm": 9.84160004233017, "learning_rate": 4.191056016360699e-07, "logits/chosen": -2.1540074348449707, "logits/rejected": -2.363142728805542, "logps/chosen": -353.576416015625, "logps/rejected": -356.2342834472656, "loss": 0.6191, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8168758153915405, "rewards/margins": 0.1971598118543625, "rewards/rejected": -1.014035701751709, "step": 160 }, { "epoch": 0.3482849604221636, "grad_norm": 10.137761591125681, "learning_rate": 4.121862894301754e-07, "logits/chosen": -2.1386771202087402, "logits/rejected": -2.463273286819458, "logps/chosen": -378.07904052734375, "logps/rejected": -362.71893310546875, "loss": 0.6188, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.91053307056427, "rewards/margins": 0.17932763695716858, "rewards/rejected": -1.0898606777191162, "step": 165 }, { "epoch": 0.35883905013192613, "grad_norm": 12.186665200345084, "learning_rate": 4.050454502616667e-07, "logits/chosen": -2.1371123790740967, "logits/rejected": -2.453059673309326, "logps/chosen": -393.2257080078125, "logps/rejected": -389.81195068359375, "loss": 0.6228, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0101797580718994, "rewards/margins": 0.24000540375709534, "rewards/rejected": -1.2501851320266724, "step": 170 }, { "epoch": 0.36939313984168864, "grad_norm": 7.8151280249971276, "learning_rate": 3.976928376579047e-07, "logits/chosen": -2.1572844982147217, "logits/rejected": -2.5259194374084473, "logps/chosen": -371.3215026855469, "logps/rejected": -361.8147277832031, "loss": 0.6206, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.8133503794670105, "rewards/margins": 0.20597751438617706, "rewards/rejected": -1.0193278789520264, "step": 175 }, { "epoch": 0.37994722955145116, "grad_norm": 10.09035062532825, "learning_rate": 3.9013849440328945e-07, "logits/chosen": -2.11098051071167, "logits/rejected": -2.379697799682617, "logps/chosen": -331.082763671875, "logps/rejected": -332.23443603515625, "loss": 0.6247, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.7421666979789734, "rewards/margins": 0.18613779544830322, "rewards/rejected": -0.9283044934272766, "step": 180 }, { "epoch": 0.39050131926121373, "grad_norm": 11.160942548142629, "learning_rate": 3.8239273882202473e-07, "logits/chosen": -2.120657444000244, "logits/rejected": -2.317275285720825, "logps/chosen": -406.42041015625, "logps/rejected": -405.0521545410156, "loss": 0.609, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.1358160972595215, "rewards/margins": 0.21516843140125275, "rewards/rejected": -1.3509845733642578, "step": 185 }, { "epoch": 0.40105540897097625, "grad_norm": 10.026177303858306, "learning_rate": 3.7446615068452804e-07, "logits/chosen": -2.2416388988494873, "logits/rejected": -2.50757098197937, "logps/chosen": -402.29205322265625, "logps/rejected": -400.1927795410156, "loss": 0.5983, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.1884238719940186, "rewards/margins": 0.24852600693702698, "rewards/rejected": -1.4369499683380127, "step": 190 }, { "epoch": 0.41160949868073876, "grad_norm": 9.915852139948614, "learning_rate": 3.6636955675673743e-07, "logits/chosen": -2.292942762374878, "logits/rejected": -2.5384058952331543, "logps/chosen": -383.97418212890625, "logps/rejected": -393.2140197753906, "loss": 0.6009, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.096928358078003, "rewards/margins": 0.28661248087882996, "rewards/rejected": -1.3835408687591553, "step": 195 }, { "epoch": 0.42216358839050133, "grad_norm": 14.20330931182508, "learning_rate": 3.5811401601205093e-07, "logits/chosen": -2.2763895988464355, "logits/rejected": -2.5465030670166016, "logps/chosen": -393.99267578125, "logps/rejected": -405.69268798828125, "loss": 0.6428, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2335624694824219, "rewards/margins": 0.20860306918621063, "rewards/rejected": -1.442165732383728, "step": 200 }, { "epoch": 0.43271767810026385, "grad_norm": 9.857364268590045, "learning_rate": 3.497108045260995e-07, "logits/chosen": -2.2732205390930176, "logits/rejected": -2.512218713760376, "logps/chosen": -384.2240905761719, "logps/rejected": -387.35052490234375, "loss": 0.6098, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.2368552684783936, "rewards/margins": 0.2555080056190491, "rewards/rejected": -1.492363452911377, "step": 205 }, { "epoch": 0.44327176781002636, "grad_norm": 10.31880014400564, "learning_rate": 3.411714000749838e-07, "logits/chosen": -2.3171160221099854, "logits/rejected": -2.6205945014953613, "logps/chosen": -408.45916748046875, "logps/rejected": -408.751220703125, "loss": 0.6023, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -1.2992274761199951, "rewards/margins": 0.2618922293186188, "rewards/rejected": -1.5611199140548706, "step": 210 }, { "epoch": 0.45382585751978893, "grad_norm": 12.303776927971242, "learning_rate": 3.3250746645801287e-07, "logits/chosen": -2.199439525604248, "logits/rejected": -2.4404823780059814, "logps/chosen": -443.233642578125, "logps/rejected": -461.68597412109375, "loss": 0.594, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6428359746932983, "rewards/margins": 0.345574289560318, "rewards/rejected": -1.9884103536605835, "step": 215 }, { "epoch": 0.46437994722955145, "grad_norm": 12.23954935903092, "learning_rate": 3.237308375663571e-07, "logits/chosen": -2.234389305114746, "logits/rejected": -2.4988560676574707, "logps/chosen": -442.395751953125, "logps/rejected": -463.5758361816406, "loss": 0.5764, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.693788766860962, "rewards/margins": 0.34694477915763855, "rewards/rejected": -2.040733575820923, "step": 220 }, { "epoch": 0.47493403693931396, "grad_norm": 14.593463763552798, "learning_rate": 3.148535012193767e-07, "logits/chosen": -2.2539751529693604, "logits/rejected": -2.492187023162842, "logps/chosen": -510.832275390625, "logps/rejected": -525.7044677734375, "loss": 0.5971, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.1941466331481934, "rewards/margins": 0.3329901695251465, "rewards/rejected": -2.5271365642547607, "step": 225 }, { "epoch": 0.48548812664907653, "grad_norm": 10.847822783700963, "learning_rate": 3.0588758279070183e-07, "logits/chosen": -2.233119249343872, "logits/rejected": -2.4563241004943848, "logps/chosen": -432.57635498046875, "logps/rejected": -434.28399658203125, "loss": 0.6159, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.6671392917633057, "rewards/margins": 0.20822450518608093, "rewards/rejected": -1.8753639459609985, "step": 230 }, { "epoch": 0.49604221635883905, "grad_norm": 9.907519766746905, "learning_rate": 2.968453286464312e-07, "logits/chosen": -2.2562708854675293, "logits/rejected": -2.479283094406128, "logps/chosen": -388.1554260253906, "logps/rejected": -398.91656494140625, "loss": 0.5934, "rewards/accuracies": 0.65625, "rewards/chosen": -1.3175015449523926, "rewards/margins": 0.24448028206825256, "rewards/rejected": -1.5619816780090332, "step": 235 }, { "epoch": 0.5065963060686016, "grad_norm": 11.385701610802691, "learning_rate": 2.8773908941806877e-07, "logits/chosen": -2.2707817554473877, "logits/rejected": -2.499936103820801, "logps/chosen": -438.36834716796875, "logps/rejected": -435.4722595214844, "loss": 0.6058, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.5956835746765137, "rewards/margins": 0.22251293063163757, "rewards/rejected": -1.8181965351104736, "step": 240 }, { "epoch": 0.5171503957783641, "grad_norm": 13.386635563356508, "learning_rate": 2.785813031330473e-07, "logits/chosen": -2.2956652641296387, "logits/rejected": -2.5434913635253906, "logps/chosen": -469.11676025390625, "logps/rejected": -465.9659118652344, "loss": 0.6099, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.9268583059310913, "rewards/margins": 0.21980834007263184, "rewards/rejected": -2.1466667652130127, "step": 245 }, { "epoch": 0.5277044854881267, "grad_norm": 10.39915818076319, "learning_rate": 2.693844782258779e-07, "logits/chosen": -2.3407020568847656, "logits/rejected": -2.5239195823669434, "logps/chosen": -459.4602966308594, "logps/rejected": -466.40966796875, "loss": 0.6065, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.813126564025879, "rewards/margins": 0.22814805805683136, "rewards/rejected": -2.0412745475769043, "step": 250 }, { "epoch": 0.5382585751978892, "grad_norm": 13.963581040356289, "learning_rate": 2.601611764531342e-07, "logits/chosen": -2.2778186798095703, "logits/rejected": -2.4619011878967285, "logps/chosen": -394.53631591796875, "logps/rejected": -416.8975524902344, "loss": 0.6027, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.4522285461425781, "rewards/margins": 0.3218362331390381, "rewards/rejected": -1.7740647792816162, "step": 255 }, { "epoch": 0.5488126649076517, "grad_norm": 9.618957459005115, "learning_rate": 2.5092399573560323e-07, "logits/chosen": -2.219548463821411, "logits/rejected": -2.35976243019104, "logps/chosen": -442.07470703125, "logps/rejected": -447.6449279785156, "loss": 0.6068, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.6692779064178467, "rewards/margins": 0.20155127346515656, "rewards/rejected": -1.8708292245864868, "step": 260 }, { "epoch": 0.5593667546174143, "grad_norm": 13.385422305434652, "learning_rate": 2.4168555295104124e-07, "logits/chosen": -2.215520143508911, "logits/rejected": -2.2993171215057373, "logps/chosen": -438.2977600097656, "logps/rejected": -455.9774475097656, "loss": 0.585, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -1.5749682188034058, "rewards/margins": 0.3128505051136017, "rewards/rejected": -1.8878189325332642, "step": 265 }, { "epoch": 0.5699208443271768, "grad_norm": 20.732510246650026, "learning_rate": 2.3245846670103626e-07, "logits/chosen": -2.383749008178711, "logits/rejected": -2.67887806892395, "logps/chosen": -489.03680419921875, "logps/rejected": -513.4251708984375, "loss": 0.5809, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8371191024780273, "rewards/margins": 0.4673503041267395, "rewards/rejected": -2.304469347000122, "step": 270 }, { "epoch": 0.5804749340369393, "grad_norm": 21.85869665384531, "learning_rate": 2.232553400755159e-07, "logits/chosen": -2.5215096473693848, "logits/rejected": -2.706601619720459, "logps/chosen": -494.96881103515625, "logps/rejected": -513.2868041992188, "loss": 0.6131, "rewards/accuracies": 0.71875, "rewards/chosen": -2.0746359825134277, "rewards/margins": 0.40606698393821716, "rewards/rejected": -2.4807028770446777, "step": 275 }, { "epoch": 0.5910290237467019, "grad_norm": 11.697047808438843, "learning_rate": 2.1408874343844294e-07, "logits/chosen": -2.4797873497009277, "logits/rejected": -2.675075054168701, "logps/chosen": -458.46148681640625, "logps/rejected": -470.7688903808594, "loss": 0.5758, "rewards/accuracies": 0.75, "rewards/chosen": -1.7771122455596924, "rewards/margins": 0.36666423082351685, "rewards/rejected": -2.1437766551971436, "step": 280 }, { "epoch": 0.6015831134564644, "grad_norm": 12.782798316845536, "learning_rate": 2.049711972582101e-07, "logits/chosen": -2.287956714630127, "logits/rejected": -2.528700590133667, "logps/chosen": -435.39569091796875, "logps/rejected": -443.40240478515625, "loss": 0.5757, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6082191467285156, "rewards/margins": 0.2757379412651062, "rewards/rejected": -1.8839571475982666, "step": 285 }, { "epoch": 0.6121372031662269, "grad_norm": 15.368915084454407, "learning_rate": 1.9591515500618588e-07, "logits/chosen": -2.276632070541382, "logits/rejected": -2.4731783866882324, "logps/chosen": -473.08233642578125, "logps/rejected": -492.99774169921875, "loss": 0.5871, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.886792540550232, "rewards/margins": 0.30836355686187744, "rewards/rejected": -2.1951560974121094, "step": 290 }, { "epoch": 0.6226912928759895, "grad_norm": 12.7131825042862, "learning_rate": 1.8693298614677112e-07, "logits/chosen": -2.141019821166992, "logits/rejected": -2.3793327808380127, "logps/chosen": -507.8687438964844, "logps/rejected": -522.310546875, "loss": 0.5797, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.982000708580017, "rewards/margins": 0.3759006857872009, "rewards/rejected": -2.3579015731811523, "step": 295 }, { "epoch": 0.633245382585752, "grad_norm": 16.694511883816396, "learning_rate": 1.7803695924219814e-07, "logits/chosen": -2.2722671031951904, "logits/rejected": -2.486273765563965, "logps/chosen": -485.3500061035156, "logps/rejected": -501.1082458496094, "loss": 0.5968, "rewards/accuracies": 0.71875, "rewards/chosen": -2.0275466442108154, "rewards/margins": 0.3500698506832123, "rewards/rejected": -2.3776164054870605, "step": 300 }, { "epoch": 0.6437994722955145, "grad_norm": 12.047975502833824, "learning_rate": 1.6923922519515067e-07, "logits/chosen": -2.2678744792938232, "logits/rejected": -2.4072413444519043, "logps/chosen": -485.83709716796875, "logps/rejected": -510.50628662109375, "loss": 0.5798, "rewards/accuracies": 0.6875, "rewards/chosen": -2.0933148860931396, "rewards/margins": 0.37190961837768555, "rewards/rejected": -2.465224504470825, "step": 305 }, { "epoch": 0.6543535620052771, "grad_norm": 15.563285458971103, "learning_rate": 1.605518006520924e-07, "logits/chosen": -2.2849347591400146, "logits/rejected": -2.5423800945281982, "logps/chosen": -502.2515563964844, "logps/rejected": -523.35400390625, "loss": 0.5888, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.3487892150878906, "rewards/margins": 0.39526933431625366, "rewards/rejected": -2.744058609008789, "step": 310 }, { "epoch": 0.6649076517150396, "grad_norm": 12.024205485012896, "learning_rate": 1.519865515899731e-07, "logits/chosen": -2.3724029064178467, "logits/rejected": -2.584688901901245, "logps/chosen": -492.44183349609375, "logps/rejected": -507.952880859375, "loss": 0.5818, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.114487409591675, "rewards/margins": 0.37069326639175415, "rewards/rejected": -2.485180616378784, "step": 315 }, { "epoch": 0.6754617414248021, "grad_norm": 12.928310439067417, "learning_rate": 1.4355517710873182e-07, "logits/chosen": -2.332822322845459, "logits/rejected": -2.580765724182129, "logps/chosen": -478.24560546875, "logps/rejected": -496.3634338378906, "loss": 0.5952, "rewards/accuracies": 0.71875, "rewards/chosen": -2.112123966217041, "rewards/margins": 0.3569082021713257, "rewards/rejected": -2.4690322875976562, "step": 320 }, { "epoch": 0.6860158311345647, "grad_norm": 14.108193064852378, "learning_rate": 1.3526919345173318e-07, "logits/chosen": -2.4187912940979004, "logits/rejected": -2.564967632293701, "logps/chosen": -518.6376953125, "logps/rejected": -543.2145385742188, "loss": 0.578, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.3633389472961426, "rewards/margins": 0.39262861013412476, "rewards/rejected": -2.755967378616333, "step": 325 }, { "epoch": 0.6965699208443272, "grad_norm": 19.566691765082115, "learning_rate": 1.2713991827596443e-07, "logits/chosen": -2.466085195541382, "logits/rejected": -2.6994807720184326, "logps/chosen": -525.8836669921875, "logps/rejected": -555.58154296875, "loss": 0.576, "rewards/accuracies": 0.71875, "rewards/chosen": -2.558525800704956, "rewards/margins": 0.42205095291137695, "rewards/rejected": -2.980576753616333, "step": 330 }, { "epoch": 0.7071240105540897, "grad_norm": 16.838554493879652, "learning_rate": 1.191784551934773e-07, "logits/chosen": -2.502603054046631, "logits/rejected": -2.686891794204712, "logps/chosen": -503.7294006347656, "logps/rejected": -527.2174072265625, "loss": 0.5818, "rewards/accuracies": 0.71875, "rewards/chosen": -2.4917781352996826, "rewards/margins": 0.3848406672477722, "rewards/rejected": -2.8766188621520996, "step": 335 }, { "epoch": 0.7176781002638523, "grad_norm": 14.504581595303138, "learning_rate": 1.1139567860518953e-07, "logits/chosen": -2.369147777557373, "logits/rejected": -2.5709891319274902, "logps/chosen": -490.6739807128906, "logps/rejected": -508.79486083984375, "loss": 0.5972, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.1547963619232178, "rewards/margins": 0.4393877387046814, "rewards/rejected": -2.594184160232544, "step": 340 }, { "epoch": 0.7282321899736148, "grad_norm": 17.356368091641894, "learning_rate": 1.0380221884776128e-07, "logits/chosen": -2.3545467853546143, "logits/rejected": -2.6243462562561035, "logps/chosen": -524.93896484375, "logps/rejected": -539.4567260742188, "loss": 0.5834, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -2.282606840133667, "rewards/margins": 0.3932141065597534, "rewards/rejected": -2.675821304321289, "step": 345 }, { "epoch": 0.7387862796833773, "grad_norm": 12.791178420216584, "learning_rate": 9.640844767383405e-08, "logits/chosen": -2.3955166339874268, "logits/rejected": -2.7560970783233643, "logps/chosen": -512.841552734375, "logps/rejected": -530.08203125, "loss": 0.574, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2674124240875244, "rewards/margins": 0.4363299012184143, "rewards/rejected": -2.703742504119873, "step": 350 }, { "epoch": 0.7493403693931399, "grad_norm": 15.246389662934607, "learning_rate": 8.922446408546378e-08, "logits/chosen": -2.1915884017944336, "logits/rejected": -2.4269826412200928, "logps/chosen": -500.866943359375, "logps/rejected": -525.7913208007812, "loss": 0.5859, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0900752544403076, "rewards/margins": 0.4145973324775696, "rewards/rejected": -2.5046725273132324, "step": 355 }, { "epoch": 0.7598944591029023, "grad_norm": 13.547069603435855, "learning_rate": 8.22600805400994e-08, "logits/chosen": -2.177799940109253, "logits/rejected": -2.3907604217529297, "logps/chosen": -484.6221618652344, "logps/rejected": -505.7215270996094, "loss": 0.5938, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -1.995901346206665, "rewards/margins": 0.3997262418270111, "rewards/rejected": -2.395627498626709, "step": 360 }, { "epoch": 0.7704485488126649, "grad_norm": 13.801750243778415, "learning_rate": 7.552480954794558e-08, "logits/chosen": -2.4111971855163574, "logits/rejected": -2.541329860687256, "logps/chosen": -483.94940185546875, "logps/rejected": -505.018798828125, "loss": 0.5786, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1984305381774902, "rewards/margins": 0.30352845788002014, "rewards/rejected": -2.5019590854644775, "step": 365 }, { "epoch": 0.7810026385224275, "grad_norm": 14.514442087318445, "learning_rate": 6.902785067901854e-08, "logits/chosen": -2.3650124073028564, "logits/rejected": -2.6508941650390625, "logps/chosen": -493.879150390625, "logps/rejected": -509.673828125, "loss": 0.5716, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.1922976970672607, "rewards/margins": 0.36951905488967896, "rewards/rejected": -2.561816930770874, "step": 370 }, { "epoch": 0.7915567282321899, "grad_norm": 17.145765706359093, "learning_rate": 6.277807799763973e-08, "logits/chosen": -2.3265914916992188, "logits/rejected": -2.5552210807800293, "logps/chosen": -559.2115478515625, "logps/rejected": -583.4457397460938, "loss": 0.5863, "rewards/accuracies": 0.71875, "rewards/chosen": -2.589047908782959, "rewards/margins": 0.4866320490837097, "rewards/rejected": -3.0756797790527344, "step": 375 }, { "epoch": 0.8021108179419525, "grad_norm": 12.09091453196307, "learning_rate": 5.678402794153145e-08, "logits/chosen": -2.2694175243377686, "logits/rejected": -2.5305798053741455, "logps/chosen": -522.9309692382812, "logps/rejected": -551.8192749023438, "loss": 0.5797, "rewards/accuracies": 0.6875, "rewards/chosen": -2.3409042358398438, "rewards/margins": 0.4437629282474518, "rewards/rejected": -2.7846672534942627, "step": 380 }, { "epoch": 0.8126649076517151, "grad_norm": 13.834538465076683, "learning_rate": 5.105388766206969e-08, "logits/chosen": -2.3926773071289062, "logits/rejected": -2.5119540691375732, "logps/chosen": -503.90850830078125, "logps/rejected": -526.0748901367188, "loss": 0.5905, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.2285289764404297, "rewards/margins": 0.36956310272216797, "rewards/rejected": -2.5980920791625977, "step": 385 }, { "epoch": 0.8232189973614775, "grad_norm": 12.082575884438302, "learning_rate": 4.5595483841620484e-08, "logits/chosen": -2.219971179962158, "logits/rejected": -2.4546897411346436, "logps/chosen": -487.01544189453125, "logps/rejected": -499.76495361328125, "loss": 0.5761, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -2.0403149127960205, "rewards/margins": 0.41723886132240295, "rewards/rejected": -2.4575533866882324, "step": 390 }, { "epoch": 0.8337730870712401, "grad_norm": 14.209125492445594, "learning_rate": 4.0416272003232526e-08, "logits/chosen": -2.3332419395446777, "logits/rejected": -2.5107414722442627, "logps/chosen": -479.9403381347656, "logps/rejected": -502.7928771972656, "loss": 0.5866, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.998462438583374, "rewards/margins": 0.44837865233421326, "rewards/rejected": -2.446840763092041, "step": 395 }, { "epoch": 0.8443271767810027, "grad_norm": 13.431649672455546, "learning_rate": 3.552332632729041e-08, "logits/chosen": -2.3189663887023926, "logits/rejected": -2.414161205291748, "logps/chosen": -480.2158203125, "logps/rejected": -508.34466552734375, "loss": 0.5675, "rewards/accuracies": 0.71875, "rewards/chosen": -1.9780933856964111, "rewards/margins": 0.3599149286746979, "rewards/rejected": -2.338008403778076, "step": 400 }, { "epoch": 0.8443271767810027, "eval_logits/chosen": -2.860567092895508, "eval_logits/rejected": -2.755436420440674, "eval_logps/chosen": -475.5936279296875, "eval_logps/rejected": -511.86138916015625, "eval_loss": 0.6271286606788635, "eval_rewards/accuracies": 0.6350806355476379, "eval_rewards/chosen": -2.127014636993408, "eval_rewards/margins": 0.2526260018348694, "eval_rewards/rejected": -2.379640579223633, "eval_runtime": 325.3184, "eval_samples_per_second": 6.074, "eval_steps_per_second": 0.381, "step": 400 }, { "epoch": 0.8548812664907651, "grad_norm": 14.805379177265603, "learning_rate": 3.092332998903416e-08, "logits/chosen": -2.3201870918273926, "logits/rejected": -2.5070722103118896, "logps/chosen": -494.46075439453125, "logps/rejected": -522.0347900390625, "loss": 0.5659, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.1851017475128174, "rewards/margins": 0.4036984443664551, "rewards/rejected": -2.5887999534606934, "step": 405 }, { "epoch": 0.8654353562005277, "grad_norm": 14.004584602296926, "learning_rate": 2.6622566030146455e-08, "logits/chosen": -2.2431082725524902, "logits/rejected": -2.432163715362549, "logps/chosen": -490.9959411621094, "logps/rejected": -510.72088623046875, "loss": 0.5799, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.1516032218933105, "rewards/margins": 0.3738686442375183, "rewards/rejected": -2.5254716873168945, "step": 410 }, { "epoch": 0.8759894459102903, "grad_norm": 12.549317692341273, "learning_rate": 2.26269087768734e-08, "logits/chosen": -2.362277030944824, "logits/rejected": -2.535696029663086, "logps/chosen": -504.3929748535156, "logps/rejected": -532.2606201171875, "loss": 0.5687, "rewards/accuracies": 0.6875, "rewards/chosen": -2.364351511001587, "rewards/margins": 0.4068065285682678, "rewards/rejected": -2.771157741546631, "step": 415 }, { "epoch": 0.8865435356200527, "grad_norm": 17.291644458564797, "learning_rate": 1.894181581640106e-08, "logits/chosen": -2.474963903427124, "logits/rejected": -2.697309970855713, "logps/chosen": -524.9340209960938, "logps/rejected": -546.8604125976562, "loss": 0.5769, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -2.467923641204834, "rewards/margins": 0.43783673644065857, "rewards/rejected": -2.9057605266571045, "step": 420 }, { "epoch": 0.8970976253298153, "grad_norm": 14.61494685094214, "learning_rate": 1.5572320542448143e-08, "logits/chosen": -2.338838577270508, "logits/rejected": -2.5825653076171875, "logps/chosen": -540.0514526367188, "logps/rejected": -563.7896118164062, "loss": 0.5923, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.444960832595825, "rewards/margins": 0.40859413146972656, "rewards/rejected": -2.8535547256469727, "step": 425 }, { "epoch": 0.9076517150395779, "grad_norm": 11.41595324078826, "learning_rate": 1.2523025280255729e-08, "logits/chosen": -2.3540937900543213, "logits/rejected": -2.604950428009033, "logps/chosen": -529.1434326171875, "logps/rejected": -546.1678466796875, "loss": 0.576, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.464961051940918, "rewards/margins": 0.4290400445461273, "rewards/rejected": -2.894001007080078, "step": 430 }, { "epoch": 0.9182058047493403, "grad_norm": 15.41017745385185, "learning_rate": 9.798095000364214e-09, "logits/chosen": -2.409531831741333, "logits/rejected": -2.6002402305603027, "logps/chosen": -515.3450317382812, "logps/rejected": -548.9906005859375, "loss": 0.5571, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -2.4495291709899902, "rewards/margins": 0.48690468072891235, "rewards/rejected": -2.936434030532837, "step": 435 }, { "epoch": 0.9287598944591029, "grad_norm": 15.067700929396372, "learning_rate": 7.401251629764876e-09, "logits/chosen": -2.5300445556640625, "logits/rejected": -2.7091293334960938, "logps/chosen": -556.22607421875, "logps/rejected": -582.80126953125, "loss": 0.587, "rewards/accuracies": 0.6875, "rewards/chosen": -2.7479639053344727, "rewards/margins": 0.405862033367157, "rewards/rejected": -3.1538259983062744, "step": 440 }, { "epoch": 0.9393139841688655, "grad_norm": 15.58719326370491, "learning_rate": 5.335768968195098e-09, "logits/chosen": -2.4501638412475586, "logits/rejected": -2.711761713027954, "logps/chosen": -547.3215942382812, "logps/rejected": -560.46142578125, "loss": 0.5694, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.6382341384887695, "rewards/margins": 0.393528550863266, "rewards/rejected": -3.0317625999450684, "step": 445 }, { "epoch": 0.9498680738786279, "grad_norm": 18.222145597670412, "learning_rate": 3.604468216521883e-09, "logits/chosen": -2.5248587131500244, "logits/rejected": -2.665889024734497, "logps/chosen": -544.0384521484375, "logps/rejected": -558.4286499023438, "loss": 0.5696, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.6324782371520996, "rewards/margins": 0.36458876729011536, "rewards/rejected": -2.9970669746398926, "step": 450 }, { "epoch": 0.9604221635883905, "grad_norm": 18.763031291028923, "learning_rate": 2.2097141233206884e-09, "logits/chosen": -2.4280190467834473, "logits/rejected": -2.6637661457061768, "logps/chosen": -531.4348754882812, "logps/rejected": -560.4275512695312, "loss": 0.5771, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.5917065143585205, "rewards/margins": 0.4408086836338043, "rewards/rejected": -3.032515287399292, "step": 455 }, { "epoch": 0.9709762532981531, "grad_norm": 13.86977592293856, "learning_rate": 1.1534117549133472e-09, "logits/chosen": -2.514380931854248, "logits/rejected": -2.7799925804138184, "logps/chosen": -552.8911743164062, "logps/rejected": -577.2693481445312, "loss": 0.5713, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.666104793548584, "rewards/margins": 0.5134469270706177, "rewards/rejected": -3.179551601409912, "step": 460 }, { "epoch": 0.9815303430079155, "grad_norm": 15.545025613547882, "learning_rate": 4.3700389327672173e-10, "logits/chosen": -2.3933727741241455, "logits/rejected": -2.630448818206787, "logps/chosen": -566.9051513671875, "logps/rejected": -592.8246459960938, "loss": 0.5817, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.771134614944458, "rewards/margins": 0.4589918553829193, "rewards/rejected": -3.230126142501831, "step": 465 }, { "epoch": 0.9920844327176781, "grad_norm": 13.156905753144738, "learning_rate": 6.146906537587982e-11, "logits/chosen": -2.4273524284362793, "logits/rejected": -2.5787882804870605, "logps/chosen": -544.8056640625, "logps/rejected": -564.3406982421875, "loss": 0.5834, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -2.588836908340454, "rewards/margins": 0.3985593914985657, "rewards/rejected": -2.987395763397217, "step": 470 }, { "epoch": 0.9984168865435357, "step": 473, "total_flos": 0.0, "train_loss": 0.6128583030015167, "train_runtime": 20192.8387, "train_samples_per_second": 3.003, "train_steps_per_second": 0.023 } ], "logging_steps": 5, "max_steps": 473, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }