diff --git "a/fdpo_test_seed_1/checkpoint-384/trainer_state.json" "b/fdpo_test_seed_1/checkpoint-384/trainer_state.json" new file mode 100644--- /dev/null +++ "b/fdpo_test_seed_1/checkpoint-384/trainer_state.json" @@ -0,0 +1,5413 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.0, + "eval_steps": 1.0, + "global_step": 384, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.01, + "learning_rate": 1e-06, + "logits/chosen": 764.4638061523438, + "logits/rejected": 761.4228515625, + "logps/chosen": -97.93035125732422, + "logps/rejected": -110.90742492675781, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.01, + "learning_rate": 1e-06, + "logits/chosen": 752.5679321289062, + "logits/rejected": 752.6337890625, + "logps/chosen": -123.5273208618164, + "logps/rejected": -142.233154296875, + "loss": 0.6369, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.01011733990162611, + "rewards/margins": 0.11587753891944885, + "rewards/rejected": -0.12599487602710724, + "step": 2 + }, + { + "epoch": 0.02, + "learning_rate": 1e-06, + "logits/chosen": 763.8212890625, + "logits/rejected": 761.1595458984375, + "logps/chosen": -120.7507095336914, + "logps/rejected": -90.5356216430664, + "loss": 0.58, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07129593193531036, + "rewards/margins": 0.24075622856616974, + "rewards/rejected": -0.16946029663085938, + "step": 3 + }, + { + "epoch": 0.03, + "learning_rate": 1e-06, + "logits/chosen": 762.6593627929688, + "logits/rejected": 761.0687866210938, + "logps/chosen": -118.54879760742188, + "logps/rejected": -102.5355453491211, + "loss": 0.6118, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.039154052734375, + "rewards/margins": 0.16989365220069885, + "rewards/rejected": -0.13073959946632385, + "step": 4 + }, + { + "epoch": 0.04, + "learning_rate": 1e-06, + "logits/chosen": 757.2559204101562, + "logits/rejected": 758.9849853515625, + "logps/chosen": -162.62420654296875, + "logps/rejected": -139.88278198242188, + "loss": 0.6521, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06438904255628586, + "rewards/margins": 0.08386535942554474, + "rewards/rejected": -0.01947631873190403, + "step": 5 + }, + { + "epoch": 0.04, + "learning_rate": 1e-06, + "logits/chosen": 756.7674560546875, + "logits/rejected": 756.3587646484375, + "logps/chosen": -110.85993957519531, + "logps/rejected": -156.24179077148438, + "loss": 0.6887, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1026763916015625, + "rewards/margins": 0.00886382907629013, + "rewards/rejected": -0.11154022067785263, + "step": 6 + }, + { + "epoch": 0.05, + "learning_rate": 1e-06, + "logits/chosen": 760.385498046875, + "logits/rejected": 764.9473266601562, + "logps/chosen": -75.86040496826172, + "logps/rejected": -46.66392517089844, + "loss": 0.734, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.1554008573293686, + "rewards/margins": -0.08007355779409409, + "rewards/rejected": -0.0753272995352745, + "step": 7 + }, + { + "epoch": 0.06, + "learning_rate": 1e-06, + "logits/chosen": 754.2999877929688, + "logits/rejected": 748.8121948242188, + "logps/chosen": -107.128662109375, + "logps/rejected": -131.2296905517578, + "loss": 0.5731, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.008456421084702015, + "rewards/margins": 0.2565856873989105, + "rewards/rejected": -0.24812927842140198, + "step": 8 + }, + { + "epoch": 0.06, + "learning_rate": 1e-06, + "logits/chosen": 760.7911987304688, + "logits/rejected": 757.0386962890625, + "logps/chosen": -102.23580932617188, + "logps/rejected": -81.56920623779297, + "loss": 0.631, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15421219170093536, + "rewards/margins": 0.1283729523420334, + "rewards/rejected": -0.28258514404296875, + "step": 9 + }, + { + "epoch": 0.07, + "learning_rate": 1e-06, + "logits/chosen": 756.4664306640625, + "logits/rejected": 750.3159790039062, + "logps/chosen": -89.3166732788086, + "logps/rejected": -129.74998474121094, + "loss": 0.6783, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.13186264038085938, + "rewards/margins": 0.029853053390979767, + "rewards/rejected": 0.10200958698987961, + "step": 10 + }, + { + "epoch": 0.08, + "learning_rate": 1e-06, + "logits/chosen": 758.9336547851562, + "logits/rejected": 759.9686279296875, + "logps/chosen": -110.68902587890625, + "logps/rejected": -85.22792053222656, + "loss": 0.7296, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.11720581352710724, + "rewards/margins": -0.07171325385570526, + "rewards/rejected": -0.04549255594611168, + "step": 11 + }, + { + "epoch": 0.08, + "learning_rate": 1e-06, + "logits/chosen": 762.115234375, + "logits/rejected": 763.7662963867188, + "logps/chosen": -117.40858459472656, + "logps/rejected": -93.16138458251953, + "loss": 0.6949, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.01721343956887722, + "rewards/margins": -0.0035514822229743004, + "rewards/rejected": -0.01366195734590292, + "step": 12 + }, + { + "epoch": 0.09, + "learning_rate": 1e-06, + "logits/chosen": 759.1834106445312, + "logits/rejected": 758.5989990234375, + "logps/chosen": -102.92713928222656, + "logps/rejected": -90.89684295654297, + "loss": 0.6872, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.196217343211174, + "rewards/margins": 0.01187056303024292, + "rewards/rejected": 0.1843467801809311, + "step": 13 + }, + { + "epoch": 0.1, + "learning_rate": 1e-06, + "logits/chosen": 757.360107421875, + "logits/rejected": 750.2620239257812, + "logps/chosen": -148.67514038085938, + "logps/rejected": -131.78175354003906, + "loss": 0.4797, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14040528237819672, + "rewards/margins": 0.48512572050094604, + "rewards/rejected": -0.3447204530239105, + "step": 14 + }, + { + "epoch": 0.11, + "learning_rate": 1e-06, + "logits/chosen": 737.7575073242188, + "logits/rejected": 741.4765625, + "logps/chosen": -136.55882263183594, + "logps/rejected": -121.47374725341797, + "loss": 0.5935, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09811859577894211, + "rewards/margins": 0.21031418442726135, + "rewards/rejected": -0.30843278765678406, + "step": 15 + }, + { + "epoch": 0.11, + "learning_rate": 1e-06, + "logits/chosen": 748.8475952148438, + "logits/rejected": 744.8802490234375, + "logps/chosen": -128.9879913330078, + "logps/rejected": -62.55142593383789, + "loss": 0.545, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.16646575927734375, + "rewards/margins": 0.3220936059951782, + "rewards/rejected": -0.15562783181667328, + "step": 16 + }, + { + "epoch": 0.12, + "learning_rate": 1e-06, + "logits/chosen": 758.3858032226562, + "logits/rejected": 746.2037963867188, + "logps/chosen": -110.75660705566406, + "logps/rejected": -118.88664245605469, + "loss": 0.8642, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.036347199231386185, + "rewards/margins": -0.31708911061286926, + "rewards/rejected": 0.2807419002056122, + "step": 17 + }, + { + "epoch": 0.13, + "learning_rate": 1e-06, + "logits/chosen": 738.8646850585938, + "logits/rejected": 746.2620239257812, + "logps/chosen": -118.283447265625, + "logps/rejected": -112.45326232910156, + "loss": 0.6498, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06850891560316086, + "rewards/margins": 0.08872222155332565, + "rewards/rejected": -0.1572311371564865, + "step": 18 + }, + { + "epoch": 0.13, + "learning_rate": 1e-06, + "logits/chosen": 751.4152221679688, + "logits/rejected": 755.874755859375, + "logps/chosen": -100.26580810546875, + "logps/rejected": -109.46641540527344, + "loss": 0.6315, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09481048583984375, + "rewards/margins": 0.12738418579101562, + "rewards/rejected": -0.032573699951171875, + "step": 19 + }, + { + "epoch": 0.14, + "learning_rate": 1e-06, + "logits/chosen": 756.5003662109375, + "logits/rejected": 757.3355102539062, + "logps/chosen": -100.18865203857422, + "logps/rejected": -90.1058120727539, + "loss": 0.7599, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.042594145983457565, + "rewards/margins": -0.12934264540672302, + "rewards/rejected": 0.08674850314855576, + "step": 20 + }, + { + "epoch": 0.15, + "learning_rate": 1e-06, + "logits/chosen": 748.668212890625, + "logits/rejected": 747.31591796875, + "logps/chosen": -108.35651397705078, + "logps/rejected": -118.47122955322266, + "loss": 0.2804, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3621322810649872, + "rewards/margins": 1.1279884576797485, + "rewards/rejected": -0.7658562064170837, + "step": 21 + }, + { + "epoch": 0.15, + "learning_rate": 1e-06, + "logits/chosen": 756.00732421875, + "logits/rejected": 759.6976318359375, + "logps/chosen": -154.11558532714844, + "logps/rejected": -136.60943603515625, + "loss": 0.9174, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.3178848326206207, + "rewards/margins": -0.407257080078125, + "rewards/rejected": 0.08937225490808487, + "step": 22 + }, + { + "epoch": 0.16, + "learning_rate": 1e-06, + "logits/chosen": 759.8823852539062, + "logits/rejected": 762.1998291015625, + "logps/chosen": -171.6786651611328, + "logps/rejected": -187.7472381591797, + "loss": 0.8705, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.26835328340530396, + "rewards/margins": -0.3278640806674957, + "rewards/rejected": 0.05951080471277237, + "step": 23 + }, + { + "epoch": 0.17, + "learning_rate": 1e-06, + "logits/chosen": 762.2452392578125, + "logits/rejected": 742.046630859375, + "logps/chosen": -138.851318359375, + "logps/rejected": -137.5217742919922, + "loss": 0.6615, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2660537660121918, + "rewards/margins": 0.06429138779640198, + "rewards/rejected": -0.33034515380859375, + "step": 24 + }, + { + "epoch": 0.18, + "learning_rate": 1e-06, + "logits/chosen": 751.2327880859375, + "logits/rejected": 749.802001953125, + "logps/chosen": -110.74943542480469, + "logps/rejected": -117.26881408691406, + "loss": 0.6655, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.20365600287914276, + "rewards/margins": 0.056072235107421875, + "rewards/rejected": 0.1475837677717209, + "step": 25 + }, + { + "epoch": 0.18, + "learning_rate": 1e-06, + "logits/chosen": 735.2847900390625, + "logits/rejected": 717.2905883789062, + "logps/chosen": -110.95075225830078, + "logps/rejected": -137.11422729492188, + "loss": 0.6817, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15410080552101135, + "rewards/margins": 0.023120880126953125, + "rewards/rejected": -0.17722168564796448, + "step": 26 + }, + { + "epoch": 0.19, + "learning_rate": 1e-06, + "logits/chosen": 764.1547241210938, + "logits/rejected": 757.8618774414062, + "logps/chosen": -116.16393280029297, + "logps/rejected": -143.43783569335938, + "loss": 0.8991, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.432321161031723, + "rewards/margins": -0.376678466796875, + "rewards/rejected": -0.05564270168542862, + "step": 27 + }, + { + "epoch": 0.2, + "learning_rate": 1e-06, + "logits/chosen": 754.07373046875, + "logits/rejected": 754.0462036132812, + "logps/chosen": -118.10738372802734, + "logps/rejected": -81.28277587890625, + "loss": 0.8209, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.20333634316921234, + "rewards/margins": -0.2410331815481186, + "rewards/rejected": 0.03769683837890625, + "step": 28 + }, + { + "epoch": 0.2, + "learning_rate": 1e-06, + "logits/chosen": 726.7611694335938, + "logits/rejected": 754.1725463867188, + "logps/chosen": -35.8173713684082, + "logps/rejected": -49.520381927490234, + "loss": 0.6968, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.18792609870433807, + "rewards/margins": -0.007364645600318909, + "rewards/rejected": -0.18056145310401917, + "step": 29 + }, + { + "epoch": 0.21, + "learning_rate": 1e-06, + "logits/chosen": 757.5687866210938, + "logits/rejected": 747.6735229492188, + "logps/chosen": -121.11041259765625, + "logps/rejected": -127.15098571777344, + "loss": 0.6606, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09317779541015625, + "rewards/margins": 0.06621399521827698, + "rewards/rejected": -0.15939179062843323, + "step": 30 + }, + { + "epoch": 0.22, + "learning_rate": 1e-06, + "logits/chosen": 751.5040893554688, + "logits/rejected": 742.9932250976562, + "logps/chosen": -102.38017272949219, + "logps/rejected": -77.6241226196289, + "loss": 0.6426, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08546600490808487, + "rewards/margins": 0.10382766276597977, + "rewards/rejected": -0.18929366767406464, + "step": 31 + }, + { + "epoch": 0.23, + "learning_rate": 1e-06, + "logits/chosen": 752.1027221679688, + "logits/rejected": 760.2054443359375, + "logps/chosen": -64.5015640258789, + "logps/rejected": -81.18472290039062, + "loss": 0.802, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.3496158719062805, + "rewards/margins": -0.20701028406620026, + "rewards/rejected": -0.14260558784008026, + "step": 32 + }, + { + "epoch": 0.23, + "learning_rate": 1e-06, + "logits/chosen": 736.0115356445312, + "logits/rejected": 730.6161499023438, + "logps/chosen": -72.37939453125, + "logps/rejected": -78.26362609863281, + "loss": 0.6596, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07304687798023224, + "rewards/margins": 0.06815643608570099, + "rewards/rejected": 0.00489044189453125, + "step": 33 + }, + { + "epoch": 0.24, + "learning_rate": 1e-06, + "logits/chosen": 747.4063110351562, + "logits/rejected": 751.9911499023438, + "logps/chosen": -200.87123107910156, + "logps/rejected": -140.84243774414062, + "loss": 0.9749, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.45883941650390625, + "rewards/margins": -0.5014007687568665, + "rewards/rejected": 0.04256134107708931, + "step": 34 + }, + { + "epoch": 0.25, + "learning_rate": 1e-06, + "logits/chosen": 760.1029052734375, + "logits/rejected": 765.5377807617188, + "logps/chosen": -146.69708251953125, + "logps/rejected": -86.4393081665039, + "loss": 0.3671, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25168153643608093, + "rewards/margins": 0.8130607604980469, + "rewards/rejected": -0.5613792538642883, + "step": 35 + }, + { + "epoch": 0.25, + "learning_rate": 1e-06, + "logits/chosen": 750.3985595703125, + "logits/rejected": 734.8814697265625, + "logps/chosen": -102.51422119140625, + "logps/rejected": -123.13782501220703, + "loss": 0.6403, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.10166626423597336, + "rewards/margins": 0.10860519856214523, + "rewards/rejected": -0.2102714627981186, + "step": 36 + }, + { + "epoch": 0.26, + "learning_rate": 1e-06, + "logits/chosen": 763.2600708007812, + "logits/rejected": 762.0756225585938, + "logps/chosen": -125.00343322753906, + "logps/rejected": -129.2534942626953, + "loss": 0.8149, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.46032410860061646, + "rewards/margins": -0.23026199638843536, + "rewards/rejected": -0.2300621122121811, + "step": 37 + }, + { + "epoch": 0.27, + "learning_rate": 1e-06, + "logits/chosen": 751.3994140625, + "logits/rejected": 757.9010009765625, + "logps/chosen": -107.737060546875, + "logps/rejected": -114.07667541503906, + "loss": 0.4406, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.08528061211109161, + "rewards/margins": 0.5912086367607117, + "rewards/rejected": -0.5059280395507812, + "step": 38 + }, + { + "epoch": 0.27, + "learning_rate": 1e-06, + "logits/chosen": 754.9421997070312, + "logits/rejected": 754.058349609375, + "logps/chosen": -112.81529235839844, + "logps/rejected": -124.47413635253906, + "loss": 0.6796, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.05758056789636612, + "rewards/margins": 0.027332305908203125, + "rewards/rejected": -0.08491287380456924, + "step": 39 + }, + { + "epoch": 0.28, + "learning_rate": 1e-06, + "logits/chosen": 743.2937622070312, + "logits/rejected": 751.8687133789062, + "logps/chosen": -88.56942749023438, + "logps/rejected": -112.16087341308594, + "loss": 0.9837, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.27603456377983093, + "rewards/margins": -0.5153458118438721, + "rewards/rejected": 0.23931121826171875, + "step": 40 + }, + { + "epoch": 0.29, + "learning_rate": 1e-06, + "logits/chosen": 764.50830078125, + "logits/rejected": 761.2915649414062, + "logps/chosen": -106.87338256835938, + "logps/rejected": -54.64258575439453, + "loss": 0.5876, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18732452392578125, + "rewards/margins": 0.22363892197608948, + "rewards/rejected": -0.4109634459018707, + "step": 41 + }, + { + "epoch": 0.3, + "learning_rate": 1e-06, + "logits/chosen": 743.6443481445312, + "logits/rejected": 744.7010498046875, + "logps/chosen": -80.66154479980469, + "logps/rejected": -97.54801940917969, + "loss": 0.5698, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.10201110690832138, + "rewards/margins": 0.2641037106513977, + "rewards/rejected": -0.16209259629249573, + "step": 42 + }, + { + "epoch": 0.3, + "learning_rate": 1e-06, + "logits/chosen": 702.2275390625, + "logits/rejected": 725.9920654296875, + "logps/chosen": -88.91989135742188, + "logps/rejected": -67.57276153564453, + "loss": 0.6539, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.048259735107421875, + "rewards/margins": 0.08015899360179901, + "rewards/rejected": -0.1284187287092209, + "step": 43 + }, + { + "epoch": 0.31, + "learning_rate": 1e-06, + "logits/chosen": 755.2586669921875, + "logits/rejected": 736.1990966796875, + "logps/chosen": -99.885009765625, + "logps/rejected": -141.79798889160156, + "loss": 0.4196, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05605621263384819, + "rewards/margins": 0.6512848138809204, + "rewards/rejected": -0.5952286124229431, + "step": 44 + }, + { + "epoch": 0.32, + "learning_rate": 1e-06, + "logits/chosen": 751.4073486328125, + "logits/rejected": 760.989990234375, + "logps/chosen": -128.45132446289062, + "logps/rejected": -123.00556945800781, + "loss": 0.6127, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03395385667681694, + "rewards/margins": 0.16784439980983734, + "rewards/rejected": -0.1338905394077301, + "step": 45 + }, + { + "epoch": 0.32, + "learning_rate": 1e-06, + "logits/chosen": 752.4517822265625, + "logits/rejected": 753.5572509765625, + "logps/chosen": -83.05987548828125, + "logps/rejected": -116.48635864257812, + "loss": 0.6134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09021759033203125, + "rewards/margins": 0.16639479994773865, + "rewards/rejected": -0.2566123902797699, + "step": 46 + }, + { + "epoch": 0.33, + "learning_rate": 1e-06, + "logits/chosen": 735.6661987304688, + "logits/rejected": 738.6618041992188, + "logps/chosen": -98.42833709716797, + "logps/rejected": -123.45014953613281, + "loss": 0.6275, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0876060500741005, + "rewards/margins": 0.1358238160610199, + "rewards/rejected": -0.223429873585701, + "step": 47 + }, + { + "epoch": 0.34, + "learning_rate": 1e-06, + "logits/chosen": 756.7797241210938, + "logits/rejected": 760.5571899414062, + "logps/chosen": -90.3933334350586, + "logps/rejected": -93.92127990722656, + "loss": 0.735, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.22783660888671875, + "rewards/margins": -0.08210906386375427, + "rewards/rejected": -0.14572754502296448, + "step": 48 + }, + { + "epoch": 0.35, + "learning_rate": 1e-06, + "logits/chosen": 756.03271484375, + "logits/rejected": 751.1925659179688, + "logps/chosen": -94.63128662109375, + "logps/rejected": -111.46923828125, + "loss": 0.4988, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.332620233297348, + "rewards/margins": 0.4359176754951477, + "rewards/rejected": -0.10329742729663849, + "step": 49 + }, + { + "epoch": 0.35, + "learning_rate": 1e-06, + "logits/chosen": 750.6693725585938, + "logits/rejected": 727.6410522460938, + "logps/chosen": -127.14371490478516, + "logps/rejected": -141.1189422607422, + "loss": 0.53, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.14541397988796234, + "rewards/margins": 0.3583183288574219, + "rewards/rejected": -0.21290436387062073, + "step": 50 + }, + { + "epoch": 0.36, + "learning_rate": 1e-06, + "logits/chosen": 752.72314453125, + "logits/rejected": 742.1061401367188, + "logps/chosen": -139.9878387451172, + "logps/rejected": -115.6321792602539, + "loss": 0.7613, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.0016769409412518144, + "rewards/margins": -0.13192978501319885, + "rewards/rejected": 0.13025283813476562, + "step": 51 + }, + { + "epoch": 0.37, + "learning_rate": 1e-06, + "logits/chosen": 756.05029296875, + "logits/rejected": 751.9379272460938, + "logps/chosen": -125.40816497802734, + "logps/rejected": -157.74908447265625, + "loss": 0.619, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.030683135613799095, + "rewards/margins": 0.1542045623064041, + "rewards/rejected": -0.12352142482995987, + "step": 52 + }, + { + "epoch": 0.37, + "learning_rate": 1e-06, + "logits/chosen": 748.6371459960938, + "logits/rejected": 748.2842407226562, + "logps/chosen": -58.064640045166016, + "logps/rejected": -70.60032653808594, + "loss": 0.7691, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.04222908243536949, + "rewards/margins": -0.14660683274269104, + "rewards/rejected": 0.10437774658203125, + "step": 53 + }, + { + "epoch": 0.38, + "learning_rate": 1e-06, + "logits/chosen": 758.9898681640625, + "logits/rejected": 759.9035034179688, + "logps/chosen": -92.18698120117188, + "logps/rejected": -56.28901290893555, + "loss": 0.6426, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.051839448511600494, + "rewards/margins": 0.10371094197034836, + "rewards/rejected": -0.15555039048194885, + "step": 54 + }, + { + "epoch": 0.39, + "learning_rate": 1e-06, + "logits/chosen": 757.0160522460938, + "logits/rejected": 760.1632690429688, + "logps/chosen": -124.617431640625, + "logps/rejected": -134.78347778320312, + "loss": 0.6649, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.039947509765625, + "rewards/margins": 0.05721893161535263, + "rewards/rejected": -0.09716644138097763, + "step": 55 + }, + { + "epoch": 0.39, + "learning_rate": 1e-06, + "logits/chosen": 746.8894653320312, + "logits/rejected": 748.6250610351562, + "logps/chosen": -128.22354125976562, + "logps/rejected": -121.92475891113281, + "loss": 0.5966, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3081298768520355, + "rewards/margins": 0.2034912407398224, + "rewards/rejected": -0.5116211175918579, + "step": 56 + }, + { + "epoch": 0.4, + "learning_rate": 1e-06, + "logits/chosen": 761.6463012695312, + "logits/rejected": 755.6340942382812, + "logps/chosen": -119.68124389648438, + "logps/rejected": -120.28682708740234, + "loss": 0.529, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.211955264210701, + "rewards/margins": 0.3605507016181946, + "rewards/rejected": -0.1485954374074936, + "step": 57 + }, + { + "epoch": 0.41, + "learning_rate": 1e-06, + "logits/chosen": 744.4283447265625, + "logits/rejected": 753.7869262695312, + "logps/chosen": -140.1608428955078, + "logps/rejected": -108.81074523925781, + "loss": 0.6257, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01197052001953125, + "rewards/margins": 0.1397911161184311, + "rewards/rejected": -0.12782059609889984, + "step": 58 + }, + { + "epoch": 0.42, + "learning_rate": 1e-06, + "logits/chosen": 752.8543090820312, + "logits/rejected": 755.0949096679688, + "logps/chosen": -116.94694519042969, + "logps/rejected": -146.7781524658203, + "loss": 0.4914, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.037078857421875, + "rewards/margins": 0.45475006103515625, + "rewards/rejected": -0.41767120361328125, + "step": 59 + }, + { + "epoch": 0.42, + "learning_rate": 1e-06, + "logits/chosen": 757.67529296875, + "logits/rejected": 761.3323974609375, + "logps/chosen": -89.47793579101562, + "logps/rejected": -143.24212646484375, + "loss": 0.8872, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.4256172180175781, + "rewards/margins": -0.3564491271972656, + "rewards/rejected": -0.0691680908203125, + "step": 60 + }, + { + "epoch": 0.43, + "learning_rate": 1e-06, + "logits/chosen": 738.3493041992188, + "logits/rejected": 724.2689208984375, + "logps/chosen": -102.72940063476562, + "logps/rejected": -118.49750518798828, + "loss": 0.6916, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.023908233270049095, + "rewards/margins": 0.00304565392434597, + "rewards/rejected": 0.020862579345703125, + "step": 61 + }, + { + "epoch": 0.44, + "learning_rate": 1e-06, + "logits/chosen": 751.5997924804688, + "logits/rejected": 752.8407592773438, + "logps/chosen": -106.08146667480469, + "logps/rejected": -90.82438659667969, + "loss": 0.8217, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.14467011392116547, + "rewards/margins": -0.24251481890678406, + "rewards/rejected": 0.097844697535038, + "step": 62 + }, + { + "epoch": 0.44, + "learning_rate": 1e-06, + "logits/chosen": 752.1771850585938, + "logits/rejected": 747.8937377929688, + "logps/chosen": -117.58723449707031, + "logps/rejected": -115.23249816894531, + "loss": 0.6372, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4052841365337372, + "rewards/margins": 0.11519011855125427, + "rewards/rejected": -0.5204742550849915, + "step": 63 + }, + { + "epoch": 0.45, + "learning_rate": 1e-06, + "logits/chosen": 761.4396362304688, + "logits/rejected": 762.1318359375, + "logps/chosen": -129.77969360351562, + "logps/rejected": -130.31796264648438, + "loss": 0.5847, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07098998874425888, + "rewards/margins": 0.23017579317092896, + "rewards/rejected": -0.15918579697608948, + "step": 64 + }, + { + "epoch": 0.46, + "learning_rate": 1e-06, + "logits/chosen": 757.9916381835938, + "logits/rejected": 753.4136962890625, + "logps/chosen": -129.6767120361328, + "logps/rejected": -125.47244262695312, + "loss": 1.1914, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.5169326663017273, + "rewards/margins": -0.8293319940567017, + "rewards/rejected": 0.312399297952652, + "step": 65 + }, + { + "epoch": 0.46, + "learning_rate": 1e-06, + "logits/chosen": 759.7933959960938, + "logits/rejected": 759.0105590820312, + "logps/chosen": -98.1355972290039, + "logps/rejected": -93.47838592529297, + "loss": 0.8157, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.022873688489198685, + "rewards/margins": -0.2316688597202301, + "rewards/rejected": 0.20879517495632172, + "step": 66 + }, + { + "epoch": 0.47, + "learning_rate": 1e-06, + "logits/chosen": 754.3090209960938, + "logits/rejected": 743.0081787109375, + "logps/chosen": -99.8756103515625, + "logps/rejected": -108.28828430175781, + "loss": 0.5445, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06540145725011826, + "rewards/margins": 0.32337725162506104, + "rewards/rejected": -0.2579757869243622, + "step": 67 + }, + { + "epoch": 0.48, + "learning_rate": 1e-06, + "logits/chosen": 761.7742919921875, + "logits/rejected": 758.9119873046875, + "logps/chosen": -119.34580993652344, + "logps/rejected": -163.52130126953125, + "loss": 0.947, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.38124313950538635, + "rewards/margins": -0.45611801743507385, + "rewards/rejected": 0.0748748779296875, + "step": 68 + }, + { + "epoch": 0.49, + "learning_rate": 1e-06, + "logits/chosen": 760.0370483398438, + "logits/rejected": 761.59716796875, + "logps/chosen": -119.20895385742188, + "logps/rejected": -128.97647094726562, + "loss": 0.7996, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.28709107637405396, + "rewards/margins": -0.20270845293998718, + "rewards/rejected": -0.08438263088464737, + "step": 69 + }, + { + "epoch": 0.49, + "learning_rate": 1e-06, + "logits/chosen": 751.2227783203125, + "logits/rejected": 761.312255859375, + "logps/chosen": -95.9995346069336, + "logps/rejected": -100.01705932617188, + "loss": 0.8086, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.24229812622070312, + "rewards/margins": -0.218891903758049, + "rewards/rejected": -0.023406220600008965, + "step": 70 + }, + { + "epoch": 0.5, + "learning_rate": 1e-06, + "logits/chosen": 760.1939697265625, + "logits/rejected": 759.4922485351562, + "logps/chosen": -131.91094970703125, + "logps/rejected": -144.0423583984375, + "loss": 0.698, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.18262329697608948, + "rewards/margins": -0.009626775979995728, + "rewards/rejected": -0.17299652099609375, + "step": 71 + }, + { + "epoch": 0.51, + "learning_rate": 1e-06, + "logits/chosen": 757.9854125976562, + "logits/rejected": 761.405517578125, + "logps/chosen": -104.34053039550781, + "logps/rejected": -83.25271606445312, + "loss": 0.7948, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.49568939208984375, + "rewards/margins": -0.19383087754249573, + "rewards/rejected": -0.301858514547348, + "step": 72 + }, + { + "epoch": 0.51, + "learning_rate": 1e-06, + "logits/chosen": 767.4866943359375, + "logits/rejected": 761.1202392578125, + "logps/chosen": -111.77717590332031, + "logps/rejected": -97.4455337524414, + "loss": 0.5509, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11773528903722763, + "rewards/margins": 0.30806654691696167, + "rewards/rejected": -0.4258018434047699, + "step": 73 + }, + { + "epoch": 0.52, + "learning_rate": 1e-06, + "logits/chosen": 754.52001953125, + "logits/rejected": 752.1524047851562, + "logps/chosen": -121.61293029785156, + "logps/rejected": -93.05191040039062, + "loss": 0.7292, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.1756126433610916, + "rewards/margins": -0.0708213821053505, + "rewards/rejected": -0.10479126125574112, + "step": 74 + }, + { + "epoch": 0.53, + "learning_rate": 1e-06, + "logits/chosen": 741.5098266601562, + "logits/rejected": 739.8911743164062, + "logps/chosen": -106.20742797851562, + "logps/rejected": -46.48041534423828, + "loss": 0.5587, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08890075981616974, + "rewards/margins": 0.28971290588378906, + "rewards/rejected": -0.37861368060112, + "step": 75 + }, + { + "epoch": 0.54, + "learning_rate": 1e-06, + "logits/chosen": 745.55810546875, + "logits/rejected": 750.806640625, + "logps/chosen": -119.76245880126953, + "logps/rejected": -101.90979766845703, + "loss": 0.8207, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.3279609680175781, + "rewards/margins": -0.2407333254814148, + "rewards/rejected": -0.08722763508558273, + "step": 76 + }, + { + "epoch": 0.54, + "learning_rate": 1e-06, + "logits/chosen": 763.961181640625, + "logits/rejected": 761.4111328125, + "logps/chosen": -123.98377227783203, + "logps/rejected": -85.49230194091797, + "loss": 0.5695, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06825637817382812, + "rewards/margins": 0.26486968994140625, + "rewards/rejected": -0.3331260681152344, + "step": 77 + }, + { + "epoch": 0.55, + "learning_rate": 1e-06, + "logits/chosen": 762.0533447265625, + "logits/rejected": 759.9553833007812, + "logps/chosen": -118.5648422241211, + "logps/rejected": -99.68025207519531, + "loss": 0.6956, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2101341336965561, + "rewards/margins": -0.004981234669685364, + "rewards/rejected": -0.20515289902687073, + "step": 78 + }, + { + "epoch": 0.56, + "learning_rate": 1e-06, + "logits/chosen": 755.7527465820312, + "logits/rejected": 749.006591796875, + "logps/chosen": -128.2355499267578, + "logps/rejected": -106.59686279296875, + "loss": 0.7097, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.1735221892595291, + "rewards/margins": -0.032891079783439636, + "rewards/rejected": -0.14063110947608948, + "step": 79 + }, + { + "epoch": 0.56, + "learning_rate": 1e-06, + "logits/chosen": 747.9053344726562, + "logits/rejected": 742.8861694335938, + "logps/chosen": -102.98145294189453, + "logps/rejected": -119.53512573242188, + "loss": 0.9196, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.084833525121212, + "rewards/margins": -0.4108985960483551, + "rewards/rejected": 0.3260650634765625, + "step": 80 + }, + { + "epoch": 0.57, + "learning_rate": 1e-06, + "logits/chosen": 764.1580810546875, + "logits/rejected": 764.8826293945312, + "logps/chosen": -119.13794708251953, + "logps/rejected": -152.4549560546875, + "loss": 0.6961, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.363504022359848, + "rewards/margins": -0.00585174560546875, + "rewards/rejected": -0.3576522767543793, + "step": 81 + }, + { + "epoch": 0.58, + "learning_rate": 1e-06, + "logits/chosen": 764.7913818359375, + "logits/rejected": 767.0409545898438, + "logps/chosen": -75.571533203125, + "logps/rejected": -47.680519104003906, + "loss": 0.6294, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.029735565185546875, + "rewards/margins": 0.13176842033863068, + "rewards/rejected": -0.16150398552417755, + "step": 82 + }, + { + "epoch": 0.58, + "learning_rate": 1e-06, + "logits/chosen": 753.986572265625, + "logits/rejected": 753.5350952148438, + "logps/chosen": -124.35569763183594, + "logps/rejected": -102.05960083007812, + "loss": 0.5461, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08840484917163849, + "rewards/margins": 0.3195533752441406, + "rewards/rejected": -0.4079582393169403, + "step": 83 + }, + { + "epoch": 0.59, + "learning_rate": 1e-06, + "logits/chosen": 764.2150268554688, + "logits/rejected": 762.1077880859375, + "logps/chosen": -140.03704833984375, + "logps/rejected": -129.44155883789062, + "loss": 0.5424, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.22366943955421448, + "rewards/margins": 0.3283752501010895, + "rewards/rejected": -0.104705810546875, + "step": 84 + }, + { + "epoch": 0.6, + "learning_rate": 1e-06, + "logits/chosen": 755.70703125, + "logits/rejected": 763.6488647460938, + "logps/chosen": -153.62710571289062, + "logps/rejected": -72.65528869628906, + "loss": 0.664, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.0819549560546875, + "rewards/margins": 0.05920639634132385, + "rewards/rejected": -0.14116135239601135, + "step": 85 + }, + { + "epoch": 0.61, + "learning_rate": 1e-06, + "logits/chosen": 753.8207397460938, + "logits/rejected": 753.836669921875, + "logps/chosen": -87.35313415527344, + "logps/rejected": -100.01573944091797, + "loss": 0.6427, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2605789303779602, + "rewards/margins": 0.10361480712890625, + "rewards/rejected": -0.36419373750686646, + "step": 86 + }, + { + "epoch": 0.61, + "learning_rate": 1e-06, + "logits/chosen": 745.9812622070312, + "logits/rejected": 737.1619262695312, + "logps/chosen": -143.4858856201172, + "logps/rejected": -134.9827880859375, + "loss": 1.1085, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.4747863709926605, + "rewards/margins": -0.7079299688339233, + "rewards/rejected": 0.233143612742424, + "step": 87 + }, + { + "epoch": 0.62, + "learning_rate": 1e-06, + "logits/chosen": 751.6220703125, + "logits/rejected": 750.4932861328125, + "logps/chosen": -123.07611083984375, + "logps/rejected": -105.552978515625, + "loss": 0.7491, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.3319442868232727, + "rewards/margins": -0.10901184380054474, + "rewards/rejected": -0.22293244302272797, + "step": 88 + }, + { + "epoch": 0.63, + "learning_rate": 1e-06, + "logits/chosen": 745.0188598632812, + "logits/rejected": 753.0629272460938, + "logps/chosen": -98.93976593017578, + "logps/rejected": -86.66239929199219, + "loss": 0.6431, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.17377853393554688, + "rewards/margins": 0.10282212495803833, + "rewards/rejected": -0.2766006588935852, + "step": 89 + }, + { + "epoch": 0.63, + "learning_rate": 1e-06, + "logits/chosen": 764.0938720703125, + "logits/rejected": 764.4071655273438, + "logps/chosen": -67.20352172851562, + "logps/rejected": -74.39288330078125, + "loss": 0.7675, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.15355224907398224, + "rewards/margins": -0.14358673989772797, + "rewards/rejected": -0.009965515695512295, + "step": 90 + }, + { + "epoch": 0.64, + "learning_rate": 1e-06, + "logits/chosen": 753.6372680664062, + "logits/rejected": 753.1512451171875, + "logps/chosen": -97.92921447753906, + "logps/rejected": -91.00151062011719, + "loss": 0.8089, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2650283873081207, + "rewards/margins": -0.21955186128616333, + "rewards/rejected": -0.045476533472537994, + "step": 91 + }, + { + "epoch": 0.65, + "learning_rate": 1e-06, + "logits/chosen": 723.7212524414062, + "logits/rejected": 720.0075073242188, + "logps/chosen": -145.670654296875, + "logps/rejected": -99.59265899658203, + "loss": 0.7359, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.3436279296875, + "rewards/margins": -0.0836586058139801, + "rewards/rejected": -0.2599693238735199, + "step": 92 + }, + { + "epoch": 0.65, + "learning_rate": 1e-06, + "logits/chosen": 763.4015502929688, + "logits/rejected": 764.9708251953125, + "logps/chosen": -136.78121948242188, + "logps/rejected": -140.60479736328125, + "loss": 0.794, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.27412110567092896, + "rewards/margins": -0.1925399899482727, + "rewards/rejected": -0.08158111572265625, + "step": 93 + }, + { + "epoch": 0.66, + "learning_rate": 1e-06, + "logits/chosen": 753.7976684570312, + "logits/rejected": 754.0098266601562, + "logps/chosen": -114.50205993652344, + "logps/rejected": -147.43289184570312, + "loss": 0.7339, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.17452393472194672, + "rewards/margins": -0.07981415092945099, + "rewards/rejected": 0.2543380856513977, + "step": 94 + }, + { + "epoch": 0.67, + "learning_rate": 1e-06, + "logits/chosen": 748.94921875, + "logits/rejected": 752.2357177734375, + "logps/chosen": -115.40544128417969, + "logps/rejected": -129.00430297851562, + "loss": 0.7599, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.1198020949959755, + "rewards/margins": -0.12924423813819885, + "rewards/rejected": 0.009442138485610485, + "step": 95 + }, + { + "epoch": 0.68, + "learning_rate": 1e-06, + "logits/chosen": 759.995849609375, + "logits/rejected": 761.3862915039062, + "logps/chosen": -124.8411865234375, + "logps/rejected": -121.31877899169922, + "loss": 0.802, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0161285400390625, + "rewards/margins": -0.20698776841163635, + "rewards/rejected": 0.22311630845069885, + "step": 96 + }, + { + "epoch": 0.68, + "learning_rate": 1e-06, + "logits/chosen": 743.0630493164062, + "logits/rejected": 737.2128295898438, + "logps/chosen": -126.05711364746094, + "logps/rejected": -127.29440307617188, + "loss": 0.8768, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.3359634578227997, + "rewards/margins": -0.33870774507522583, + "rewards/rejected": 0.0027442933060228825, + "step": 97 + }, + { + "epoch": 0.69, + "learning_rate": 1e-06, + "logits/chosen": 747.9443969726562, + "logits/rejected": 749.3529663085938, + "logps/chosen": -98.00287628173828, + "logps/rejected": -123.86518096923828, + "loss": 0.5937, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.019467925652861595, + "rewards/margins": 0.20989990234375, + "rewards/rejected": -0.19043198227882385, + "step": 98 + }, + { + "epoch": 0.7, + "learning_rate": 1e-06, + "logits/chosen": 757.425537109375, + "logits/rejected": 757.1771240234375, + "logps/chosen": -145.6573486328125, + "logps/rejected": -172.71075439453125, + "loss": 0.6653, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.09065399318933487, + "rewards/margins": 0.05641632154583931, + "rewards/rejected": 0.03423767164349556, + "step": 99 + }, + { + "epoch": 0.7, + "learning_rate": 1e-06, + "logits/chosen": 743.8179931640625, + "logits/rejected": 742.6279296875, + "logps/chosen": -105.31732177734375, + "logps/rejected": -105.94932556152344, + "loss": 0.61, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.05047149583697319, + "rewards/margins": 0.17373962700366974, + "rewards/rejected": -0.12326812744140625, + "step": 100 + }, + { + "epoch": 0.71, + "learning_rate": 1e-06, + "logits/chosen": 764.404541015625, + "logits/rejected": 766.1242065429688, + "logps/chosen": -74.53715515136719, + "logps/rejected": -126.6571273803711, + "loss": 0.712, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.23829498887062073, + "rewards/margins": -0.037270352244377136, + "rewards/rejected": -0.2010246366262436, + "step": 101 + }, + { + "epoch": 0.72, + "learning_rate": 1e-06, + "logits/chosen": 765.6698608398438, + "logits/rejected": 761.167724609375, + "logps/chosen": -121.03076934814453, + "logps/rejected": -146.04257202148438, + "loss": 0.5161, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3844200074672699, + "rewards/margins": 0.39238661527633667, + "rewards/rejected": -0.007966614328324795, + "step": 102 + }, + { + "epoch": 0.73, + "learning_rate": 1e-06, + "logits/chosen": 762.09814453125, + "logits/rejected": 761.5235595703125, + "logps/chosen": -74.78302001953125, + "logps/rejected": -54.076114654541016, + "loss": 0.7615, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2950142025947571, + "rewards/margins": -0.13233071565628052, + "rewards/rejected": -0.16268348693847656, + "step": 103 + }, + { + "epoch": 0.73, + "learning_rate": 1e-06, + "logits/chosen": 740.9817504882812, + "logits/rejected": 742.8893432617188, + "logps/chosen": -114.18751525878906, + "logps/rejected": -102.99320983886719, + "loss": 0.8736, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.24495545029640198, + "rewards/margins": -0.33319780230522156, + "rewards/rejected": 0.08824234455823898, + "step": 104 + }, + { + "epoch": 0.74, + "learning_rate": 1e-06, + "logits/chosen": 757.228271484375, + "logits/rejected": 754.0084228515625, + "logps/chosen": -122.21895599365234, + "logps/rejected": -109.7493667602539, + "loss": 0.5761, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.12509766221046448, + "rewards/margins": 0.2496589720249176, + "rewards/rejected": -0.12456130981445312, + "step": 105 + }, + { + "epoch": 0.75, + "learning_rate": 1e-06, + "logits/chosen": 747.152099609375, + "logits/rejected": 744.0999755859375, + "logps/chosen": -112.57635498046875, + "logps/rejected": -118.1578598022461, + "loss": 0.7861, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.13722534477710724, + "rewards/margins": -0.1780494749546051, + "rewards/rejected": 0.040824126452207565, + "step": 106 + }, + { + "epoch": 0.75, + "learning_rate": 1e-06, + "logits/chosen": 761.1990356445312, + "logits/rejected": 746.3059692382812, + "logps/chosen": -105.62643432617188, + "logps/rejected": -157.44781494140625, + "loss": 0.6916, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19440308213233948, + "rewards/margins": 0.0030639618635177612, + "rewards/rejected": -0.19746704399585724, + "step": 107 + }, + { + "epoch": 0.76, + "learning_rate": 1e-06, + "logits/chosen": 753.099609375, + "logits/rejected": 749.4813842773438, + "logps/chosen": -94.44894409179688, + "logps/rejected": -130.79641723632812, + "loss": 0.6541, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.11890258640050888, + "rewards/margins": 0.07972564548254013, + "rewards/rejected": -0.198628231883049, + "step": 108 + }, + { + "epoch": 0.77, + "learning_rate": 1e-06, + "logits/chosen": 729.2442016601562, + "logits/rejected": 728.8242797851562, + "logps/chosen": -118.32754516601562, + "logps/rejected": -128.49957275390625, + "loss": 0.5382, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.065180204808712, + "rewards/margins": 0.3383010923862457, + "rewards/rejected": -0.40348130464553833, + "step": 109 + }, + { + "epoch": 0.77, + "learning_rate": 1e-06, + "logits/chosen": 753.8582763671875, + "logits/rejected": 762.9027709960938, + "logps/chosen": -118.78437805175781, + "logps/rejected": -108.376953125, + "loss": 0.5721, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25769349932670593, + "rewards/margins": 0.25887376070022583, + "rewards/rejected": -0.0011802673107013106, + "step": 110 + }, + { + "epoch": 0.78, + "learning_rate": 1e-06, + "logits/chosen": 733.0245361328125, + "logits/rejected": 746.3375244140625, + "logps/chosen": -72.14678192138672, + "logps/rejected": -118.18450927734375, + "loss": 0.8029, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.2560882568359375, + "rewards/margins": -0.20856551826000214, + "rewards/rejected": -0.047522734850645065, + "step": 111 + }, + { + "epoch": 0.79, + "learning_rate": 1e-06, + "logits/chosen": 742.6484375, + "logits/rejected": 751.5332641601562, + "logps/chosen": -90.23307800292969, + "logps/rejected": -104.455078125, + "loss": 0.5533, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.03204040601849556, + "rewards/margins": 0.3023841977119446, + "rewards/rejected": -0.2703437805175781, + "step": 112 + }, + { + "epoch": 0.8, + "learning_rate": 1e-06, + "logits/chosen": 742.335693359375, + "logits/rejected": 740.8668823242188, + "logps/chosen": -82.45018005371094, + "logps/rejected": -106.21551513671875, + "loss": 0.5271, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.07861023396253586, + "rewards/margins": 0.36531221866607666, + "rewards/rejected": -0.2867019772529602, + "step": 113 + }, + { + "epoch": 0.8, + "learning_rate": 1e-06, + "logits/chosen": 754.1641235351562, + "logits/rejected": 759.054443359375, + "logps/chosen": -83.60963439941406, + "logps/rejected": -119.83500671386719, + "loss": 0.6445, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2449295073747635, + "rewards/margins": 0.09970168769359589, + "rewards/rejected": -0.3446311950683594, + "step": 114 + }, + { + "epoch": 0.81, + "learning_rate": 1e-06, + "logits/chosen": 753.371337890625, + "logits/rejected": 748.583984375, + "logps/chosen": -141.4698944091797, + "logps/rejected": -116.79045867919922, + "loss": 0.5158, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.2446746826171875, + "rewards/margins": 0.39311903715133667, + "rewards/rejected": -0.14844436943531036, + "step": 115 + }, + { + "epoch": 0.82, + "learning_rate": 1e-06, + "logits/chosen": 753.511474609375, + "logits/rejected": 755.4053955078125, + "logps/chosen": -112.50521087646484, + "logps/rejected": -112.9051742553711, + "loss": 0.7683, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.7129722833633423, + "rewards/margins": -0.14510804414749146, + "rewards/rejected": -0.5678642392158508, + "step": 116 + }, + { + "epoch": 0.82, + "learning_rate": 1e-06, + "logits/chosen": 748.044921875, + "logits/rejected": 752.298095703125, + "logps/chosen": -76.83850860595703, + "logps/rejected": -125.4980239868164, + "loss": 0.8519, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.4200790524482727, + "rewards/margins": -0.29563599824905396, + "rewards/rejected": -0.12444305419921875, + "step": 117 + }, + { + "epoch": 0.83, + "learning_rate": 1e-06, + "logits/chosen": 751.7593383789062, + "logits/rejected": 749.989990234375, + "logps/chosen": -95.87763977050781, + "logps/rejected": -115.7607650756836, + "loss": 0.5681, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18688355386257172, + "rewards/margins": 0.2679740786552429, + "rewards/rejected": -0.45485764741897583, + "step": 118 + }, + { + "epoch": 0.84, + "learning_rate": 1e-06, + "logits/chosen": 754.4381103515625, + "logits/rejected": 743.6098022460938, + "logps/chosen": -112.17021942138672, + "logps/rejected": -114.05720520019531, + "loss": 0.9076, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.36246567964553833, + "rewards/margins": -0.3909034729003906, + "rewards/rejected": 0.02843780629336834, + "step": 119 + }, + { + "epoch": 0.85, + "learning_rate": 1e-06, + "logits/chosen": 749.8277587890625, + "logits/rejected": 766.5257568359375, + "logps/chosen": -132.20159912109375, + "logps/rejected": -144.526123046875, + "loss": 0.4665, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.49238815903663635, + "rewards/margins": 0.5202751159667969, + "rewards/rejected": -1.0126632452011108, + "step": 120 + }, + { + "epoch": 0.85, + "learning_rate": 1e-06, + "logits/chosen": 754.5902099609375, + "logits/rejected": 754.7726440429688, + "logps/chosen": -138.08837890625, + "logps/rejected": -104.69085693359375, + "loss": 0.659, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2428436279296875, + "rewards/margins": 0.06955718994140625, + "rewards/rejected": -0.31240081787109375, + "step": 121 + }, + { + "epoch": 0.86, + "learning_rate": 1e-06, + "logits/chosen": 756.3060302734375, + "logits/rejected": 753.31103515625, + "logps/chosen": -143.39187622070312, + "logps/rejected": -134.10870361328125, + "loss": 0.7605, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.14795684814453125, + "rewards/margins": -0.1303756684064865, + "rewards/rejected": -0.01758117787539959, + "step": 122 + }, + { + "epoch": 0.87, + "learning_rate": 1e-06, + "logits/chosen": 732.1908569335938, + "logits/rejected": 738.9258422851562, + "logps/chosen": -93.47746276855469, + "logps/rejected": -120.90376281738281, + "loss": 0.6415, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23574523627758026, + "rewards/margins": 0.10607834160327911, + "rewards/rejected": -0.3418235778808594, + "step": 123 + }, + { + "epoch": 0.87, + "learning_rate": 1e-06, + "logits/chosen": 765.3704833984375, + "logits/rejected": 755.211181640625, + "logps/chosen": -165.53350830078125, + "logps/rejected": -195.3149871826172, + "loss": 0.7472, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.1707717925310135, + "rewards/margins": -0.10535889118909836, + "rewards/rejected": -0.06541290134191513, + "step": 124 + }, + { + "epoch": 0.88, + "learning_rate": 1e-06, + "logits/chosen": 755.7764282226562, + "logits/rejected": 757.4081420898438, + "logps/chosen": -74.58445739746094, + "logps/rejected": -76.66616821289062, + "loss": 0.6643, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22104187309741974, + "rewards/margins": 0.05862046778202057, + "rewards/rejected": -0.2796623408794403, + "step": 125 + }, + { + "epoch": 0.89, + "learning_rate": 1e-06, + "logits/chosen": 754.1861572265625, + "logits/rejected": 751.4375610351562, + "logps/chosen": -114.86368560791016, + "logps/rejected": -115.52163696289062, + "loss": 0.5091, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.3700668513774872, + "rewards/margins": 0.40980303287506104, + "rewards/rejected": -0.039736177772283554, + "step": 126 + }, + { + "epoch": 0.89, + "learning_rate": 1e-06, + "logits/chosen": 770.7310791015625, + "logits/rejected": 769.5196533203125, + "logps/chosen": -106.07989501953125, + "logps/rejected": -112.40510559082031, + "loss": 0.7148, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.648785412311554, + "rewards/margins": -0.042759716510772705, + "rewards/rejected": -0.6060256958007812, + "step": 127 + }, + { + "epoch": 0.9, + "learning_rate": 1e-06, + "logits/chosen": 752.6226806640625, + "logits/rejected": 752.1729125976562, + "logps/chosen": -105.07903289794922, + "logps/rejected": -102.99949645996094, + "loss": 0.559, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.038236238062381744, + "rewards/margins": 0.2891387939453125, + "rewards/rejected": -0.25090256333351135, + "step": 128 + }, + { + "epoch": 0.91, + "learning_rate": 1e-06, + "logits/chosen": 763.7931518554688, + "logits/rejected": 760.289306640625, + "logps/chosen": -130.01654052734375, + "logps/rejected": -146.55398559570312, + "loss": 0.5344, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.25978395342826843, + "rewards/margins": 0.34752658009529114, + "rewards/rejected": -0.08774261921644211, + "step": 129 + }, + { + "epoch": 0.92, + "learning_rate": 1e-06, + "logits/chosen": 752.9420776367188, + "logits/rejected": 755.839111328125, + "logps/chosen": -65.53675079345703, + "logps/rejected": -101.43161010742188, + "loss": 0.768, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.23746834695339203, + "rewards/margins": -0.1444164365530014, + "rewards/rejected": -0.09305191040039062, + "step": 130 + }, + { + "epoch": 0.92, + "learning_rate": 1e-06, + "logits/chosen": 759.6641235351562, + "logits/rejected": 752.3460083007812, + "logps/chosen": -88.10502624511719, + "logps/rejected": -131.0496063232422, + "loss": 0.5026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07658081501722336, + "rewards/margins": 0.4262436032295227, + "rewards/rejected": -0.5028244256973267, + "step": 131 + }, + { + "epoch": 0.93, + "learning_rate": 1e-06, + "logits/chosen": 751.3743286132812, + "logits/rejected": 761.0347290039062, + "logps/chosen": -103.1326904296875, + "logps/rejected": -124.47802734375, + "loss": 0.8658, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.02178649976849556, + "rewards/margins": -0.31988680362701416, + "rewards/rejected": 0.2981002926826477, + "step": 132 + }, + { + "epoch": 0.94, + "learning_rate": 1e-06, + "logits/chosen": 738.010986328125, + "logits/rejected": 749.4124755859375, + "logps/chosen": -102.75056457519531, + "logps/rejected": -106.64369201660156, + "loss": 0.5289, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.23018188774585724, + "rewards/margins": 0.3609573245048523, + "rewards/rejected": -0.13077545166015625, + "step": 133 + }, + { + "epoch": 0.94, + "learning_rate": 1e-06, + "logits/chosen": 760.3431396484375, + "logits/rejected": 759.145263671875, + "logps/chosen": -147.86944580078125, + "logps/rejected": -141.9213104248047, + "loss": 0.5212, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08485870808362961, + "rewards/margins": 0.3796554505825043, + "rewards/rejected": -0.4645141661167145, + "step": 134 + }, + { + "epoch": 0.95, + "learning_rate": 1e-06, + "logits/chosen": 755.10107421875, + "logits/rejected": 740.0361328125, + "logps/chosen": -75.2493667602539, + "logps/rejected": -107.53886413574219, + "loss": 0.5842, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2933342158794403, + "rewards/margins": 0.23123016953468323, + "rewards/rejected": -0.5245643854141235, + "step": 135 + }, + { + "epoch": 0.96, + "learning_rate": 1e-06, + "logits/chosen": 761.3751220703125, + "logits/rejected": 763.627197265625, + "logps/chosen": -129.47021484375, + "logps/rejected": -110.99361419677734, + "loss": 0.8404, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.5953087210655212, + "rewards/margins": -0.27550891041755676, + "rewards/rejected": -0.3197998106479645, + "step": 136 + }, + { + "epoch": 0.96, + "learning_rate": 1e-06, + "logits/chosen": 754.5219116210938, + "logits/rejected": 755.13232421875, + "logps/chosen": -78.75035095214844, + "logps/rejected": -77.590087890625, + "loss": 0.582, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.04389190673828125, + "rewards/margins": 0.23624879121780396, + "rewards/rejected": -0.2801406979560852, + "step": 137 + }, + { + "epoch": 0.97, + "learning_rate": 1e-06, + "logits/chosen": 759.3259887695312, + "logits/rejected": 749.2542724609375, + "logps/chosen": -92.72389221191406, + "logps/rejected": -128.94461059570312, + "loss": 0.7277, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.4006607234477997, + "rewards/margins": -0.06800997257232666, + "rewards/rejected": -0.332650750875473, + "step": 138 + }, + { + "epoch": 0.98, + "learning_rate": 1e-06, + "logits/chosen": 755.8272705078125, + "logits/rejected": 759.6437377929688, + "logps/chosen": -107.02555847167969, + "logps/rejected": -52.75437927246094, + "loss": 0.4474, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.06465301662683487, + "rewards/margins": 0.5724052786827087, + "rewards/rejected": -0.5077522397041321, + "step": 139 + }, + { + "epoch": 0.99, + "learning_rate": 1e-06, + "logits/chosen": 753.6461791992188, + "logits/rejected": 743.0986328125, + "logps/chosen": -124.5606460571289, + "logps/rejected": -129.29444885253906, + "loss": 0.5162, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1964561492204666, + "rewards/margins": 0.39213335514068604, + "rewards/rejected": -0.19567719101905823, + "step": 140 + }, + { + "epoch": 0.99, + "learning_rate": 1e-06, + "logits/chosen": 762.0756225585938, + "logits/rejected": 764.7891845703125, + "logps/chosen": -120.2902603149414, + "logps/rejected": -137.1676788330078, + "loss": 0.8638, + "rewards/accuracies": 0.0, + "rewards/chosen": -0.43293991684913635, + "rewards/margins": -0.3163765072822571, + "rewards/rejected": -0.11656341701745987, + "step": 141 + }, + { + "epoch": 1.0, + "learning_rate": 1e-06, + "logits/chosen": 755.4547729492188, + "logits/rejected": 753.641357421875, + "logps/chosen": -115.51753234863281, + "logps/rejected": -127.26959991455078, + "loss": 0.6359, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.07507782429456711, + "rewards/margins": 0.11796417087316513, + "rewards/rejected": -0.19304199516773224, + "step": 142 + }, + { + "epoch": 1.01, + "learning_rate": 1e-06, + "logits/chosen": 737.989990234375, + "logits/rejected": 749.4385986328125, + "logps/chosen": -91.63616943359375, + "logps/rejected": -120.50233459472656, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3416213989257812, + "rewards/margins": 2.8582611083984375, + "rewards/rejected": -1.5166397094726562, + "step": 143 + }, + { + "epoch": 1.02, + "learning_rate": 1e-06, + "logits/chosen": 747.44189453125, + "logits/rejected": 751.6834716796875, + "logps/chosen": -60.1141471862793, + "logps/rejected": -147.59805297851562, + "loss": 0.0273, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2523571252822876, + "rewards/margins": 3.5868029594421387, + "rewards/rejected": -2.3344459533691406, + "step": 144 + }, + { + "epoch": 1.03, + "learning_rate": 1e-06, + "logits/chosen": 760.1705322265625, + "logits/rejected": 757.1412963867188, + "logps/chosen": -104.97271728515625, + "logps/rejected": -185.25779724121094, + "loss": 0.0418, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0560661554336548, + "rewards/margins": 3.1548409461975098, + "rewards/rejected": -2.0987746715545654, + "step": 145 + }, + { + "epoch": 1.04, + "learning_rate": 1e-06, + "logits/chosen": 770.2408447265625, + "logits/rejected": 769.0567626953125, + "logps/chosen": -93.2243423461914, + "logps/rejected": -124.7294921875, + "loss": 0.0808, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6367698907852173, + "rewards/margins": 2.475234270095825, + "rewards/rejected": -1.838464379310608, + "step": 146 + }, + { + "epoch": 1.05, + "learning_rate": 1e-06, + "logits/chosen": 746.1376953125, + "logits/rejected": 747.9188232421875, + "logps/chosen": -84.068359375, + "logps/rejected": -142.59750366210938, + "loss": 0.0304, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4129196405410767, + "rewards/margins": 3.476583957672119, + "rewards/rejected": -2.063664197921753, + "step": 147 + }, + { + "epoch": 1.06, + "learning_rate": 1e-06, + "logits/chosen": 754.6676025390625, + "logits/rejected": 749.3137817382812, + "logps/chosen": -82.38626098632812, + "logps/rejected": -151.41783142089844, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.557122826576233, + "rewards/margins": 5.655279636383057, + "rewards/rejected": -4.098156929016113, + "step": 148 + }, + { + "epoch": 1.07, + "learning_rate": 1e-06, + "logits/chosen": 740.9193725585938, + "logits/rejected": 750.250732421875, + "logps/chosen": -76.56571197509766, + "logps/rejected": -120.34004974365234, + "loss": 0.0378, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3987770080566406, + "rewards/margins": 3.257617950439453, + "rewards/rejected": -1.8588409423828125, + "step": 149 + }, + { + "epoch": 1.08, + "learning_rate": 1e-06, + "logits/chosen": 756.0874633789062, + "logits/rejected": 751.5933227539062, + "logps/chosen": -112.08387756347656, + "logps/rejected": -138.200439453125, + "loss": 0.1048, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2423508167266846, + "rewards/margins": 2.2027511596679688, + "rewards/rejected": -0.960400402545929, + "step": 150 + }, + { + "epoch": 1.09, + "learning_rate": 1e-06, + "logits/chosen": 747.4507446289062, + "logits/rejected": 731.8726806640625, + "logps/chosen": -91.3673095703125, + "logps/rejected": -152.07684326171875, + "loss": 0.0162, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.013024926185608, + "rewards/margins": 4.117198467254639, + "rewards/rejected": -3.104173421859741, + "step": 151 + }, + { + "epoch": 1.1, + "learning_rate": 1e-06, + "logits/chosen": 748.8003540039062, + "logits/rejected": 750.332763671875, + "logps/chosen": -92.77163696289062, + "logps/rejected": -102.83785247802734, + "loss": 0.0965, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1863129138946533, + "rewards/margins": 2.2898149490356445, + "rewards/rejected": -1.1035019159317017, + "step": 152 + }, + { + "epoch": 1.11, + "learning_rate": 1e-06, + "logits/chosen": 747.7927856445312, + "logits/rejected": 764.7493896484375, + "logps/chosen": -113.17561340332031, + "logps/rejected": -161.32223510742188, + "loss": 0.0164, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4102104902267456, + "rewards/margins": 4.102485179901123, + "rewards/rejected": -2.692274570465088, + "step": 153 + }, + { + "epoch": 1.12, + "learning_rate": 1e-06, + "logits/chosen": 760.8968505859375, + "logits/rejected": 759.7334594726562, + "logps/chosen": -110.7530517578125, + "logps/rejected": -155.86016845703125, + "loss": 0.0209, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9647140502929688, + "rewards/margins": 3.8554437160491943, + "rewards/rejected": -2.8907296657562256, + "step": 154 + }, + { + "epoch": 1.13, + "learning_rate": 1e-06, + "logits/chosen": 761.036865234375, + "logits/rejected": 762.6702880859375, + "logps/chosen": -125.87457275390625, + "logps/rejected": -155.1082763671875, + "loss": 0.0912, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8165435791015625, + "rewards/margins": 2.3484725952148438, + "rewards/rejected": -1.5319290161132812, + "step": 155 + }, + { + "epoch": 1.14, + "learning_rate": 1e-06, + "logits/chosen": 750.7264404296875, + "logits/rejected": 750.5714111328125, + "logps/chosen": -91.43588256835938, + "logps/rejected": -104.13253784179688, + "loss": 0.009, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4638137817382812, + "rewards/margins": 4.711092948913574, + "rewards/rejected": -2.247279405593872, + "step": 156 + }, + { + "epoch": 1.15, + "learning_rate": 1e-06, + "logits/chosen": 748.8834838867188, + "logits/rejected": 758.8059692382812, + "logps/chosen": -107.6167984008789, + "logps/rejected": -138.88067626953125, + "loss": 0.0213, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1174066066741943, + "rewards/margins": 3.8388078212738037, + "rewards/rejected": -1.7214012145996094, + "step": 157 + }, + { + "epoch": 1.16, + "learning_rate": 1e-06, + "logits/chosen": 751.4725952148438, + "logits/rejected": 756.6077270507812, + "logps/chosen": -120.70336151123047, + "logps/rejected": -164.28851318359375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0233376026153564, + "rewards/margins": 5.701872825622559, + "rewards/rejected": -2.6785354614257812, + "step": 158 + }, + { + "epoch": 1.17, + "learning_rate": 1e-06, + "logits/chosen": 757.2255249023438, + "logits/rejected": 759.0044555664062, + "logps/chosen": -146.1885986328125, + "logps/rejected": -222.3453826904297, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.280653476715088, + "rewards/margins": 5.680956840515137, + "rewards/rejected": -3.400303602218628, + "step": 159 + }, + { + "epoch": 1.18, + "learning_rate": 1e-06, + "logits/chosen": 758.6777954101562, + "logits/rejected": 757.4089965820312, + "logps/chosen": -134.47557067871094, + "logps/rejected": -155.60775756835938, + "loss": 0.0446, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2545288801193237, + "rewards/margins": 3.0876877307891846, + "rewards/rejected": -1.8331588506698608, + "step": 160 + }, + { + "epoch": 1.19, + "learning_rate": 1e-06, + "logits/chosen": 761.4644775390625, + "logits/rejected": 761.8722534179688, + "logps/chosen": -57.221885681152344, + "logps/rejected": -82.77011108398438, + "loss": 0.169, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8446113467216492, + "rewards/margins": 1.6922996044158936, + "rewards/rejected": -0.8476883172988892, + "step": 161 + }, + { + "epoch": 1.2, + "learning_rate": 1e-06, + "logits/chosen": 743.7944946289062, + "logits/rejected": 740.6669311523438, + "logps/chosen": -97.65397644042969, + "logps/rejected": -137.427734375, + "loss": 0.0384, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3550125360488892, + "rewards/margins": 3.241175889968872, + "rewards/rejected": -1.886163353919983, + "step": 162 + }, + { + "epoch": 1.21, + "learning_rate": 1e-06, + "logits/chosen": 747.3677978515625, + "logits/rejected": 752.4970092773438, + "logps/chosen": -79.38906860351562, + "logps/rejected": -133.1637725830078, + "loss": 0.0102, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1824843883514404, + "rewards/margins": 4.584794044494629, + "rewards/rejected": -2.4023094177246094, + "step": 163 + }, + { + "epoch": 1.22, + "learning_rate": 1e-06, + "logits/chosen": 753.8716430664062, + "logits/rejected": 740.8131713867188, + "logps/chosen": -92.48912048339844, + "logps/rejected": -168.15040588378906, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7904014587402344, + "rewards/margins": 6.436036109924316, + "rewards/rejected": -4.645634651184082, + "step": 164 + }, + { + "epoch": 1.23, + "learning_rate": 1e-06, + "logits/chosen": 757.1979370117188, + "logits/rejected": 749.5791625976562, + "logps/chosen": -78.35336303710938, + "logps/rejected": -153.04347229003906, + "loss": 0.0269, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8985854983329773, + "rewards/margins": 3.600796699523926, + "rewards/rejected": -2.7022111415863037, + "step": 165 + }, + { + "epoch": 1.24, + "learning_rate": 1e-06, + "logits/chosen": 747.9757080078125, + "logits/rejected": 743.7157592773438, + "logps/chosen": -101.12834167480469, + "logps/rejected": -140.71270751953125, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2406052350997925, + "rewards/margins": 4.30910062789917, + "rewards/rejected": -3.068495273590088, + "step": 166 + }, + { + "epoch": 1.25, + "learning_rate": 1e-06, + "logits/chosen": 741.9119873046875, + "logits/rejected": 750.1944580078125, + "logps/chosen": -89.87914276123047, + "logps/rejected": -104.67402648925781, + "loss": 0.0585, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7322837710380554, + "rewards/margins": 2.810047149658203, + "rewards/rejected": -2.077763319015503, + "step": 167 + }, + { + "epoch": 1.26, + "learning_rate": 1e-06, + "logits/chosen": 753.3460083007812, + "logits/rejected": 760.5264282226562, + "logps/chosen": -127.35453796386719, + "logps/rejected": -84.519775390625, + "loss": 0.0206, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.545301914215088, + "rewards/margins": 3.8729119300842285, + "rewards/rejected": -1.3276100158691406, + "step": 168 + }, + { + "epoch": 1.27, + "learning_rate": 1e-06, + "logits/chosen": 737.9996948242188, + "logits/rejected": 732.5606079101562, + "logps/chosen": -113.8556137084961, + "logps/rejected": -155.7402801513672, + "loss": 0.0238, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8841865658760071, + "rewards/margins": 3.726029872894287, + "rewards/rejected": -2.841843366622925, + "step": 169 + }, + { + "epoch": 1.28, + "learning_rate": 1e-06, + "logits/chosen": 752.3007202148438, + "logits/rejected": 753.4920043945312, + "logps/chosen": -130.18753051757812, + "logps/rejected": -196.55247497558594, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.308056592941284, + "rewards/margins": 8.994502067565918, + "rewards/rejected": -5.686445713043213, + "step": 170 + }, + { + "epoch": 1.29, + "learning_rate": 1e-06, + "logits/chosen": 737.7472534179688, + "logits/rejected": 735.9760131835938, + "logps/chosen": -75.81230163574219, + "logps/rejected": -128.3035888671875, + "loss": 0.0385, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7423980832099915, + "rewards/margins": 3.2379074096679688, + "rewards/rejected": -2.495509386062622, + "step": 171 + }, + { + "epoch": 1.3, + "learning_rate": 1e-06, + "logits/chosen": 750.780029296875, + "logits/rejected": 749.4365234375, + "logps/chosen": -100.77393341064453, + "logps/rejected": -150.34300231933594, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1465553045272827, + "rewards/margins": 3.818354606628418, + "rewards/rejected": -2.671799421310425, + "step": 172 + }, + { + "epoch": 1.31, + "learning_rate": 1e-06, + "logits/chosen": 748.6647338867188, + "logits/rejected": 749.9218139648438, + "logps/chosen": -78.33729553222656, + "logps/rejected": -213.2393798828125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.508885383605957, + "rewards/margins": 11.735502243041992, + "rewards/rejected": -7.226617336273193, + "step": 173 + }, + { + "epoch": 1.32, + "learning_rate": 1e-06, + "logits/chosen": 738.2979736328125, + "logits/rejected": 746.9979858398438, + "logps/chosen": -74.24847412109375, + "logps/rejected": -138.82366943359375, + "loss": 0.0274, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.156060814857483, + "rewards/margins": 3.583029270172119, + "rewards/rejected": -2.4269683361053467, + "step": 174 + }, + { + "epoch": 1.33, + "learning_rate": 1e-06, + "logits/chosen": 743.7095336914062, + "logits/rejected": 737.3031005859375, + "logps/chosen": -104.67143249511719, + "logps/rejected": -96.82768249511719, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5981216430664062, + "rewards/margins": 6.181375503540039, + "rewards/rejected": -3.5832536220550537, + "step": 175 + }, + { + "epoch": 1.34, + "learning_rate": 1e-06, + "logits/chosen": 746.3065185546875, + "logits/rejected": 744.6980590820312, + "logps/chosen": -109.10230255126953, + "logps/rejected": -121.86424255371094, + "loss": 0.0526, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.065436601638794, + "rewards/margins": 2.9194955825805664, + "rewards/rejected": -1.854058861732483, + "step": 176 + }, + { + "epoch": 1.35, + "learning_rate": 1e-06, + "logits/chosen": 756.8947143554688, + "logits/rejected": 749.6739501953125, + "logps/chosen": -107.39349365234375, + "logps/rejected": -144.19827270507812, + "loss": 0.0185, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.440730333328247, + "rewards/margins": 3.9804704189300537, + "rewards/rejected": -2.5397400856018066, + "step": 177 + }, + { + "epoch": 1.36, + "learning_rate": 1e-06, + "logits/chosen": 758.4752807617188, + "logits/rejected": 757.44091796875, + "logps/chosen": -66.5837173461914, + "logps/rejected": -66.93745422363281, + "loss": 0.1301, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5249161124229431, + "rewards/margins": 1.97373366355896, + "rewards/rejected": -1.448817491531372, + "step": 178 + }, + { + "epoch": 1.37, + "learning_rate": 1e-06, + "logits/chosen": 734.7947998046875, + "logits/rejected": 737.0238647460938, + "logps/chosen": -100.37646484375, + "logps/rejected": -118.0760498046875, + "loss": 0.0747, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1361496448516846, + "rewards/margins": 2.5561914443969727, + "rewards/rejected": -1.4200416803359985, + "step": 179 + }, + { + "epoch": 1.38, + "learning_rate": 1e-06, + "logits/chosen": 753.7606201171875, + "logits/rejected": 756.8986206054688, + "logps/chosen": -87.83619689941406, + "logps/rejected": -102.36812591552734, + "loss": 0.0339, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.154744029045105, + "rewards/margins": 3.3681435585021973, + "rewards/rejected": -2.213399648666382, + "step": 180 + }, + { + "epoch": 1.39, + "learning_rate": 1e-06, + "logits/chosen": 757.6575927734375, + "logits/rejected": 759.6077880859375, + "logps/chosen": -94.41954040527344, + "logps/rejected": -123.98831176757812, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.281691074371338, + "rewards/margins": 5.378046035766602, + "rewards/rejected": -3.0963547229766846, + "step": 181 + }, + { + "epoch": 1.4, + "learning_rate": 1e-06, + "logits/chosen": 748.6627807617188, + "logits/rejected": 745.3861083984375, + "logps/chosen": -108.68283081054688, + "logps/rejected": -116.69976806640625, + "loss": 0.0273, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1173973083496094, + "rewards/margins": 3.5869743824005127, + "rewards/rejected": -2.4695770740509033, + "step": 182 + }, + { + "epoch": 1.41, + "learning_rate": 1e-06, + "logits/chosen": 754.1681518554688, + "logits/rejected": 754.533203125, + "logps/chosen": -77.60491943359375, + "logps/rejected": -70.4732666015625, + "loss": 0.0495, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4063667058944702, + "rewards/margins": 2.980342388153076, + "rewards/rejected": -1.5739758014678955, + "step": 183 + }, + { + "epoch": 1.42, + "learning_rate": 1e-06, + "logits/chosen": 739.7008056640625, + "logits/rejected": 742.1857299804688, + "logps/chosen": -110.44871520996094, + "logps/rejected": -143.59414672851562, + "loss": 0.0157, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4693527221679688, + "rewards/margins": 4.147912979125977, + "rewards/rejected": -2.6785600185394287, + "step": 184 + }, + { + "epoch": 1.43, + "learning_rate": 1e-06, + "logits/chosen": 750.370849609375, + "logits/rejected": 745.7979736328125, + "logps/chosen": -109.77300262451172, + "logps/rejected": -191.3189697265625, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5941994190216064, + "rewards/margins": 5.074709415435791, + "rewards/rejected": -3.4805099964141846, + "step": 185 + }, + { + "epoch": 1.44, + "learning_rate": 1e-06, + "logits/chosen": 754.7254638671875, + "logits/rejected": 743.2255249023438, + "logps/chosen": -85.62185668945312, + "logps/rejected": -151.84507751464844, + "loss": 0.0519, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.30954286456108093, + "rewards/margins": 2.9322404861450195, + "rewards/rejected": -2.622697591781616, + "step": 186 + }, + { + "epoch": 1.45, + "learning_rate": 1e-06, + "logits/chosen": 757.7532958984375, + "logits/rejected": 755.826171875, + "logps/chosen": -85.77392578125, + "logps/rejected": -160.18296813964844, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.316641330718994, + "rewards/margins": 9.21212387084961, + "rewards/rejected": -5.895482063293457, + "step": 187 + }, + { + "epoch": 1.46, + "learning_rate": 1e-06, + "logits/chosen": 753.1141967773438, + "logits/rejected": 753.493896484375, + "logps/chosen": -78.60509490966797, + "logps/rejected": -122.5627670288086, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6284217834472656, + "rewards/margins": 5.6106672286987305, + "rewards/rejected": -2.982245683670044, + "step": 188 + }, + { + "epoch": 1.47, + "learning_rate": 1e-06, + "logits/chosen": 756.9597778320312, + "logits/rejected": 759.7149047851562, + "logps/chosen": -114.1121597290039, + "logps/rejected": -121.31015014648438, + "loss": 0.0963, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9404968619346619, + "rewards/margins": 2.291950225830078, + "rewards/rejected": -1.351453423500061, + "step": 189 + }, + { + "epoch": 1.48, + "learning_rate": 1e-06, + "logits/chosen": 759.688720703125, + "logits/rejected": 755.500732421875, + "logps/chosen": -119.5220718383789, + "logps/rejected": -169.56031799316406, + "loss": 0.0245, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3092308044433594, + "rewards/margins": 3.6976068019866943, + "rewards/rejected": -2.388375997543335, + "step": 190 + }, + { + "epoch": 1.49, + "learning_rate": 1e-06, + "logits/chosen": 748.8916625976562, + "logits/rejected": 735.9102783203125, + "logps/chosen": -100.68359375, + "logps/rejected": -137.53469848632812, + "loss": 0.0438, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7861968874931335, + "rewards/margins": 3.10550856590271, + "rewards/rejected": -2.3193116188049316, + "step": 191 + }, + { + "epoch": 1.51, + "learning_rate": 1e-06, + "logits/chosen": 753.729248046875, + "logits/rejected": 755.0476684570312, + "logps/chosen": -110.94456481933594, + "logps/rejected": -143.5272216796875, + "loss": 0.0327, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4057906866073608, + "rewards/margins": 3.4035186767578125, + "rewards/rejected": -1.9977279901504517, + "step": 192 + }, + { + "epoch": 1.52, + "learning_rate": 1e-06, + "logits/chosen": 735.844482421875, + "logits/rejected": 746.7603149414062, + "logps/chosen": -121.16556549072266, + "logps/rejected": -129.36444091796875, + "loss": 0.0165, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9114983081817627, + "rewards/margins": 4.094688415527344, + "rewards/rejected": -2.183190107345581, + "step": 193 + }, + { + "epoch": 1.53, + "learning_rate": 1e-06, + "logits/chosen": 746.9216918945312, + "logits/rejected": 733.9296264648438, + "logps/chosen": -85.75089263916016, + "logps/rejected": -130.9271240234375, + "loss": 0.0182, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4778732061386108, + "rewards/margins": 3.9997329711914062, + "rewards/rejected": -2.521859884262085, + "step": 194 + }, + { + "epoch": 1.54, + "learning_rate": 1e-06, + "logits/chosen": 719.0285034179688, + "logits/rejected": 717.2233276367188, + "logps/chosen": -107.57772064208984, + "logps/rejected": -167.483154296875, + "loss": 0.0049, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.009802222251892, + "rewards/margins": 5.311641693115234, + "rewards/rejected": -4.301839351654053, + "step": 195 + }, + { + "epoch": 1.55, + "learning_rate": 1e-06, + "logits/chosen": 760.2907104492188, + "logits/rejected": 754.987548828125, + "logps/chosen": -109.52468872070312, + "logps/rejected": -171.11509704589844, + "loss": 0.0173, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.535028100013733, + "rewards/margins": 4.0502471923828125, + "rewards/rejected": -2.515219211578369, + "step": 196 + }, + { + "epoch": 1.56, + "learning_rate": 1e-06, + "logits/chosen": 742.224853515625, + "logits/rejected": 738.7446899414062, + "logps/chosen": -95.5473861694336, + "logps/rejected": -155.90774536132812, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7238609790802002, + "rewards/margins": 5.4401702880859375, + "rewards/rejected": -3.7163093090057373, + "step": 197 + }, + { + "epoch": 1.57, + "learning_rate": 1e-06, + "logits/chosen": 758.0135498046875, + "logits/rejected": 754.9188232421875, + "logps/chosen": -89.80221557617188, + "logps/rejected": -127.39008331298828, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1661453247070312, + "rewards/margins": 7.02105188369751, + "rewards/rejected": -3.8549065589904785, + "step": 198 + }, + { + "epoch": 1.58, + "learning_rate": 1e-06, + "logits/chosen": 756.3922119140625, + "logits/rejected": 760.1611938476562, + "logps/chosen": -110.13628387451172, + "logps/rejected": -154.06964111328125, + "loss": 0.0877, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5824577212333679, + "rewards/margins": 2.3892173767089844, + "rewards/rejected": -1.8067597150802612, + "step": 199 + }, + { + "epoch": 1.59, + "learning_rate": 1e-06, + "logits/chosen": 741.9486694335938, + "logits/rejected": 731.9614868164062, + "logps/chosen": -91.28828430175781, + "logps/rejected": -125.53680419921875, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0237228870391846, + "rewards/margins": 6.004284858703613, + "rewards/rejected": -4.98056173324585, + "step": 200 + }, + { + "epoch": 1.6, + "learning_rate": 1e-06, + "logits/chosen": 746.2884521484375, + "logits/rejected": 757.3328247070312, + "logps/chosen": -101.97673797607422, + "logps/rejected": -117.81609344482422, + "loss": 0.0544, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9384574890136719, + "rewards/margins": 2.883551836013794, + "rewards/rejected": -0.9450942873954773, + "step": 201 + }, + { + "epoch": 1.61, + "learning_rate": 1e-06, + "logits/chosen": 738.4723510742188, + "logits/rejected": 744.498779296875, + "logps/chosen": -166.54428100585938, + "logps/rejected": -160.50811767578125, + "loss": 0.0074, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.97385573387146, + "rewards/margins": 4.897862434387207, + "rewards/rejected": -1.924006700515747, + "step": 202 + }, + { + "epoch": 1.62, + "learning_rate": 1e-06, + "logits/chosen": 757.14501953125, + "logits/rejected": 749.2903442382812, + "logps/chosen": -102.1121826171875, + "logps/rejected": -182.97268676757812, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9728538393974304, + "rewards/margins": 4.9819817543029785, + "rewards/rejected": -4.009128093719482, + "step": 203 + }, + { + "epoch": 1.63, + "learning_rate": 1e-06, + "logits/chosen": 732.0082397460938, + "logits/rejected": 730.4036865234375, + "logps/chosen": -89.25343322753906, + "logps/rejected": -64.82356262207031, + "loss": 0.0217, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6064987182617188, + "rewards/margins": 3.819427251815796, + "rewards/rejected": -2.212928533554077, + "step": 204 + }, + { + "epoch": 1.64, + "learning_rate": 1e-06, + "logits/chosen": 749.783203125, + "logits/rejected": 749.5674438476562, + "logps/chosen": -128.81256103515625, + "logps/rejected": -194.35406494140625, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7751327753067017, + "rewards/margins": 3.905226230621338, + "rewards/rejected": -2.1300933361053467, + "step": 205 + }, + { + "epoch": 1.65, + "learning_rate": 1e-06, + "logits/chosen": 737.5679931640625, + "logits/rejected": 733.238037109375, + "logps/chosen": -99.24267578125, + "logps/rejected": -165.94277954101562, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.273516058921814, + "rewards/margins": 6.786527633666992, + "rewards/rejected": -5.513011455535889, + "step": 206 + }, + { + "epoch": 1.66, + "learning_rate": 1e-06, + "logits/chosen": 749.4509887695312, + "logits/rejected": 753.3217163085938, + "logps/chosen": -76.46129608154297, + "logps/rejected": -123.53337097167969, + "loss": 0.0139, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1653671264648438, + "rewards/margins": 4.272303581237793, + "rewards/rejected": -3.1069366931915283, + "step": 207 + }, + { + "epoch": 1.67, + "learning_rate": 1e-06, + "logits/chosen": 722.6453857421875, + "logits/rejected": 736.4693603515625, + "logps/chosen": -64.5191421508789, + "logps/rejected": -144.28598022460938, + "loss": 0.0414, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5066757202148438, + "rewards/margins": 3.1643455028533936, + "rewards/rejected": -2.65766978263855, + "step": 208 + }, + { + "epoch": 1.68, + "learning_rate": 1e-06, + "logits/chosen": 753.90869140625, + "logits/rejected": 735.1388549804688, + "logps/chosen": -93.12266540527344, + "logps/rejected": -188.0275115966797, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0559738874435425, + "rewards/margins": 4.311410427093506, + "rewards/rejected": -3.255436658859253, + "step": 209 + }, + { + "epoch": 1.69, + "learning_rate": 1e-06, + "logits/chosen": 747.8322143554688, + "logits/rejected": 751.8526000976562, + "logps/chosen": -98.87774658203125, + "logps/rejected": -63.52620315551758, + "loss": 0.0816, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8794342279434204, + "rewards/margins": 2.4643688201904297, + "rewards/rejected": -1.5849345922470093, + "step": 210 + }, + { + "epoch": 1.7, + "learning_rate": 1e-06, + "logits/chosen": 759.1729125976562, + "logits/rejected": 747.0396728515625, + "logps/chosen": -149.9312744140625, + "logps/rejected": -217.44610595703125, + "loss": 0.0252, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3894516229629517, + "rewards/margins": 3.6679763793945312, + "rewards/rejected": -2.278524875640869, + "step": 211 + }, + { + "epoch": 1.71, + "learning_rate": 1e-06, + "logits/chosen": 747.2423095703125, + "logits/rejected": 733.9888916015625, + "logps/chosen": -100.79484558105469, + "logps/rejected": -170.07681274414062, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.938378930091858, + "rewards/margins": 6.390353202819824, + "rewards/rejected": -4.451974391937256, + "step": 212 + }, + { + "epoch": 1.72, + "learning_rate": 1e-06, + "logits/chosen": 740.6461181640625, + "logits/rejected": 709.0418090820312, + "logps/chosen": -116.23504638671875, + "logps/rejected": -186.71493530273438, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2362807989120483, + "rewards/margins": 6.008784770965576, + "rewards/rejected": -4.772503852844238, + "step": 213 + }, + { + "epoch": 1.73, + "learning_rate": 1e-06, + "logits/chosen": 746.071533203125, + "logits/rejected": 751.8787231445312, + "logps/chosen": -74.63729858398438, + "logps/rejected": -136.71481323242188, + "loss": 0.066, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6523041129112244, + "rewards/margins": 2.6849160194396973, + "rewards/rejected": -2.032611846923828, + "step": 214 + }, + { + "epoch": 1.74, + "learning_rate": 1e-06, + "logits/chosen": 718.1611938476562, + "logits/rejected": 726.565673828125, + "logps/chosen": -83.01461791992188, + "logps/rejected": -140.8511199951172, + "loss": 0.0421, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8105392456054688, + "rewards/margins": 3.1470985412597656, + "rewards/rejected": -2.336559295654297, + "step": 215 + }, + { + "epoch": 1.75, + "learning_rate": 1e-06, + "logits/chosen": 743.421875, + "logits/rejected": 746.0142211914062, + "logps/chosen": -103.25149536132812, + "logps/rejected": -128.62020874023438, + "loss": 0.0909, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.21239928901195526, + "rewards/margins": 2.351767063140869, + "rewards/rejected": -2.1393678188323975, + "step": 216 + }, + { + "epoch": 1.76, + "learning_rate": 1e-06, + "logits/chosen": 747.2691650390625, + "logits/rejected": 752.845947265625, + "logps/chosen": -74.19900512695312, + "logps/rejected": -173.20458984375, + "loss": 0.0154, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.1022758483886719, + "rewards/margins": 4.167690277099609, + "rewards/rejected": -3.0654144287109375, + "step": 217 + }, + { + "epoch": 1.77, + "learning_rate": 1e-06, + "logits/chosen": 752.2735595703125, + "logits/rejected": 748.0730590820312, + "logps/chosen": -78.91561889648438, + "logps/rejected": -120.8078384399414, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.177806854248047, + "rewards/margins": 6.384255409240723, + "rewards/rejected": -4.206448554992676, + "step": 218 + }, + { + "epoch": 1.78, + "learning_rate": 1e-06, + "logits/chosen": 705.3291625976562, + "logits/rejected": 701.3724975585938, + "logps/chosen": -127.54286193847656, + "logps/rejected": -115.79811096191406, + "loss": 0.0345, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4691513776779175, + "rewards/margins": 3.349665880203247, + "rewards/rejected": -1.8805145025253296, + "step": 219 + }, + { + "epoch": 1.79, + "learning_rate": 1e-06, + "logits/chosen": 756.548095703125, + "logits/rejected": 754.713623046875, + "logps/chosen": -127.49676513671875, + "logps/rejected": -155.8701171875, + "loss": 0.0145, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.477697730064392, + "rewards/margins": 4.225259304046631, + "rewards/rejected": -2.7475616931915283, + "step": 220 + }, + { + "epoch": 1.8, + "learning_rate": 1e-06, + "logits/chosen": 751.2115478515625, + "logits/rejected": 730.0790405273438, + "logps/chosen": -118.25406646728516, + "logps/rejected": -182.3004150390625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7936714887619019, + "rewards/margins": 6.60188102722168, + "rewards/rejected": -4.808209419250488, + "step": 221 + }, + { + "epoch": 1.81, + "learning_rate": 1e-06, + "logits/chosen": 755.7479248046875, + "logits/rejected": 751.756103515625, + "logps/chosen": -109.6573715209961, + "logps/rejected": -104.56623840332031, + "loss": 0.0268, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3643836975097656, + "rewards/margins": 3.60490345954895, + "rewards/rejected": -2.2405197620391846, + "step": 222 + }, + { + "epoch": 1.82, + "learning_rate": 1e-06, + "logits/chosen": 740.5425415039062, + "logits/rejected": 737.9905395507812, + "logps/chosen": -85.0796127319336, + "logps/rejected": -138.29664611816406, + "loss": 0.0269, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8929191827774048, + "rewards/margins": 3.601365089416504, + "rewards/rejected": -2.7084457874298096, + "step": 223 + }, + { + "epoch": 1.83, + "learning_rate": 1e-06, + "logits/chosen": 720.2456665039062, + "logits/rejected": 725.3515625, + "logps/chosen": -81.45463562011719, + "logps/rejected": -143.22352600097656, + "loss": 0.0219, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6097640991210938, + "rewards/margins": 3.8105316162109375, + "rewards/rejected": -2.2007675170898438, + "step": 224 + }, + { + "epoch": 1.84, + "learning_rate": 1e-06, + "logits/chosen": 743.6019897460938, + "logits/rejected": 729.6106567382812, + "logps/chosen": -113.73159790039062, + "logps/rejected": -147.58935546875, + "loss": 0.0361, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2793610095977783, + "rewards/margins": 3.3045289516448975, + "rewards/rejected": -2.025167942047119, + "step": 225 + }, + { + "epoch": 1.85, + "learning_rate": 1e-06, + "logits/chosen": 743.1868286132812, + "logits/rejected": 744.715087890625, + "logps/chosen": -69.11923217773438, + "logps/rejected": -92.87596130371094, + "loss": 0.0633, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.919219970703125, + "rewards/margins": 2.7279481887817383, + "rewards/rejected": -1.8087280988693237, + "step": 226 + }, + { + "epoch": 1.86, + "learning_rate": 1e-06, + "logits/chosen": 745.3460693359375, + "logits/rejected": 743.1890869140625, + "logps/chosen": -104.20529174804688, + "logps/rejected": -145.69107055664062, + "loss": 0.0444, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.056146264076233, + "rewards/margins": 3.0913352966308594, + "rewards/rejected": -2.035189151763916, + "step": 227 + }, + { + "epoch": 1.87, + "learning_rate": 1e-06, + "logits/chosen": 722.90478515625, + "logits/rejected": 705.5928344726562, + "logps/chosen": -90.54049682617188, + "logps/rejected": -157.27066040039062, + "loss": 0.0061, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2427986860275269, + "rewards/margins": 5.099251747131348, + "rewards/rejected": -3.8564529418945312, + "step": 228 + }, + { + "epoch": 1.88, + "learning_rate": 1e-06, + "logits/chosen": 745.122314453125, + "logits/rejected": 745.5069580078125, + "logps/chosen": -84.00446319580078, + "logps/rejected": -117.15104675292969, + "loss": 0.015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5758247375488281, + "rewards/margins": 4.193599700927734, + "rewards/rejected": -2.6177749633789062, + "step": 229 + }, + { + "epoch": 1.89, + "learning_rate": 1e-06, + "logits/chosen": 743.2073974609375, + "logits/rejected": 725.5383911132812, + "logps/chosen": -63.20913314819336, + "logps/rejected": -122.580810546875, + "loss": 0.0515, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9106891751289368, + "rewards/margins": 2.939448118209839, + "rewards/rejected": -2.028759002685547, + "step": 230 + }, + { + "epoch": 1.9, + "learning_rate": 1e-06, + "logits/chosen": 683.9519653320312, + "logits/rejected": 709.9270629882812, + "logps/chosen": -74.44076538085938, + "logps/rejected": -98.86894226074219, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3996528387069702, + "rewards/margins": 4.657689571380615, + "rewards/rejected": -3.2580368518829346, + "step": 231 + }, + { + "epoch": 1.91, + "learning_rate": 1e-06, + "logits/chosen": 744.951904296875, + "logits/rejected": 749.2725830078125, + "logps/chosen": -105.27039337158203, + "logps/rejected": -163.56382751464844, + "loss": 0.0076, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8947563171386719, + "rewards/margins": 4.86995792388916, + "rewards/rejected": -2.975201368331909, + "step": 232 + }, + { + "epoch": 1.92, + "learning_rate": 1e-06, + "logits/chosen": 698.66357421875, + "logits/rejected": 731.2380981445312, + "logps/chosen": -29.707929611206055, + "logps/rejected": -79.537841796875, + "loss": 0.0268, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.42301806807518005, + "rewards/margins": 3.60532546043396, + "rewards/rejected": -3.182307481765747, + "step": 233 + }, + { + "epoch": 1.93, + "learning_rate": 1e-06, + "logits/chosen": 742.6114501953125, + "logits/rejected": 742.7617797851562, + "logps/chosen": -61.493804931640625, + "logps/rejected": -127.57579040527344, + "loss": 0.0549, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.1668262481689453, + "rewards/margins": 2.874296188354492, + "rewards/rejected": -2.707469940185547, + "step": 234 + }, + { + "epoch": 1.94, + "learning_rate": 1e-06, + "logits/chosen": 742.2534790039062, + "logits/rejected": 734.466796875, + "logps/chosen": -130.7904815673828, + "logps/rejected": -138.1530303955078, + "loss": 0.027, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.312615990638733, + "rewards/margins": 3.597317695617676, + "rewards/rejected": -2.2847015857696533, + "step": 235 + }, + { + "epoch": 1.95, + "learning_rate": 1e-06, + "logits/chosen": 731.26953125, + "logits/rejected": 731.0730590820312, + "logps/chosen": -48.50862503051758, + "logps/rejected": -107.28805541992188, + "loss": 0.0113, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.9133724570274353, + "rewards/margins": 4.477767467498779, + "rewards/rejected": -3.5643951892852783, + "step": 236 + }, + { + "epoch": 1.96, + "learning_rate": 1e-06, + "logits/chosen": 754.3118896484375, + "logits/rejected": 755.7796020507812, + "logps/chosen": -109.88552856445312, + "logps/rejected": -173.35076904296875, + "loss": 0.0482, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.5617378354072571, + "rewards/margins": 3.0089714527130127, + "rewards/rejected": -2.4472336769104004, + "step": 237 + }, + { + "epoch": 1.97, + "learning_rate": 1e-06, + "logits/chosen": 745.6985473632812, + "logits/rejected": 748.7868041992188, + "logps/chosen": -100.55361938476562, + "logps/rejected": -158.98077392578125, + "loss": 0.0094, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5784424543380737, + "rewards/margins": 4.663255214691162, + "rewards/rejected": -3.084812879562378, + "step": 238 + }, + { + "epoch": 1.98, + "learning_rate": 1e-06, + "logits/chosen": 744.4022216796875, + "logits/rejected": 739.286865234375, + "logps/chosen": -111.19105529785156, + "logps/rejected": -144.24342346191406, + "loss": 0.0082, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.227887749671936, + "rewards/margins": 4.801854610443115, + "rewards/rejected": -3.5739669799804688, + "step": 239 + }, + { + "epoch": 1.99, + "learning_rate": 1e-06, + "logits/chosen": 745.2330322265625, + "logits/rejected": 735.1626586914062, + "logps/chosen": -120.08739471435547, + "logps/rejected": -171.17230224609375, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9991798400878906, + "rewards/margins": 7.282955169677734, + "rewards/rejected": -4.283775329589844, + "step": 240 + }, + { + "epoch": 2.0, + "learning_rate": 1e-06, + "logits/chosen": 738.9703369140625, + "logits/rejected": 739.1956176757812, + "logps/chosen": -105.7802734375, + "logps/rejected": -176.71951293945312, + "loss": 0.0239, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0467026233673096, + "rewards/margins": 3.721026659011841, + "rewards/rejected": -2.6743240356445312, + "step": 241 + }, + { + "epoch": 2.01, + "learning_rate": 1e-06, + "logits/chosen": 736.5604858398438, + "logits/rejected": 738.3843994140625, + "logps/chosen": -83.77609252929688, + "logps/rejected": -119.46920013427734, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.085867404937744, + "rewards/margins": 4.852503776550293, + "rewards/rejected": -2.766636610031128, + "step": 242 + }, + { + "epoch": 2.03, + "learning_rate": 1e-06, + "logits/chosen": 751.8400268554688, + "logits/rejected": 749.5999145507812, + "logps/chosen": -85.23765563964844, + "logps/rejected": -170.42726135253906, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3702683448791504, + "rewards/margins": 10.290180206298828, + "rewards/rejected": -6.9199113845825195, + "step": 243 + }, + { + "epoch": 2.04, + "learning_rate": 1e-06, + "logits/chosen": 743.7387084960938, + "logits/rejected": 738.4058227539062, + "logps/chosen": -101.7964096069336, + "logps/rejected": -220.40078735351562, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3918588161468506, + "rewards/margins": 8.780550956726074, + "rewards/rejected": -6.3886919021606445, + "step": 244 + }, + { + "epoch": 2.05, + "learning_rate": 1e-06, + "logits/chosen": 716.8140869140625, + "logits/rejected": 730.5784912109375, + "logps/chosen": -59.19915008544922, + "logps/rejected": -172.53213500976562, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.0386749505996704, + "rewards/margins": 6.520960330963135, + "rewards/rejected": -5.482285499572754, + "step": 245 + }, + { + "epoch": 2.06, + "learning_rate": 1e-06, + "logits/chosen": 734.020751953125, + "logits/rejected": 722.5446166992188, + "logps/chosen": -96.87025451660156, + "logps/rejected": -144.06231689453125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.378239393234253, + "rewards/margins": 11.684956550598145, + "rewards/rejected": -8.306716918945312, + "step": 246 + }, + { + "epoch": 2.08, + "learning_rate": 1e-06, + "logits/chosen": 746.9415893554688, + "logits/rejected": 745.9384765625, + "logps/chosen": -76.33311462402344, + "logps/rejected": -94.30284118652344, + "loss": 0.0041, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.533547282218933, + "rewards/margins": 5.490480422973633, + "rewards/rejected": -3.9569332599639893, + "step": 247 + }, + { + "epoch": 2.09, + "learning_rate": 1e-06, + "logits/chosen": 749.957275390625, + "logits/rejected": 745.8206176757812, + "logps/chosen": -61.42109298706055, + "logps/rejected": -88.26673126220703, + "loss": 0.0098, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.041178584098816, + "rewards/margins": 4.622923851013184, + "rewards/rejected": -3.581745147705078, + "step": 248 + }, + { + "epoch": 2.1, + "learning_rate": 1e-06, + "logits/chosen": 724.5130004882812, + "logits/rejected": 722.3417358398438, + "logps/chosen": -70.00173950195312, + "logps/rejected": -154.7038116455078, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3234542608261108, + "rewards/margins": 6.458986282348633, + "rewards/rejected": -5.135531902313232, + "step": 249 + }, + { + "epoch": 2.12, + "learning_rate": 1e-06, + "logits/chosen": 692.0050659179688, + "logits/rejected": 722.229248046875, + "logps/chosen": -27.89548683166504, + "logps/rejected": -113.64689636230469, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6042623519897461, + "rewards/margins": 7.197475433349609, + "rewards/rejected": -6.593213081359863, + "step": 250 + }, + { + "epoch": 2.13, + "learning_rate": 1e-06, + "logits/chosen": 752.9841918945312, + "logits/rejected": 747.5181274414062, + "logps/chosen": -101.2496566772461, + "logps/rejected": -123.68312072753906, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.20515513420105, + "rewards/margins": 6.357362747192383, + "rewards/rejected": -4.152207851409912, + "step": 251 + }, + { + "epoch": 2.14, + "learning_rate": 1e-06, + "logits/chosen": 750.6376342773438, + "logits/rejected": 754.8546142578125, + "logps/chosen": -101.88619995117188, + "logps/rejected": -177.50592041015625, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.407466173171997, + "rewards/margins": 5.557853698730469, + "rewards/rejected": -4.150387763977051, + "step": 252 + }, + { + "epoch": 2.16, + "learning_rate": 1e-06, + "logits/chosen": 723.3779907226562, + "logits/rejected": 734.23095703125, + "logps/chosen": -66.3678207397461, + "logps/rejected": -167.58941650390625, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9441261291503906, + "rewards/margins": 7.247669219970703, + "rewards/rejected": -5.3035430908203125, + "step": 253 + }, + { + "epoch": 2.17, + "learning_rate": 1e-06, + "logits/chosen": 740.6430053710938, + "logits/rejected": 740.7091674804688, + "logps/chosen": -91.03292083740234, + "logps/rejected": -85.0467529296875, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.663916826248169, + "rewards/margins": 5.400906562805176, + "rewards/rejected": -3.736989736557007, + "step": 254 + }, + { + "epoch": 2.18, + "learning_rate": 1e-06, + "logits/chosen": 739.0185546875, + "logits/rejected": 751.822021484375, + "logps/chosen": -89.63487243652344, + "logps/rejected": -129.50360107421875, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1726441383361816, + "rewards/margins": 5.286489486694336, + "rewards/rejected": -2.113845109939575, + "step": 255 + }, + { + "epoch": 2.19, + "learning_rate": 1e-06, + "logits/chosen": 741.4841918945312, + "logits/rejected": 741.7670288085938, + "logps/chosen": -131.63766479492188, + "logps/rejected": -213.09771728515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.163043260574341, + "rewards/margins": 10.504013061523438, + "rewards/rejected": -7.340970039367676, + "step": 256 + }, + { + "epoch": 2.21, + "learning_rate": 1e-06, + "logits/chosen": 714.075439453125, + "logits/rejected": 719.966552734375, + "logps/chosen": -72.21794128417969, + "logps/rejected": -165.50509643554688, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.533433675765991, + "rewards/margins": 6.962358474731445, + "rewards/rejected": -4.428924560546875, + "step": 257 + }, + { + "epoch": 2.22, + "learning_rate": 1e-06, + "logits/chosen": 720.365966796875, + "logits/rejected": 734.1922607421875, + "logps/chosen": -84.0616455078125, + "logps/rejected": -138.39208984375, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.099073886871338, + "rewards/margins": 5.404688835144043, + "rewards/rejected": -3.305615186691284, + "step": 258 + }, + { + "epoch": 2.23, + "learning_rate": 1e-06, + "logits/chosen": 740.834716796875, + "logits/rejected": 737.747314453125, + "logps/chosen": -56.4475212097168, + "logps/rejected": -153.93844604492188, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.6714546084403992, + "rewards/margins": 6.015190124511719, + "rewards/rejected": -5.343735694885254, + "step": 259 + }, + { + "epoch": 2.25, + "learning_rate": 1e-06, + "logits/chosen": 725.0271606445312, + "logits/rejected": 724.025390625, + "logps/chosen": -45.07270050048828, + "logps/rejected": -142.32394409179688, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.256964921951294, + "rewards/margins": 8.324949264526367, + "rewards/rejected": -7.067984104156494, + "step": 260 + }, + { + "epoch": 2.26, + "learning_rate": 1e-06, + "logits/chosen": 751.2418823242188, + "logits/rejected": 753.0140991210938, + "logps/chosen": -117.8044204711914, + "logps/rejected": -172.86277770996094, + "loss": 0.0072, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6235588788986206, + "rewards/margins": 4.930938243865967, + "rewards/rejected": -3.3073792457580566, + "step": 261 + }, + { + "epoch": 2.27, + "learning_rate": 1e-06, + "logits/chosen": 761.27880859375, + "logits/rejected": 760.2965698242188, + "logps/chosen": -83.80908203125, + "logps/rejected": -145.1227264404297, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5782959461212158, + "rewards/margins": 5.45608377456665, + "rewards/rejected": -3.8777878284454346, + "step": 262 + }, + { + "epoch": 2.29, + "learning_rate": 1e-06, + "logits/chosen": 745.9216918945312, + "logits/rejected": 736.337646484375, + "logps/chosen": -70.63249206542969, + "logps/rejected": -189.70843505859375, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6706726551055908, + "rewards/margins": 8.039380073547363, + "rewards/rejected": -6.368707180023193, + "step": 263 + }, + { + "epoch": 2.3, + "learning_rate": 1e-06, + "logits/chosen": 734.077880859375, + "logits/rejected": 696.3433227539062, + "logps/chosen": -110.6375961303711, + "logps/rejected": -237.77435302734375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7960258722305298, + "rewards/margins": 11.674471855163574, + "rewards/rejected": -9.878445625305176, + "step": 264 + }, + { + "epoch": 2.31, + "learning_rate": 1e-06, + "logits/chosen": 740.6715087890625, + "logits/rejected": 744.1766357421875, + "logps/chosen": -91.46484375, + "logps/rejected": -187.09225463867188, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4873199462890625, + "rewards/margins": 8.383281707763672, + "rewards/rejected": -5.895961284637451, + "step": 265 + }, + { + "epoch": 2.32, + "learning_rate": 1e-06, + "logits/chosen": 748.2072143554688, + "logits/rejected": 751.9159545898438, + "logps/chosen": -103.39703369140625, + "logps/rejected": -138.41419982910156, + "loss": 0.0062, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.012009382247925, + "rewards/margins": 5.0738677978515625, + "rewards/rejected": -3.0618584156036377, + "step": 266 + }, + { + "epoch": 2.34, + "learning_rate": 1e-06, + "logits/chosen": 740.0219116210938, + "logits/rejected": 744.5198974609375, + "logps/chosen": -94.05328369140625, + "logps/rejected": -191.13973999023438, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.016467332839966, + "rewards/margins": 8.749259948730469, + "rewards/rejected": -5.732792854309082, + "step": 267 + }, + { + "epoch": 2.35, + "learning_rate": 1e-06, + "logits/chosen": 742.8614501953125, + "logits/rejected": 742.6378173828125, + "logps/chosen": -73.19803619384766, + "logps/rejected": -154.29901123046875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1691277027130127, + "rewards/margins": 9.324997901916504, + "rewards/rejected": -6.155869960784912, + "step": 268 + }, + { + "epoch": 2.36, + "learning_rate": 1e-06, + "logits/chosen": 734.372802734375, + "logits/rejected": 734.20263671875, + "logps/chosen": -96.91320037841797, + "logps/rejected": -204.166259765625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9334099292755127, + "rewards/margins": 7.352408409118652, + "rewards/rejected": -5.418998718261719, + "step": 269 + }, + { + "epoch": 2.38, + "learning_rate": 1e-06, + "logits/chosen": 720.396240234375, + "logits/rejected": 717.9971313476562, + "logps/chosen": -82.05516052246094, + "logps/rejected": -112.20073699951172, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3263261318206787, + "rewards/margins": 9.276971817016602, + "rewards/rejected": -6.950645446777344, + "step": 270 + }, + { + "epoch": 2.39, + "learning_rate": 1e-06, + "logits/chosen": 740.0625, + "logits/rejected": 746.4600830078125, + "logps/chosen": -68.48933410644531, + "logps/rejected": -156.41558837890625, + "loss": 0.0051, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2671005725860596, + "rewards/margins": 5.269789695739746, + "rewards/rejected": -4.002689361572266, + "step": 271 + }, + { + "epoch": 2.4, + "learning_rate": 1e-06, + "logits/chosen": 737.6250610351562, + "logits/rejected": 734.9310913085938, + "logps/chosen": -92.7996597290039, + "logps/rejected": -175.46644592285156, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.943982720375061, + "rewards/margins": 7.128126621246338, + "rewards/rejected": -5.184144020080566, + "step": 272 + }, + { + "epoch": 2.42, + "learning_rate": 1e-06, + "logits/chosen": 721.9855346679688, + "logits/rejected": 735.0977172851562, + "logps/chosen": -110.92289733886719, + "logps/rejected": -155.6758270263672, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.935765027999878, + "rewards/margins": 7.750093460083008, + "rewards/rejected": -4.814328670501709, + "step": 273 + }, + { + "epoch": 2.43, + "learning_rate": 1e-06, + "logits/chosen": 731.5733032226562, + "logits/rejected": 738.6312866210938, + "logps/chosen": -69.69560241699219, + "logps/rejected": -158.70640563964844, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1518311500549316, + "rewards/margins": 8.108404159545898, + "rewards/rejected": -4.956573009490967, + "step": 274 + }, + { + "epoch": 2.44, + "learning_rate": 1e-06, + "logits/chosen": 726.6232299804688, + "logits/rejected": 722.6569213867188, + "logps/chosen": -88.44778442382812, + "logps/rejected": -161.01806640625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2756316661834717, + "rewards/margins": 6.5208282470703125, + "rewards/rejected": -4.24519681930542, + "step": 275 + }, + { + "epoch": 2.45, + "learning_rate": 1e-06, + "logits/chosen": 739.509521484375, + "logits/rejected": 731.6041870117188, + "logps/chosen": -75.68081665039062, + "logps/rejected": -196.58511352539062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2276673316955566, + "rewards/margins": 10.842552185058594, + "rewards/rejected": -8.614885330200195, + "step": 276 + }, + { + "epoch": 2.47, + "learning_rate": 1e-06, + "logits/chosen": 734.115234375, + "logits/rejected": 736.6942138671875, + "logps/chosen": -60.92715072631836, + "logps/rejected": -111.22089385986328, + "loss": 0.0046, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7384281158447266, + "rewards/margins": 5.381649494171143, + "rewards/rejected": -3.643221378326416, + "step": 277 + }, + { + "epoch": 2.48, + "learning_rate": 1e-06, + "logits/chosen": 750.9117431640625, + "logits/rejected": 748.7080688476562, + "logps/chosen": -120.03547668457031, + "logps/rejected": -185.828369140625, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2238266468048096, + "rewards/margins": 7.9672136306762695, + "rewards/rejected": -5.743386745452881, + "step": 278 + }, + { + "epoch": 2.49, + "learning_rate": 1e-06, + "logits/chosen": 737.1796264648438, + "logits/rejected": 738.0701293945312, + "logps/chosen": -78.29095458984375, + "logps/rejected": -220.831787109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.513519287109375, + "rewards/margins": 12.499378204345703, + "rewards/rejected": -7.98585844039917, + "step": 279 + }, + { + "epoch": 2.51, + "learning_rate": 1e-06, + "logits/chosen": 726.0398559570312, + "logits/rejected": 728.8816528320312, + "logps/chosen": -74.90200805664062, + "logps/rejected": -174.2008056640625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.329554796218872, + "rewards/margins": 7.553549766540527, + "rewards/rejected": -5.223994731903076, + "step": 280 + }, + { + "epoch": 2.52, + "learning_rate": 1e-06, + "logits/chosen": 732.6264038085938, + "logits/rejected": 728.9421997070312, + "logps/chosen": -75.54058837890625, + "logps/rejected": -163.63592529296875, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.846821665763855, + "rewards/margins": 7.089195728302002, + "rewards/rejected": -5.242373943328857, + "step": 281 + }, + { + "epoch": 2.53, + "learning_rate": 1e-06, + "logits/chosen": 734.3858032226562, + "logits/rejected": 737.2391967773438, + "logps/chosen": -97.11677551269531, + "logps/rejected": -147.464111328125, + "loss": 0.0078, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.8258712887763977, + "rewards/margins": 4.8496294021606445, + "rewards/rejected": -4.0237579345703125, + "step": 282 + }, + { + "epoch": 2.55, + "learning_rate": 1e-06, + "logits/chosen": 730.0067749023438, + "logits/rejected": 723.2018432617188, + "logps/chosen": -91.0189208984375, + "logps/rejected": -193.0071563720703, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1767075061798096, + "rewards/margins": 9.602957725524902, + "rewards/rejected": -7.426250457763672, + "step": 283 + }, + { + "epoch": 2.56, + "learning_rate": 1e-06, + "logits/chosen": 753.3532104492188, + "logits/rejected": 739.4624633789062, + "logps/chosen": -138.03726196289062, + "logps/rejected": -244.8419647216797, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.578852891921997, + "rewards/margins": 7.596963882446289, + "rewards/rejected": -5.018110752105713, + "step": 284 + }, + { + "epoch": 2.57, + "learning_rate": 1e-06, + "logits/chosen": 751.7630615234375, + "logits/rejected": 744.656005859375, + "logps/chosen": -101.55215454101562, + "logps/rejected": -212.72898864746094, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3322815895080566, + "rewards/margins": 9.008890151977539, + "rewards/rejected": -6.676608562469482, + "step": 285 + }, + { + "epoch": 2.58, + "learning_rate": 1e-06, + "logits/chosen": 737.717529296875, + "logits/rejected": 719.792724609375, + "logps/chosen": -89.882080078125, + "logps/rejected": -170.45858764648438, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8663482666015625, + "rewards/margins": 7.478048801422119, + "rewards/rejected": -5.611700534820557, + "step": 286 + }, + { + "epoch": 2.6, + "learning_rate": 1e-06, + "logits/chosen": 746.3302001953125, + "logits/rejected": 748.86083984375, + "logps/chosen": -88.95478820800781, + "logps/rejected": -148.36135864257812, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8281662464141846, + "rewards/margins": 8.361825942993164, + "rewards/rejected": -5.5336594581604, + "step": 287 + }, + { + "epoch": 2.61, + "learning_rate": 1e-06, + "logits/chosen": 748.1126708984375, + "logits/rejected": 746.8381958007812, + "logps/chosen": -122.7348861694336, + "logps/rejected": -174.8379669189453, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4285972118377686, + "rewards/margins": 6.18477725982666, + "rewards/rejected": -3.7561798095703125, + "step": 288 + }, + { + "epoch": 2.62, + "learning_rate": 1e-06, + "logits/chosen": 750.0781860351562, + "logits/rejected": 751.6514892578125, + "logps/chosen": -96.43820190429688, + "logps/rejected": -194.43780517578125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.906470537185669, + "rewards/margins": 6.462408065795898, + "rewards/rejected": -4.55593729019165, + "step": 289 + }, + { + "epoch": 2.64, + "learning_rate": 1e-06, + "logits/chosen": 724.03076171875, + "logits/rejected": 715.0072021484375, + "logps/chosen": -90.9781723022461, + "logps/rejected": -204.82730102539062, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.099966526031494, + "rewards/margins": 11.50143051147461, + "rewards/rejected": -9.401463508605957, + "step": 290 + }, + { + "epoch": 2.65, + "learning_rate": 1e-06, + "logits/chosen": 711.1498413085938, + "logits/rejected": 690.7463989257812, + "logps/chosen": -84.51719665527344, + "logps/rejected": -200.25730895996094, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8451286554336548, + "rewards/margins": 10.00024700164795, + "rewards/rejected": -8.155117988586426, + "step": 291 + }, + { + "epoch": 2.66, + "learning_rate": 1e-06, + "logits/chosen": 736.5441284179688, + "logits/rejected": 736.3696899414062, + "logps/chosen": -81.39799499511719, + "logps/rejected": -142.01377868652344, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8364715576171875, + "rewards/margins": 6.9405198097229, + "rewards/rejected": -5.104048252105713, + "step": 292 + }, + { + "epoch": 2.68, + "learning_rate": 1e-06, + "logits/chosen": 725.1179809570312, + "logits/rejected": 731.4525146484375, + "logps/chosen": -49.28761291503906, + "logps/rejected": -187.81605529785156, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.335010528564453, + "rewards/margins": 8.69125747680664, + "rewards/rejected": -6.356246471405029, + "step": 293 + }, + { + "epoch": 2.69, + "learning_rate": 1e-06, + "logits/chosen": 719.383544921875, + "logits/rejected": 712.9955444335938, + "logps/chosen": -108.6873550415039, + "logps/rejected": -208.62246704101562, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4010124206542969, + "rewards/margins": 9.531074523925781, + "rewards/rejected": -8.130062103271484, + "step": 294 + }, + { + "epoch": 2.7, + "learning_rate": 1e-06, + "logits/chosen": 739.6880493164062, + "logits/rejected": 728.6071166992188, + "logps/chosen": -115.47481536865234, + "logps/rejected": -186.40603637695312, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.460437774658203, + "rewards/margins": 9.267586708068848, + "rewards/rejected": -5.8071489334106445, + "step": 295 + }, + { + "epoch": 2.71, + "learning_rate": 1e-06, + "logits/chosen": 731.1561889648438, + "logits/rejected": 725.9503784179688, + "logps/chosen": -90.48724365234375, + "logps/rejected": -172.88870239257812, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3047149181365967, + "rewards/margins": 8.59080982208252, + "rewards/rejected": -6.286094665527344, + "step": 296 + }, + { + "epoch": 2.73, + "learning_rate": 1e-06, + "logits/chosen": 744.2967529296875, + "logits/rejected": 734.2734375, + "logps/chosen": -101.18692016601562, + "logps/rejected": -175.01174926757812, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.061387777328491, + "rewards/margins": 7.6824750900268555, + "rewards/rejected": -5.621087551116943, + "step": 297 + }, + { + "epoch": 2.74, + "learning_rate": 1e-06, + "logits/chosen": 733.93603515625, + "logits/rejected": 745.308349609375, + "logps/chosen": -99.65798950195312, + "logps/rejected": -169.07423400878906, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.913287401199341, + "rewards/margins": 7.654044151306152, + "rewards/rejected": -4.740756988525391, + "step": 298 + }, + { + "epoch": 2.75, + "learning_rate": 1e-06, + "logits/chosen": 742.6478881835938, + "logits/rejected": 743.1141357421875, + "logps/chosen": -104.64097595214844, + "logps/rejected": -177.47930908203125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.036149740219116, + "rewards/margins": 7.429086685180664, + "rewards/rejected": -5.392936706542969, + "step": 299 + }, + { + "epoch": 2.77, + "learning_rate": 1e-06, + "logits/chosen": 737.3350830078125, + "logits/rejected": 730.6506958007812, + "logps/chosen": -104.68534088134766, + "logps/rejected": -183.864013671875, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8784592151641846, + "rewards/margins": 9.414484977722168, + "rewards/rejected": -7.5360260009765625, + "step": 300 + }, + { + "epoch": 2.78, + "learning_rate": 1e-06, + "logits/chosen": 740.4413452148438, + "logits/rejected": 739.8828125, + "logps/chosen": -116.61955261230469, + "logps/rejected": -218.98265075683594, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.994433641433716, + "rewards/margins": 7.587386131286621, + "rewards/rejected": -4.592952251434326, + "step": 301 + }, + { + "epoch": 2.79, + "learning_rate": 1e-06, + "logits/chosen": 721.8073120117188, + "logits/rejected": 725.7706298828125, + "logps/chosen": -103.96492767333984, + "logps/rejected": -170.6640167236328, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1177315711975098, + "rewards/margins": 7.503278732299805, + "rewards/rejected": -5.385547161102295, + "step": 302 + }, + { + "epoch": 2.81, + "learning_rate": 1e-06, + "logits/chosen": 717.1700439453125, + "logits/rejected": 719.8836059570312, + "logps/chosen": -93.41561889648438, + "logps/rejected": -136.04376220703125, + "loss": 0.0064, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.832234263420105, + "rewards/margins": 5.049046993255615, + "rewards/rejected": -3.2168128490448, + "step": 303 + }, + { + "epoch": 2.82, + "learning_rate": 1e-06, + "logits/chosen": 738.9448852539062, + "logits/rejected": 740.8707275390625, + "logps/chosen": -81.84746551513672, + "logps/rejected": -140.497314453125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7536171674728394, + "rewards/margins": 7.779935836791992, + "rewards/rejected": -6.026318550109863, + "step": 304 + }, + { + "epoch": 2.83, + "learning_rate": 1e-06, + "logits/chosen": 744.9635620117188, + "logits/rejected": 720.3474731445312, + "logps/chosen": -86.17436218261719, + "logps/rejected": -229.25698852539062, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7508041858673096, + "rewards/margins": 9.129188537597656, + "rewards/rejected": -7.378384590148926, + "step": 305 + }, + { + "epoch": 2.84, + "learning_rate": 1e-06, + "logits/chosen": 747.2305297851562, + "logits/rejected": 745.1190185546875, + "logps/chosen": -103.24749755859375, + "logps/rejected": -186.46243286132812, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7152694463729858, + "rewards/margins": 7.666225433349609, + "rewards/rejected": -5.950955867767334, + "step": 306 + }, + { + "epoch": 2.86, + "learning_rate": 1e-06, + "logits/chosen": 738.9415893554688, + "logits/rejected": 732.471435546875, + "logps/chosen": -100.17965698242188, + "logps/rejected": -158.85504150390625, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4327728748321533, + "rewards/margins": 5.4586334228515625, + "rewards/rejected": -3.025860548019409, + "step": 307 + }, + { + "epoch": 2.87, + "learning_rate": 1e-06, + "logits/chosen": 744.577880859375, + "logits/rejected": 740.4278564453125, + "logps/chosen": -99.29054260253906, + "logps/rejected": -224.01156616210938, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.6242836713790894, + "rewards/margins": 7.598435401916504, + "rewards/rejected": -5.974151611328125, + "step": 308 + }, + { + "epoch": 2.88, + "learning_rate": 1e-06, + "logits/chosen": 725.6573486328125, + "logits/rejected": 706.074951171875, + "logps/chosen": -87.38636779785156, + "logps/rejected": -191.2567138671875, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4111191034317017, + "rewards/margins": 8.433279991149902, + "rewards/rejected": -7.022160530090332, + "step": 309 + }, + { + "epoch": 2.9, + "learning_rate": 1e-06, + "logits/chosen": 740.6321411132812, + "logits/rejected": 717.98779296875, + "logps/chosen": -109.28819274902344, + "logps/rejected": -229.67733764648438, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.690258741378784, + "rewards/margins": 12.236160278320312, + "rewards/rejected": -9.54590129852295, + "step": 310 + }, + { + "epoch": 2.91, + "learning_rate": 1e-06, + "logits/chosen": 731.0087890625, + "logits/rejected": 730.6953125, + "logps/chosen": -79.59683990478516, + "logps/rejected": -130.19863891601562, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6477181911468506, + "rewards/margins": 8.501607894897461, + "rewards/rejected": -4.853889465332031, + "step": 311 + }, + { + "epoch": 2.92, + "learning_rate": 1e-06, + "logits/chosen": 735.4886474609375, + "logits/rejected": 717.5365600585938, + "logps/chosen": -90.39205932617188, + "logps/rejected": -211.6212158203125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9786574840545654, + "rewards/margins": 11.58507251739502, + "rewards/rejected": -8.606414794921875, + "step": 312 + }, + { + "epoch": 2.94, + "learning_rate": 1e-06, + "logits/chosen": 735.1174926757812, + "logits/rejected": 725.9493408203125, + "logps/chosen": -121.71316528320312, + "logps/rejected": -168.14352416992188, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2203476428985596, + "rewards/margins": 7.504098892211914, + "rewards/rejected": -5.283751010894775, + "step": 313 + }, + { + "epoch": 2.95, + "learning_rate": 1e-06, + "logits/chosen": 671.0048828125, + "logits/rejected": 699.376220703125, + "logps/chosen": -70.37047576904297, + "logps/rejected": -120.96990203857422, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8066818714141846, + "rewards/margins": 7.274814605712891, + "rewards/rejected": -5.468132972717285, + "step": 314 + }, + { + "epoch": 2.96, + "learning_rate": 1e-06, + "logits/chosen": 741.7174682617188, + "logits/rejected": 725.6544189453125, + "logps/chosen": -80.9742660522461, + "logps/rejected": -185.03268432617188, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7743019461631775, + "rewards/margins": 6.715760231018066, + "rewards/rejected": -5.941458225250244, + "step": 315 + }, + { + "epoch": 2.97, + "learning_rate": 1e-06, + "logits/chosen": 738.7156372070312, + "logits/rejected": 742.2384033203125, + "logps/chosen": -70.17455291748047, + "logps/rejected": -152.0988311767578, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7940415143966675, + "rewards/margins": 7.757524490356445, + "rewards/rejected": -5.963482856750488, + "step": 316 + }, + { + "epoch": 2.99, + "learning_rate": 1e-06, + "logits/chosen": 701.4963989257812, + "logits/rejected": 711.3980712890625, + "logps/chosen": -75.94883728027344, + "logps/rejected": -171.7069549560547, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5171173810958862, + "rewards/margins": 6.939260482788086, + "rewards/rejected": -5.42214298248291, + "step": 317 + }, + { + "epoch": 3.0, + "learning_rate": 1e-06, + "logits/chosen": 705.4109497070312, + "logits/rejected": 725.5880126953125, + "logps/chosen": -66.0982666015625, + "logps/rejected": -151.95394897460938, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.445521593093872, + "rewards/margins": 7.465752601623535, + "rewards/rejected": -5.020230770111084, + "step": 318 + }, + { + "epoch": 3.02, + "learning_rate": 1e-06, + "logits/chosen": 743.7249145507812, + "logits/rejected": 718.3504028320312, + "logps/chosen": -85.9219970703125, + "logps/rejected": -235.22918701171875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.7760406732559204, + "rewards/margins": 9.7516450881958, + "rewards/rejected": -7.97560453414917, + "step": 319 + }, + { + "epoch": 3.03, + "learning_rate": 1e-06, + "logits/chosen": 722.2771606445312, + "logits/rejected": 725.15087890625, + "logps/chosen": -71.75315856933594, + "logps/rejected": -186.65089416503906, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.644439697265625, + "rewards/margins": 9.113443374633789, + "rewards/rejected": -6.469003200531006, + "step": 320 + }, + { + "epoch": 3.05, + "learning_rate": 1e-06, + "logits/chosen": 706.1834716796875, + "logits/rejected": 712.8848266601562, + "logps/chosen": -67.77120971679688, + "logps/rejected": -180.27040100097656, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.978106737136841, + "rewards/margins": 8.883562088012695, + "rewards/rejected": -5.905455112457275, + "step": 321 + }, + { + "epoch": 3.06, + "learning_rate": 1e-06, + "logits/chosen": 729.4330444335938, + "logits/rejected": 732.5885620117188, + "logps/chosen": -55.003929138183594, + "logps/rejected": -132.47824096679688, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3307502269744873, + "rewards/margins": 8.099706649780273, + "rewards/rejected": -5.768956184387207, + "step": 322 + }, + { + "epoch": 3.08, + "learning_rate": 1e-06, + "logits/chosen": 743.449951171875, + "logits/rejected": 739.1897583007812, + "logps/chosen": -96.2901382446289, + "logps/rejected": -241.40524291992188, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9243240356445312, + "rewards/margins": 9.63784408569336, + "rewards/rejected": -7.71351957321167, + "step": 323 + }, + { + "epoch": 3.09, + "learning_rate": 1e-06, + "logits/chosen": 707.6451416015625, + "logits/rejected": 721.5835571289062, + "logps/chosen": -56.662933349609375, + "logps/rejected": -197.05397033691406, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2922966480255127, + "rewards/margins": 9.226765632629395, + "rewards/rejected": -7.934468746185303, + "step": 324 + }, + { + "epoch": 3.11, + "learning_rate": 1e-06, + "logits/chosen": 739.4616088867188, + "logits/rejected": 716.668212890625, + "logps/chosen": -109.03681945800781, + "logps/rejected": -230.74781799316406, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.7153961658477783, + "rewards/margins": 12.368345260620117, + "rewards/rejected": -9.652949333190918, + "step": 325 + }, + { + "epoch": 3.12, + "learning_rate": 1e-06, + "logits/chosen": 728.9228515625, + "logits/rejected": 728.694091796875, + "logps/chosen": -95.12728118896484, + "logps/rejected": -218.6767578125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.112001895904541, + "rewards/margins": 8.982049942016602, + "rewards/rejected": -6.870048522949219, + "step": 326 + }, + { + "epoch": 3.14, + "learning_rate": 1e-06, + "logits/chosen": 747.5016479492188, + "logits/rejected": 749.2763061523438, + "logps/chosen": -108.90858459472656, + "logps/rejected": -194.17123413085938, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5131423473358154, + "rewards/margins": 7.951367378234863, + "rewards/rejected": -5.438224792480469, + "step": 327 + }, + { + "epoch": 3.15, + "learning_rate": 1e-06, + "logits/chosen": 738.3208618164062, + "logits/rejected": 737.7191162109375, + "logps/chosen": -113.22555541992188, + "logps/rejected": -228.81085205078125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3338334560394287, + "rewards/margins": 8.909605979919434, + "rewards/rejected": -5.575772285461426, + "step": 328 + }, + { + "epoch": 3.17, + "learning_rate": 1e-06, + "logits/chosen": 728.2122192382812, + "logits/rejected": 722.7445678710938, + "logps/chosen": -89.21581268310547, + "logps/rejected": -179.2111053466797, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4318580627441406, + "rewards/margins": 9.35019302368164, + "rewards/rejected": -6.9183349609375, + "step": 329 + }, + { + "epoch": 3.18, + "learning_rate": 1e-06, + "logits/chosen": 734.7132568359375, + "logits/rejected": 738.6228637695312, + "logps/chosen": -89.77992248535156, + "logps/rejected": -199.04345703125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.6558120250701904, + "rewards/margins": 9.746892929077148, + "rewards/rejected": -7.091081142425537, + "step": 330 + }, + { + "epoch": 3.2, + "learning_rate": 1e-06, + "logits/chosen": 668.7403564453125, + "logits/rejected": 696.9133911132812, + "logps/chosen": -67.125244140625, + "logps/rejected": -146.91921997070312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1312050819396973, + "rewards/margins": 10.194269180297852, + "rewards/rejected": -8.063064575195312, + "step": 331 + }, + { + "epoch": 3.21, + "learning_rate": 1e-06, + "logits/chosen": 726.1821899414062, + "logits/rejected": 712.3114013671875, + "logps/chosen": -97.84066772460938, + "logps/rejected": -151.14306640625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.281198263168335, + "rewards/margins": 12.295990943908691, + "rewards/rejected": -9.014792442321777, + "step": 332 + }, + { + "epoch": 3.23, + "learning_rate": 1e-06, + "logits/chosen": 737.1996459960938, + "logits/rejected": 730.942626953125, + "logps/chosen": -102.52587127685547, + "logps/rejected": -227.82334899902344, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3189125061035156, + "rewards/margins": 9.449860572814941, + "rewards/rejected": -7.130948066711426, + "step": 333 + }, + { + "epoch": 3.24, + "learning_rate": 1e-06, + "logits/chosen": 741.6857299804688, + "logits/rejected": 731.0802612304688, + "logps/chosen": -99.2021255493164, + "logps/rejected": -185.6935577392578, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.2598671913146973, + "rewards/margins": 8.949135780334473, + "rewards/rejected": -6.689268589019775, + "step": 334 + }, + { + "epoch": 3.26, + "learning_rate": 1e-06, + "logits/chosen": 720.3169555664062, + "logits/rejected": 710.7181396484375, + "logps/chosen": -90.6078872680664, + "logps/rejected": -208.54405212402344, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1369950771331787, + "rewards/margins": 11.910134315490723, + "rewards/rejected": -9.773138999938965, + "step": 335 + }, + { + "epoch": 3.27, + "learning_rate": 1e-06, + "logits/chosen": 679.981689453125, + "logits/rejected": 702.9410400390625, + "logps/chosen": -26.604257583618164, + "logps/rejected": -150.62820434570312, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.7333852648735046, + "rewards/margins": 11.024728775024414, + "rewards/rejected": -10.291343688964844, + "step": 336 + }, + { + "epoch": 3.29, + "learning_rate": 1e-06, + "logits/chosen": 713.6534423828125, + "logits/rejected": 725.7046508789062, + "logps/chosen": -64.2873764038086, + "logps/rejected": -185.92950439453125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.1521706581115723, + "rewards/margins": 9.289722442626953, + "rewards/rejected": -7.137551784515381, + "step": 337 + }, + { + "epoch": 3.3, + "learning_rate": 1e-06, + "logits/chosen": 733.8445434570312, + "logits/rejected": 734.6804809570312, + "logps/chosen": -77.67139434814453, + "logps/rejected": -223.7530517578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.575475215911865, + "rewards/margins": 12.853460311889648, + "rewards/rejected": -8.277984619140625, + "step": 338 + }, + { + "epoch": 3.32, + "learning_rate": 1e-06, + "logits/chosen": 743.3245849609375, + "logits/rejected": 746.0653686523438, + "logps/chosen": -87.09391021728516, + "logps/rejected": -153.65267944335938, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.014254093170166, + "rewards/margins": 9.077045440673828, + "rewards/rejected": -6.062791347503662, + "step": 339 + }, + { + "epoch": 3.33, + "learning_rate": 1e-06, + "logits/chosen": 757.9900512695312, + "logits/rejected": 756.8787841796875, + "logps/chosen": -74.44833374023438, + "logps/rejected": -165.33489990234375, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5143706798553467, + "rewards/margins": 8.413375854492188, + "rewards/rejected": -5.89900541305542, + "step": 340 + }, + { + "epoch": 3.35, + "learning_rate": 1e-06, + "logits/chosen": 736.0738525390625, + "logits/rejected": 728.460693359375, + "logps/chosen": -51.39256286621094, + "logps/rejected": -191.89385986328125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.176950454711914, + "rewards/margins": 10.316227912902832, + "rewards/rejected": -9.139277458190918, + "step": 341 + }, + { + "epoch": 3.36, + "learning_rate": 1e-06, + "logits/chosen": 747.8567504882812, + "logits/rejected": 745.39501953125, + "logps/chosen": -118.44633483886719, + "logps/rejected": -196.91204833984375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3827407360076904, + "rewards/margins": 9.234495162963867, + "rewards/rejected": -6.851754665374756, + "step": 342 + }, + { + "epoch": 3.38, + "learning_rate": 1e-06, + "logits/chosen": 725.90283203125, + "logits/rejected": 718.4945678710938, + "logps/chosen": -90.60539245605469, + "logps/rejected": -197.70269775390625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.218060255050659, + "rewards/margins": 10.11386489868164, + "rewards/rejected": -7.8958048820495605, + "step": 343 + }, + { + "epoch": 3.39, + "learning_rate": 1e-06, + "logits/chosen": 739.9036865234375, + "logits/rejected": 723.2141723632812, + "logps/chosen": -75.01350402832031, + "logps/rejected": -211.52430725097656, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.3703781366348267, + "rewards/margins": 9.960999488830566, + "rewards/rejected": -8.590620994567871, + "step": 344 + }, + { + "epoch": 3.41, + "learning_rate": 1e-06, + "logits/chosen": 715.7235107421875, + "logits/rejected": 709.2985229492188, + "logps/chosen": -108.58916473388672, + "logps/rejected": -214.40255737304688, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.4108314514160156, + "rewards/margins": 10.118903160095215, + "rewards/rejected": -8.7080717086792, + "step": 345 + }, + { + "epoch": 3.42, + "learning_rate": 1e-06, + "logits/chosen": 734.409423828125, + "logits/rejected": 727.1421508789062, + "logps/chosen": -104.57987213134766, + "logps/rejected": -193.0653839111328, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8890060186386108, + "rewards/margins": 10.345169067382812, + "rewards/rejected": -8.45616340637207, + "step": 346 + }, + { + "epoch": 3.44, + "learning_rate": 1e-06, + "logits/chosen": 735.6088256835938, + "logits/rejected": 735.1978149414062, + "logps/chosen": -133.06475830078125, + "logps/rejected": -221.62530517578125, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.020334005355835, + "rewards/margins": 11.214062690734863, + "rewards/rejected": -8.19372844696045, + "step": 347 + }, + { + "epoch": 3.45, + "learning_rate": 1e-06, + "logits/chosen": 713.6419067382812, + "logits/rejected": 711.160400390625, + "logps/chosen": -83.52464294433594, + "logps/rejected": -122.23622131347656, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.179377794265747, + "rewards/margins": 10.133572578430176, + "rewards/rejected": -7.95419454574585, + "step": 348 + }, + { + "epoch": 3.47, + "learning_rate": 1e-06, + "logits/chosen": 745.6130981445312, + "logits/rejected": 744.260986328125, + "logps/chosen": -111.60773468017578, + "logps/rejected": -193.70545959472656, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.5413124561309814, + "rewards/margins": 9.18424129486084, + "rewards/rejected": -5.6429290771484375, + "step": 349 + }, + { + "epoch": 3.48, + "learning_rate": 1e-06, + "logits/chosen": 739.1451416015625, + "logits/rejected": 734.9131469726562, + "logps/chosen": -79.79393768310547, + "logps/rejected": -138.94837951660156, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.187464952468872, + "rewards/margins": 9.608952522277832, + "rewards/rejected": -8.421487808227539, + "step": 350 + }, + { + "epoch": 3.5, + "learning_rate": 1e-06, + "logits/chosen": 750.8216552734375, + "logits/rejected": 736.0921630859375, + "logps/chosen": -134.63113403320312, + "logps/rejected": -257.116943359375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9194657802581787, + "rewards/margins": 9.165074348449707, + "rewards/rejected": -6.245608806610107, + "step": 351 + }, + { + "epoch": 3.52, + "learning_rate": 1e-06, + "logits/chosen": 737.7184448242188, + "logits/rejected": 737.2736206054688, + "logps/chosen": -73.01019287109375, + "logps/rejected": -159.97283935546875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1879119873046875, + "rewards/margins": 9.911165237426758, + "rewards/rejected": -6.723252773284912, + "step": 352 + }, + { + "epoch": 3.53, + "learning_rate": 1e-06, + "logits/chosen": 736.3836059570312, + "logits/rejected": 729.2514038085938, + "logps/chosen": -87.81730651855469, + "logps/rejected": -186.51885986328125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.6690080165863037, + "rewards/margins": 9.461250305175781, + "rewards/rejected": -5.792242527008057, + "step": 353 + }, + { + "epoch": 3.55, + "learning_rate": 1e-06, + "logits/chosen": 706.3058471679688, + "logits/rejected": 684.7946166992188, + "logps/chosen": -84.38475036621094, + "logps/rejected": -204.73052978515625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.8583732843399048, + "rewards/margins": 10.460813522338867, + "rewards/rejected": -8.602439880371094, + "step": 354 + }, + { + "epoch": 3.56, + "learning_rate": 1e-06, + "logits/chosen": 741.3902587890625, + "logits/rejected": 732.3666381835938, + "logps/chosen": -59.026371002197266, + "logps/rejected": -114.01712036132812, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.2806507349014282, + "rewards/margins": 7.437434673309326, + "rewards/rejected": -6.1567840576171875, + "step": 355 + }, + { + "epoch": 3.58, + "learning_rate": 1e-06, + "logits/chosen": 745.2266235351562, + "logits/rejected": 742.9217529296875, + "logps/chosen": -100.9074935913086, + "logps/rejected": -195.44195556640625, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9492698907852173, + "rewards/margins": 8.798178672790527, + "rewards/rejected": -6.848908424377441, + "step": 356 + }, + { + "epoch": 3.59, + "learning_rate": 1e-06, + "logits/chosen": 720.528076171875, + "logits/rejected": 715.5703735351562, + "logps/chosen": -79.94759368896484, + "logps/rejected": -190.8441162109375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.1256508827209473, + "rewards/margins": 10.353452682495117, + "rewards/rejected": -7.22780179977417, + "step": 357 + }, + { + "epoch": 3.61, + "learning_rate": 1e-06, + "logits/chosen": 727.7904052734375, + "logits/rejected": 723.4896240234375, + "logps/chosen": -71.66838836669922, + "logps/rejected": -179.47164916992188, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.234041690826416, + "rewards/margins": 9.059988021850586, + "rewards/rejected": -6.82594633102417, + "step": 358 + }, + { + "epoch": 3.62, + "learning_rate": 1e-06, + "logits/chosen": 731.9834594726562, + "logits/rejected": 728.8046264648438, + "logps/chosen": -89.96942138671875, + "logps/rejected": -190.34788513183594, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.227006673812866, + "rewards/margins": 8.89929485321045, + "rewards/rejected": -6.672287940979004, + "step": 359 + }, + { + "epoch": 3.64, + "learning_rate": 1e-06, + "logits/chosen": 720.5106201171875, + "logits/rejected": 727.082275390625, + "logps/chosen": -47.44602966308594, + "logps/rejected": -197.17105102539062, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5191688537597656, + "rewards/margins": 9.810914993286133, + "rewards/rejected": -7.291745662689209, + "step": 360 + }, + { + "epoch": 3.65, + "learning_rate": 1e-06, + "logits/chosen": 747.0634765625, + "logits/rejected": 738.198974609375, + "logps/chosen": -93.89689636230469, + "logps/rejected": -153.9918212890625, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9404313564300537, + "rewards/margins": 10.123509407043457, + "rewards/rejected": -7.183078289031982, + "step": 361 + }, + { + "epoch": 3.67, + "learning_rate": 1e-06, + "logits/chosen": 731.3892211914062, + "logits/rejected": 745.96435546875, + "logps/chosen": -79.57083129882812, + "logps/rejected": -146.0126953125, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": 4.17904806137085, + "rewards/margins": 7.943802833557129, + "rewards/rejected": -3.7647545337677, + "step": 362 + }, + { + "epoch": 3.68, + "learning_rate": 1e-06, + "logits/chosen": 730.9630126953125, + "logits/rejected": 742.4584350585938, + "logps/chosen": -95.61009979248047, + "logps/rejected": -180.9610595703125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.3180763721466064, + "rewards/margins": 9.247515678405762, + "rewards/rejected": -5.929439544677734, + "step": 363 + }, + { + "epoch": 3.7, + "learning_rate": 1e-06, + "logits/chosen": 735.0308227539062, + "logits/rejected": 741.8463745117188, + "logps/chosen": -61.81424331665039, + "logps/rejected": -180.7491455078125, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.9346096515655518, + "rewards/margins": 8.370655059814453, + "rewards/rejected": -6.436045169830322, + "step": 364 + }, + { + "epoch": 3.71, + "learning_rate": 1e-06, + "logits/chosen": 739.5715942382812, + "logits/rejected": 739.3678588867188, + "logps/chosen": -101.19467163085938, + "logps/rejected": -197.13888549804688, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.380779981613159, + "rewards/margins": 9.73967456817627, + "rewards/rejected": -7.358894348144531, + "step": 365 + }, + { + "epoch": 3.73, + "learning_rate": 1e-06, + "logits/chosen": 728.278076171875, + "logits/rejected": 686.303466796875, + "logps/chosen": -111.4708251953125, + "logps/rejected": -243.84100341796875, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.712702989578247, + "rewards/margins": 12.197813034057617, + "rewards/rejected": -10.48511028289795, + "step": 366 + }, + { + "epoch": 3.74, + "learning_rate": 1e-06, + "logits/chosen": 740.2201538085938, + "logits/rejected": 729.514892578125, + "logps/chosen": -69.4881820678711, + "logps/rejected": -201.69967651367188, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.785103678703308, + "rewards/margins": 9.352934837341309, + "rewards/rejected": -7.567831516265869, + "step": 367 + }, + { + "epoch": 3.76, + "learning_rate": 1e-06, + "logits/chosen": 735.7814331054688, + "logits/rejected": 736.7582397460938, + "logps/chosen": -78.40980529785156, + "logps/rejected": -158.1739501953125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.0973832607269287, + "rewards/margins": 9.891365051269531, + "rewards/rejected": -7.793982028961182, + "step": 368 + }, + { + "epoch": 3.77, + "learning_rate": 1e-06, + "logits/chosen": 722.5512084960938, + "logits/rejected": 702.341064453125, + "logps/chosen": -86.25050354003906, + "logps/rejected": -199.01019287109375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 1.5247055292129517, + "rewards/margins": 9.322214126586914, + "rewards/rejected": -7.797508239746094, + "step": 369 + }, + { + "epoch": 3.79, + "learning_rate": 1e-06, + "logits/chosen": 743.0562133789062, + "logits/rejected": 747.2483520507812, + "logps/chosen": -92.55030059814453, + "logps/rejected": -153.6352081298828, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0966827869415283, + "rewards/margins": 7.680642127990723, + "rewards/rejected": -4.583959102630615, + "step": 370 + }, + { + "epoch": 3.8, + "learning_rate": 1e-06, + "logits/chosen": 732.6799926757812, + "logits/rejected": 713.6985473632812, + "logps/chosen": -90.42632293701172, + "logps/rejected": -216.27874755859375, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.975231170654297, + "rewards/margins": 12.047399520874023, + "rewards/rejected": -9.072168350219727, + "step": 371 + }, + { + "epoch": 3.82, + "learning_rate": 1e-06, + "logits/chosen": 746.6068115234375, + "logits/rejected": 748.4751586914062, + "logps/chosen": -88.75025939941406, + "logps/rejected": -219.6444549560547, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.675264835357666, + "rewards/margins": 9.751867294311523, + "rewards/rejected": -7.076602458953857, + "step": 372 + }, + { + "epoch": 3.83, + "learning_rate": 1e-06, + "logits/chosen": 727.824462890625, + "logits/rejected": 727.3997802734375, + "logps/chosen": -78.22805786132812, + "logps/rejected": -135.79464721679688, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.7845962047576904, + "rewards/margins": 9.198086738586426, + "rewards/rejected": -5.413490295410156, + "step": 373 + }, + { + "epoch": 3.85, + "learning_rate": 1e-06, + "logits/chosen": 717.2174682617188, + "logits/rejected": 721.270263671875, + "logps/chosen": -101.6449203491211, + "logps/rejected": -179.93746948242188, + "loss": 0.0002, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.3497321605682373, + "rewards/margins": 8.66262435913086, + "rewards/rejected": -6.312892436981201, + "step": 374 + }, + { + "epoch": 3.86, + "learning_rate": 1e-06, + "logits/chosen": 724.8995971679688, + "logits/rejected": 727.01904296875, + "logps/chosen": -76.05685424804688, + "logps/rejected": -143.3458251953125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.8577911853790283, + "rewards/margins": 8.012090682983398, + "rewards/rejected": -5.154299259185791, + "step": 375 + }, + { + "epoch": 3.88, + "learning_rate": 1e-06, + "logits/chosen": 713.1998901367188, + "logits/rejected": 715.5352783203125, + "logps/chosen": -81.00406646728516, + "logps/rejected": -162.2042236328125, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.0733895301818848, + "rewards/margins": 8.906248092651367, + "rewards/rejected": -5.832859039306641, + "step": 376 + }, + { + "epoch": 3.89, + "learning_rate": 1e-06, + "logits/chosen": 700.8328857421875, + "logits/rejected": 722.0309448242188, + "logps/chosen": -62.340126037597656, + "logps/rejected": -167.49114990234375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.821335554122925, + "rewards/margins": 9.395286560058594, + "rewards/rejected": -6.573951244354248, + "step": 377 + }, + { + "epoch": 3.91, + "learning_rate": 1e-06, + "logits/chosen": 732.4066772460938, + "logits/rejected": 722.2620849609375, + "logps/chosen": -118.19965362548828, + "logps/rejected": -185.95077514648438, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.5716989040374756, + "rewards/margins": 9.636175155639648, + "rewards/rejected": -7.064476013183594, + "step": 378 + }, + { + "epoch": 3.92, + "learning_rate": 1e-06, + "logits/chosen": 710.7389526367188, + "logits/rejected": 725.5673217773438, + "logps/chosen": -75.5845718383789, + "logps/rejected": -164.34083557128906, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.9467811584472656, + "rewards/margins": 8.847270965576172, + "rewards/rejected": -5.900489807128906, + "step": 379 + }, + { + "epoch": 3.94, + "learning_rate": 1e-06, + "logits/chosen": 733.0737915039062, + "logits/rejected": 712.5618286132812, + "logps/chosen": -86.55747985839844, + "logps/rejected": -185.29290771484375, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.198808431625366, + "rewards/margins": 9.293940544128418, + "rewards/rejected": -7.095132350921631, + "step": 380 + }, + { + "epoch": 3.95, + "learning_rate": 1e-06, + "logits/chosen": 734.1033325195312, + "logits/rejected": 725.6203002929688, + "logps/chosen": -76.3611831665039, + "logps/rejected": -201.23793029785156, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.159630537033081, + "rewards/margins": 11.239797592163086, + "rewards/rejected": -9.080166816711426, + "step": 381 + }, + { + "epoch": 3.97, + "learning_rate": 1e-06, + "logits/chosen": 713.7753295898438, + "logits/rejected": 727.990966796875, + "logps/chosen": -108.12967681884766, + "logps/rejected": -166.75576782226562, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.2150871753692627, + "rewards/margins": 9.137410163879395, + "rewards/rejected": -5.922322750091553, + "step": 382 + }, + { + "epoch": 3.98, + "learning_rate": 1e-06, + "logits/chosen": 747.9082641601562, + "logits/rejected": 740.0225219726562, + "logps/chosen": -100.58594512939453, + "logps/rejected": -221.03890991210938, + "loss": 0.0, + "rewards/accuracies": 1.0, + "rewards/chosen": 2.4289023876190186, + "rewards/margins": 9.936502456665039, + "rewards/rejected": -7.5076003074646, + "step": 383 + }, + { + "epoch": 4.0, + "learning_rate": 1e-06, + "logits/chosen": 733.404052734375, + "logits/rejected": 738.27490234375, + "logps/chosen": -92.38526153564453, + "logps/rejected": -200.24844360351562, + "loss": 0.0001, + "rewards/accuracies": 1.0, + "rewards/chosen": 3.183269500732422, + "rewards/margins": 9.826932907104492, + "rewards/rejected": -6.643662929534912, + "step": 384 + }, + { + "epoch": 4.0, + "eval_logits/chosen": 726.487060546875, + "eval_logits/rejected": 725.3931884765625, + "eval_logps/chosen": -112.1086654663086, + "eval_logps/rejected": -124.93560791015625, + "eval_loss": 0.6701663732528687, + "eval_rewards/accuracies": 0.699999988079071, + "eval_rewards/chosen": -0.6128483414649963, + "eval_rewards/margins": 0.18891727924346924, + "eval_rewards/rejected": -0.8017656207084656, + "eval_runtime": 0.3021, + "eval_samples_per_second": 33.099, + "eval_steps_per_second": 33.099, + "step": 384 + } + ], + "logging_steps": 1.0, + "max_steps": 384, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}