{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.9989866087585957, "eval_steps": 100, "global_step": 1726, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023163228374954757, "grad_norm": 107.28771573687173, "learning_rate": 4e-09, "logits/chosen": -1.124485731124878, "logits/rejected": -1.2086994647979736, "logps/chosen": -156.01296997070312, "logps/rejected": -208.54991149902344, "loss": 0.6233, "rewards/accuracies": 0.59375, "rewards/chosen": 0.07398775964975357, "rewards/margins": 0.2667727768421173, "rewards/rejected": -0.19278500974178314, "step": 2 }, { "epoch": 0.0046326456749909515, "grad_norm": 88.08812844747567, "learning_rate": 8e-09, "logits/chosen": -1.2528715133666992, "logits/rejected": -1.2828329801559448, "logps/chosen": -132.4397430419922, "logps/rejected": -172.45309448242188, "loss": 0.7004, "rewards/accuracies": 0.71875, "rewards/chosen": 0.17529334127902985, "rewards/margins": 0.35418701171875, "rewards/rejected": -0.17889368534088135, "step": 4 }, { "epoch": 0.006948968512486428, "grad_norm": 91.60766301044009, "learning_rate": 1.1999999999999998e-08, "logits/chosen": -1.2843875885009766, "logits/rejected": -1.280484914779663, "logps/chosen": -152.78907775878906, "logps/rejected": -152.23944091796875, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": -0.041480474174022675, "rewards/margins": 0.14676526188850403, "rewards/rejected": -0.1882457733154297, "step": 6 }, { "epoch": 0.009265291349981903, "grad_norm": 98.62741523160896, "learning_rate": 1.6e-08, "logits/chosen": -1.398779273033142, "logits/rejected": -1.4048051834106445, "logps/chosen": -169.36114501953125, "logps/rejected": -187.72393798828125, "loss": 0.6494, "rewards/accuracies": 0.78125, "rewards/chosen": 0.06773313134908676, "rewards/margins": 0.3909590244293213, "rewards/rejected": -0.32322588562965393, "step": 8 }, { "epoch": 0.01158161418747738, "grad_norm": 103.80152080031493, "learning_rate": 2e-08, "logits/chosen": -1.2080261707305908, "logits/rejected": -1.287600040435791, "logps/chosen": -143.546875, "logps/rejected": -167.290283203125, "loss": 0.714, "rewards/accuracies": 0.5, "rewards/chosen": 0.024446196854114532, "rewards/margins": 0.028562966734170914, "rewards/rejected": -0.004116774536669254, "step": 10 }, { "epoch": 0.013897937024972856, "grad_norm": 102.45515514662566, "learning_rate": 2.3999999999999997e-08, "logits/chosen": -1.3106777667999268, "logits/rejected": -1.2940219640731812, "logps/chosen": -146.94171142578125, "logps/rejected": -157.75350952148438, "loss": 0.6722, "rewards/accuracies": 0.5625, "rewards/chosen": 0.09310206025838852, "rewards/margins": 0.06749637424945831, "rewards/rejected": 0.0256056971848011, "step": 12 }, { "epoch": 0.01621425986246833, "grad_norm": 98.56963705951901, "learning_rate": 2.8000000000000003e-08, "logits/chosen": -1.2968071699142456, "logits/rejected": -1.3280229568481445, "logps/chosen": -124.2802505493164, "logps/rejected": -141.4438018798828, "loss": 0.6939, "rewards/accuracies": 0.6875, "rewards/chosen": 0.11542778462171555, "rewards/margins": 0.1968868374824524, "rewards/rejected": -0.08145906031131744, "step": 14 }, { "epoch": 0.018530582699963806, "grad_norm": 126.73593067586688, "learning_rate": 3.2e-08, "logits/chosen": -1.2126774787902832, "logits/rejected": -1.3230162858963013, "logps/chosen": -143.24264526367188, "logps/rejected": -150.46603393554688, "loss": 0.6388, "rewards/accuracies": 0.8125, "rewards/chosen": 0.02438124269247055, "rewards/margins": 0.4613185226917267, "rewards/rejected": -0.43693727254867554, "step": 16 }, { "epoch": 0.020846905537459284, "grad_norm": 125.55466393152622, "learning_rate": 3.6e-08, "logits/chosen": -1.1398718357086182, "logits/rejected": -1.2096312046051025, "logps/chosen": -168.44686889648438, "logps/rejected": -209.51075744628906, "loss": 0.5937, "rewards/accuracies": 0.71875, "rewards/chosen": 0.058702100068330765, "rewards/margins": 0.36269423365592957, "rewards/rejected": -0.3039921820163727, "step": 18 }, { "epoch": 0.02316322837495476, "grad_norm": 99.14617505813268, "learning_rate": 4e-08, "logits/chosen": -1.18150794506073, "logits/rejected": -1.2542356252670288, "logps/chosen": -167.63018798828125, "logps/rejected": -192.35606384277344, "loss": 0.6576, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0018138289451599121, "rewards/margins": 0.2771007716655731, "rewards/rejected": -0.27891457080841064, "step": 20 }, { "epoch": 0.025479551212450234, "grad_norm": 91.07306017372633, "learning_rate": 4.4e-08, "logits/chosen": -1.2071597576141357, "logits/rejected": -1.182037115097046, "logps/chosen": -144.98622131347656, "logps/rejected": -141.1741180419922, "loss": 0.5928, "rewards/accuracies": 0.625, "rewards/chosen": 0.08638761937618256, "rewards/margins": 0.27829694747924805, "rewards/rejected": -0.1919093132019043, "step": 22 }, { "epoch": 0.027795874049945712, "grad_norm": 89.9597440051718, "learning_rate": 4.799999999999999e-08, "logits/chosen": -1.31764554977417, "logits/rejected": -1.3588660955429077, "logps/chosen": -88.85529327392578, "logps/rejected": -98.12319946289062, "loss": 0.6819, "rewards/accuracies": 0.5, "rewards/chosen": 0.034286849200725555, "rewards/margins": -0.03185552358627319, "rewards/rejected": 0.06614237278699875, "step": 24 }, { "epoch": 0.030112196887441187, "grad_norm": 103.03824203236205, "learning_rate": 5.2e-08, "logits/chosen": -1.2300812005996704, "logits/rejected": -1.3013834953308105, "logps/chosen": -136.9016571044922, "logps/rejected": -158.10443115234375, "loss": 0.6651, "rewards/accuracies": 0.5, "rewards/chosen": 0.02496076375246048, "rewards/margins": 0.11402938514947891, "rewards/rejected": -0.08906861394643784, "step": 26 }, { "epoch": 0.03242851972493666, "grad_norm": 82.73233095035695, "learning_rate": 5.6000000000000005e-08, "logits/chosen": -1.2323240041732788, "logits/rejected": -1.279797911643982, "logps/chosen": -129.50372314453125, "logps/rejected": -149.10198974609375, "loss": 0.6957, "rewards/accuracies": 0.625, "rewards/chosen": 0.01655733771622181, "rewards/margins": 0.28547346591949463, "rewards/rejected": -0.26891613006591797, "step": 28 }, { "epoch": 0.03474484256243214, "grad_norm": 81.35170136123334, "learning_rate": 6e-08, "logits/chosen": -1.2328161001205444, "logits/rejected": -1.2630647420883179, "logps/chosen": -115.19596862792969, "logps/rejected": -122.42259216308594, "loss": 0.6297, "rewards/accuracies": 0.65625, "rewards/chosen": 0.05066928267478943, "rewards/margins": 0.29739153385162354, "rewards/rejected": -0.24672222137451172, "step": 30 }, { "epoch": 0.03706116539992761, "grad_norm": 97.64254508804386, "learning_rate": 6.4e-08, "logits/chosen": -1.203454852104187, "logits/rejected": -1.2570163011550903, "logps/chosen": -130.0023193359375, "logps/rejected": -140.43641662597656, "loss": 0.6388, "rewards/accuracies": 0.65625, "rewards/chosen": 0.22588226199150085, "rewards/margins": 0.21626117825508118, "rewards/rejected": 0.009621085599064827, "step": 32 }, { "epoch": 0.03937748823742309, "grad_norm": 77.47976131311218, "learning_rate": 6.8e-08, "logits/chosen": -1.3255200386047363, "logits/rejected": -1.34698486328125, "logps/chosen": -154.41970825195312, "logps/rejected": -185.44137573242188, "loss": 0.6644, "rewards/accuracies": 0.71875, "rewards/chosen": 0.219247967004776, "rewards/margins": 0.5436465740203857, "rewards/rejected": -0.32439863681793213, "step": 34 }, { "epoch": 0.04169381107491857, "grad_norm": 116.81809749704041, "learning_rate": 7.2e-08, "logits/chosen": -1.172755479812622, "logits/rejected": -1.2560861110687256, "logps/chosen": -140.07249450683594, "logps/rejected": -150.0348358154297, "loss": 0.6966, "rewards/accuracies": 0.625, "rewards/chosen": -0.06599292159080505, "rewards/margins": 0.17668266594409943, "rewards/rejected": -0.2426755726337433, "step": 36 }, { "epoch": 0.04401013391241404, "grad_norm": 106.17036044375986, "learning_rate": 7.599999999999999e-08, "logits/chosen": -1.2809267044067383, "logits/rejected": -1.3268154859542847, "logps/chosen": -150.99420166015625, "logps/rejected": -156.5166473388672, "loss": 0.6641, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0937371775507927, "rewards/margins": 0.24054178595542908, "rewards/rejected": -0.14680461585521698, "step": 38 }, { "epoch": 0.04632645674990952, "grad_norm": 92.41470133544318, "learning_rate": 8e-08, "logits/chosen": -1.117331862449646, "logits/rejected": -1.1234540939331055, "logps/chosen": -116.87013244628906, "logps/rejected": -136.917724609375, "loss": 0.6733, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0025692852213978767, "rewards/margins": 0.18151307106018066, "rewards/rejected": -0.17894375324249268, "step": 40 }, { "epoch": 0.048642779587405, "grad_norm": 93.22753554673558, "learning_rate": 8.4e-08, "logits/chosen": -1.2923729419708252, "logits/rejected": -1.297027587890625, "logps/chosen": -166.103759765625, "logps/rejected": -181.45489501953125, "loss": 0.7232, "rewards/accuracies": 0.71875, "rewards/chosen": 0.10465113818645477, "rewards/margins": 0.18375912308692932, "rewards/rejected": -0.07910798490047455, "step": 42 }, { "epoch": 0.05095910242490047, "grad_norm": 106.85916991971588, "learning_rate": 8.8e-08, "logits/chosen": -1.1490933895111084, "logits/rejected": -1.1849195957183838, "logps/chosen": -128.70228576660156, "logps/rejected": -132.4073944091797, "loss": 0.6279, "rewards/accuracies": 0.71875, "rewards/chosen": 0.020340707153081894, "rewards/margins": 0.1922486573457718, "rewards/rejected": -0.1719079315662384, "step": 44 }, { "epoch": 0.053275425262395947, "grad_norm": 97.83169947406368, "learning_rate": 9.2e-08, "logits/chosen": -1.172762393951416, "logits/rejected": -1.2331753969192505, "logps/chosen": -158.21144104003906, "logps/rejected": -166.85243225097656, "loss": 0.678, "rewards/accuracies": 0.46875, "rewards/chosen": -0.032968997955322266, "rewards/margins": -0.025485830381512642, "rewards/rejected": -0.007483180612325668, "step": 46 }, { "epoch": 0.055591748099891425, "grad_norm": 95.31491574095924, "learning_rate": 9.599999999999999e-08, "logits/chosen": -1.1868703365325928, "logits/rejected": -1.1828659772872925, "logps/chosen": -124.27775573730469, "logps/rejected": -134.80194091796875, "loss": 0.6496, "rewards/accuracies": 0.65625, "rewards/chosen": 0.07946968823671341, "rewards/margins": 0.2973785996437073, "rewards/rejected": -0.21790890395641327, "step": 48 }, { "epoch": 0.057908070937386896, "grad_norm": 85.1406868444302, "learning_rate": 1e-07, "logits/chosen": -1.2733350992202759, "logits/rejected": -1.259924054145813, "logps/chosen": -98.49462890625, "logps/rejected": -97.88270568847656, "loss": 0.6637, "rewards/accuracies": 0.625, "rewards/chosen": 0.0715535506606102, "rewards/margins": 0.20892760157585144, "rewards/rejected": -0.13737404346466064, "step": 50 }, { "epoch": 0.060224393774882375, "grad_norm": 125.25778955296133, "learning_rate": 1.04e-07, "logits/chosen": -1.149878740310669, "logits/rejected": -1.2266716957092285, "logps/chosen": -148.4358367919922, "logps/rejected": -194.48550415039062, "loss": 0.6064, "rewards/accuracies": 0.6875, "rewards/chosen": 0.20534949004650116, "rewards/margins": 0.5171413421630859, "rewards/rejected": -0.31179192662239075, "step": 52 }, { "epoch": 0.06254071661237785, "grad_norm": 85.1210580902555, "learning_rate": 1.08e-07, "logits/chosen": -1.167691946029663, "logits/rejected": -1.2391445636749268, "logps/chosen": -89.7966537475586, "logps/rejected": -90.55988311767578, "loss": 0.652, "rewards/accuracies": 0.53125, "rewards/chosen": 0.041167087852954865, "rewards/margins": 0.031343974173069, "rewards/rejected": 0.009823101572692394, "step": 54 }, { "epoch": 0.06485703944987332, "grad_norm": 113.7038348505988, "learning_rate": 1.1200000000000001e-07, "logits/chosen": -1.1915769577026367, "logits/rejected": -1.2897090911865234, "logps/chosen": -128.1698760986328, "logps/rejected": -162.2020721435547, "loss": 0.6575, "rewards/accuracies": 0.59375, "rewards/chosen": 0.04976767301559448, "rewards/margins": 0.225263312458992, "rewards/rejected": -0.17549563944339752, "step": 56 }, { "epoch": 0.0671733622873688, "grad_norm": 101.75813362292396, "learning_rate": 1.1599999999999999e-07, "logits/chosen": -1.1701388359069824, "logits/rejected": -1.2366788387298584, "logps/chosen": -125.07969665527344, "logps/rejected": -143.41500854492188, "loss": 0.6856, "rewards/accuracies": 0.46875, "rewards/chosen": 0.10177399218082428, "rewards/margins": -0.0010812487453222275, "rewards/rejected": 0.10285523533821106, "step": 58 }, { "epoch": 0.06948968512486428, "grad_norm": 126.89350965014833, "learning_rate": 1.2e-07, "logits/chosen": -1.0981509685516357, "logits/rejected": -1.0755817890167236, "logps/chosen": -97.16602325439453, "logps/rejected": -108.92754364013672, "loss": 0.6917, "rewards/accuracies": 0.53125, "rewards/chosen": 0.06149844080209732, "rewards/margins": 0.26494595408439636, "rewards/rejected": -0.20344750583171844, "step": 60 }, { "epoch": 0.07180600796235975, "grad_norm": 104.04126418655044, "learning_rate": 1.24e-07, "logits/chosen": -1.349946141242981, "logits/rejected": -1.3422037363052368, "logps/chosen": -196.04966735839844, "logps/rejected": -191.64556884765625, "loss": 0.6639, "rewards/accuracies": 0.65625, "rewards/chosen": -0.06520761549472809, "rewards/margins": 0.08913937956094742, "rewards/rejected": -0.1543469876050949, "step": 62 }, { "epoch": 0.07412233079985522, "grad_norm": 88.57358643267023, "learning_rate": 1.28e-07, "logits/chosen": -1.301114559173584, "logits/rejected": -1.3226174116134644, "logps/chosen": -193.7246856689453, "logps/rejected": -175.90676879882812, "loss": 0.7147, "rewards/accuracies": 0.65625, "rewards/chosen": -0.012029323726892471, "rewards/margins": 0.13698697090148926, "rewards/rejected": -0.14901632070541382, "step": 64 }, { "epoch": 0.07643865363735071, "grad_norm": 88.52081037901337, "learning_rate": 1.32e-07, "logits/chosen": -1.2255011796951294, "logits/rejected": -1.1852511167526245, "logps/chosen": -126.35673522949219, "logps/rejected": -129.8230743408203, "loss": 0.6276, "rewards/accuracies": 0.65625, "rewards/chosen": -0.08929681777954102, "rewards/margins": 0.20648589730262756, "rewards/rejected": -0.29578274488449097, "step": 66 }, { "epoch": 0.07875497647484618, "grad_norm": 99.62619229467569, "learning_rate": 1.36e-07, "logits/chosen": -1.2398712635040283, "logits/rejected": -1.2752665281295776, "logps/chosen": -120.943115234375, "logps/rejected": -146.46226501464844, "loss": 0.6609, "rewards/accuracies": 0.65625, "rewards/chosen": 0.15306755900382996, "rewards/margins": 0.28030622005462646, "rewards/rejected": -0.12723864614963531, "step": 68 }, { "epoch": 0.08107129931234165, "grad_norm": 123.76564817180537, "learning_rate": 1.3999999999999998e-07, "logits/chosen": -1.2972962856292725, "logits/rejected": -1.2252490520477295, "logps/chosen": -225.8544921875, "logps/rejected": -202.05828857421875, "loss": 0.6665, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04239021614193916, "rewards/margins": 0.21172578632831573, "rewards/rejected": -0.2541159689426422, "step": 70 }, { "epoch": 0.08338762214983714, "grad_norm": 76.83779340876126, "learning_rate": 1.44e-07, "logits/chosen": -1.2308259010314941, "logits/rejected": -1.2874609231948853, "logps/chosen": -122.35316467285156, "logps/rejected": -140.2095489501953, "loss": 0.6382, "rewards/accuracies": 0.40625, "rewards/chosen": -0.03576444834470749, "rewards/margins": 0.04666022211313248, "rewards/rejected": -0.08242467045783997, "step": 72 }, { "epoch": 0.08570394498733261, "grad_norm": 99.14395654649428, "learning_rate": 1.48e-07, "logits/chosen": -1.2242047786712646, "logits/rejected": -1.1648895740509033, "logps/chosen": -129.6905975341797, "logps/rejected": -149.36679077148438, "loss": 0.7052, "rewards/accuracies": 0.46875, "rewards/chosen": -0.054999418556690216, "rewards/margins": -0.0660078227519989, "rewards/rejected": 0.011008389294147491, "step": 74 }, { "epoch": 0.08802026782482808, "grad_norm": 97.12896274616604, "learning_rate": 1.5199999999999998e-07, "logits/chosen": -1.1600117683410645, "logits/rejected": -1.1804448366165161, "logps/chosen": -104.76295471191406, "logps/rejected": -119.62078857421875, "loss": 0.7319, "rewards/accuracies": 0.53125, "rewards/chosen": 0.09297358989715576, "rewards/margins": -0.027700770646333694, "rewards/rejected": 0.12067436426877975, "step": 76 }, { "epoch": 0.09033659066232357, "grad_norm": 97.18348074718145, "learning_rate": 1.56e-07, "logits/chosen": -1.32283616065979, "logits/rejected": -1.3598227500915527, "logps/chosen": -106.35183715820312, "logps/rejected": -126.29917907714844, "loss": 0.6492, "rewards/accuracies": 0.59375, "rewards/chosen": -0.02818872779607773, "rewards/margins": 0.018800366669893265, "rewards/rejected": -0.046989068388938904, "step": 78 }, { "epoch": 0.09265291349981904, "grad_norm": 97.61734233459106, "learning_rate": 1.6e-07, "logits/chosen": -1.271921157836914, "logits/rejected": -1.2778291702270508, "logps/chosen": -131.4869384765625, "logps/rejected": -138.66790771484375, "loss": 0.674, "rewards/accuracies": 0.59375, "rewards/chosen": 0.12225940823554993, "rewards/margins": 0.13028502464294434, "rewards/rejected": -0.008025608956813812, "step": 80 }, { "epoch": 0.09496923633731451, "grad_norm": 107.33609007479366, "learning_rate": 1.6399999999999999e-07, "logits/chosen": -1.2688210010528564, "logits/rejected": -1.157952904701233, "logps/chosen": -139.7398223876953, "logps/rejected": -118.81407928466797, "loss": 0.6931, "rewards/accuracies": 0.59375, "rewards/chosen": 0.09918585419654846, "rewards/margins": 0.21009066700935364, "rewards/rejected": -0.11090480536222458, "step": 82 }, { "epoch": 0.09728555917481, "grad_norm": 110.01597856385645, "learning_rate": 1.68e-07, "logits/chosen": -1.2803689241409302, "logits/rejected": -1.2909519672393799, "logps/chosen": -132.66493225097656, "logps/rejected": -173.28465270996094, "loss": 0.6635, "rewards/accuracies": 0.5625, "rewards/chosen": 0.05728979408740997, "rewards/margins": 0.36615562438964844, "rewards/rejected": -0.30886584520339966, "step": 84 }, { "epoch": 0.09960188201230546, "grad_norm": 83.44332939168942, "learning_rate": 1.7199999999999998e-07, "logits/chosen": -1.3057663440704346, "logits/rejected": -1.385368824005127, "logps/chosen": -107.35197448730469, "logps/rejected": -131.16021728515625, "loss": 0.6259, "rewards/accuracies": 0.75, "rewards/chosen": 0.11738797277212143, "rewards/margins": 0.3890346884727478, "rewards/rejected": -0.2716467082500458, "step": 86 }, { "epoch": 0.10191820484980094, "grad_norm": 111.78643885213532, "learning_rate": 1.76e-07, "logits/chosen": -1.2848472595214844, "logits/rejected": -1.3411585092544556, "logps/chosen": -122.2959976196289, "logps/rejected": -137.3688507080078, "loss": 0.7167, "rewards/accuracies": 0.4375, "rewards/chosen": -0.042663730680942535, "rewards/margins": 0.04965965449810028, "rewards/rejected": -0.09232338517904282, "step": 88 }, { "epoch": 0.10423452768729642, "grad_norm": 98.91788911999603, "learning_rate": 1.8e-07, "logits/chosen": -1.3746676445007324, "logits/rejected": -1.4242061376571655, "logps/chosen": -165.28323364257812, "logps/rejected": -191.28463745117188, "loss": 0.6558, "rewards/accuracies": 0.75, "rewards/chosen": -0.03775983303785324, "rewards/margins": 0.41289234161376953, "rewards/rejected": -0.45065221190452576, "step": 90 }, { "epoch": 0.10655085052479189, "grad_norm": 111.36490143512665, "learning_rate": 1.84e-07, "logits/chosen": -1.3450393676757812, "logits/rejected": -1.2920035123825073, "logps/chosen": -113.93867492675781, "logps/rejected": -127.0622787475586, "loss": 0.652, "rewards/accuracies": 0.59375, "rewards/chosen": 0.016492784023284912, "rewards/margins": 0.23039251565933228, "rewards/rejected": -0.21389973163604736, "step": 92 }, { "epoch": 0.10886717336228736, "grad_norm": 86.77836554288824, "learning_rate": 1.88e-07, "logits/chosen": -1.2809118032455444, "logits/rejected": -1.2533433437347412, "logps/chosen": -126.3050308227539, "logps/rejected": -146.13954162597656, "loss": 0.67, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12605877220630646, "rewards/margins": 0.10366299748420715, "rewards/rejected": -0.2297217696905136, "step": 94 }, { "epoch": 0.11118349619978285, "grad_norm": 88.7771339602004, "learning_rate": 1.9199999999999997e-07, "logits/chosen": -1.3726493120193481, "logits/rejected": -1.3574461936950684, "logps/chosen": -166.51853942871094, "logps/rejected": -152.40924072265625, "loss": 0.6228, "rewards/accuracies": 0.625, "rewards/chosen": 0.10408200323581696, "rewards/margins": 0.2763758897781372, "rewards/rejected": -0.17229388654232025, "step": 96 }, { "epoch": 0.11349981903727832, "grad_norm": 108.88629440565376, "learning_rate": 1.9599999999999998e-07, "logits/chosen": -1.2083911895751953, "logits/rejected": -1.2246620655059814, "logps/chosen": -102.56442260742188, "logps/rejected": -119.50433349609375, "loss": 0.6727, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12194587290287018, "rewards/margins": -0.01594824716448784, "rewards/rejected": -0.10599763691425323, "step": 98 }, { "epoch": 0.11581614187477379, "grad_norm": 107.34841983393876, "learning_rate": 2e-07, "logits/chosen": -1.301100254058838, "logits/rejected": -1.2448687553405762, "logps/chosen": -98.93006896972656, "logps/rejected": -104.18254089355469, "loss": 0.6665, "rewards/accuracies": 0.6875, "rewards/chosen": 0.13594529032707214, "rewards/margins": 0.32611793279647827, "rewards/rejected": -0.19017262756824493, "step": 100 }, { "epoch": 0.11581614187477379, "eval_logits/chosen": -1.2619309425354004, "eval_logits/rejected": -1.26002836227417, "eval_logps/chosen": -136.7689208984375, "eval_logps/rejected": -136.49838256835938, "eval_loss": 0.698452353477478, "eval_rewards/accuracies": 0.6000000238418579, "eval_rewards/chosen": -0.11377277970314026, "eval_rewards/margins": 0.14070965349674225, "eval_rewards/rejected": -0.2544824182987213, "eval_runtime": 24.9769, "eval_samples_per_second": 4.004, "eval_steps_per_second": 1.001, "step": 100 }, { "epoch": 0.11813246471226928, "grad_norm": 116.33900974943187, "learning_rate": 1.9999925339977214e-07, "logits/chosen": -1.184662938117981, "logits/rejected": -1.2719396352767944, "logps/chosen": -97.52359008789062, "logps/rejected": -115.05503845214844, "loss": 0.7448, "rewards/accuracies": 0.53125, "rewards/chosen": 0.214466392993927, "rewards/margins": 0.24747245013713837, "rewards/rejected": -0.033006034791469574, "step": 102 }, { "epoch": 0.12044878754976475, "grad_norm": 104.21886206321497, "learning_rate": 1.9999701361023685e-07, "logits/chosen": -1.3039063215255737, "logits/rejected": -1.3020060062408447, "logps/chosen": -159.8837432861328, "logps/rejected": -170.56915283203125, "loss": 0.6355, "rewards/accuracies": 0.65625, "rewards/chosen": 0.22545436024665833, "rewards/margins": 0.24028439819812775, "rewards/rejected": -0.014830047264695168, "step": 104 }, { "epoch": 0.12276511038726022, "grad_norm": 87.7482533485662, "learning_rate": 1.9999328066483861e-07, "logits/chosen": -1.222218632698059, "logits/rejected": -1.2927945852279663, "logps/chosen": -139.34170532226562, "logps/rejected": -165.7588348388672, "loss": 0.6163, "rewards/accuracies": 0.59375, "rewards/chosen": 0.08087025582790375, "rewards/margins": 0.19403491914272308, "rewards/rejected": -0.11316468566656113, "step": 106 }, { "epoch": 0.1250814332247557, "grad_norm": 99.23120967868726, "learning_rate": 1.9998805461931787e-07, "logits/chosen": -1.4282017946243286, "logits/rejected": -1.4305387735366821, "logps/chosen": -143.5193634033203, "logps/rejected": -159.5055389404297, "loss": 0.6448, "rewards/accuracies": 0.65625, "rewards/chosen": 0.19475266337394714, "rewards/margins": 0.35647571086883545, "rewards/rejected": -0.1617230772972107, "step": 108 }, { "epoch": 0.12739775606225118, "grad_norm": 138.8867591897412, "learning_rate": 1.9998133555170987e-07, "logits/chosen": -1.251868486404419, "logits/rejected": -1.2644716501235962, "logps/chosen": -155.47824096679688, "logps/rejected": -147.9483642578125, "loss": 0.7241, "rewards/accuracies": 0.65625, "rewards/chosen": 0.009776605293154716, "rewards/margins": 0.11597751080989838, "rewards/rejected": -0.1062009185552597, "step": 110 }, { "epoch": 0.12971407889974665, "grad_norm": 83.82733566843218, "learning_rate": 1.9997312356234383e-07, "logits/chosen": -1.1328340768814087, "logits/rejected": -1.1860363483428955, "logps/chosen": -122.01078796386719, "logps/rejected": -129.853759765625, "loss": 0.6493, "rewards/accuracies": 0.5625, "rewards/chosen": 0.04629912227392197, "rewards/margins": 0.14483632147312164, "rewards/rejected": -0.09853721410036087, "step": 112 }, { "epoch": 0.13203040173724212, "grad_norm": 93.13036576501773, "learning_rate": 1.9996341877384118e-07, "logits/chosen": -1.1620995998382568, "logits/rejected": -1.1975159645080566, "logps/chosen": -106.84819793701172, "logps/rejected": -118.56710815429688, "loss": 0.7035, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0475483313202858, "rewards/margins": 0.27000802755355835, "rewards/rejected": -0.22245970368385315, "step": 114 }, { "epoch": 0.1343467245747376, "grad_norm": 91.76740971647824, "learning_rate": 1.9995222133111386e-07, "logits/chosen": -1.41290283203125, "logits/rejected": -1.4713375568389893, "logps/chosen": -150.1622314453125, "logps/rejected": -171.03826904296875, "loss": 0.6605, "rewards/accuracies": 0.625, "rewards/chosen": -0.036708392202854156, "rewards/margins": 0.22243811190128326, "rewards/rejected": -0.259146511554718, "step": 116 }, { "epoch": 0.1366630474122331, "grad_norm": 92.6178794136077, "learning_rate": 1.9993953140136216e-07, "logits/chosen": -1.2917448282241821, "logits/rejected": -1.2992435693740845, "logps/chosen": -156.93075561523438, "logps/rejected": -169.90003967285156, "loss": 0.6092, "rewards/accuracies": 0.71875, "rewards/chosen": 0.2660582661628723, "rewards/margins": 0.3958339989185333, "rewards/rejected": -0.1297757476568222, "step": 118 }, { "epoch": 0.13897937024972856, "grad_norm": 120.21640303391953, "learning_rate": 1.9992534917407219e-07, "logits/chosen": -1.349656105041504, "logits/rejected": -1.3740586042404175, "logps/chosen": -105.40829467773438, "logps/rejected": -121.30719757080078, "loss": 0.6782, "rewards/accuracies": 0.6875, "rewards/chosen": 0.12624263763427734, "rewards/margins": 0.2478535771369934, "rewards/rejected": -0.12161093950271606, "step": 120 }, { "epoch": 0.14129569308722403, "grad_norm": 96.89881415216858, "learning_rate": 1.9990967486101294e-07, "logits/chosen": -1.2133190631866455, "logits/rejected": -1.2981417179107666, "logps/chosen": -143.4733123779297, "logps/rejected": -158.9942169189453, "loss": 0.6542, "rewards/accuracies": 0.78125, "rewards/chosen": 0.12114143371582031, "rewards/margins": 0.3096562922000885, "rewards/rejected": -0.18851490318775177, "step": 122 }, { "epoch": 0.1436120159247195, "grad_norm": 115.73216474991814, "learning_rate": 1.9989250869623339e-07, "logits/chosen": -1.1692413091659546, "logits/rejected": -1.1631613969802856, "logps/chosen": -75.05902099609375, "logps/rejected": -73.19275665283203, "loss": 0.6629, "rewards/accuracies": 0.59375, "rewards/chosen": 0.004932025447487831, "rewards/margins": 0.01587551087141037, "rewards/rejected": -0.010943496599793434, "step": 124 }, { "epoch": 0.14592833876221498, "grad_norm": 91.12287746621037, "learning_rate": 1.9987385093605883e-07, "logits/chosen": -1.2355098724365234, "logits/rejected": -1.3346201181411743, "logps/chosen": -128.04537963867188, "logps/rejected": -157.95831298828125, "loss": 0.6661, "rewards/accuracies": 0.5625, "rewards/chosen": 0.16255205869674683, "rewards/margins": 0.1615760326385498, "rewards/rejected": 0.0009760260581970215, "step": 126 }, { "epoch": 0.14824466159971045, "grad_norm": 75.57776368919416, "learning_rate": 1.9985370185908693e-07, "logits/chosen": -1.324599266052246, "logits/rejected": -1.4256994724273682, "logps/chosen": -160.19496154785156, "logps/rejected": -195.67481994628906, "loss": 0.6052, "rewards/accuracies": 0.625, "rewards/chosen": 0.2864247262477875, "rewards/margins": 0.38650280237197876, "rewards/rejected": -0.10007809102535248, "step": 128 }, { "epoch": 0.15056098443720595, "grad_norm": 99.17567816074047, "learning_rate": 1.9983206176618386e-07, "logits/chosen": -1.3276948928833008, "logits/rejected": -1.355118751525879, "logps/chosen": -150.64071655273438, "logps/rejected": -160.59710693359375, "loss": 0.6456, "rewards/accuracies": 0.59375, "rewards/chosen": 0.24799679219722748, "rewards/margins": 0.17287284135818481, "rewards/rejected": 0.07512396574020386, "step": 130 }, { "epoch": 0.15287730727470142, "grad_norm": 96.98870449195998, "learning_rate": 1.9980893098047952e-07, "logits/chosen": -1.3167519569396973, "logits/rejected": -1.3964886665344238, "logps/chosen": -180.10609436035156, "logps/rejected": -202.38543701171875, "loss": 0.6797, "rewards/accuracies": 0.53125, "rewards/chosen": -0.005836776457726955, "rewards/margins": 0.22576376795768738, "rewards/rejected": -0.2316005378961563, "step": 132 }, { "epoch": 0.1551936301121969, "grad_norm": 117.99608582102925, "learning_rate": 1.9978430984736302e-07, "logits/chosen": -1.2505745887756348, "logits/rejected": -1.2896987199783325, "logps/chosen": -136.62095642089844, "logps/rejected": -174.75283813476562, "loss": 0.6933, "rewards/accuracies": 0.5625, "rewards/chosen": -0.007682096213102341, "rewards/margins": 0.22923235595226288, "rewards/rejected": -0.2369144707918167, "step": 134 }, { "epoch": 0.15750995294969236, "grad_norm": 96.12339888540123, "learning_rate": 1.9975819873447716e-07, "logits/chosen": -1.3039547204971313, "logits/rejected": -1.2686259746551514, "logps/chosen": -148.68177795410156, "logps/rejected": -152.08419799804688, "loss": 0.7376, "rewards/accuracies": 0.4375, "rewards/chosen": 0.060760702937841415, "rewards/margins": 0.03407803922891617, "rewards/rejected": 0.026682645082473755, "step": 136 }, { "epoch": 0.15982627578718783, "grad_norm": 89.28907762268612, "learning_rate": 1.9973059803171318e-07, "logits/chosen": -1.289531946182251, "logits/rejected": -1.3106164932250977, "logps/chosen": -132.0419464111328, "logps/rejected": -141.20553588867188, "loss": 0.6417, "rewards/accuracies": 0.5625, "rewards/chosen": 0.11375564336776733, "rewards/margins": 0.15943853557109833, "rewards/rejected": -0.045682892203330994, "step": 138 }, { "epoch": 0.1621425986246833, "grad_norm": 91.84482179377031, "learning_rate": 1.9970150815120492e-07, "logits/chosen": -1.192561388015747, "logits/rejected": -1.2516599893569946, "logps/chosen": -144.32687377929688, "logps/rejected": -173.77713012695312, "loss": 0.6664, "rewards/accuracies": 0.5625, "rewards/chosen": 0.15235400199890137, "rewards/margins": 0.528164267539978, "rewards/rejected": -0.3758102059364319, "step": 140 }, { "epoch": 0.1644589214621788, "grad_norm": 106.57145249868728, "learning_rate": 1.9967092952732263e-07, "logits/chosen": -1.2669620513916016, "logits/rejected": -1.2892065048217773, "logps/chosen": -158.38601684570312, "logps/rejected": -170.94969177246094, "loss": 0.6949, "rewards/accuracies": 0.59375, "rewards/chosen": -0.014247164130210876, "rewards/margins": 0.1061759814620018, "rewards/rejected": -0.12042315304279327, "step": 142 }, { "epoch": 0.16677524429967427, "grad_norm": 84.38620972657165, "learning_rate": 1.9963886261666644e-07, "logits/chosen": -1.3349426984786987, "logits/rejected": -1.3626400232315063, "logps/chosen": -182.36489868164062, "logps/rejected": -181.57772827148438, "loss": 0.6661, "rewards/accuracies": 0.5, "rewards/chosen": 0.010849647223949432, "rewards/margins": -0.0017997026443481445, "rewards/rejected": 0.012649361044168472, "step": 144 }, { "epoch": 0.16909156713716975, "grad_norm": 98.62106551824567, "learning_rate": 1.996053078980596e-07, "logits/chosen": -1.2585192918777466, "logits/rejected": -1.3876447677612305, "logps/chosen": -197.3888702392578, "logps/rejected": -256.7153625488281, "loss": 0.6226, "rewards/accuracies": 0.59375, "rewards/chosen": 0.11644049733877182, "rewards/margins": 0.36041852831840515, "rewards/rejected": -0.24397803843021393, "step": 146 }, { "epoch": 0.17140788997466522, "grad_norm": 98.22080984411147, "learning_rate": 1.9957026587254133e-07, "logits/chosen": -1.337060809135437, "logits/rejected": -1.409785270690918, "logps/chosen": -132.1688995361328, "logps/rejected": -153.75076293945312, "loss": 0.6727, "rewards/accuracies": 0.625, "rewards/chosen": 0.1054624617099762, "rewards/margins": 0.2318607121706009, "rewards/rejected": -0.1263982504606247, "step": 148 }, { "epoch": 0.1737242128121607, "grad_norm": 97.21935367376963, "learning_rate": 1.9953373706335933e-07, "logits/chosen": -1.1929289102554321, "logits/rejected": -1.2236409187316895, "logps/chosen": -107.02670288085938, "logps/rejected": -127.95613098144531, "loss": 0.6247, "rewards/accuracies": 0.59375, "rewards/chosen": 0.2545817792415619, "rewards/margins": 0.5042745471000671, "rewards/rejected": -0.24969279766082764, "step": 150 }, { "epoch": 0.17604053564965616, "grad_norm": 94.64988218894314, "learning_rate": 1.994957220159619e-07, "logits/chosen": -1.3698749542236328, "logits/rejected": -1.3630839586257935, "logps/chosen": -139.28404235839844, "logps/rejected": -144.27447509765625, "loss": 0.6708, "rewards/accuracies": 0.46875, "rewards/chosen": 0.09452275186777115, "rewards/margins": -0.054528601467609406, "rewards/rejected": 0.14905135333538055, "step": 152 }, { "epoch": 0.17835685848715166, "grad_norm": 84.21031983415891, "learning_rate": 1.9945622129798997e-07, "logits/chosen": -1.1571345329284668, "logits/rejected": -1.1760966777801514, "logps/chosen": -116.92776489257812, "logps/rejected": -130.6823272705078, "loss": 0.7077, "rewards/accuracies": 0.65625, "rewards/chosen": 0.014008231461048126, "rewards/margins": 0.1772165149450302, "rewards/rejected": -0.16320832073688507, "step": 154 }, { "epoch": 0.18067318132464713, "grad_norm": 92.78782165706316, "learning_rate": 1.994152354992684e-07, "logits/chosen": -1.2113721370697021, "logits/rejected": -1.1836888790130615, "logps/chosen": -152.64590454101562, "logps/rejected": -165.62060546875, "loss": 0.6243, "rewards/accuracies": 0.625, "rewards/chosen": 0.08571603149175644, "rewards/margins": 0.3834956884384155, "rewards/rejected": -0.2977796792984009, "step": 156 }, { "epoch": 0.1829895041621426, "grad_norm": 87.13678534194861, "learning_rate": 1.993727652317973e-07, "logits/chosen": -1.3516209125518799, "logits/rejected": -1.3507136106491089, "logps/chosen": -131.21653747558594, "logps/rejected": -137.61451721191406, "loss": 0.663, "rewards/accuracies": 0.625, "rewards/chosen": 0.1327284872531891, "rewards/margins": 0.0872359424829483, "rewards/rejected": 0.04549254849553108, "step": 158 }, { "epoch": 0.18530582699963807, "grad_norm": 95.58101049566258, "learning_rate": 1.9932881112974295e-07, "logits/chosen": -1.33234441280365, "logits/rejected": -1.3648202419281006, "logps/chosen": -145.8714141845703, "logps/rejected": -169.70263671875, "loss": 0.6228, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2145387828350067, "rewards/margins": 0.47029000520706177, "rewards/rejected": -0.25575125217437744, "step": 160 }, { "epoch": 0.18762214983713354, "grad_norm": 102.72583148528975, "learning_rate": 1.9928337384942815e-07, "logits/chosen": -1.3217285871505737, "logits/rejected": -1.3136992454528809, "logps/chosen": -136.3543701171875, "logps/rejected": -162.57989501953125, "loss": 0.6413, "rewards/accuracies": 0.65625, "rewards/chosen": 0.10585685819387436, "rewards/margins": 0.4493997395038605, "rewards/rejected": -0.3435429036617279, "step": 162 }, { "epoch": 0.18993847267462902, "grad_norm": 80.21906160407148, "learning_rate": 1.992364540693226e-07, "logits/chosen": -1.1656920909881592, "logits/rejected": -1.251837134361267, "logps/chosen": -107.77313995361328, "logps/rejected": -126.16749572753906, "loss": 0.6541, "rewards/accuracies": 0.5625, "rewards/chosen": 0.11644032597541809, "rewards/margins": 0.2288074493408203, "rewards/rejected": -0.11236711591482162, "step": 164 }, { "epoch": 0.19225479551212452, "grad_norm": 86.51541074826133, "learning_rate": 1.991880524900327e-07, "logits/chosen": -1.3202519416809082, "logits/rejected": -1.3556690216064453, "logps/chosen": -161.82077026367188, "logps/rejected": -177.78990173339844, "loss": 0.6568, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14771340787410736, "rewards/margins": 0.33536669611930847, "rewards/rejected": -0.1876532882452011, "step": 166 }, { "epoch": 0.19457111834962, "grad_norm": 94.05305295088043, "learning_rate": 1.99138169834291e-07, "logits/chosen": -1.3505268096923828, "logits/rejected": -1.3973050117492676, "logps/chosen": -139.77182006835938, "logps/rejected": -162.59844970703125, "loss": 0.6877, "rewards/accuracies": 0.5625, "rewards/chosen": 0.07011064141988754, "rewards/margins": 0.491350919008255, "rewards/rejected": -0.42124032974243164, "step": 168 }, { "epoch": 0.19688744118711546, "grad_norm": 97.64767459159383, "learning_rate": 1.9908680684694557e-07, "logits/chosen": -1.3445894718170166, "logits/rejected": -1.290609359741211, "logps/chosen": -141.98887634277344, "logps/rejected": -149.97586059570312, "loss": 0.6267, "rewards/accuracies": 0.71875, "rewards/chosen": 0.06746640801429749, "rewards/margins": 0.2343478798866272, "rewards/rejected": -0.1668814867734909, "step": 170 }, { "epoch": 0.19920376402461093, "grad_norm": 83.46398544914094, "learning_rate": 1.990339642949488e-07, "logits/chosen": -1.2715935707092285, "logits/rejected": -1.287178874015808, "logps/chosen": -148.58975219726562, "logps/rejected": -165.84759521484375, "loss": 0.63, "rewards/accuracies": 0.65625, "rewards/chosen": 0.021569374948740005, "rewards/margins": 0.302487850189209, "rewards/rejected": -0.2809184491634369, "step": 172 }, { "epoch": 0.2015200868621064, "grad_norm": 76.3322328554944, "learning_rate": 1.9897964296734585e-07, "logits/chosen": -1.3700447082519531, "logits/rejected": -1.342909336090088, "logps/chosen": -120.27708435058594, "logps/rejected": -138.94029235839844, "loss": 0.5928, "rewards/accuracies": 0.625, "rewards/chosen": 0.13506034016609192, "rewards/margins": 0.2544079124927521, "rewards/rejected": -0.11934758722782135, "step": 174 }, { "epoch": 0.20383640969960187, "grad_norm": 107.54391320136774, "learning_rate": 1.9892384367526306e-07, "logits/chosen": -1.295619249343872, "logits/rejected": -1.310873031616211, "logps/chosen": -121.80717468261719, "logps/rejected": -130.56863403320312, "loss": 0.6713, "rewards/accuracies": 0.71875, "rewards/chosen": 0.04329588636755943, "rewards/margins": 0.15788349509239197, "rewards/rejected": -0.11458761245012283, "step": 176 }, { "epoch": 0.20615273253709737, "grad_norm": 119.93284403188072, "learning_rate": 1.9886656725189573e-07, "logits/chosen": -1.279773473739624, "logits/rejected": -1.248420000076294, "logps/chosen": -140.36367797851562, "logps/rejected": -171.85006713867188, "loss": 0.6378, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0794132649898529, "rewards/margins": 0.44784438610076904, "rewards/rejected": -0.36843112111091614, "step": 178 }, { "epoch": 0.20846905537459284, "grad_norm": 79.14950979441363, "learning_rate": 1.9880781455249567e-07, "logits/chosen": -1.2967725992202759, "logits/rejected": -1.25602126121521, "logps/chosen": -144.56265258789062, "logps/rejected": -162.73004150390625, "loss": 0.6249, "rewards/accuracies": 0.71875, "rewards/chosen": -0.049035411328077316, "rewards/margins": 0.5332698822021484, "rewards/rejected": -0.5823052525520325, "step": 180 }, { "epoch": 0.21078537821208831, "grad_norm": 98.83687345993943, "learning_rate": 1.9874758645435846e-07, "logits/chosen": -1.2088567018508911, "logits/rejected": -1.2226208448410034, "logps/chosen": -123.06584930419922, "logps/rejected": -139.6479034423828, "loss": 0.7016, "rewards/accuracies": 0.65625, "rewards/chosen": 0.08502492308616638, "rewards/margins": 0.19190920889377594, "rewards/rejected": -0.10688426345586777, "step": 182 }, { "epoch": 0.21310170104958379, "grad_norm": 113.97507708078155, "learning_rate": 1.986858838568103e-07, "logits/chosen": -1.3104774951934814, "logits/rejected": -1.4111891984939575, "logps/chosen": -181.88356018066406, "logps/rejected": -208.72584533691406, "loss": 0.705, "rewards/accuracies": 0.65625, "rewards/chosen": -0.08081494271755219, "rewards/margins": 0.3886911869049072, "rewards/rejected": -0.469506174325943, "step": 184 }, { "epoch": 0.21541802388707926, "grad_norm": 77.45086288721679, "learning_rate": 1.986227076811947e-07, "logits/chosen": -1.3617669343948364, "logits/rejected": -1.4249626398086548, "logps/chosen": -129.08343505859375, "logps/rejected": -152.81179809570312, "loss": 0.6411, "rewards/accuracies": 0.625, "rewards/chosen": 0.05278458073735237, "rewards/margins": 0.2645794153213501, "rewards/rejected": -0.21179485321044922, "step": 186 }, { "epoch": 0.21773434672457473, "grad_norm": 84.35008555939693, "learning_rate": 1.985580588708586e-07, "logits/chosen": -1.3356997966766357, "logits/rejected": -1.3520371913909912, "logps/chosen": -116.34514617919922, "logps/rejected": -129.33343505859375, "loss": 0.5923, "rewards/accuracies": 0.75, "rewards/chosen": 0.2643883526325226, "rewards/margins": 0.6749240159988403, "rewards/rejected": -0.41053569316864014, "step": 188 }, { "epoch": 0.2200506695620702, "grad_norm": 123.38101336984641, "learning_rate": 1.984919383911383e-07, "logits/chosen": -1.308461308479309, "logits/rejected": -1.304238200187683, "logps/chosen": -158.46170043945312, "logps/rejected": -180.0099639892578, "loss": 0.6699, "rewards/accuracies": 0.625, "rewards/chosen": 0.2162826955318451, "rewards/margins": 0.2433900237083435, "rewards/rejected": -0.02710733562707901, "step": 190 }, { "epoch": 0.2223669923995657, "grad_norm": 86.07588724640635, "learning_rate": 1.9842434722934515e-07, "logits/chosen": -1.4127757549285889, "logits/rejected": -1.4460113048553467, "logps/chosen": -157.5387420654297, "logps/rejected": -172.41824340820312, "loss": 0.6669, "rewards/accuracies": 0.59375, "rewards/chosen": 0.1006392389535904, "rewards/margins": 0.23149970173835754, "rewards/rejected": -0.13086043298244476, "step": 192 }, { "epoch": 0.22468331523706117, "grad_norm": 94.64802234991522, "learning_rate": 1.9835528639475064e-07, "logits/chosen": -1.388432264328003, "logits/rejected": -1.4818886518478394, "logps/chosen": -141.1114959716797, "logps/rejected": -157.92218017578125, "loss": 0.697, "rewards/accuracies": 0.625, "rewards/chosen": 0.01169365644454956, "rewards/margins": 0.08859987556934357, "rewards/rejected": -0.076906219124794, "step": 194 }, { "epoch": 0.22699963807455664, "grad_norm": 89.06679411091595, "learning_rate": 1.9828475691857144e-07, "logits/chosen": -1.3492029905319214, "logits/rejected": -1.3424463272094727, "logps/chosen": -187.03414916992188, "logps/rejected": -190.73678588867188, "loss": 0.6388, "rewards/accuracies": 0.59375, "rewards/chosen": 0.22039715945720673, "rewards/margins": 0.18239110708236694, "rewards/rejected": 0.038006074726581573, "step": 196 }, { "epoch": 0.2293159609120521, "grad_norm": 95.75837422835144, "learning_rate": 1.982127598539541e-07, "logits/chosen": -1.1775927543640137, "logits/rejected": -1.1424531936645508, "logps/chosen": -148.27951049804688, "logps/rejected": -156.52732849121094, "loss": 0.6159, "rewards/accuracies": 0.53125, "rewards/chosen": 0.1567731648683548, "rewards/margins": 0.08035247027873993, "rewards/rejected": 0.07642071694135666, "step": 198 }, { "epoch": 0.23163228374954759, "grad_norm": 93.08435428016394, "learning_rate": 1.9813929627595906e-07, "logits/chosen": -1.4000098705291748, "logits/rejected": -1.3821650743484497, "logps/chosen": -139.89630126953125, "logps/rejected": -148.32470703125, "loss": 0.6393, "rewards/accuracies": 0.625, "rewards/chosen": 0.09745461493730545, "rewards/margins": 0.24615491926670074, "rewards/rejected": -0.14870032668113708, "step": 200 }, { "epoch": 0.23163228374954759, "eval_logits/chosen": -1.304500937461853, "eval_logits/rejected": -1.3002585172653198, "eval_logps/chosen": -136.9430694580078, "eval_logps/rejected": -136.71730041503906, "eval_loss": 0.6824392080307007, "eval_rewards/accuracies": 0.5600000023841858, "eval_rewards/chosen": -0.13118763267993927, "eval_rewards/margins": 0.14518636465072632, "eval_rewards/rejected": -0.2763740122318268, "eval_runtime": 26.1287, "eval_samples_per_second": 3.827, "eval_steps_per_second": 0.957, "step": 200 }, { "epoch": 0.23394860658704306, "grad_norm": 72.45361347245692, "learning_rate": 1.9806436728154483e-07, "logits/chosen": -1.335165023803711, "logits/rejected": -1.3661694526672363, "logps/chosen": -138.936767578125, "logps/rejected": -158.40025329589844, "loss": 0.6013, "rewards/accuracies": 0.53125, "rewards/chosen": -0.007253367453813553, "rewards/margins": 0.15234613418579102, "rewards/rejected": -0.15959949791431427, "step": 202 }, { "epoch": 0.23626492942453856, "grad_norm": 90.8540277710046, "learning_rate": 1.9798797398955145e-07, "logits/chosen": -1.2384086847305298, "logits/rejected": -1.3494983911514282, "logps/chosen": -128.23599243164062, "logps/rejected": -177.1882781982422, "loss": 0.564, "rewards/accuracies": 0.8125, "rewards/chosen": 0.33455580472946167, "rewards/margins": 0.7392863035202026, "rewards/rejected": -0.40473055839538574, "step": 204 }, { "epoch": 0.23858125226203403, "grad_norm": 74.43794613270396, "learning_rate": 1.9791011754068395e-07, "logits/chosen": -1.3185287714004517, "logits/rejected": -1.3400077819824219, "logps/chosen": -131.19667053222656, "logps/rejected": -163.21923828125, "loss": 0.6032, "rewards/accuracies": 0.65625, "rewards/chosen": 0.1290275603532791, "rewards/margins": 0.47639700770378113, "rewards/rejected": -0.3473694622516632, "step": 206 }, { "epoch": 0.2408975750995295, "grad_norm": 95.09010015175441, "learning_rate": 1.9783079909749514e-07, "logits/chosen": -1.1546052694320679, "logits/rejected": -1.2192610502243042, "logps/chosen": -166.31753540039062, "logps/rejected": -186.45753479003906, "loss": 0.6578, "rewards/accuracies": 0.625, "rewards/chosen": 0.3212764859199524, "rewards/margins": 0.19196587800979614, "rewards/rejected": 0.12931060791015625, "step": 208 }, { "epoch": 0.24321389793702497, "grad_norm": 85.95636909904059, "learning_rate": 1.9775001984436842e-07, "logits/chosen": -1.2965327501296997, "logits/rejected": -1.3256494998931885, "logps/chosen": -150.2991943359375, "logps/rejected": -156.32376098632812, "loss": 0.6469, "rewards/accuracies": 0.375, "rewards/chosen": -0.29619836807250977, "rewards/margins": 0.0699533224105835, "rewards/rejected": -0.36615169048309326, "step": 210 }, { "epoch": 0.24553022077452044, "grad_norm": 100.06649868470993, "learning_rate": 1.9766778098749993e-07, "logits/chosen": -1.216491937637329, "logits/rejected": -1.2272950410842896, "logps/chosen": -164.16220092773438, "logps/rejected": -181.47265625, "loss": 0.6707, "rewards/accuracies": 0.625, "rewards/chosen": -0.06646190583705902, "rewards/margins": 0.46730464696884155, "rewards/rejected": -0.5337665677070618, "step": 212 }, { "epoch": 0.2478465436120159, "grad_norm": 96.52482939931645, "learning_rate": 1.975840837548807e-07, "logits/chosen": -1.3946311473846436, "logits/rejected": -1.3414866924285889, "logps/chosen": -152.69595336914062, "logps/rejected": -164.10858154296875, "loss": 0.7014, "rewards/accuracies": 0.65625, "rewards/chosen": 0.022681551054120064, "rewards/margins": 0.17960719764232635, "rewards/rejected": -0.15692564845085144, "step": 214 }, { "epoch": 0.2501628664495114, "grad_norm": 153.51800892842274, "learning_rate": 1.974989293962781e-07, "logits/chosen": -1.2091304063796997, "logits/rejected": -1.2250198125839233, "logps/chosen": -181.1391143798828, "logps/rejected": -222.74087524414062, "loss": 0.76, "rewards/accuracies": 0.625, "rewards/chosen": -0.0878646969795227, "rewards/margins": 0.9455296993255615, "rewards/rejected": -1.033394455909729, "step": 216 }, { "epoch": 0.25247918928700686, "grad_norm": 99.38458587796177, "learning_rate": 1.974123191832175e-07, "logits/chosen": -1.2188912630081177, "logits/rejected": -1.2454049587249756, "logps/chosen": -114.72016143798828, "logps/rejected": -151.92970275878906, "loss": 0.7037, "rewards/accuracies": 0.46875, "rewards/chosen": -0.28741875290870667, "rewards/margins": 0.1605263203382492, "rewards/rejected": -0.44794508814811707, "step": 218 }, { "epoch": 0.25479551212450235, "grad_norm": 81.97849529400557, "learning_rate": 1.9732425440896293e-07, "logits/chosen": -1.264250636100769, "logits/rejected": -1.3256012201309204, "logps/chosen": -124.54591369628906, "logps/rejected": -148.69161987304688, "loss": 0.5741, "rewards/accuracies": 0.75, "rewards/chosen": 0.18301630020141602, "rewards/margins": 0.42959779500961304, "rewards/rejected": -0.24658147990703583, "step": 220 }, { "epoch": 0.25711183496199785, "grad_norm": 145.57197289231644, "learning_rate": 1.9723473638849804e-07, "logits/chosen": -1.282865285873413, "logits/rejected": -1.3407156467437744, "logps/chosen": -160.74911499023438, "logps/rejected": -179.92872619628906, "loss": 0.6797, "rewards/accuracies": 0.59375, "rewards/chosen": -0.25827834010124207, "rewards/margins": 0.23611454665660858, "rewards/rejected": -0.49439290165901184, "step": 222 }, { "epoch": 0.2594281577994933, "grad_norm": 92.12330941145905, "learning_rate": 1.9714376645850633e-07, "logits/chosen": -1.1877822875976562, "logits/rejected": -1.2320842742919922, "logps/chosen": -112.88442993164062, "logps/rejected": -153.49356079101562, "loss": 0.6879, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06163054332137108, "rewards/margins": 0.5120011568069458, "rewards/rejected": -0.5736316442489624, "step": 224 }, { "epoch": 0.2617444806369888, "grad_norm": 89.27914911232024, "learning_rate": 1.9705134597735113e-07, "logits/chosen": -1.3040626049041748, "logits/rejected": -1.3116812705993652, "logps/chosen": -138.1647491455078, "logps/rejected": -164.8112030029297, "loss": 0.6066, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5537402629852295, "rewards/margins": 0.5435131788253784, "rewards/rejected": -1.097253441810608, "step": 226 }, { "epoch": 0.26406080347448424, "grad_norm": 100.44445362175034, "learning_rate": 1.9695747632505558e-07, "logits/chosen": -1.2323942184448242, "logits/rejected": -1.3942488431930542, "logps/chosen": -95.50066375732422, "logps/rejected": -116.43473815917969, "loss": 0.6697, "rewards/accuracies": 0.625, "rewards/chosen": 0.22127968072891235, "rewards/margins": 0.21899569034576416, "rewards/rejected": 0.0022839903831481934, "step": 228 }, { "epoch": 0.26637712631197974, "grad_norm": 121.25117391544367, "learning_rate": 1.9686215890328168e-07, "logits/chosen": -1.3321830034255981, "logits/rejected": -1.2888097763061523, "logps/chosen": -139.03431701660156, "logps/rejected": -178.35134887695312, "loss": 0.6215, "rewards/accuracies": 0.75, "rewards/chosen": 0.2928890287876129, "rewards/margins": 0.5932635068893433, "rewards/rejected": -0.30037450790405273, "step": 230 }, { "epoch": 0.2686934491494752, "grad_norm": 92.88847876797445, "learning_rate": 1.9676539513530965e-07, "logits/chosen": -1.2525643110275269, "logits/rejected": -1.2690104246139526, "logps/chosen": -137.59510803222656, "logps/rejected": -170.5804901123047, "loss": 0.6265, "rewards/accuracies": 0.75, "rewards/chosen": -0.003497052937746048, "rewards/margins": 0.2578098177909851, "rewards/rejected": -0.2613069415092468, "step": 232 }, { "epoch": 0.2710097719869707, "grad_norm": 109.64954295867085, "learning_rate": 1.966671864660165e-07, "logits/chosen": -1.3433884382247925, "logits/rejected": -1.332856297492981, "logps/chosen": -98.57205200195312, "logps/rejected": -97.49153137207031, "loss": 0.6447, "rewards/accuracies": 0.6875, "rewards/chosen": -0.011620203964412212, "rewards/margins": 0.2672192454338074, "rewards/rejected": -0.27883943915367126, "step": 234 }, { "epoch": 0.2733260948244662, "grad_norm": 88.30404740296814, "learning_rate": 1.9656753436185456e-07, "logits/chosen": -1.4166165590286255, "logits/rejected": -1.3988808393478394, "logps/chosen": -120.94856262207031, "logps/rejected": -126.8148422241211, "loss": 0.5872, "rewards/accuracies": 0.8125, "rewards/chosen": 0.20259736478328705, "rewards/margins": 0.5475842356681824, "rewards/rejected": -0.3449868857860565, "step": 236 }, { "epoch": 0.2756424176619616, "grad_norm": 86.00138406914863, "learning_rate": 1.9646644031082948e-07, "logits/chosen": -1.26936674118042, "logits/rejected": -1.3119958639144897, "logps/chosen": -141.89259338378906, "logps/rejected": -152.75332641601562, "loss": 0.6581, "rewards/accuracies": 0.625, "rewards/chosen": 0.03933039680123329, "rewards/margins": 0.42467713356018066, "rewards/rejected": -0.38534674048423767, "step": 238 }, { "epoch": 0.2779587404994571, "grad_norm": 85.11515238510546, "learning_rate": 1.9636390582247804e-07, "logits/chosen": -1.2737447023391724, "logits/rejected": -1.2945623397827148, "logps/chosen": -119.07524871826172, "logps/rejected": -129.40042114257812, "loss": 0.6561, "rewards/accuracies": 0.5, "rewards/chosen": 0.21082651615142822, "rewards/margins": 0.13657376170158386, "rewards/rejected": 0.07425275444984436, "step": 240 }, { "epoch": 0.28027506333695257, "grad_norm": 78.24054806837644, "learning_rate": 1.9625993242784577e-07, "logits/chosen": -1.3759459257125854, "logits/rejected": -1.4495054483413696, "logps/chosen": -109.1790771484375, "logps/rejected": -141.45123291015625, "loss": 0.5681, "rewards/accuracies": 0.625, "rewards/chosen": 0.19224990904331207, "rewards/margins": 0.4157065451145172, "rewards/rejected": -0.22345668077468872, "step": 242 }, { "epoch": 0.28259138617444807, "grad_norm": 100.77507144115899, "learning_rate": 1.9615452167946382e-07, "logits/chosen": -1.2527633905410767, "logits/rejected": -1.3112610578536987, "logps/chosen": -138.2284393310547, "logps/rejected": -145.6739501953125, "loss": 0.6735, "rewards/accuracies": 0.59375, "rewards/chosen": -0.12001015990972519, "rewards/margins": 0.14908871054649353, "rewards/rejected": -0.2690988779067993, "step": 244 }, { "epoch": 0.28490770901194357, "grad_norm": 81.77193945900667, "learning_rate": 1.9604767515132598e-07, "logits/chosen": -1.30977463722229, "logits/rejected": -1.308231234550476, "logps/chosen": -146.16175842285156, "logps/rejected": -150.26780700683594, "loss": 0.6512, "rewards/accuracies": 0.65625, "rewards/chosen": -0.02212803065776825, "rewards/margins": 0.2704678177833557, "rewards/rejected": -0.29259583353996277, "step": 246 }, { "epoch": 0.287224031849439, "grad_norm": 94.70823896676559, "learning_rate": 1.9593939443886513e-07, "logits/chosen": -1.2869986295700073, "logits/rejected": -1.3198009729385376, "logps/chosen": -205.1080322265625, "logps/rejected": -221.5395050048828, "loss": 0.6069, "rewards/accuracies": 0.625, "rewards/chosen": 0.03687068819999695, "rewards/margins": 0.5602297782897949, "rewards/rejected": -0.5233591794967651, "step": 248 }, { "epoch": 0.2895403546869345, "grad_norm": 78.53341652955227, "learning_rate": 1.9582968115892928e-07, "logits/chosen": -1.5314466953277588, "logits/rejected": -1.4816820621490479, "logps/chosen": -175.5377655029297, "logps/rejected": -167.43699645996094, "loss": 0.6442, "rewards/accuracies": 0.5625, "rewards/chosen": 0.053328827023506165, "rewards/margins": 0.07659827172756195, "rewards/rejected": -0.02326946146786213, "step": 250 }, { "epoch": 0.29185667752442995, "grad_norm": 79.61524235861766, "learning_rate": 1.9571853694975768e-07, "logits/chosen": -1.2131693363189697, "logits/rejected": -1.2827026844024658, "logps/chosen": -135.439453125, "logps/rejected": -152.61849975585938, "loss": 0.6405, "rewards/accuracies": 0.65625, "rewards/chosen": 0.15244589745998383, "rewards/margins": 0.6494119763374329, "rewards/rejected": -0.496966153383255, "step": 252 }, { "epoch": 0.29417300036192545, "grad_norm": 98.13148651308288, "learning_rate": 1.956059634709562e-07, "logits/chosen": -1.1604809761047363, "logits/rejected": -1.2288950681686401, "logps/chosen": -131.82022094726562, "logps/rejected": -161.86351013183594, "loss": 0.6057, "rewards/accuracies": 0.75, "rewards/chosen": 0.1956767737865448, "rewards/margins": 0.9915366172790527, "rewards/rejected": -0.7958598136901855, "step": 254 }, { "epoch": 0.2964893231994209, "grad_norm": 103.85718892196564, "learning_rate": 1.9549196240347248e-07, "logits/chosen": -1.2092281579971313, "logits/rejected": -1.2428326606750488, "logps/chosen": -155.42657470703125, "logps/rejected": -153.75314331054688, "loss": 0.634, "rewards/accuracies": 0.71875, "rewards/chosen": -0.04123011976480484, "rewards/margins": 0.5150858759880066, "rewards/rejected": -0.5563160181045532, "step": 256 }, { "epoch": 0.2988056460369164, "grad_norm": 86.63305573669103, "learning_rate": 1.9537653544957097e-07, "logits/chosen": -1.4105985164642334, "logits/rejected": -1.434030532836914, "logps/chosen": -185.7542266845703, "logps/rejected": -201.953369140625, "loss": 0.607, "rewards/accuracies": 0.59375, "rewards/chosen": 0.03058760240674019, "rewards/margins": 0.36953964829444885, "rewards/rejected": -0.33895203471183777, "step": 258 }, { "epoch": 0.3011219688744119, "grad_norm": 76.86118592942691, "learning_rate": 1.9525968433280754e-07, "logits/chosen": -1.3709828853607178, "logits/rejected": -1.4235422611236572, "logps/chosen": -116.02484130859375, "logps/rejected": -162.0040283203125, "loss": 0.6561, "rewards/accuracies": 0.78125, "rewards/chosen": 0.22844532132148743, "rewards/margins": 0.5770739912986755, "rewards/rejected": -0.3486286997795105, "step": 260 }, { "epoch": 0.30343829171190734, "grad_norm": 87.47994543044294, "learning_rate": 1.9514141079800358e-07, "logits/chosen": -1.3765437602996826, "logits/rejected": -1.4620999097824097, "logps/chosen": -178.11399841308594, "logps/rejected": -184.82977294921875, "loss": 0.6388, "rewards/accuracies": 0.59375, "rewards/chosen": 0.3185967206954956, "rewards/margins": 0.35556116700172424, "rewards/rejected": -0.036964427679777145, "step": 262 }, { "epoch": 0.30575461454940284, "grad_norm": 93.0963118432758, "learning_rate": 1.9502171661121997e-07, "logits/chosen": -1.3460544347763062, "logits/rejected": -1.3444372415542603, "logps/chosen": -129.8871307373047, "logps/rejected": -139.0457305908203, "loss": 0.633, "rewards/accuracies": 0.5, "rewards/chosen": 0.24807024002075195, "rewards/margins": 0.164279505610466, "rewards/rejected": 0.08379074931144714, "step": 264 }, { "epoch": 0.3080709373868983, "grad_norm": 89.98079729490527, "learning_rate": 1.9490060355973096e-07, "logits/chosen": -1.3349117040634155, "logits/rejected": -1.4269710779190063, "logps/chosen": -117.39810180664062, "logps/rejected": -144.81390380859375, "loss": 0.6071, "rewards/accuracies": 0.625, "rewards/chosen": 0.13228380680084229, "rewards/margins": 0.36814936995506287, "rewards/rejected": -0.2358655333518982, "step": 266 }, { "epoch": 0.3103872602243938, "grad_norm": 79.79141686146392, "learning_rate": 1.9477807345199713e-07, "logits/chosen": -1.2187737226486206, "logits/rejected": -1.255754828453064, "logps/chosen": -129.10220336914062, "logps/rejected": -155.4088592529297, "loss": 0.5498, "rewards/accuracies": 0.75, "rewards/chosen": 0.04840267822146416, "rewards/margins": 0.8025398850440979, "rewards/rejected": -0.7541371583938599, "step": 268 }, { "epoch": 0.3127035830618892, "grad_norm": 76.99480771293982, "learning_rate": 1.946541281176386e-07, "logits/chosen": -1.3978129625320435, "logits/rejected": -1.3492777347564697, "logps/chosen": -94.11738586425781, "logps/rejected": -111.17606353759766, "loss": 0.6439, "rewards/accuracies": 0.6875, "rewards/chosen": 0.10735100507736206, "rewards/margins": 0.33690255880355835, "rewards/rejected": -0.22955158352851868, "step": 270 }, { "epoch": 0.3150199058993847, "grad_norm": 123.83701643097643, "learning_rate": 1.9452876940740767e-07, "logits/chosen": -1.3182752132415771, "logits/rejected": -1.3577101230621338, "logps/chosen": -179.61712646484375, "logps/rejected": -175.739501953125, "loss": 0.658, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8353556394577026, "rewards/margins": -0.027862735092639923, "rewards/rejected": -0.8074928522109985, "step": 272 }, { "epoch": 0.3173362287368802, "grad_norm": 85.72321946906887, "learning_rate": 1.9440199919316122e-07, "logits/chosen": -1.288657546043396, "logits/rejected": -1.3015202283859253, "logps/chosen": -119.4478759765625, "logps/rejected": -126.77153015136719, "loss": 0.5747, "rewards/accuracies": 0.59375, "rewards/chosen": -0.10668228566646576, "rewards/margins": 0.2372274398803711, "rewards/rejected": -0.34390971064567566, "step": 274 }, { "epoch": 0.31965255157437567, "grad_norm": 74.20876048665262, "learning_rate": 1.9427381936783265e-07, "logits/chosen": -1.2866897583007812, "logits/rejected": -1.2775256633758545, "logps/chosen": -128.80926513671875, "logps/rejected": -149.09042358398438, "loss": 0.5875, "rewards/accuracies": 0.8125, "rewards/chosen": 0.045619502663612366, "rewards/margins": 0.7617495059967041, "rewards/rejected": -0.7161301374435425, "step": 276 }, { "epoch": 0.32196887441187116, "grad_norm": 92.4643464121328, "learning_rate": 1.9414423184540364e-07, "logits/chosen": -1.2458128929138184, "logits/rejected": -1.3468252420425415, "logps/chosen": -185.1983642578125, "logps/rejected": -208.1055908203125, "loss": 0.6219, "rewards/accuracies": 0.625, "rewards/chosen": -0.6145004034042358, "rewards/margins": 0.43242332339286804, "rewards/rejected": -1.0469236373901367, "step": 278 }, { "epoch": 0.3242851972493666, "grad_norm": 78.52373912784961, "learning_rate": 1.940132385608757e-07, "logits/chosen": -1.2931345701217651, "logits/rejected": -1.285240888595581, "logps/chosen": -120.32514953613281, "logps/rejected": -127.14945983886719, "loss": 0.6194, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13594146072864532, "rewards/margins": 0.45918428897857666, "rewards/rejected": -0.5951257348060608, "step": 280 }, { "epoch": 0.3266015200868621, "grad_norm": 87.99833578536212, "learning_rate": 1.9388084147024119e-07, "logits/chosen": -1.289980411529541, "logits/rejected": -1.3120776414871216, "logps/chosen": -143.28274536132812, "logps/rejected": -184.68968200683594, "loss": 0.5612, "rewards/accuracies": 0.625, "rewards/chosen": -0.36499708890914917, "rewards/margins": 0.8015878200531006, "rewards/rejected": -1.166584849357605, "step": 282 }, { "epoch": 0.3289178429243576, "grad_norm": 130.88630180608516, "learning_rate": 1.93747042550454e-07, "logits/chosen": -1.1992597579956055, "logits/rejected": -1.20537531375885, "logps/chosen": -164.58963012695312, "logps/rejected": -188.33392333984375, "loss": 0.6706, "rewards/accuracies": 0.65625, "rewards/chosen": -0.032479241490364075, "rewards/margins": 0.6490625143051147, "rewards/rejected": -0.6815417408943176, "step": 284 }, { "epoch": 0.33123416576185305, "grad_norm": 129.38381304632557, "learning_rate": 1.9361184379940027e-07, "logits/chosen": -1.3481284379959106, "logits/rejected": -1.3769184350967407, "logps/chosen": -144.51824951171875, "logps/rejected": -184.82598876953125, "loss": 0.5956, "rewards/accuracies": 0.71875, "rewards/chosen": -0.20756953954696655, "rewards/margins": 1.1878336668014526, "rewards/rejected": -1.3954031467437744, "step": 286 }, { "epoch": 0.33355048859934855, "grad_norm": 75.58880846521522, "learning_rate": 1.9347524723586834e-07, "logits/chosen": -1.3142046928405762, "logits/rejected": -1.3535311222076416, "logps/chosen": -127.45948791503906, "logps/rejected": -134.2197723388672, "loss": 0.5643, "rewards/accuracies": 0.53125, "rewards/chosen": -0.029636431485414505, "rewards/margins": 0.3624846637248993, "rewards/rejected": -0.3921211361885071, "step": 288 }, { "epoch": 0.335866811436844, "grad_norm": 73.31791152641335, "learning_rate": 1.9333725489951874e-07, "logits/chosen": -1.279150128364563, "logits/rejected": -1.2785069942474365, "logps/chosen": -137.74392700195312, "logps/rejected": -150.1664581298828, "loss": 0.562, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09417904168367386, "rewards/margins": 0.4170314073562622, "rewards/rejected": -0.5112104415893555, "step": 290 }, { "epoch": 0.3381831342743395, "grad_norm": 92.52553029666018, "learning_rate": 1.9319786885085363e-07, "logits/chosen": -1.2694755792617798, "logits/rejected": -1.2954623699188232, "logps/chosen": -131.68209838867188, "logps/rejected": -147.16407775878906, "loss": 0.5707, "rewards/accuracies": 0.625, "rewards/chosen": -0.23095721006393433, "rewards/margins": 0.6836221814155579, "rewards/rejected": -0.9145793318748474, "step": 292 }, { "epoch": 0.34049945711183494, "grad_norm": 117.1109903878386, "learning_rate": 1.9305709117118614e-07, "logits/chosen": -1.3775343894958496, "logits/rejected": -1.40194571018219, "logps/chosen": -152.88351440429688, "logps/rejected": -152.07113647460938, "loss": 0.6534, "rewards/accuracies": 0.6875, "rewards/chosen": -0.25070011615753174, "rewards/margins": 0.443280965089798, "rewards/rejected": -0.6939811110496521, "step": 294 }, { "epoch": 0.34281577994933043, "grad_norm": 90.63184013182904, "learning_rate": 1.929149239626092e-07, "logits/chosen": -1.3810930252075195, "logits/rejected": -1.375144600868225, "logps/chosen": -138.50286865234375, "logps/rejected": -150.77099609375, "loss": 0.6208, "rewards/accuracies": 0.78125, "rewards/chosen": -0.08024093508720398, "rewards/margins": 0.330517053604126, "rewards/rejected": -0.41075798869132996, "step": 296 }, { "epoch": 0.34513210278682593, "grad_norm": 151.73571113362462, "learning_rate": 1.9277136934796427e-07, "logits/chosen": -1.3765751123428345, "logits/rejected": -1.3763781785964966, "logps/chosen": -133.6553497314453, "logps/rejected": -155.61392211914062, "loss": 0.6849, "rewards/accuracies": 0.59375, "rewards/chosen": -0.21619975566864014, "rewards/margins": 0.11746194958686829, "rewards/rejected": -0.3336617350578308, "step": 298 }, { "epoch": 0.3474484256243214, "grad_norm": 102.44748886372176, "learning_rate": 1.926264294708095e-07, "logits/chosen": -1.2531358003616333, "logits/rejected": -1.280750036239624, "logps/chosen": -142.09063720703125, "logps/rejected": -168.24539184570312, "loss": 0.5871, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10979723930358887, "rewards/margins": 0.8283678293228149, "rewards/rejected": -0.9381651282310486, "step": 300 }, { "epoch": 0.3474484256243214, "eval_logits/chosen": -1.3154690265655518, "eval_logits/rejected": -1.3106950521469116, "eval_logps/chosen": -138.28590393066406, "eval_logps/rejected": -137.84115600585938, "eval_loss": 0.6834442615509033, "eval_rewards/accuracies": 0.6000000238418579, "eval_rewards/chosen": -0.2654740810394287, "eval_rewards/margins": 0.12328676134347916, "eval_rewards/rejected": -0.38876083493232727, "eval_runtime": 24.6332, "eval_samples_per_second": 4.06, "eval_steps_per_second": 1.015, "step": 300 }, { "epoch": 0.3497647484618169, "grad_norm": 79.77982199610355, "learning_rate": 1.9248010649538775e-07, "logits/chosen": -1.3689723014831543, "logits/rejected": -1.4326424598693848, "logps/chosen": -186.77511596679688, "logps/rejected": -194.57289123535156, "loss": 0.677, "rewards/accuracies": 0.625, "rewards/chosen": 0.07687507569789886, "rewards/margins": 0.245405375957489, "rewards/rejected": -0.16853031516075134, "step": 302 }, { "epoch": 0.3520810712993123, "grad_norm": 72.83330769963189, "learning_rate": 1.923324026065944e-07, "logits/chosen": -1.2983791828155518, "logits/rejected": -1.301888108253479, "logps/chosen": -96.0470199584961, "logps/rejected": -113.31134796142578, "loss": 0.6028, "rewards/accuracies": 0.625, "rewards/chosen": 0.10890144109725952, "rewards/margins": 0.2682499885559082, "rewards/rejected": -0.1593485325574875, "step": 304 }, { "epoch": 0.3543973941368078, "grad_norm": 100.14984360245958, "learning_rate": 1.9218332000994458e-07, "logits/chosen": -1.4329365491867065, "logits/rejected": -1.4993162155151367, "logps/chosen": -186.0762176513672, "logps/rejected": -214.90533447265625, "loss": 0.6058, "rewards/accuracies": 0.65625, "rewards/chosen": 0.03735332563519478, "rewards/margins": 0.3314560055732727, "rewards/rejected": -0.2941026985645294, "step": 306 }, { "epoch": 0.3567137169743033, "grad_norm": 72.19620109844895, "learning_rate": 1.9203286093154026e-07, "logits/chosen": -1.2941675186157227, "logits/rejected": -1.259239673614502, "logps/chosen": -109.15111541748047, "logps/rejected": -109.48808288574219, "loss": 0.6171, "rewards/accuracies": 0.59375, "rewards/chosen": 0.24576213955879211, "rewards/margins": 0.18151941895484924, "rewards/rejected": 0.06424272805452347, "step": 308 }, { "epoch": 0.35903003981179876, "grad_norm": 106.11628012062671, "learning_rate": 1.9188102761803715e-07, "logits/chosen": -1.4155701398849487, "logits/rejected": -1.469191312789917, "logps/chosen": -192.42648315429688, "logps/rejected": -190.7394561767578, "loss": 0.6785, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15398849546909332, "rewards/margins": 0.36678701639175415, "rewards/rejected": -0.5207754373550415, "step": 310 }, { "epoch": 0.36134636264929426, "grad_norm": 70.19369769339828, "learning_rate": 1.9172782233661094e-07, "logits/chosen": -1.254553198814392, "logits/rejected": -1.1699531078338623, "logps/chosen": -127.07028198242188, "logps/rejected": -145.60787963867188, "loss": 0.6158, "rewards/accuracies": 0.6875, "rewards/chosen": 0.17219696938991547, "rewards/margins": 0.5162093639373779, "rewards/rejected": -0.34401237964630127, "step": 312 }, { "epoch": 0.3636626854867897, "grad_norm": 90.77874249334138, "learning_rate": 1.915732473749236e-07, "logits/chosen": -1.2084178924560547, "logits/rejected": -1.1874415874481201, "logps/chosen": -166.15135192871094, "logps/rejected": -176.3106231689453, "loss": 0.629, "rewards/accuracies": 0.59375, "rewards/chosen": 0.1258632242679596, "rewards/margins": 0.4471869468688965, "rewards/rejected": -0.3213237524032593, "step": 314 }, { "epoch": 0.3659790083242852, "grad_norm": 78.58822831789924, "learning_rate": 1.914173050410892e-07, "logits/chosen": -1.3010833263397217, "logits/rejected": -1.3981972932815552, "logps/chosen": -94.9105453491211, "logps/rejected": -108.66416931152344, "loss": 0.6266, "rewards/accuracies": 0.65625, "rewards/chosen": 0.15446007251739502, "rewards/margins": 0.30908384919166565, "rewards/rejected": -0.15462377667427063, "step": 316 }, { "epoch": 0.36829533116178065, "grad_norm": 80.5668477187345, "learning_rate": 1.9125999766363932e-07, "logits/chosen": -1.4468637704849243, "logits/rejected": -1.4837853908538818, "logps/chosen": -121.05176544189453, "logps/rejected": -132.3884735107422, "loss": 0.6187, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1501280963420868, "rewards/margins": 0.14075569808483124, "rewards/rejected": 0.009372413158416748, "step": 318 }, { "epoch": 0.37061165399927615, "grad_norm": 90.09297915942425, "learning_rate": 1.9110132759148843e-07, "logits/chosen": -1.239458680152893, "logits/rejected": -1.2513267993927002, "logps/chosen": -119.19309997558594, "logps/rejected": -135.55023193359375, "loss": 0.6107, "rewards/accuracies": 0.71875, "rewards/chosen": -0.07714903354644775, "rewards/margins": 0.40861696004867554, "rewards/rejected": -0.4857659935951233, "step": 320 }, { "epoch": 0.37292797683677165, "grad_norm": 130.3488780136265, "learning_rate": 1.9094129719389885e-07, "logits/chosen": -1.3481711149215698, "logits/rejected": -1.328981637954712, "logps/chosen": -192.10084533691406, "logps/rejected": -214.688720703125, "loss": 0.635, "rewards/accuracies": 0.625, "rewards/chosen": -0.4045405685901642, "rewards/margins": 0.6289528012275696, "rewards/rejected": -1.0334933996200562, "step": 322 }, { "epoch": 0.3752442996742671, "grad_norm": 86.21776365076336, "learning_rate": 1.907799088604451e-07, "logits/chosen": -1.1944794654846191, "logits/rejected": -1.154435157775879, "logps/chosen": -86.31254577636719, "logps/rejected": -97.8081283569336, "loss": 0.6424, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0629437267780304, "rewards/margins": 0.25958341360092163, "rewards/rejected": -0.19663970172405243, "step": 324 }, { "epoch": 0.3775606225117626, "grad_norm": 109.3152948358386, "learning_rate": 1.9061716500097862e-07, "logits/chosen": -1.3203986883163452, "logits/rejected": -1.3523664474487305, "logps/chosen": -152.81573486328125, "logps/rejected": -161.0247039794922, "loss": 0.6101, "rewards/accuracies": 0.53125, "rewards/chosen": -0.4936632812023163, "rewards/margins": 0.04809580743312836, "rewards/rejected": -0.5417591333389282, "step": 326 }, { "epoch": 0.37987694534925803, "grad_norm": 91.94400981611243, "learning_rate": 1.904530680455914e-07, "logits/chosen": -1.3758294582366943, "logits/rejected": -1.4080578088760376, "logps/chosen": -146.73672485351562, "logps/rejected": -145.2505645751953, "loss": 0.6278, "rewards/accuracies": 0.65625, "rewards/chosen": 0.012471210211515427, "rewards/margins": 0.4439522325992584, "rewards/rejected": -0.4314810335636139, "step": 328 }, { "epoch": 0.38219326818675353, "grad_norm": 105.04213501880093, "learning_rate": 1.9028762044457992e-07, "logits/chosen": -1.2461824417114258, "logits/rejected": -1.288218379020691, "logps/chosen": -126.72929382324219, "logps/rejected": -151.31341552734375, "loss": 0.6118, "rewards/accuracies": 0.6875, "rewards/chosen": 0.042198315262794495, "rewards/margins": 0.5073456764221191, "rewards/rejected": -0.46514737606048584, "step": 330 }, { "epoch": 0.38450959102424903, "grad_norm": 131.99962498687907, "learning_rate": 1.901208246684085e-07, "logits/chosen": -1.345144271850586, "logits/rejected": -1.3419792652130127, "logps/chosen": -138.4906768798828, "logps/rejected": -144.3926239013672, "loss": 0.6483, "rewards/accuracies": 0.625, "rewards/chosen": -0.3029904365539551, "rewards/margins": 0.3120897114276886, "rewards/rejected": -0.6150801181793213, "step": 332 }, { "epoch": 0.3868259138617445, "grad_norm": 192.23629969436513, "learning_rate": 1.8995268320767252e-07, "logits/chosen": -1.3834903240203857, "logits/rejected": -1.4057523012161255, "logps/chosen": -138.5772705078125, "logps/rejected": -148.1931915283203, "loss": 0.7789, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0129515016451478, "rewards/margins": 0.49864012002944946, "rewards/rejected": -0.4856886565685272, "step": 334 }, { "epoch": 0.38914223669924, "grad_norm": 82.74631507246218, "learning_rate": 1.897831985730609e-07, "logits/chosen": -1.2497293949127197, "logits/rejected": -1.2685260772705078, "logps/chosen": -135.58956909179688, "logps/rejected": -166.16636657714844, "loss": 0.6435, "rewards/accuracies": 0.8125, "rewards/chosen": -0.062045883387327194, "rewards/margins": 0.9767952561378479, "rewards/rejected": -1.0388411283493042, "step": 336 }, { "epoch": 0.3914585595367354, "grad_norm": 97.75310342784691, "learning_rate": 1.896123732953191e-07, "logits/chosen": -1.2475745677947998, "logits/rejected": -1.2074342966079712, "logps/chosen": -108.48465728759766, "logps/rejected": -131.79908752441406, "loss": 0.6321, "rewards/accuracies": 0.71875, "rewards/chosen": -0.32877668738365173, "rewards/margins": 0.5046026110649109, "rewards/rejected": -0.8333792686462402, "step": 338 }, { "epoch": 0.3937748823742309, "grad_norm": 104.56753710703906, "learning_rate": 1.8944020992521088e-07, "logits/chosen": -1.331594467163086, "logits/rejected": -1.4218388795852661, "logps/chosen": -122.07364654541016, "logps/rejected": -144.00531005859375, "loss": 0.6138, "rewards/accuracies": 0.78125, "rewards/chosen": 0.15422941744327545, "rewards/margins": 0.4605112373828888, "rewards/rejected": -0.30628180503845215, "step": 340 }, { "epoch": 0.39609120521172636, "grad_norm": 104.94507394937493, "learning_rate": 1.8926671103348047e-07, "logits/chosen": -1.3103477954864502, "logits/rejected": -1.3303866386413574, "logps/chosen": -118.01762390136719, "logps/rejected": -128.77285766601562, "loss": 0.698, "rewards/accuracies": 0.53125, "rewards/chosen": -0.1733967363834381, "rewards/margins": 0.22825026512145996, "rewards/rejected": -0.40164700150489807, "step": 342 }, { "epoch": 0.39840752804922186, "grad_norm": 87.41594646239237, "learning_rate": 1.8909187921081416e-07, "logits/chosen": -1.2882866859436035, "logits/rejected": -1.266202449798584, "logps/chosen": -144.56747436523438, "logps/rejected": -142.6608123779297, "loss": 0.6561, "rewards/accuracies": 0.53125, "rewards/chosen": -0.08110320568084717, "rewards/margins": 0.10048308968544006, "rewards/rejected": -0.18158632516860962, "step": 344 }, { "epoch": 0.40072385088671736, "grad_norm": 166.0088927921291, "learning_rate": 1.8891571706780144e-07, "logits/chosen": -1.3238105773925781, "logits/rejected": -1.3814265727996826, "logps/chosen": -135.59217834472656, "logps/rejected": -158.6577911376953, "loss": 0.6647, "rewards/accuracies": 0.75, "rewards/chosen": -0.2648026645183563, "rewards/margins": 0.6691212058067322, "rewards/rejected": -0.9339239001274109, "step": 346 }, { "epoch": 0.4030401737242128, "grad_norm": 92.22069522105578, "learning_rate": 1.8873822723489633e-07, "logits/chosen": -1.3072634935379028, "logits/rejected": -1.3363394737243652, "logps/chosen": -179.68614196777344, "logps/rejected": -213.12120056152344, "loss": 0.6272, "rewards/accuracies": 0.75, "rewards/chosen": -0.018616102635860443, "rewards/margins": 0.4234482944011688, "rewards/rejected": -0.44206440448760986, "step": 348 }, { "epoch": 0.4053564965617083, "grad_norm": 70.97764990334171, "learning_rate": 1.8855941236237774e-07, "logits/chosen": -1.2639405727386475, "logits/rejected": -1.2773693799972534, "logps/chosen": -133.8863067626953, "logps/rejected": -170.3965606689453, "loss": 0.5784, "rewards/accuracies": 0.65625, "rewards/chosen": 0.36297571659088135, "rewards/margins": 0.6825499534606934, "rewards/rejected": -0.3195742070674896, "step": 350 }, { "epoch": 0.40767281939920375, "grad_norm": 87.3271520781356, "learning_rate": 1.883792751203102e-07, "logits/chosen": -1.2711012363433838, "logits/rejected": -1.2672007083892822, "logps/chosen": -169.25314331054688, "logps/rejected": -167.83010864257812, "loss": 0.608, "rewards/accuracies": 0.65625, "rewards/chosen": -0.06947077065706253, "rewards/margins": 0.39652663469314575, "rewards/rejected": -0.4659973978996277, "step": 352 }, { "epoch": 0.40998914223669924, "grad_norm": 82.42288813891042, "learning_rate": 1.8819781819850382e-07, "logits/chosen": -1.2538509368896484, "logits/rejected": -1.2403154373168945, "logps/chosen": -112.01508331298828, "logps/rejected": -122.62294006347656, "loss": 0.6, "rewards/accuracies": 0.78125, "rewards/chosen": 0.310922235250473, "rewards/margins": 0.5129318237304688, "rewards/rejected": -0.20200954377651215, "step": 354 }, { "epoch": 0.41230546507419474, "grad_norm": 92.66996742577295, "learning_rate": 1.880150443064742e-07, "logits/chosen": -1.1228657960891724, "logits/rejected": -1.1974968910217285, "logps/chosen": -129.4398193359375, "logps/rejected": -178.6856689453125, "loss": 0.6907, "rewards/accuracies": 0.71875, "rewards/chosen": 0.1207706406712532, "rewards/margins": 0.8560737371444702, "rewards/rejected": -0.7353031039237976, "step": 356 }, { "epoch": 0.4146217879116902, "grad_norm": 85.77942086621498, "learning_rate": 1.8783095617340192e-07, "logits/chosen": -1.3269970417022705, "logits/rejected": -1.3102359771728516, "logps/chosen": -138.91845703125, "logps/rejected": -150.00466918945312, "loss": 0.6704, "rewards/accuracies": 0.53125, "rewards/chosen": -0.25449270009994507, "rewards/margins": 0.06581351906061172, "rewards/rejected": -0.32030627131462097, "step": 358 }, { "epoch": 0.4169381107491857, "grad_norm": 66.74779859823646, "learning_rate": 1.876455565480918e-07, "logits/chosen": -1.395142912864685, "logits/rejected": -1.4558305740356445, "logps/chosen": -138.25567626953125, "logps/rejected": -142.72232055664062, "loss": 0.608, "rewards/accuracies": 0.65625, "rewards/chosen": 0.22209802269935608, "rewards/margins": 0.33867061138153076, "rewards/rejected": -0.11657258868217468, "step": 360 }, { "epoch": 0.41925443358668113, "grad_norm": 69.35743210486372, "learning_rate": 1.8745884819893192e-07, "logits/chosen": -1.3764009475708008, "logits/rejected": -1.4009249210357666, "logps/chosen": -125.95867919921875, "logps/rejected": -147.38038635253906, "loss": 0.5892, "rewards/accuracies": 0.46875, "rewards/chosen": 0.04307159036397934, "rewards/margins": 0.25595974922180176, "rewards/rejected": -0.21288815140724182, "step": 362 }, { "epoch": 0.42157075642417663, "grad_norm": 91.06098837601228, "learning_rate": 1.8727083391385219e-07, "logits/chosen": -1.3126693964004517, "logits/rejected": -1.359320044517517, "logps/chosen": -122.15340423583984, "logps/rejected": -152.7900390625, "loss": 0.6084, "rewards/accuracies": 0.65625, "rewards/chosen": 0.16955101490020752, "rewards/margins": 0.33028605580329895, "rewards/rejected": -0.16073507070541382, "step": 364 }, { "epoch": 0.4238870792616721, "grad_norm": 60.84745087172502, "learning_rate": 1.8708151650028278e-07, "logits/chosen": -1.3809125423431396, "logits/rejected": -1.403237223625183, "logps/chosen": -109.20733642578125, "logps/rejected": -135.84494018554688, "loss": 0.6428, "rewards/accuracies": 0.65625, "rewards/chosen": 0.13176926970481873, "rewards/margins": 0.44196146726608276, "rewards/rejected": -0.31019219756126404, "step": 366 }, { "epoch": 0.42620340209916757, "grad_norm": 83.0289812455712, "learning_rate": 1.8689089878511214e-07, "logits/chosen": -1.2712593078613281, "logits/rejected": -1.3146370649337769, "logps/chosen": -104.22183990478516, "logps/rejected": -117.42278289794922, "loss": 0.6601, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04589027911424637, "rewards/margins": 0.24963931739330292, "rewards/rejected": -0.2955296039581299, "step": 368 }, { "epoch": 0.42851972493666307, "grad_norm": 80.18268668813586, "learning_rate": 1.866989836146449e-07, "logits/chosen": -1.367477536201477, "logits/rejected": -1.4047478437423706, "logps/chosen": -156.0530242919922, "logps/rejected": -166.14857482910156, "loss": 0.6033, "rewards/accuracies": 0.65625, "rewards/chosen": 0.03725311905145645, "rewards/margins": 0.22227245569229126, "rewards/rejected": -0.18501931428909302, "step": 370 }, { "epoch": 0.4308360477741585, "grad_norm": 85.02940823274966, "learning_rate": 1.8650577385455924e-07, "logits/chosen": -1.3402721881866455, "logits/rejected": -1.3483717441558838, "logps/chosen": -129.09817504882812, "logps/rejected": -133.04421997070312, "loss": 0.578, "rewards/accuracies": 0.71875, "rewards/chosen": 0.19856195151805878, "rewards/margins": 0.32389020919799805, "rewards/rejected": -0.12532827258110046, "step": 372 }, { "epoch": 0.433152370611654, "grad_norm": 84.18493319136046, "learning_rate": 1.8631127238986416e-07, "logits/chosen": -1.3070781230926514, "logits/rejected": -1.3111450672149658, "logps/chosen": -100.65834045410156, "logps/rejected": -119.19929504394531, "loss": 0.57, "rewards/accuracies": 0.59375, "rewards/chosen": 0.15119151771068573, "rewards/margins": 0.44069719314575195, "rewards/rejected": -0.28950563073158264, "step": 374 }, { "epoch": 0.43546869344914946, "grad_norm": 78.13851817895889, "learning_rate": 1.8611548212485647e-07, "logits/chosen": -1.3796460628509521, "logits/rejected": -1.4454896450042725, "logps/chosen": -137.24407958984375, "logps/rejected": -168.12208557128906, "loss": 0.6024, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2298029363155365, "rewards/margins": 0.5796483159065247, "rewards/rejected": -0.34984540939331055, "step": 376 }, { "epoch": 0.43778501628664496, "grad_norm": 87.51427473434556, "learning_rate": 1.8591840598307724e-07, "logits/chosen": -1.3684715032577515, "logits/rejected": -1.41554856300354, "logps/chosen": -156.48861694335938, "logps/rejected": -166.43325805664062, "loss": 0.6099, "rewards/accuracies": 0.8125, "rewards/chosen": 0.05118772014975548, "rewards/margins": 0.6047709584236145, "rewards/rejected": -0.5535832047462463, "step": 378 }, { "epoch": 0.4401013391241404, "grad_norm": 97.94077875373094, "learning_rate": 1.8572004690726835e-07, "logits/chosen": -1.4304860830307007, "logits/rejected": -1.3829154968261719, "logps/chosen": -137.7032928466797, "logps/rejected": -159.42665100097656, "loss": 0.6256, "rewards/accuracies": 0.75, "rewards/chosen": 0.05912143737077713, "rewards/margins": 0.9181233644485474, "rewards/rejected": -0.8590019941329956, "step": 380 }, { "epoch": 0.4424176619616359, "grad_norm": 117.47560157505089, "learning_rate": 1.8552040785932843e-07, "logits/chosen": -1.2082271575927734, "logits/rejected": -1.32054603099823, "logps/chosen": -129.3510284423828, "logps/rejected": -139.3075714111328, "loss": 0.6776, "rewards/accuracies": 0.625, "rewards/chosen": 0.08690177649259567, "rewards/margins": 0.17026250064373016, "rewards/rejected": -0.08336074650287628, "step": 382 }, { "epoch": 0.4447339847991314, "grad_norm": 83.36856549076099, "learning_rate": 1.8531949182026864e-07, "logits/chosen": -1.213942289352417, "logits/rejected": -1.2501431703567505, "logps/chosen": -87.4649658203125, "logps/rejected": -105.31576538085938, "loss": 0.6473, "rewards/accuracies": 0.6875, "rewards/chosen": -0.07069863379001617, "rewards/margins": 0.28086185455322266, "rewards/rejected": -0.3515605032444, "step": 384 }, { "epoch": 0.44705030763662684, "grad_norm": 93.35125079656054, "learning_rate": 1.851173017901682e-07, "logits/chosen": -1.2774831056594849, "logits/rejected": -1.3458952903747559, "logps/chosen": -134.04624938964844, "logps/rejected": -148.02565002441406, "loss": 0.6516, "rewards/accuracies": 0.5, "rewards/chosen": -0.0875653326511383, "rewards/margins": 0.029335327446460724, "rewards/rejected": -0.11690068244934082, "step": 386 }, { "epoch": 0.44936663047412234, "grad_norm": 94.38591902404973, "learning_rate": 1.8491384078812957e-07, "logits/chosen": -1.3489183187484741, "logits/rejected": -1.3692617416381836, "logps/chosen": -158.86729431152344, "logps/rejected": -175.22946166992188, "loss": 0.6085, "rewards/accuracies": 0.75, "rewards/chosen": 0.01937798410654068, "rewards/margins": 0.4567859470844269, "rewards/rejected": -0.4374079406261444, "step": 388 }, { "epoch": 0.4516829533116178, "grad_norm": 82.79409553577226, "learning_rate": 1.847091118522333e-07, "logits/chosen": -1.2354220151901245, "logits/rejected": -1.1955327987670898, "logps/chosen": -100.98146057128906, "logps/rejected": -106.97394561767578, "loss": 0.6118, "rewards/accuracies": 0.75, "rewards/chosen": 0.024054907262325287, "rewards/margins": 0.4244306981563568, "rewards/rejected": -0.40037575364112854, "step": 390 }, { "epoch": 0.4539992761491133, "grad_norm": 68.23646218496863, "learning_rate": 1.8450311803949288e-07, "logits/chosen": -1.4198896884918213, "logits/rejected": -1.339991807937622, "logps/chosen": -96.33162689208984, "logps/rejected": -106.24251556396484, "loss": 0.626, "rewards/accuracies": 0.71875, "rewards/chosen": 0.14840683341026306, "rewards/margins": 0.3869977295398712, "rewards/rejected": -0.23859092593193054, "step": 392 }, { "epoch": 0.4563155989866088, "grad_norm": 90.98509885957323, "learning_rate": 1.842958624258088e-07, "logits/chosen": -1.4057539701461792, "logits/rejected": -1.4758132696151733, "logps/chosen": -122.16340637207031, "logps/rejected": -123.98712158203125, "loss": 0.6429, "rewards/accuracies": 0.5, "rewards/chosen": 0.053804248571395874, "rewards/margins": 0.13191546499729156, "rewards/rejected": -0.0781112089753151, "step": 394 }, { "epoch": 0.4586319218241042, "grad_norm": 90.51866810043896, "learning_rate": 1.8408734810592286e-07, "logits/chosen": -1.3948010206222534, "logits/rejected": -1.4117646217346191, "logps/chosen": -170.54193115234375, "logps/rejected": -179.2427978515625, "loss": 0.5834, "rewards/accuracies": 0.71875, "rewards/chosen": 0.059171393513679504, "rewards/margins": 0.42534855008125305, "rewards/rejected": -0.36617720127105713, "step": 396 }, { "epoch": 0.4609482446615997, "grad_norm": 93.16409936228983, "learning_rate": 1.838775781933718e-07, "logits/chosen": -1.2591919898986816, "logits/rejected": -1.278662085533142, "logps/chosen": -133.6868133544922, "logps/rejected": -160.47731018066406, "loss": 0.6789, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12275616079568863, "rewards/margins": 0.09390115737915039, "rewards/rejected": -0.21665732562541962, "step": 398 }, { "epoch": 0.46326456749909517, "grad_norm": 85.0019450300031, "learning_rate": 1.8366655582044093e-07, "logits/chosen": -1.295358419418335, "logits/rejected": -1.3356658220291138, "logps/chosen": -82.7631607055664, "logps/rejected": -102.0246810913086, "loss": 0.6151, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2104763686656952, "rewards/margins": 0.3289705812931061, "rewards/rejected": -0.5394470691680908, "step": 400 }, { "epoch": 0.46326456749909517, "eval_logits/chosen": -1.3069441318511963, "eval_logits/rejected": -1.3023654222488403, "eval_logps/chosen": -140.2086944580078, "eval_logps/rejected": -139.57632446289062, "eval_loss": 0.6798678040504456, "eval_rewards/accuracies": 0.6000000238418579, "eval_rewards/chosen": -0.4577521085739136, "eval_rewards/margins": 0.10452325642108917, "eval_rewards/rejected": -0.5622754096984863, "eval_runtime": 26.7292, "eval_samples_per_second": 3.741, "eval_steps_per_second": 0.935, "step": 400 }, { "epoch": 0.46558089033659067, "grad_norm": 107.62190686868198, "learning_rate": 1.834542841381173e-07, "logits/chosen": -1.4000458717346191, "logits/rejected": -1.4169011116027832, "logps/chosen": -187.33409118652344, "logps/rejected": -207.20140075683594, "loss": 0.5555, "rewards/accuracies": 0.53125, "rewards/chosen": -0.11752481758594513, "rewards/margins": 0.4881589412689209, "rewards/rejected": -0.6056837439537048, "step": 402 }, { "epoch": 0.4678972131740861, "grad_norm": 81.84841168291128, "learning_rate": 1.8324076631604262e-07, "logits/chosen": -1.2451642751693726, "logits/rejected": -1.2964147329330444, "logps/chosen": -136.41270446777344, "logps/rejected": -156.53018188476562, "loss": 0.6282, "rewards/accuracies": 0.625, "rewards/chosen": -0.1907982975244522, "rewards/margins": 0.6749911308288574, "rewards/rejected": -0.8657894134521484, "step": 404 }, { "epoch": 0.4702135360115816, "grad_norm": 70.51232634632699, "learning_rate": 1.8302600554246598e-07, "logits/chosen": -1.2217371463775635, "logits/rejected": -1.2302532196044922, "logps/chosen": -109.1505355834961, "logps/rejected": -124.399169921875, "loss": 0.5908, "rewards/accuracies": 0.71875, "rewards/chosen": -0.011890493333339691, "rewards/margins": 0.41306906938552856, "rewards/rejected": -0.42495957016944885, "step": 406 }, { "epoch": 0.4725298588490771, "grad_norm": 157.73563743497198, "learning_rate": 1.8281000502419624e-07, "logits/chosen": -1.316713809967041, "logits/rejected": -1.3389533758163452, "logps/chosen": -123.63529968261719, "logps/rejected": -128.41409301757812, "loss": 0.6549, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4301506578922272, "rewards/margins": 0.16745811700820923, "rewards/rejected": -0.597608745098114, "step": 408 }, { "epoch": 0.47484618168657255, "grad_norm": 107.85099770446011, "learning_rate": 1.8259276798655412e-07, "logits/chosen": -1.3569673299789429, "logits/rejected": -1.3319692611694336, "logps/chosen": -149.51708984375, "logps/rejected": -185.8908233642578, "loss": 0.6863, "rewards/accuracies": 0.59375, "rewards/chosen": -0.11509159207344055, "rewards/margins": 0.3641398549079895, "rewards/rejected": -0.47923144698143005, "step": 410 }, { "epoch": 0.47716250452406805, "grad_norm": 109.31239844961944, "learning_rate": 1.8237429767332405e-07, "logits/chosen": -1.3673866987228394, "logits/rejected": -1.4460492134094238, "logps/chosen": -144.90838623046875, "logps/rejected": -157.9684295654297, "loss": 0.6105, "rewards/accuracies": 0.9375, "rewards/chosen": -0.08836071193218231, "rewards/margins": 0.6367740631103516, "rewards/rejected": -0.7251348495483398, "step": 412 }, { "epoch": 0.4794788273615635, "grad_norm": 79.38161196529609, "learning_rate": 1.8215459734670573e-07, "logits/chosen": -1.341538667678833, "logits/rejected": -1.371129035949707, "logps/chosen": -135.0418243408203, "logps/rejected": -181.38201904296875, "loss": 0.6121, "rewards/accuracies": 0.8125, "rewards/chosen": 0.04946846514940262, "rewards/margins": 0.8221450448036194, "rewards/rejected": -0.7726765871047974, "step": 414 }, { "epoch": 0.481795150199059, "grad_norm": 98.8037188643182, "learning_rate": 1.8193367028726547e-07, "logits/chosen": -1.1779212951660156, "logits/rejected": -1.2224653959274292, "logps/chosen": -91.48204040527344, "logps/rejected": -109.18719482421875, "loss": 0.6932, "rewards/accuracies": 0.625, "rewards/chosen": 0.05973606929183006, "rewards/margins": 0.11246003955602646, "rewards/rejected": -0.0527239665389061, "step": 416 }, { "epoch": 0.4841114730365545, "grad_norm": 75.72657378652657, "learning_rate": 1.8171151979388712e-07, "logits/chosen": -1.2831331491470337, "logits/rejected": -1.3463534116744995, "logps/chosen": -155.19076538085938, "logps/rejected": -191.88758850097656, "loss": 0.612, "rewards/accuracies": 0.625, "rewards/chosen": -0.32773423194885254, "rewards/margins": 0.40678921341896057, "rewards/rejected": -0.7345234751701355, "step": 418 }, { "epoch": 0.48642779587404994, "grad_norm": 88.72825200499656, "learning_rate": 1.8148814918372285e-07, "logits/chosen": -1.2322022914886475, "logits/rejected": -1.2740528583526611, "logps/chosen": -125.58689880371094, "logps/rejected": -145.04537963867188, "loss": 0.6336, "rewards/accuracies": 0.5625, "rewards/chosen": -0.02973347157239914, "rewards/margins": 0.28559258580207825, "rewards/rejected": -0.3153260350227356, "step": 420 }, { "epoch": 0.48874411871154544, "grad_norm": 84.14984078776182, "learning_rate": 1.8126356179214365e-07, "logits/chosen": -1.3616023063659668, "logits/rejected": -1.3728755712509155, "logps/chosen": -113.55232238769531, "logps/rejected": -120.91179656982422, "loss": 0.6093, "rewards/accuracies": 0.5625, "rewards/chosen": -0.14902538061141968, "rewards/margins": 0.14539653062820435, "rewards/rejected": -0.294421911239624, "step": 422 }, { "epoch": 0.4910604415490409, "grad_norm": 102.58844455062285, "learning_rate": 1.8103776097268942e-07, "logits/chosen": -1.3973523378372192, "logits/rejected": -1.4224525690078735, "logps/chosen": -146.35865783691406, "logps/rejected": -155.32872009277344, "loss": 0.5969, "rewards/accuracies": 0.6875, "rewards/chosen": -0.18615968525409698, "rewards/margins": 0.1702008694410324, "rewards/rejected": -0.3563604950904846, "step": 424 }, { "epoch": 0.4933767643865364, "grad_norm": 97.6281549014596, "learning_rate": 1.8081075009701908e-07, "logits/chosen": -1.3393031358718872, "logits/rejected": -1.3568938970565796, "logps/chosen": -156.75132751464844, "logps/rejected": -183.3557891845703, "loss": 0.5524, "rewards/accuracies": 0.6875, "rewards/chosen": 0.006147988140583038, "rewards/margins": 0.6164807677268982, "rewards/rejected": -0.6103328466415405, "step": 426 }, { "epoch": 0.4956930872240318, "grad_norm": 79.28532180545582, "learning_rate": 1.8058253255486004e-07, "logits/chosen": -1.479441523551941, "logits/rejected": -1.455161213874817, "logps/chosen": -149.18377685546875, "logps/rejected": -175.40121459960938, "loss": 0.6324, "rewards/accuracies": 0.5, "rewards/chosen": -0.16930466890335083, "rewards/margins": 0.3928312659263611, "rewards/rejected": -0.5621359348297119, "step": 428 }, { "epoch": 0.4980094100615273, "grad_norm": 75.50103825872334, "learning_rate": 1.8035311175395766e-07, "logits/chosen": -1.279894232749939, "logits/rejected": -1.366225004196167, "logps/chosen": -149.7015838623047, "logps/rejected": -169.37600708007812, "loss": 0.634, "rewards/accuracies": 0.625, "rewards/chosen": 0.10414651781320572, "rewards/margins": 0.31277552247047424, "rewards/rejected": -0.20862898230552673, "step": 430 }, { "epoch": 0.5003257328990228, "grad_norm": 110.71149959510932, "learning_rate": 1.8012249112002445e-07, "logits/chosen": -1.3446143865585327, "logits/rejected": -1.346205234527588, "logps/chosen": -135.6072998046875, "logps/rejected": -148.6031951904297, "loss": 0.6534, "rewards/accuracies": 0.53125, "rewards/chosen": -0.04478984698653221, "rewards/margins": 0.20214848220348358, "rewards/rejected": -0.2469383329153061, "step": 432 }, { "epoch": 0.5026420557365183, "grad_norm": 99.01624284418935, "learning_rate": 1.7989067409668867e-07, "logits/chosen": -1.3353965282440186, "logits/rejected": -1.3816275596618652, "logps/chosen": -83.31758117675781, "logps/rejected": -101.72441101074219, "loss": 0.65, "rewards/accuracies": 0.75, "rewards/chosen": 0.16134825348854065, "rewards/margins": 0.24150311946868896, "rewards/rejected": -0.0801548883318901, "step": 434 }, { "epoch": 0.5049583785740137, "grad_norm": 85.01833595262721, "learning_rate": 1.7965766414544326e-07, "logits/chosen": -1.3208928108215332, "logits/rejected": -1.4323692321777344, "logps/chosen": -170.11387634277344, "logps/rejected": -190.21917724609375, "loss": 0.5937, "rewards/accuracies": 0.6875, "rewards/chosen": -0.06436862796545029, "rewards/margins": 0.4921523928642273, "rewards/rejected": -0.5565209984779358, "step": 436 }, { "epoch": 0.5072747014115092, "grad_norm": 78.22902080084621, "learning_rate": 1.794234647455938e-07, "logits/chosen": -1.5033388137817383, "logits/rejected": -1.4195587635040283, "logps/chosen": -167.2239227294922, "logps/rejected": -187.388427734375, "loss": 0.5993, "rewards/accuracies": 0.71875, "rewards/chosen": -0.00830845721065998, "rewards/margins": 0.36755993962287903, "rewards/rejected": -0.37586843967437744, "step": 438 }, { "epoch": 0.5095910242490047, "grad_norm": 93.03449604866357, "learning_rate": 1.7918807939420688e-07, "logits/chosen": -1.2785309553146362, "logits/rejected": -1.3855379819869995, "logps/chosen": -106.93773651123047, "logps/rejected": -136.5991668701172, "loss": 0.5876, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07631123065948486, "rewards/margins": 0.45985180139541626, "rewards/rejected": -0.3835405707359314, "step": 440 }, { "epoch": 0.5119073470865002, "grad_norm": 98.9210182597883, "learning_rate": 1.7895151160605755e-07, "logits/chosen": -1.4166314601898193, "logits/rejected": -1.3835158348083496, "logps/chosen": -187.5051727294922, "logps/rejected": -196.1830596923828, "loss": 0.5841, "rewards/accuracies": 0.59375, "rewards/chosen": -0.13197794556617737, "rewards/margins": 0.36578553915023804, "rewards/rejected": -0.49776342511177063, "step": 442 }, { "epoch": 0.5142236699239957, "grad_norm": 83.90736102267026, "learning_rate": 1.7871376491357716e-07, "logits/chosen": -1.3803664445877075, "logits/rejected": -1.3876008987426758, "logps/chosen": -147.97230529785156, "logps/rejected": -158.1250762939453, "loss": 0.6266, "rewards/accuracies": 0.65625, "rewards/chosen": 0.05537159740924835, "rewards/margins": 0.3145188093185425, "rewards/rejected": -0.2591472268104553, "step": 444 }, { "epoch": 0.5165399927614911, "grad_norm": 83.18792426148275, "learning_rate": 1.7847484286680036e-07, "logits/chosen": -1.2037944793701172, "logits/rejected": -1.3015272617340088, "logps/chosen": -116.46647644042969, "logps/rejected": -133.59059143066406, "loss": 0.6422, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2899719476699829, "rewards/margins": 0.47784021496772766, "rewards/rejected": -0.767812192440033, "step": 446 }, { "epoch": 0.5188563155989866, "grad_norm": 86.96480601224319, "learning_rate": 1.782347490333123e-07, "logits/chosen": -1.3997318744659424, "logits/rejected": -1.3888890743255615, "logps/chosen": -168.47235107421875, "logps/rejected": -173.81881713867188, "loss": 0.5883, "rewards/accuracies": 0.65625, "rewards/chosen": 0.2631508708000183, "rewards/margins": 0.40129777789115906, "rewards/rejected": -0.13814686238765717, "step": 448 }, { "epoch": 0.5211726384364821, "grad_norm": 68.31450376756777, "learning_rate": 1.7799348699819518e-07, "logits/chosen": -1.3524158000946045, "logits/rejected": -1.3299603462219238, "logps/chosen": -121.39910888671875, "logps/rejected": -131.10423278808594, "loss": 0.5874, "rewards/accuracies": 0.75, "rewards/chosen": 0.28284794092178345, "rewards/margins": 0.3859240412712097, "rewards/rejected": -0.10307610780000687, "step": 450 }, { "epoch": 0.5234889612739776, "grad_norm": 111.76852046416136, "learning_rate": 1.7775106036397474e-07, "logits/chosen": -1.2830047607421875, "logits/rejected": -1.3414244651794434, "logps/chosen": -130.662353515625, "logps/rejected": -179.69061279296875, "loss": 0.5992, "rewards/accuracies": 0.75, "rewards/chosen": 0.22120808064937592, "rewards/margins": 1.2555629014968872, "rewards/rejected": -1.0343549251556396, "step": 452 }, { "epoch": 0.525805284111473, "grad_norm": 67.76003255019495, "learning_rate": 1.775074727505667e-07, "logits/chosen": -1.1533057689666748, "logits/rejected": -1.294029951095581, "logps/chosen": -149.02322387695312, "logps/rejected": -172.8708953857422, "loss": 0.5608, "rewards/accuracies": 0.5, "rewards/chosen": -0.04835113137960434, "rewards/margins": 0.28162479400634766, "rewards/rejected": -0.3299759328365326, "step": 454 }, { "epoch": 0.5281216069489685, "grad_norm": 85.70884472679678, "learning_rate": 1.7726272779522228e-07, "logits/chosen": -1.2949302196502686, "logits/rejected": -1.387807846069336, "logps/chosen": -159.39170837402344, "logps/rejected": -189.28244018554688, "loss": 0.6753, "rewards/accuracies": 0.71875, "rewards/chosen": 0.12795251607894897, "rewards/margins": 0.3282526135444641, "rewards/rejected": -0.20030008256435394, "step": 456 }, { "epoch": 0.530437929786464, "grad_norm": 69.96049512457706, "learning_rate": 1.7701682915247437e-07, "logits/chosen": -1.1357134580612183, "logits/rejected": -1.2111129760742188, "logps/chosen": -168.00326538085938, "logps/rejected": -185.22506713867188, "loss": 0.6102, "rewards/accuracies": 0.71875, "rewards/chosen": -0.46182161569595337, "rewards/margins": 0.4477265477180481, "rewards/rejected": -0.9095481634140015, "step": 458 }, { "epoch": 0.5327542526239595, "grad_norm": 101.3186304412605, "learning_rate": 1.7676978049408259e-07, "logits/chosen": -1.3433293104171753, "logits/rejected": -1.3274402618408203, "logps/chosen": -129.25802612304688, "logps/rejected": -149.58999633789062, "loss": 0.6877, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09710556268692017, "rewards/margins": 0.5576457977294922, "rewards/rejected": -0.6547513604164124, "step": 460 }, { "epoch": 0.535070575461455, "grad_norm": 101.53493027467981, "learning_rate": 1.7652158550897863e-07, "logits/chosen": -1.2119991779327393, "logits/rejected": -1.254407525062561, "logps/chosen": -124.32587432861328, "logps/rejected": -141.7906036376953, "loss": 0.6527, "rewards/accuracies": 0.625, "rewards/chosen": -0.07976742088794708, "rewards/margins": 0.11116158217191696, "rewards/rejected": -0.19092898070812225, "step": 462 }, { "epoch": 0.5373868982989504, "grad_norm": 72.67582060276438, "learning_rate": 1.7627224790321116e-07, "logits/chosen": -1.3650070428848267, "logits/rejected": -1.3934192657470703, "logps/chosen": -111.0053939819336, "logps/rejected": -128.06703186035156, "loss": 0.6384, "rewards/accuracies": 0.625, "rewards/chosen": 0.09877490997314453, "rewards/margins": 0.22652901709079742, "rewards/rejected": -0.1277541220188141, "step": 464 }, { "epoch": 0.5397032211364459, "grad_norm": 104.80291110492522, "learning_rate": 1.7602177139989042e-07, "logits/chosen": -1.2948188781738281, "logits/rejected": -1.3249576091766357, "logps/chosen": -113.75486755371094, "logps/rejected": -135.57427978515625, "loss": 0.6462, "rewards/accuracies": 0.59375, "rewards/chosen": 0.04134136065840721, "rewards/margins": 0.30063849687576294, "rewards/rejected": -0.25929710268974304, "step": 466 }, { "epoch": 0.5420195439739414, "grad_norm": 78.82420405990091, "learning_rate": 1.7577015973913274e-07, "logits/chosen": -1.2992827892303467, "logits/rejected": -1.3570318222045898, "logps/chosen": -131.05203247070312, "logps/rejected": -151.40420532226562, "loss": 0.6198, "rewards/accuracies": 0.71875, "rewards/chosen": 0.12259967625141144, "rewards/margins": 0.3709834814071655, "rewards/rejected": -0.2483838051557541, "step": 468 }, { "epoch": 0.5443358668114369, "grad_norm": 72.33642230267687, "learning_rate": 1.755174166780045e-07, "logits/chosen": -1.1955764293670654, "logits/rejected": -1.304951786994934, "logps/chosen": -132.34945678710938, "logps/rejected": -160.3063201904297, "loss": 0.5581, "rewards/accuracies": 0.78125, "rewards/chosen": 0.14929035305976868, "rewards/margins": 0.9690365791320801, "rewards/rejected": -0.8197463154792786, "step": 470 }, { "epoch": 0.5466521896489324, "grad_norm": 95.99345130843376, "learning_rate": 1.7526354599046632e-07, "logits/chosen": -1.3738641738891602, "logits/rejected": -1.4558396339416504, "logps/chosen": -124.96098327636719, "logps/rejected": -148.17123413085938, "loss": 0.6421, "rewards/accuracies": 0.6875, "rewards/chosen": 0.09812385588884354, "rewards/margins": 0.24952289462089539, "rewards/rejected": -0.15139903128147125, "step": 472 }, { "epoch": 0.5489685124864278, "grad_norm": 80.16504208727451, "learning_rate": 1.7500855146731648e-07, "logits/chosen": -1.2267169952392578, "logits/rejected": -1.2515380382537842, "logps/chosen": -148.4540557861328, "logps/rejected": -180.935791015625, "loss": 0.6187, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06260286271572113, "rewards/margins": 1.7078866958618164, "rewards/rejected": -1.7704894542694092, "step": 474 }, { "epoch": 0.5512848353239233, "grad_norm": 113.11770155446688, "learning_rate": 1.747524369161343e-07, "logits/chosen": -1.3779189586639404, "logits/rejected": -1.3472117185592651, "logps/chosen": -137.42312622070312, "logps/rejected": -141.66329956054688, "loss": 0.6569, "rewards/accuracies": 0.59375, "rewards/chosen": -0.24386143684387207, "rewards/margins": 0.2617953419685364, "rewards/rejected": -0.5056568384170532, "step": 476 }, { "epoch": 0.5536011581614187, "grad_norm": 119.83970905986772, "learning_rate": 1.744952061612234e-07, "logits/chosen": -1.4478602409362793, "logits/rejected": -1.470253348350525, "logps/chosen": -162.07476806640625, "logps/rejected": -187.415283203125, "loss": 0.6087, "rewards/accuracies": 0.6875, "rewards/chosen": 0.03821418434381485, "rewards/margins": 0.7775447368621826, "rewards/rejected": -0.739330530166626, "step": 478 }, { "epoch": 0.5559174809989142, "grad_norm": 82.18148965783794, "learning_rate": 1.7423686304355468e-07, "logits/chosen": -1.4132378101348877, "logits/rejected": -1.4143118858337402, "logps/chosen": -135.87957763671875, "logps/rejected": -154.1642608642578, "loss": 0.604, "rewards/accuracies": 0.65625, "rewards/chosen": -0.36500078439712524, "rewards/margins": 0.42764222621917725, "rewards/rejected": -0.7926430106163025, "step": 480 }, { "epoch": 0.5582338038364097, "grad_norm": 80.80323897214724, "learning_rate": 1.7397741142070867e-07, "logits/chosen": -1.3779712915420532, "logits/rejected": -1.3945672512054443, "logps/chosen": -172.9818115234375, "logps/rejected": -181.16062927246094, "loss": 0.5964, "rewards/accuracies": 0.6875, "rewards/chosen": 0.04211435094475746, "rewards/margins": 0.4079417586326599, "rewards/rejected": -0.36582741141319275, "step": 482 }, { "epoch": 0.5605501266739051, "grad_norm": 76.61028661180849, "learning_rate": 1.737168551668182e-07, "logits/chosen": -1.190808653831482, "logits/rejected": -1.271024465560913, "logps/chosen": -131.51797485351562, "logps/rejected": -167.06590270996094, "loss": 0.5975, "rewards/accuracies": 0.75, "rewards/chosen": -0.07135076820850372, "rewards/margins": 0.8735796213150024, "rewards/rejected": -0.9449302554130554, "step": 484 }, { "epoch": 0.5628664495114006, "grad_norm": 80.91548302041826, "learning_rate": 1.7345519817251053e-07, "logits/chosen": -1.3176366090774536, "logits/rejected": -1.331200122833252, "logps/chosen": -145.1810760498047, "logps/rejected": -171.1893768310547, "loss": 0.6177, "rewards/accuracies": 0.625, "rewards/chosen": -0.01799055188894272, "rewards/margins": 0.4896019399166107, "rewards/rejected": -0.507592499256134, "step": 486 }, { "epoch": 0.5651827723488961, "grad_norm": 89.40658710689003, "learning_rate": 1.7319244434484895e-07, "logits/chosen": -1.2093366384506226, "logits/rejected": -1.1616159677505493, "logps/chosen": -140.53761291503906, "logps/rejected": -141.9064483642578, "loss": 0.6064, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2011549472808838, "rewards/margins": 0.3880937099456787, "rewards/rejected": -0.5892486572265625, "step": 488 }, { "epoch": 0.5674990951863916, "grad_norm": 76.26303147749239, "learning_rate": 1.7292859760727492e-07, "logits/chosen": -1.2799924612045288, "logits/rejected": -1.296557903289795, "logps/chosen": -117.47547912597656, "logps/rejected": -129.87294006347656, "loss": 0.6132, "rewards/accuracies": 0.71875, "rewards/chosen": -0.10736295580863953, "rewards/margins": 0.3569309115409851, "rewards/rejected": -0.464293897151947, "step": 490 }, { "epoch": 0.5698154180238871, "grad_norm": 95.13972864679343, "learning_rate": 1.7266366189954905e-07, "logits/chosen": -1.348731517791748, "logits/rejected": -1.3340685367584229, "logps/chosen": -150.54696655273438, "logps/rejected": -185.81204223632812, "loss": 0.6421, "rewards/accuracies": 0.59375, "rewards/chosen": -0.1124522015452385, "rewards/margins": 0.7442688941955566, "rewards/rejected": -0.856721043586731, "step": 492 }, { "epoch": 0.5721317408613825, "grad_norm": 106.06131234014966, "learning_rate": 1.7239764117769258e-07, "logits/chosen": -1.3093186616897583, "logits/rejected": -1.3834538459777832, "logps/chosen": -193.04637145996094, "logps/rejected": -233.44293212890625, "loss": 0.6109, "rewards/accuracies": 0.78125, "rewards/chosen": -0.16808415949344635, "rewards/margins": 1.2206228971481323, "rewards/rejected": -1.388707160949707, "step": 494 }, { "epoch": 0.574448063698878, "grad_norm": 96.07655487217647, "learning_rate": 1.7213053941392816e-07, "logits/chosen": -1.330100417137146, "logits/rejected": -1.354781150817871, "logps/chosen": -157.2327880859375, "logps/rejected": -173.35081481933594, "loss": 0.572, "rewards/accuracies": 0.625, "rewards/chosen": 0.055093757808208466, "rewards/margins": 0.6396900415420532, "rewards/rejected": -0.5845962166786194, "step": 496 }, { "epoch": 0.5767643865363735, "grad_norm": 90.87905253835972, "learning_rate": 1.7186236059662046e-07, "logits/chosen": -1.4015512466430664, "logits/rejected": -1.4518334865570068, "logps/chosen": -132.65196228027344, "logps/rejected": -143.88650512695312, "loss": 0.6587, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07620470970869064, "rewards/margins": 0.09850985556840897, "rewards/rejected": -0.1747145652770996, "step": 498 }, { "epoch": 0.579080709373869, "grad_norm": 58.33509354958709, "learning_rate": 1.7159310873021693e-07, "logits/chosen": -1.464751124382019, "logits/rejected": -1.4334102869033813, "logps/chosen": -111.387939453125, "logps/rejected": -117.49159240722656, "loss": 0.5577, "rewards/accuracies": 0.78125, "rewards/chosen": 0.29490286111831665, "rewards/margins": 0.8083434700965881, "rewards/rejected": -0.5134405493736267, "step": 500 }, { "epoch": 0.579080709373869, "eval_logits/chosen": -1.3100072145462036, "eval_logits/rejected": -1.304487943649292, "eval_logps/chosen": -139.44586181640625, "eval_logps/rejected": -139.48992919921875, "eval_loss": 0.654407262802124, "eval_rewards/accuracies": 0.6000000238418579, "eval_rewards/chosen": -0.38146913051605225, "eval_rewards/margins": 0.17216716706752777, "eval_rewards/rejected": -0.5536363124847412, "eval_runtime": 24.0833, "eval_samples_per_second": 4.152, "eval_steps_per_second": 1.038, "step": 500 }, { "epoch": 0.5813970322113644, "grad_norm": 85.71410602529986, "learning_rate": 1.7132278783518754e-07, "logits/chosen": -1.2767977714538574, "logits/rejected": -1.3142091035842896, "logps/chosen": -132.83477783203125, "logps/rejected": -152.29600524902344, "loss": 0.6423, "rewards/accuracies": 0.4375, "rewards/chosen": -0.3583824038505554, "rewards/margins": 0.2208695262670517, "rewards/rejected": -0.5792520046234131, "step": 502 }, { "epoch": 0.5837133550488599, "grad_norm": 74.14800888172827, "learning_rate": 1.7105140194796522e-07, "logits/chosen": -1.3712527751922607, "logits/rejected": -1.425230860710144, "logps/chosen": -175.75039672851562, "logps/rejected": -202.72731018066406, "loss": 0.5921, "rewards/accuracies": 0.71875, "rewards/chosen": -0.03632951155304909, "rewards/margins": 0.9486851692199707, "rewards/rejected": -0.9850146174430847, "step": 504 }, { "epoch": 0.5860296778863554, "grad_norm": 71.00059592227518, "learning_rate": 1.707789551208852e-07, "logits/chosen": -1.2654979228973389, "logits/rejected": -1.3367087841033936, "logps/chosen": -107.92752075195312, "logps/rejected": -137.77261352539062, "loss": 0.5964, "rewards/accuracies": 0.75, "rewards/chosen": 0.22204995155334473, "rewards/margins": 0.47908443212509155, "rewards/rejected": -0.2570344805717468, "step": 506 }, { "epoch": 0.5883460007238509, "grad_norm": 80.94401296109848, "learning_rate": 1.705054514221248e-07, "logits/chosen": -1.359083652496338, "logits/rejected": -1.262428879737854, "logps/chosen": -128.09751892089844, "logps/rejected": -112.98042297363281, "loss": 0.5995, "rewards/accuracies": 0.71875, "rewards/chosen": -0.007966872304677963, "rewards/margins": 0.3093283772468567, "rewards/rejected": -0.31729522347450256, "step": 508 }, { "epoch": 0.5906623235613464, "grad_norm": 117.75539638738908, "learning_rate": 1.7023089493564246e-07, "logits/chosen": -1.3026072978973389, "logits/rejected": -1.3078409433364868, "logps/chosen": -157.6989288330078, "logps/rejected": -171.07347106933594, "loss": 0.6652, "rewards/accuracies": 0.65625, "rewards/chosen": -0.17687593400478363, "rewards/margins": 0.22165895998477936, "rewards/rejected": -0.398534893989563, "step": 510 }, { "epoch": 0.5929786463988418, "grad_norm": 86.54731628654646, "learning_rate": 1.6995528976111692e-07, "logits/chosen": -1.3644428253173828, "logits/rejected": -1.359837532043457, "logps/chosen": -118.70327758789062, "logps/rejected": -129.3509979248047, "loss": 0.6307, "rewards/accuracies": 0.625, "rewards/chosen": 0.16805267333984375, "rewards/margins": 0.45584040880203247, "rewards/rejected": -0.2877877354621887, "step": 512 }, { "epoch": 0.5952949692363373, "grad_norm": 87.28044950941617, "learning_rate": 1.6967864001388587e-07, "logits/chosen": -1.383012294769287, "logits/rejected": -1.372816562652588, "logps/chosen": -112.56473541259766, "logps/rejected": -113.43563842773438, "loss": 0.5892, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07795768231153488, "rewards/margins": 0.3844006359577179, "rewards/rejected": -0.30644291639328003, "step": 514 }, { "epoch": 0.5976112920738328, "grad_norm": 93.89694914049578, "learning_rate": 1.6940094982488465e-07, "logits/chosen": -1.3544152975082397, "logits/rejected": -1.4398796558380127, "logps/chosen": -174.69073486328125, "logps/rejected": -213.37953186035156, "loss": 0.6402, "rewards/accuracies": 0.71875, "rewards/chosen": 0.2562227249145508, "rewards/margins": 0.7131789922714233, "rewards/rejected": -0.45695626735687256, "step": 516 }, { "epoch": 0.5999276149113283, "grad_norm": 93.38206179293249, "learning_rate": 1.6912222334058434e-07, "logits/chosen": -1.3199559450149536, "logits/rejected": -1.3303453922271729, "logps/chosen": -113.59899139404297, "logps/rejected": -145.6167449951172, "loss": 0.5803, "rewards/accuracies": 0.6875, "rewards/chosen": 0.020612459629774094, "rewards/margins": 0.46917960047721863, "rewards/rejected": -0.4485671818256378, "step": 518 }, { "epoch": 0.6022439377488238, "grad_norm": 105.46320125024906, "learning_rate": 1.6884246472293017e-07, "logits/chosen": -1.2990922927856445, "logits/rejected": -1.32880437374115, "logps/chosen": -156.3465576171875, "logps/rejected": -181.81884765625, "loss": 0.5906, "rewards/accuracies": 0.65625, "rewards/chosen": 0.04846584051847458, "rewards/margins": 0.4274147152900696, "rewards/rejected": -0.3789488971233368, "step": 520 }, { "epoch": 0.6045602605863192, "grad_norm": 75.97476536999818, "learning_rate": 1.68561678149279e-07, "logits/chosen": -1.324131727218628, "logits/rejected": -1.3583768606185913, "logps/chosen": -158.01376342773438, "logps/rejected": -170.33180236816406, "loss": 0.62, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15412873029708862, "rewards/margins": 0.3743273615837097, "rewards/rejected": -0.5284560322761536, "step": 522 }, { "epoch": 0.6068765834238147, "grad_norm": 104.26484808062503, "learning_rate": 1.6827986781233728e-07, "logits/chosen": -1.244482159614563, "logits/rejected": -1.3129115104675293, "logps/chosen": -168.09619140625, "logps/rejected": -183.01235961914062, "loss": 0.6265, "rewards/accuracies": 0.6875, "rewards/chosen": -0.21132177114486694, "rewards/margins": 0.3104555606842041, "rewards/rejected": -0.521777331829071, "step": 524 }, { "epoch": 0.6091929062613102, "grad_norm": 114.78689524134293, "learning_rate": 1.6799703792009824e-07, "logits/chosen": -1.5139933824539185, "logits/rejected": -1.4369456768035889, "logps/chosen": -179.37973022460938, "logps/rejected": -178.68380737304688, "loss": 0.6447, "rewards/accuracies": 0.75, "rewards/chosen": 0.1471785008907318, "rewards/margins": 0.5135056376457214, "rewards/rejected": -0.366327166557312, "step": 526 }, { "epoch": 0.6115092290988057, "grad_norm": 71.37258403318782, "learning_rate": 1.6771319269577914e-07, "logits/chosen": -1.246570348739624, "logits/rejected": -1.2840875387191772, "logps/chosen": -128.31802368164062, "logps/rejected": -164.46771240234375, "loss": 0.5578, "rewards/accuracies": 0.625, "rewards/chosen": -0.00241958349943161, "rewards/margins": 0.5187560319900513, "rewards/rejected": -0.5211755633354187, "step": 528 }, { "epoch": 0.6138255519363012, "grad_norm": 69.57805371439099, "learning_rate": 1.6742833637775812e-07, "logits/chosen": -1.323167085647583, "logits/rejected": -1.3477709293365479, "logps/chosen": -146.45350646972656, "logps/rejected": -181.66311645507812, "loss": 0.5717, "rewards/accuracies": 0.6875, "rewards/chosen": -0.19867736101150513, "rewards/margins": 0.9410180449485779, "rewards/rejected": -1.1396952867507935, "step": 530 }, { "epoch": 0.6161418747737966, "grad_norm": 171.030005968529, "learning_rate": 1.6714247321951105e-07, "logits/chosen": -1.380966067314148, "logits/rejected": -1.4481279850006104, "logps/chosen": -164.24951171875, "logps/rejected": -182.36082458496094, "loss": 0.6188, "rewards/accuracies": 0.75, "rewards/chosen": -0.18503104150295258, "rewards/margins": 0.3646969497203827, "rewards/rejected": -0.5497279167175293, "step": 532 }, { "epoch": 0.6184581976112921, "grad_norm": 92.73704263508813, "learning_rate": 1.668556074895479e-07, "logits/chosen": -1.3130195140838623, "logits/rejected": -1.3079559803009033, "logps/chosen": -163.1666717529297, "logps/rejected": -171.744873046875, "loss": 0.6301, "rewards/accuracies": 0.625, "rewards/chosen": -0.020428307354450226, "rewards/margins": 0.3073387145996094, "rewards/rejected": -0.327767014503479, "step": 534 }, { "epoch": 0.6207745204487876, "grad_norm": 80.77491644213549, "learning_rate": 1.6656774347134907e-07, "logits/chosen": -1.3122167587280273, "logits/rejected": -1.3465042114257812, "logps/chosen": -122.46326446533203, "logps/rejected": -140.5079803466797, "loss": 0.651, "rewards/accuracies": 0.59375, "rewards/chosen": -0.09150812029838562, "rewards/margins": 0.42649781703948975, "rewards/rejected": -0.518005907535553, "step": 536 }, { "epoch": 0.6230908432862831, "grad_norm": 84.46218412236821, "learning_rate": 1.6627888546330136e-07, "logits/chosen": -1.4094092845916748, "logits/rejected": -1.4629356861114502, "logps/chosen": -185.64651489257812, "logps/rejected": -204.06578063964844, "loss": 0.5885, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06263245642185211, "rewards/margins": 0.5876613855361938, "rewards/rejected": -0.6502938270568848, "step": 538 }, { "epoch": 0.6254071661237784, "grad_norm": 81.10930034348203, "learning_rate": 1.659890377786339e-07, "logits/chosen": -1.3104676008224487, "logits/rejected": -1.2645026445388794, "logps/chosen": -160.73683166503906, "logps/rejected": -208.7481689453125, "loss": 0.5968, "rewards/accuracies": 0.71875, "rewards/chosen": -0.17388193309307098, "rewards/margins": 1.4726815223693848, "rewards/rejected": -1.6465635299682617, "step": 540 }, { "epoch": 0.627723488961274, "grad_norm": 70.09647822541486, "learning_rate": 1.656982047453536e-07, "logits/chosen": -1.3550140857696533, "logits/rejected": -1.3001039028167725, "logps/chosen": -152.96685791015625, "logps/rejected": -185.17835998535156, "loss": 0.5726, "rewards/accuracies": 0.65625, "rewards/chosen": -0.005441240966320038, "rewards/margins": 0.716077983379364, "rewards/rejected": -0.7215193510055542, "step": 542 }, { "epoch": 0.6300398117987694, "grad_norm": 114.90916867192477, "learning_rate": 1.6540639070618066e-07, "logits/chosen": -1.3001914024353027, "logits/rejected": -1.3512235879898071, "logps/chosen": -162.81076049804688, "logps/rejected": -186.64080810546875, "loss": 0.5977, "rewards/accuracies": 0.78125, "rewards/chosen": -0.11464500427246094, "rewards/margins": 0.6204842329025269, "rewards/rejected": -0.735129177570343, "step": 544 }, { "epoch": 0.6323561346362649, "grad_norm": 114.79194762522887, "learning_rate": 1.6511360001848367e-07, "logits/chosen": -1.1840189695358276, "logits/rejected": -1.2202097177505493, "logps/chosen": -133.49606323242188, "logps/rejected": -157.7266387939453, "loss": 0.6263, "rewards/accuracies": 0.90625, "rewards/chosen": 0.1880410611629486, "rewards/margins": 0.6589545607566833, "rewards/rejected": -0.47091349959373474, "step": 546 }, { "epoch": 0.6346724574737604, "grad_norm": 93.62652784226147, "learning_rate": 1.6481983705421448e-07, "logits/chosen": -1.306709885597229, "logits/rejected": -1.378722071647644, "logps/chosen": -105.61914825439453, "logps/rejected": -134.32098388671875, "loss": 0.6102, "rewards/accuracies": 0.71875, "rewards/chosen": 0.14075130224227905, "rewards/margins": 0.543403685092926, "rewards/rejected": -0.4026523530483246, "step": 548 }, { "epoch": 0.6369887803112558, "grad_norm": 73.81068535979944, "learning_rate": 1.6452510619984298e-07, "logits/chosen": -1.2993462085723877, "logits/rejected": -1.3260908126831055, "logps/chosen": -113.44615936279297, "logps/rejected": -114.76972961425781, "loss": 0.6459, "rewards/accuracies": 0.625, "rewards/chosen": -0.17589446902275085, "rewards/margins": 0.14515961706638336, "rewards/rejected": -0.3210541009902954, "step": 550 }, { "epoch": 0.6393051031487513, "grad_norm": 67.36004183948565, "learning_rate": 1.642294118562917e-07, "logits/chosen": -1.365562915802002, "logits/rejected": -1.3586573600769043, "logps/chosen": -124.21674346923828, "logps/rejected": -129.41188049316406, "loss": 0.6016, "rewards/accuracies": 0.5625, "rewards/chosen": -0.026413168758153915, "rewards/margins": 0.1695682406425476, "rewards/rejected": -0.19598142802715302, "step": 552 }, { "epoch": 0.6416214259862468, "grad_norm": 109.88457230810822, "learning_rate": 1.6393275843886988e-07, "logits/chosen": -1.2051353454589844, "logits/rejected": -1.1777970790863037, "logps/chosen": -169.16192626953125, "logps/rejected": -165.87405395507812, "loss": 0.7026, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2536877989768982, "rewards/margins": -0.15165254473686218, "rewards/rejected": -0.10203523933887482, "step": 554 }, { "epoch": 0.6439377488237423, "grad_norm": 75.11287220032575, "learning_rate": 1.636351503772077e-07, "logits/chosen": -1.4200119972229004, "logits/rejected": -1.4631352424621582, "logps/chosen": -198.4412841796875, "logps/rejected": -217.67779541015625, "loss": 0.5673, "rewards/accuracies": 0.65625, "rewards/chosen": 0.11143307387828827, "rewards/margins": 0.5114270448684692, "rewards/rejected": -0.39999401569366455, "step": 556 }, { "epoch": 0.6462540716612378, "grad_norm": 76.6884503172935, "learning_rate": 1.6333659211519013e-07, "logits/chosen": -1.250978708267212, "logits/rejected": -1.3204269409179688, "logps/chosen": -122.6414794921875, "logps/rejected": -148.8828887939453, "loss": 0.577, "rewards/accuracies": 0.84375, "rewards/chosen": 0.0017823921516537666, "rewards/margins": 0.8125737905502319, "rewards/rejected": -0.8107913732528687, "step": 558 }, { "epoch": 0.6485703944987332, "grad_norm": 97.61750654608176, "learning_rate": 1.630370881108905e-07, "logits/chosen": -1.4659614562988281, "logits/rejected": -1.410871148109436, "logps/chosen": -158.7913055419922, "logps/rejected": -179.6865692138672, "loss": 0.6643, "rewards/accuracies": 0.59375, "rewards/chosen": 0.09925530850887299, "rewards/margins": 0.4364185631275177, "rewards/rejected": -0.33716320991516113, "step": 560 }, { "epoch": 0.6508867173362287, "grad_norm": 81.27814399323799, "learning_rate": 1.627366428365039e-07, "logits/chosen": -1.3590463399887085, "logits/rejected": -1.402485966682434, "logps/chosen": -154.32345581054688, "logps/rejected": -171.76239013671875, "loss": 0.6142, "rewards/accuracies": 0.6875, "rewards/chosen": -0.35029542446136475, "rewards/margins": 0.5618267059326172, "rewards/rejected": -0.9121222496032715, "step": 562 }, { "epoch": 0.6532030401737242, "grad_norm": 94.5819842213993, "learning_rate": 1.6243526077828058e-07, "logits/chosen": -1.368080973625183, "logits/rejected": -1.3609378337860107, "logps/chosen": -144.53123474121094, "logps/rejected": -149.87338256835938, "loss": 0.6336, "rewards/accuracies": 0.90625, "rewards/chosen": 0.22654207050800323, "rewards/margins": 0.999248206615448, "rewards/rejected": -0.7727060914039612, "step": 564 }, { "epoch": 0.6555193630112197, "grad_norm": 77.01269214216319, "learning_rate": 1.6213294643645882e-07, "logits/chosen": -1.2939796447753906, "logits/rejected": -1.3303455114364624, "logps/chosen": -136.7617645263672, "logps/rejected": -174.4918670654297, "loss": 0.5761, "rewards/accuracies": 0.59375, "rewards/chosen": -0.4167702794075012, "rewards/margins": 0.5882107615470886, "rewards/rejected": -1.0049810409545898, "step": 566 }, { "epoch": 0.6578356858487152, "grad_norm": 93.03042212849894, "learning_rate": 1.618297043251977e-07, "logits/chosen": -1.346666932106018, "logits/rejected": -1.4271106719970703, "logps/chosen": -123.94332885742188, "logps/rejected": -144.5997772216797, "loss": 0.5749, "rewards/accuracies": 0.625, "rewards/chosen": -0.06060848757624626, "rewards/margins": 0.27999287843704224, "rewards/rejected": -0.3406013548374176, "step": 568 }, { "epoch": 0.6601520086862106, "grad_norm": 78.42710600355083, "learning_rate": 1.6152553897250987e-07, "logits/chosen": -1.1860871315002441, "logits/rejected": -1.2453413009643555, "logps/chosen": -119.57302856445312, "logps/rejected": -140.7362518310547, "loss": 0.5865, "rewards/accuracies": 0.71875, "rewards/chosen": -0.14186443388462067, "rewards/margins": 0.509893000125885, "rewards/rejected": -0.6517573595046997, "step": 570 }, { "epoch": 0.6624683315237061, "grad_norm": 79.87983043947283, "learning_rate": 1.6122045492019374e-07, "logits/chosen": -1.2810924053192139, "logits/rejected": -1.3013286590576172, "logps/chosen": -124.72441101074219, "logps/rejected": -155.4320831298828, "loss": 0.6214, "rewards/accuracies": 0.71875, "rewards/chosen": -0.17904864251613617, "rewards/margins": 0.5865851640701294, "rewards/rejected": -0.7656337022781372, "step": 572 }, { "epoch": 0.6647846543612016, "grad_norm": 83.06538656604619, "learning_rate": 1.6091445672376577e-07, "logits/chosen": -1.2900563478469849, "logits/rejected": -1.3495041131973267, "logps/chosen": -132.9668426513672, "logps/rejected": -158.88734436035156, "loss": 0.72, "rewards/accuracies": 0.71875, "rewards/chosen": 0.11348069459199905, "rewards/margins": 0.5080840587615967, "rewards/rejected": -0.3946034610271454, "step": 574 }, { "epoch": 0.6671009771986971, "grad_norm": 81.11691381402433, "learning_rate": 1.6060754895239242e-07, "logits/chosen": -1.3639813661575317, "logits/rejected": -1.3099185228347778, "logps/chosen": -129.63088989257812, "logps/rejected": -138.73565673828125, "loss": 0.5229, "rewards/accuracies": 0.625, "rewards/chosen": -0.1598489135503769, "rewards/margins": 0.4970959722995758, "rewards/rejected": -0.6569448709487915, "step": 576 }, { "epoch": 0.6694173000361926, "grad_norm": 97.23417815050283, "learning_rate": 1.6029973618882188e-07, "logits/chosen": -1.4283655881881714, "logits/rejected": -1.4487836360931396, "logps/chosen": -133.85809326171875, "logps/rejected": -156.39669799804688, "loss": 0.5988, "rewards/accuracies": 0.59375, "rewards/chosen": 0.003942415118217468, "rewards/margins": 0.2732374668121338, "rewards/rejected": -0.26929500699043274, "step": 578 }, { "epoch": 0.671733622873688, "grad_norm": 66.172594585089, "learning_rate": 1.599910230293158e-07, "logits/chosen": -1.2562668323516846, "logits/rejected": -1.3157635927200317, "logps/chosen": -166.57229614257812, "logps/rejected": -168.74867248535156, "loss": 0.5781, "rewards/accuracies": 0.78125, "rewards/chosen": 0.08036249130964279, "rewards/margins": 0.8519478440284729, "rewards/rejected": -0.7715852856636047, "step": 580 }, { "epoch": 0.6740499457111835, "grad_norm": 74.76192509954373, "learning_rate": 1.596814140835805e-07, "logits/chosen": -1.256306767463684, "logits/rejected": -1.3627066612243652, "logps/chosen": -163.5234375, "logps/rejected": -197.15310668945312, "loss": 0.5397, "rewards/accuracies": 0.8125, "rewards/chosen": 0.26100030541419983, "rewards/margins": 0.756481409072876, "rewards/rejected": -0.495481014251709, "step": 582 }, { "epoch": 0.676366268548679, "grad_norm": 70.48378313519777, "learning_rate": 1.5937091397469813e-07, "logits/chosen": -1.395608901977539, "logits/rejected": -1.4762039184570312, "logps/chosen": -147.21681213378906, "logps/rejected": -167.70651245117188, "loss": 0.6367, "rewards/accuracies": 0.53125, "rewards/chosen": 0.1700185090303421, "rewards/margins": 0.24376149475574493, "rewards/rejected": -0.07374300062656403, "step": 584 }, { "epoch": 0.6786825913861745, "grad_norm": 71.83056408251632, "learning_rate": 1.5905952733905773e-07, "logits/chosen": -1.3281779289245605, "logits/rejected": -1.37840735912323, "logps/chosen": -150.58189392089844, "logps/rejected": -174.99461364746094, "loss": 0.6312, "rewards/accuracies": 0.65625, "rewards/chosen": 0.022576339542865753, "rewards/margins": 0.32831788063049316, "rewards/rejected": -0.3057415187358856, "step": 586 }, { "epoch": 0.6809989142236699, "grad_norm": 83.87757428866225, "learning_rate": 1.5874725882628598e-07, "logits/chosen": -1.2740365266799927, "logits/rejected": -1.3645150661468506, "logps/chosen": -119.89724731445312, "logps/rejected": -143.9542694091797, "loss": 0.6469, "rewards/accuracies": 0.625, "rewards/chosen": -0.08816975355148315, "rewards/margins": 0.3666497468948364, "rewards/rejected": -0.4548195004463196, "step": 588 }, { "epoch": 0.6833152370611654, "grad_norm": 95.15763805819658, "learning_rate": 1.5843411309917773e-07, "logits/chosen": -1.1707677841186523, "logits/rejected": -1.2354707717895508, "logps/chosen": -137.05491638183594, "logps/rejected": -164.23329162597656, "loss": 0.6325, "rewards/accuracies": 0.75, "rewards/chosen": -0.04570431262254715, "rewards/margins": 0.837788999080658, "rewards/rejected": -0.8834933638572693, "step": 590 }, { "epoch": 0.6856315598986609, "grad_norm": 89.31999210372004, "learning_rate": 1.5812009483362641e-07, "logits/chosen": -1.3425350189208984, "logits/rejected": -1.3389382362365723, "logps/chosen": -114.12051391601562, "logps/rejected": -130.07469177246094, "loss": 0.6058, "rewards/accuracies": 0.71875, "rewards/chosen": 0.10132614523172379, "rewards/margins": 0.26681679487228394, "rewards/rejected": -0.16549064218997955, "step": 592 }, { "epoch": 0.6879478827361564, "grad_norm": 86.55981599933118, "learning_rate": 1.5780520871855416e-07, "logits/chosen": -1.4263215065002441, "logits/rejected": -1.4661970138549805, "logps/chosen": -146.5399932861328, "logps/rejected": -162.22738647460938, "loss": 0.6227, "rewards/accuracies": 0.625, "rewards/chosen": 0.1263137012720108, "rewards/margins": 0.7135946750640869, "rewards/rejected": -0.5872809886932373, "step": 594 }, { "epoch": 0.6902642055736519, "grad_norm": 77.11466861321054, "learning_rate": 1.5748945945584194e-07, "logits/chosen": -1.1571879386901855, "logits/rejected": -1.2603471279144287, "logps/chosen": -138.8977813720703, "logps/rejected": -177.0740509033203, "loss": 0.6776, "rewards/accuracies": 0.625, "rewards/chosen": -0.02539961040019989, "rewards/margins": 0.39455336332321167, "rewards/rejected": -0.41995295882225037, "step": 596 }, { "epoch": 0.6925805284111473, "grad_norm": 81.2459965814331, "learning_rate": 1.5717285176025912e-07, "logits/chosen": -1.2991225719451904, "logits/rejected": -1.38021981716156, "logps/chosen": -151.61753845214844, "logps/rejected": -176.51548767089844, "loss": 0.5917, "rewards/accuracies": 0.78125, "rewards/chosen": 0.20256918668746948, "rewards/margins": 0.4604591131210327, "rewards/rejected": -0.25788992643356323, "step": 598 }, { "epoch": 0.6948968512486428, "grad_norm": 70.92510708156665, "learning_rate": 1.568553903593933e-07, "logits/chosen": -1.444725513458252, "logits/rejected": -1.3993281126022339, "logps/chosen": -111.9288330078125, "logps/rejected": -113.10049438476562, "loss": 0.6366, "rewards/accuracies": 0.53125, "rewards/chosen": -0.008920304477214813, "rewards/margins": 0.010683823376893997, "rewards/rejected": -0.019604135304689407, "step": 600 }, { "epoch": 0.6948968512486428, "eval_logits/chosen": -1.3430299758911133, "eval_logits/rejected": -1.3360421657562256, "eval_logps/chosen": -137.48741149902344, "eval_logps/rejected": -138.31024169921875, "eval_loss": 0.6260569095611572, "eval_rewards/accuracies": 0.6399999856948853, "eval_rewards/chosen": -0.185623899102211, "eval_rewards/margins": 0.25004515051841736, "eval_rewards/rejected": -0.43566906452178955, "eval_runtime": 24.9391, "eval_samples_per_second": 4.01, "eval_steps_per_second": 1.002, "step": 600 }, { "epoch": 0.6972131740861383, "grad_norm": 65.21565931204394, "learning_rate": 1.5653707999357954e-07, "logits/chosen": -1.246476411819458, "logits/rejected": -1.3448492288589478, "logps/chosen": -145.94290161132812, "logps/rejected": -167.75537109375, "loss": 0.5257, "rewards/accuracies": 0.71875, "rewards/chosen": 0.09868886321783066, "rewards/margins": 0.4459027051925659, "rewards/rejected": -0.3472138047218323, "step": 602 }, { "epoch": 0.6995294969236338, "grad_norm": 83.43610856907603, "learning_rate": 1.5621792541582965e-07, "logits/chosen": -1.3476929664611816, "logits/rejected": -1.3578147888183594, "logps/chosen": -147.28909301757812, "logps/rejected": -151.40380859375, "loss": 0.6078, "rewards/accuracies": 0.6875, "rewards/chosen": -0.13675910234451294, "rewards/margins": 0.5534166097640991, "rewards/rejected": -0.6901756525039673, "step": 604 }, { "epoch": 0.7018458197611293, "grad_norm": 83.4114443803894, "learning_rate": 1.558979313917613e-07, "logits/chosen": -1.3711118698120117, "logits/rejected": -1.4022268056869507, "logps/chosen": -147.4836883544922, "logps/rejected": -200.58444213867188, "loss": 0.5579, "rewards/accuracies": 0.75, "rewards/chosen": -0.20421257615089417, "rewards/margins": 1.2417640686035156, "rewards/rejected": -1.4459766149520874, "step": 606 }, { "epoch": 0.7041621425986246, "grad_norm": 86.87494281849429, "learning_rate": 1.5557710269952668e-07, "logits/chosen": -1.2246983051300049, "logits/rejected": -1.257164716720581, "logps/chosen": -126.96261596679688, "logps/rejected": -146.96583557128906, "loss": 0.618, "rewards/accuracies": 0.5625, "rewards/chosen": -0.23703543841838837, "rewards/margins": 0.6981402635574341, "rewards/rejected": -0.9351757168769836, "step": 608 }, { "epoch": 0.7064784654361201, "grad_norm": 78.38055223849892, "learning_rate": 1.552554441297413e-07, "logits/chosen": -1.3179811239242554, "logits/rejected": -1.3713396787643433, "logps/chosen": -152.94192504882812, "logps/rejected": -176.10769653320312, "loss": 0.6449, "rewards/accuracies": 0.6875, "rewards/chosen": -0.025582734495401382, "rewards/margins": 0.6770236492156982, "rewards/rejected": -0.7026063799858093, "step": 610 }, { "epoch": 0.7087947882736156, "grad_norm": 74.133399784032, "learning_rate": 1.549329604854124e-07, "logits/chosen": -1.3148674964904785, "logits/rejected": -1.327743649482727, "logps/chosen": -139.4869384765625, "logps/rejected": -159.59814453125, "loss": 0.5677, "rewards/accuracies": 0.75, "rewards/chosen": -0.14542880654335022, "rewards/margins": 0.8610043525695801, "rewards/rejected": -1.0064332485198975, "step": 612 }, { "epoch": 0.7111111111111111, "grad_norm": 112.65020221864026, "learning_rate": 1.5460965658186714e-07, "logits/chosen": -1.3466724157333374, "logits/rejected": -1.3887736797332764, "logps/chosen": -151.4530029296875, "logps/rejected": -177.39657592773438, "loss": 0.6323, "rewards/accuracies": 0.78125, "rewards/chosen": -0.09504815936088562, "rewards/margins": 0.42471885681152344, "rewards/rejected": -0.5197670459747314, "step": 614 }, { "epoch": 0.7134274339486066, "grad_norm": 76.25616510726566, "learning_rate": 1.54285537246681e-07, "logits/chosen": -1.24288010597229, "logits/rejected": -1.2770837545394897, "logps/chosen": -111.17274475097656, "logps/rejected": -145.47189331054688, "loss": 0.5485, "rewards/accuracies": 0.625, "rewards/chosen": 0.026386726647615433, "rewards/margins": 0.590622067451477, "rewards/rejected": -0.5642353296279907, "step": 616 }, { "epoch": 0.715743756786102, "grad_norm": 138.34950058984242, "learning_rate": 1.539606073196053e-07, "logits/chosen": -1.3290059566497803, "logits/rejected": -1.3147568702697754, "logps/chosen": -189.2161865234375, "logps/rejected": -208.13873291015625, "loss": 0.6596, "rewards/accuracies": 0.625, "rewards/chosen": -0.6345406174659729, "rewards/margins": 0.8859498500823975, "rewards/rejected": -1.5204904079437256, "step": 618 }, { "epoch": 0.7180600796235975, "grad_norm": 150.2269252956893, "learning_rate": 1.536348716524952e-07, "logits/chosen": -1.2324703931808472, "logits/rejected": -1.3258979320526123, "logps/chosen": -115.2970199584961, "logps/rejected": -129.09571838378906, "loss": 0.6866, "rewards/accuracies": 0.59375, "rewards/chosen": 0.001235203817486763, "rewards/margins": 0.2507448196411133, "rewards/rejected": -0.24950963258743286, "step": 620 }, { "epoch": 0.720376402461093, "grad_norm": 87.5580010456215, "learning_rate": 1.5330833510923716e-07, "logits/chosen": -1.257300615310669, "logits/rejected": -1.2330734729766846, "logps/chosen": -146.174072265625, "logps/rejected": -167.9958038330078, "loss": 0.5343, "rewards/accuracies": 0.78125, "rewards/chosen": -0.09243424236774445, "rewards/margins": 0.7605526447296143, "rewards/rejected": -0.8529868125915527, "step": 622 }, { "epoch": 0.7226927252985885, "grad_norm": 80.10448855296569, "learning_rate": 1.529810025656764e-07, "logits/chosen": -1.3765792846679688, "logits/rejected": -1.3748364448547363, "logps/chosen": -108.47747802734375, "logps/rejected": -115.16337585449219, "loss": 0.5844, "rewards/accuracies": 0.625, "rewards/chosen": 0.10373257100582123, "rewards/margins": 0.365066796541214, "rewards/rejected": -0.26133421063423157, "step": 624 }, { "epoch": 0.7250090481360839, "grad_norm": 83.63175479164269, "learning_rate": 1.5265287890954386e-07, "logits/chosen": -1.2026071548461914, "logits/rejected": -1.2504820823669434, "logps/chosen": -132.87338256835938, "logps/rejected": -209.71510314941406, "loss": 0.5652, "rewards/accuracies": 0.71875, "rewards/chosen": -0.26427173614501953, "rewards/margins": 0.8983727693557739, "rewards/rejected": -1.1626445055007935, "step": 626 }, { "epoch": 0.7273253709735794, "grad_norm": 99.26984323238828, "learning_rate": 1.523239690403835e-07, "logits/chosen": -1.2745357751846313, "logits/rejected": -1.3479351997375488, "logps/chosen": -141.58119201660156, "logps/rejected": -179.41603088378906, "loss": 0.5868, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12304151058197021, "rewards/margins": 0.5740844011306763, "rewards/rejected": -0.6971259117126465, "step": 628 }, { "epoch": 0.7296416938110749, "grad_norm": 68.60907419248075, "learning_rate": 1.5199427786947898e-07, "logits/chosen": -1.2993189096450806, "logits/rejected": -1.268160104751587, "logps/chosen": -150.71392822265625, "logps/rejected": -175.40994262695312, "loss": 0.5011, "rewards/accuracies": 0.875, "rewards/chosen": -0.05146133154630661, "rewards/margins": 0.942064106464386, "rewards/rejected": -0.993525505065918, "step": 630 }, { "epoch": 0.7319580166485704, "grad_norm": 103.51182599026306, "learning_rate": 1.5166381031978043e-07, "logits/chosen": -1.2612452507019043, "logits/rejected": -1.313174843788147, "logps/chosen": -160.19854736328125, "logps/rejected": -210.183349609375, "loss": 0.5902, "rewards/accuracies": 0.6875, "rewards/chosen": -0.494692325592041, "rewards/margins": 1.7965799570083618, "rewards/rejected": -2.2912724018096924, "step": 632 }, { "epoch": 0.7342743394860659, "grad_norm": 62.22603011415776, "learning_rate": 1.5133257132583073e-07, "logits/chosen": -1.26754629611969, "logits/rejected": -1.2881284952163696, "logps/chosen": -159.5138702392578, "logps/rejected": -161.47116088867188, "loss": 0.5607, "rewards/accuracies": 0.75, "rewards/chosen": -0.3782161772251129, "rewards/margins": 0.5586279630661011, "rewards/rejected": -0.9368441700935364, "step": 634 }, { "epoch": 0.7365906623235613, "grad_norm": 96.15037328481357, "learning_rate": 1.5100056583369205e-07, "logits/chosen": -1.2539037466049194, "logits/rejected": -1.3549363613128662, "logps/chosen": -155.125732421875, "logps/rejected": -164.1871795654297, "loss": 0.5928, "rewards/accuracies": 0.65625, "rewards/chosen": -0.4689493477344513, "rewards/margins": 0.17761842906475067, "rewards/rejected": -0.6465678215026855, "step": 636 }, { "epoch": 0.7389069851610568, "grad_norm": 103.33944083718919, "learning_rate": 1.5066779880087195e-07, "logits/chosen": -1.2627050876617432, "logits/rejected": -1.29320228099823, "logps/chosen": -166.43197631835938, "logps/rejected": -198.9180450439453, "loss": 0.5905, "rewards/accuracies": 0.75, "rewards/chosen": 0.11970967054367065, "rewards/margins": 0.9318963289260864, "rewards/rejected": -0.8121867179870605, "step": 638 }, { "epoch": 0.7412233079985523, "grad_norm": 99.08292249787714, "learning_rate": 1.503342751962493e-07, "logits/chosen": -1.347541332244873, "logits/rejected": -1.396866798400879, "logps/chosen": -115.1908187866211, "logps/rejected": -140.8692169189453, "loss": 0.6687, "rewards/accuracies": 0.5625, "rewards/chosen": -0.448566734790802, "rewards/margins": 0.19808316230773926, "rewards/rejected": -0.6466498374938965, "step": 640 }, { "epoch": 0.7435396308360478, "grad_norm": 100.95301107774017, "learning_rate": 1.5e-07, "logits/chosen": -1.3385045528411865, "logits/rejected": -1.3928390741348267, "logps/chosen": -161.37307739257812, "logps/rejected": -180.67625427246094, "loss": 0.622, "rewards/accuracies": 0.5625, "rewards/chosen": -0.28616487979888916, "rewards/margins": 0.43838122487068176, "rewards/rejected": -0.7245461344718933, "step": 642 }, { "epoch": 0.7458559536735433, "grad_norm": 101.02506488024277, "learning_rate": 1.4966497820352286e-07, "logits/chosen": -1.2145590782165527, "logits/rejected": -1.2937660217285156, "logps/chosen": -158.820068359375, "logps/rejected": -178.55609130859375, "loss": 0.6347, "rewards/accuracies": 0.53125, "rewards/chosen": -0.7363303303718567, "rewards/margins": 0.2016220986843109, "rewards/rejected": -0.9379523992538452, "step": 644 }, { "epoch": 0.7481722765110387, "grad_norm": 87.43579693715355, "learning_rate": 1.493292148093649e-07, "logits/chosen": -1.3464206457138062, "logits/rejected": -1.3654189109802246, "logps/chosen": -151.60321044921875, "logps/rejected": -166.5888214111328, "loss": 0.5879, "rewards/accuracies": 0.6875, "rewards/chosen": -0.22216303646564484, "rewards/margins": 0.5710839629173279, "rewards/rejected": -0.7932470440864563, "step": 646 }, { "epoch": 0.7504885993485342, "grad_norm": 89.88093815940675, "learning_rate": 1.4899271483114656e-07, "logits/chosen": -1.2944061756134033, "logits/rejected": -1.2952131032943726, "logps/chosen": -87.6488037109375, "logps/rejected": -96.55010223388672, "loss": 0.8793, "rewards/accuracies": 0.65625, "rewards/chosen": -0.36783769726753235, "rewards/margins": 0.2082243710756302, "rewards/rejected": -0.576062023639679, "step": 648 }, { "epoch": 0.7528049221860297, "grad_norm": 68.16473110876538, "learning_rate": 1.4865548329348716e-07, "logits/chosen": -1.2365565299987793, "logits/rejected": -1.2194610834121704, "logps/chosen": -101.74496459960938, "logps/rejected": -115.47993469238281, "loss": 0.5926, "rewards/accuracies": 0.78125, "rewards/chosen": 0.09891890734434128, "rewards/margins": 0.535317063331604, "rewards/rejected": -0.43639814853668213, "step": 650 }, { "epoch": 0.7551212450235252, "grad_norm": 126.64604196312857, "learning_rate": 1.4831752523192947e-07, "logits/chosen": -1.3086532354354858, "logits/rejected": -1.347178339958191, "logps/chosen": -123.16075134277344, "logps/rejected": -140.86134338378906, "loss": 0.6408, "rewards/accuracies": 0.53125, "rewards/chosen": 0.06762253493070602, "rewards/margins": 0.4483943581581116, "rewards/rejected": -0.38077181577682495, "step": 652 }, { "epoch": 0.7574375678610207, "grad_norm": 87.13600344485694, "learning_rate": 1.4797884569286482e-07, "logits/chosen": -1.223707675933838, "logits/rejected": -1.3276126384735107, "logps/chosen": -154.83677673339844, "logps/rejected": -161.1993865966797, "loss": 0.561, "rewards/accuracies": 0.65625, "rewards/chosen": 0.12348895519971848, "rewards/margins": 0.5256034135818481, "rewards/rejected": -0.40211451053619385, "step": 654 }, { "epoch": 0.7597538906985161, "grad_norm": 77.97523563671065, "learning_rate": 1.476394497334577e-07, "logits/chosen": -1.2289892435073853, "logits/rejected": -1.2923333644866943, "logps/chosen": -133.74855041503906, "logps/rejected": -168.71890258789062, "loss": 0.5593, "rewards/accuracies": 0.625, "rewards/chosen": 0.054568979889154434, "rewards/margins": 1.1155064105987549, "rewards/rejected": -1.0609374046325684, "step": 656 }, { "epoch": 0.7620702135360116, "grad_norm": 86.95243414887648, "learning_rate": 1.4729934242157002e-07, "logits/chosen": -1.2043237686157227, "logits/rejected": -1.298114538192749, "logps/chosen": -147.36212158203125, "logps/rejected": -175.49537658691406, "loss": 0.5916, "rewards/accuracies": 0.84375, "rewards/chosen": 0.04841674864292145, "rewards/margins": 0.7525838613510132, "rewards/rejected": -0.7041671872138977, "step": 658 }, { "epoch": 0.7643865363735071, "grad_norm": 75.65437165493766, "learning_rate": 1.4695852883568577e-07, "logits/chosen": -1.296444058418274, "logits/rejected": -1.297696590423584, "logps/chosen": -139.0203399658203, "logps/rejected": -136.33407592773438, "loss": 0.664, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03904925286769867, "rewards/margins": 0.23676946759223938, "rewards/rejected": -0.27581873536109924, "step": 660 }, { "epoch": 0.7667028592110026, "grad_norm": 74.56863133576563, "learning_rate": 1.46617014064835e-07, "logits/chosen": -1.31462562084198, "logits/rejected": -1.3569142818450928, "logps/chosen": -84.67462158203125, "logps/rejected": -85.39293670654297, "loss": 0.6178, "rewards/accuracies": 0.71875, "rewards/chosen": 0.08590921759605408, "rewards/margins": 0.2730802893638611, "rewards/rejected": -0.1871711015701294, "step": 662 }, { "epoch": 0.7690191820484981, "grad_norm": 65.1421142926074, "learning_rate": 1.4627480320851773e-07, "logits/chosen": -1.2603397369384766, "logits/rejected": -1.3087056875228882, "logps/chosen": -94.25755310058594, "logps/rejected": -124.65604400634766, "loss": 0.5517, "rewards/accuracies": 0.5625, "rewards/chosen": 0.13362671434879303, "rewards/margins": 0.27784639596939087, "rewards/rejected": -0.14421969652175903, "step": 664 }, { "epoch": 0.7713355048859935, "grad_norm": 71.68830752075247, "learning_rate": 1.459319013766281e-07, "logits/chosen": -1.2157174348831177, "logits/rejected": -1.2898728847503662, "logps/chosen": -115.97237396240234, "logps/rejected": -161.3004913330078, "loss": 0.5697, "rewards/accuracies": 0.5625, "rewards/chosen": 0.03516310453414917, "rewards/margins": 0.6528601050376892, "rewards/rejected": -0.6176969408988953, "step": 666 }, { "epoch": 0.773651827723489, "grad_norm": 79.36186918181832, "learning_rate": 1.4558831368937782e-07, "logits/chosen": -1.4229352474212646, "logits/rejected": -1.4072250127792358, "logps/chosen": -177.0735626220703, "logps/rejected": -197.98715209960938, "loss": 0.5665, "rewards/accuracies": 0.78125, "rewards/chosen": 0.10195475816726685, "rewards/margins": 0.687512993812561, "rewards/rejected": -0.5855582356452942, "step": 668 }, { "epoch": 0.7759681505609844, "grad_norm": 87.8388965497945, "learning_rate": 1.4524404527721975e-07, "logits/chosen": -1.3583612442016602, "logits/rejected": -1.389435052871704, "logps/chosen": -122.91793823242188, "logps/rejected": -143.0467071533203, "loss": 0.5898, "rewards/accuracies": 0.65625, "rewards/chosen": 0.04354669526219368, "rewards/margins": 0.41671624779701233, "rewards/rejected": -0.37316957116127014, "step": 670 }, { "epoch": 0.77828447339848, "grad_norm": 81.7257176752688, "learning_rate": 1.4489910128077144e-07, "logits/chosen": -1.247169017791748, "logits/rejected": -1.233970046043396, "logps/chosen": -160.17123413085938, "logps/rejected": -161.0812225341797, "loss": 0.5867, "rewards/accuracies": 0.625, "rewards/chosen": 0.1097157821059227, "rewards/margins": 0.3292488753795624, "rewards/rejected": -0.21953308582305908, "step": 672 }, { "epoch": 0.7806007962359753, "grad_norm": 82.58162746498225, "learning_rate": 1.4455348685073824e-07, "logits/chosen": -1.3045804500579834, "logits/rejected": -1.3406845331192017, "logps/chosen": -144.19696044921875, "logps/rejected": -166.00563049316406, "loss": 0.5414, "rewards/accuracies": 0.8125, "rewards/chosen": 0.17315542697906494, "rewards/margins": 0.8811919689178467, "rewards/rejected": -0.7080365419387817, "step": 674 }, { "epoch": 0.7829171190734708, "grad_norm": 70.90800914750002, "learning_rate": 1.4420720714783634e-07, "logits/chosen": -1.3308024406433105, "logits/rejected": -1.3253967761993408, "logps/chosen": -168.21568298339844, "logps/rejected": -170.13629150390625, "loss": 0.6002, "rewards/accuracies": 0.625, "rewards/chosen": 0.01426420547068119, "rewards/margins": 0.2677161991596222, "rewards/rejected": -0.25345203280448914, "step": 676 }, { "epoch": 0.7852334419109663, "grad_norm": 108.18465138210843, "learning_rate": 1.438602673427158e-07, "logits/chosen": -1.243054747581482, "logits/rejected": -1.284456729888916, "logps/chosen": -141.8882598876953, "logps/rejected": -142.9855499267578, "loss": 0.6341, "rewards/accuracies": 0.71875, "rewards/chosen": -0.468353807926178, "rewards/margins": 0.15223075449466705, "rewards/rejected": -0.6205846667289734, "step": 678 }, { "epoch": 0.7875497647484618, "grad_norm": 136.97239295907355, "learning_rate": 1.435126726158835e-07, "logits/chosen": -1.1808712482452393, "logits/rejected": -1.190900444984436, "logps/chosen": -108.58960723876953, "logps/rejected": -115.77843475341797, "loss": 0.6642, "rewards/accuracies": 0.5, "rewards/chosen": -0.1600310057401657, "rewards/margins": 0.3264339566230774, "rewards/rejected": -0.4864649176597595, "step": 680 }, { "epoch": 0.7898660875859573, "grad_norm": 77.26216998107256, "learning_rate": 1.431644281576254e-07, "logits/chosen": -1.352297067642212, "logits/rejected": -1.3604989051818848, "logps/chosen": -158.22714233398438, "logps/rejected": -179.4667510986328, "loss": 0.5683, "rewards/accuracies": 0.78125, "rewards/chosen": 0.03663470223546028, "rewards/margins": 0.6152007579803467, "rewards/rejected": -0.5785660147666931, "step": 682 }, { "epoch": 0.7921824104234527, "grad_norm": 86.99102850306988, "learning_rate": 1.4281553916792933e-07, "logits/chosen": -1.3430233001708984, "logits/rejected": -1.3200087547302246, "logps/chosen": -177.13780212402344, "logps/rejected": -197.31582641601562, "loss": 0.5711, "rewards/accuracies": 0.65625, "rewards/chosen": 0.08363021910190582, "rewards/margins": 0.7721484899520874, "rewards/rejected": -0.688518226146698, "step": 684 }, { "epoch": 0.7944987332609482, "grad_norm": 95.74498787413943, "learning_rate": 1.4246601085640734e-07, "logits/chosen": -1.3332422971725464, "logits/rejected": -1.3727816343307495, "logps/chosen": -146.84754943847656, "logps/rejected": -159.5667724609375, "loss": 0.5676, "rewards/accuracies": 0.71875, "rewards/chosen": -0.121219202876091, "rewards/margins": 0.5295783877372742, "rewards/rejected": -0.6507976055145264, "step": 686 }, { "epoch": 0.7968150560984437, "grad_norm": 77.80530486828445, "learning_rate": 1.421158484422177e-07, "logits/chosen": -1.1991747617721558, "logits/rejected": -1.193981409072876, "logps/chosen": -111.72264099121094, "logps/rejected": -151.81735229492188, "loss": 0.6152, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2821239233016968, "rewards/margins": 0.8215171098709106, "rewards/rejected": -1.1036410331726074, "step": 688 }, { "epoch": 0.7991313789359392, "grad_norm": 78.69284003499466, "learning_rate": 1.417650571539872e-07, "logits/chosen": -1.184325933456421, "logits/rejected": -1.220619559288025, "logps/chosen": -117.9516372680664, "logps/rejected": -150.95596313476562, "loss": 0.6134, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1240064799785614, "rewards/margins": 0.604076087474823, "rewards/rejected": -0.7280825972557068, "step": 690 }, { "epoch": 0.8014477017734347, "grad_norm": 101.94725129563409, "learning_rate": 1.4141364222973293e-07, "logits/chosen": -1.2512197494506836, "logits/rejected": -1.2510242462158203, "logps/chosen": -105.66606140136719, "logps/rejected": -126.79011535644531, "loss": 0.6816, "rewards/accuracies": 0.6875, "rewards/chosen": 0.07451184839010239, "rewards/margins": 0.3946147859096527, "rewards/rejected": -0.3201029300689697, "step": 692 }, { "epoch": 0.8037640246109301, "grad_norm": 92.22432532521196, "learning_rate": 1.410616089167842e-07, "logits/chosen": -1.3338427543640137, "logits/rejected": -1.3960174322128296, "logps/chosen": -151.6918487548828, "logps/rejected": -172.7644805908203, "loss": 0.6545, "rewards/accuracies": 0.78125, "rewards/chosen": 0.015777569264173508, "rewards/margins": 0.5224604606628418, "rewards/rejected": -0.5066828727722168, "step": 694 }, { "epoch": 0.8060803474484256, "grad_norm": 78.94912552978236, "learning_rate": 1.40708962471704e-07, "logits/chosen": -1.3187270164489746, "logits/rejected": -1.273497223854065, "logps/chosen": -149.56173706054688, "logps/rejected": -159.30621337890625, "loss": 0.5606, "rewards/accuracies": 0.71875, "rewards/chosen": -0.10468573868274689, "rewards/margins": 0.5250222682952881, "rewards/rejected": -0.6297080516815186, "step": 696 }, { "epoch": 0.8083966702859211, "grad_norm": 73.36588542547142, "learning_rate": 1.4035570816021066e-07, "logits/chosen": -1.1806639432907104, "logits/rejected": -1.2237762212753296, "logps/chosen": -124.83197784423828, "logps/rejected": -137.32818603515625, "loss": 0.5717, "rewards/accuracies": 0.75, "rewards/chosen": 0.13686442375183105, "rewards/margins": 0.6119322180747986, "rewards/rejected": -0.47506779432296753, "step": 698 }, { "epoch": 0.8107129931234166, "grad_norm": 90.26959143913993, "learning_rate": 1.4000185125709917e-07, "logits/chosen": -1.2998504638671875, "logits/rejected": -1.2823896408081055, "logps/chosen": -145.33433532714844, "logps/rejected": -154.925048828125, "loss": 0.53, "rewards/accuracies": 0.71875, "rewards/chosen": -0.1620834469795227, "rewards/margins": 0.8069512844085693, "rewards/rejected": -0.969034731388092, "step": 700 }, { "epoch": 0.8107129931234166, "eval_logits/chosen": -1.284436821937561, "eval_logits/rejected": -1.280255913734436, "eval_logps/chosen": -139.673828125, "eval_logps/rejected": -140.73330688476562, "eval_loss": 0.6434417963027954, "eval_rewards/accuracies": 0.6399999856948853, "eval_rewards/chosen": -0.4042653739452362, "eval_rewards/margins": 0.2737090587615967, "eval_rewards/rejected": -0.6779744625091553, "eval_runtime": 28.09, "eval_samples_per_second": 3.56, "eval_steps_per_second": 0.89, "step": 700 }, { "epoch": 0.8130293159609121, "grad_norm": 76.53295510100874, "learning_rate": 1.3964739704616247e-07, "logits/chosen": -1.2752941846847534, "logits/rejected": -1.3037452697753906, "logps/chosen": -121.41841125488281, "logps/rejected": -145.7820281982422, "loss": 0.5615, "rewards/accuracies": 0.71875, "rewards/chosen": -0.14446553587913513, "rewards/margins": 0.4738205671310425, "rewards/rejected": -0.6182860732078552, "step": 702 }, { "epoch": 0.8153456387984075, "grad_norm": 83.60642302801956, "learning_rate": 1.3929235082011233e-07, "logits/chosen": -1.296051025390625, "logits/rejected": -1.2584261894226074, "logps/chosen": -147.13363647460938, "logps/rejected": -150.47975158691406, "loss": 0.6121, "rewards/accuracies": 0.75, "rewards/chosen": -0.1639641672372818, "rewards/margins": 0.1966288536787033, "rewards/rejected": -0.3605930507183075, "step": 704 }, { "epoch": 0.817661961635903, "grad_norm": 80.9906935823332, "learning_rate": 1.3893671788050073e-07, "logits/chosen": -1.3390685319900513, "logits/rejected": -1.3612732887268066, "logps/chosen": -128.30557250976562, "logps/rejected": -151.2054443359375, "loss": 0.5976, "rewards/accuracies": 0.6875, "rewards/chosen": -0.28982260823249817, "rewards/margins": 0.5823266506195068, "rewards/rejected": -0.8721492886543274, "step": 706 }, { "epoch": 0.8199782844733985, "grad_norm": 79.56491935334587, "learning_rate": 1.385805035376403e-07, "logits/chosen": -1.2172850370407104, "logits/rejected": -1.2152363061904907, "logps/chosen": -127.4556884765625, "logps/rejected": -150.25283813476562, "loss": 0.6058, "rewards/accuracies": 0.71875, "rewards/chosen": -0.030050382018089294, "rewards/margins": 0.7187869548797607, "rewards/rejected": -0.7488372921943665, "step": 708 }, { "epoch": 0.822294607310894, "grad_norm": 94.61612804665803, "learning_rate": 1.3822371311052523e-07, "logits/chosen": -1.4023680686950684, "logits/rejected": -1.3874592781066895, "logps/chosen": -137.8409423828125, "logps/rejected": -148.14865112304688, "loss": 0.6506, "rewards/accuracies": 0.65625, "rewards/chosen": -0.055325593799352646, "rewards/margins": 0.21838925778865814, "rewards/rejected": -0.2737148702144623, "step": 710 }, { "epoch": 0.8246109301483895, "grad_norm": 128.27117224074266, "learning_rate": 1.3786635192675182e-07, "logits/chosen": -1.2412676811218262, "logits/rejected": -1.2453944683074951, "logps/chosen": -107.39035034179688, "logps/rejected": -125.48077392578125, "loss": 0.6535, "rewards/accuracies": 0.65625, "rewards/chosen": -0.47688642144203186, "rewards/margins": 0.8052099943161011, "rewards/rejected": -1.2820963859558105, "step": 712 }, { "epoch": 0.8269272529858849, "grad_norm": 81.45696425151283, "learning_rate": 1.375084253224389e-07, "logits/chosen": -1.2930258512496948, "logits/rejected": -1.275715708732605, "logps/chosen": -85.20941162109375, "logps/rejected": -94.45440673828125, "loss": 0.6138, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1915649175643921, "rewards/margins": 0.5591136813163757, "rewards/rejected": -0.7506786584854126, "step": 714 }, { "epoch": 0.8292435758233804, "grad_norm": 68.28421801876017, "learning_rate": 1.371499386421481e-07, "logits/chosen": -1.4285151958465576, "logits/rejected": -1.4581302404403687, "logps/chosen": -144.27658081054688, "logps/rejected": -158.56509399414062, "loss": 0.5704, "rewards/accuracies": 0.8125, "rewards/chosen": 0.10951147228479385, "rewards/margins": 0.6330095529556274, "rewards/rejected": -0.5234980583190918, "step": 716 }, { "epoch": 0.8315598986608759, "grad_norm": 58.53704567635433, "learning_rate": 1.3679089723880426e-07, "logits/chosen": -1.2971611022949219, "logits/rejected": -1.349431037902832, "logps/chosen": -132.24440002441406, "logps/rejected": -157.0482940673828, "loss": 0.5487, "rewards/accuracies": 0.8125, "rewards/chosen": 0.03527238965034485, "rewards/margins": 0.778007447719574, "rewards/rejected": -0.7427350878715515, "step": 718 }, { "epoch": 0.8338762214983714, "grad_norm": 90.86882777075694, "learning_rate": 1.3643130647361515e-07, "logits/chosen": -1.223427414894104, "logits/rejected": -1.2997663021087646, "logps/chosen": -155.99490356445312, "logps/rejected": -184.7327880859375, "loss": 0.6357, "rewards/accuracies": 0.75, "rewards/chosen": -0.13655239343643188, "rewards/margins": 0.6724637746810913, "rewards/rejected": -0.8090161085128784, "step": 720 }, { "epoch": 0.8361925443358668, "grad_norm": 71.10479530353953, "learning_rate": 1.3607117171599178e-07, "logits/chosen": -1.432951807975769, "logits/rejected": -1.4361122846603394, "logps/chosen": -135.4175567626953, "logps/rejected": -137.8448944091797, "loss": 0.5987, "rewards/accuracies": 0.625, "rewards/chosen": -0.10024284571409225, "rewards/margins": 0.49338966608047485, "rewards/rejected": -0.5936326384544373, "step": 722 }, { "epoch": 0.8385088671733623, "grad_norm": 73.24864095064638, "learning_rate": 1.3571049834346796e-07, "logits/chosen": -1.266755223274231, "logits/rejected": -1.3649208545684814, "logps/chosen": -91.91627502441406, "logps/rejected": -137.7756805419922, "loss": 0.5877, "rewards/accuracies": 0.65625, "rewards/chosen": -0.03640535846352577, "rewards/margins": 0.47152870893478394, "rewards/rejected": -0.5079340934753418, "step": 724 }, { "epoch": 0.8408251900108578, "grad_norm": 84.42887169617501, "learning_rate": 1.3534929174162017e-07, "logits/chosen": -1.3389887809753418, "logits/rejected": -1.3608206510543823, "logps/chosen": -138.79879760742188, "logps/rejected": -148.29205322265625, "loss": 0.6315, "rewards/accuracies": 0.71875, "rewards/chosen": -0.13554176688194275, "rewards/margins": 0.4595597982406616, "rewards/rejected": -0.595101535320282, "step": 726 }, { "epoch": 0.8431415128483533, "grad_norm": 70.90955463261469, "learning_rate": 1.34987557303987e-07, "logits/chosen": -1.3609716892242432, "logits/rejected": -1.4318090677261353, "logps/chosen": -112.85796356201172, "logps/rejected": -138.29444885253906, "loss": 0.5358, "rewards/accuracies": 0.71875, "rewards/chosen": -0.0836462751030922, "rewards/margins": 0.7275322079658508, "rewards/rejected": -0.8111785650253296, "step": 728 }, { "epoch": 0.8454578356858488, "grad_norm": 114.31329319517884, "learning_rate": 1.3462530043198871e-07, "logits/chosen": -1.3260619640350342, "logits/rejected": -1.4074249267578125, "logps/chosen": -145.60401916503906, "logps/rejected": -165.5455780029297, "loss": 0.6309, "rewards/accuracies": 0.78125, "rewards/chosen": 0.12638582289218903, "rewards/margins": 0.7593765258789062, "rewards/rejected": -0.6329907178878784, "step": 730 }, { "epoch": 0.8477741585233441, "grad_norm": 109.74072899186363, "learning_rate": 1.342625265348466e-07, "logits/chosen": -1.3791258335113525, "logits/rejected": -1.4346139430999756, "logps/chosen": -178.01446533203125, "logps/rejected": -226.8712158203125, "loss": 0.6573, "rewards/accuracies": 0.65625, "rewards/chosen": -0.36265602707862854, "rewards/margins": 0.6506404280662537, "rewards/rejected": -1.0132964849472046, "step": 732 }, { "epoch": 0.8500904813608396, "grad_norm": 104.29186913107871, "learning_rate": 1.3389924102950213e-07, "logits/chosen": -1.3433104753494263, "logits/rejected": -1.349946141242981, "logps/chosen": -140.24569702148438, "logps/rejected": -149.5057373046875, "loss": 0.5876, "rewards/accuracies": 0.65625, "rewards/chosen": -0.04752161353826523, "rewards/margins": 0.3977040648460388, "rewards/rejected": -0.44522568583488464, "step": 734 }, { "epoch": 0.8524068041983351, "grad_norm": 81.62196698132477, "learning_rate": 1.3353544934053615e-07, "logits/chosen": -1.3260061740875244, "logits/rejected": -1.3429383039474487, "logps/chosen": -127.91857147216797, "logps/rejected": -150.63392639160156, "loss": 0.5993, "rewards/accuracies": 0.71875, "rewards/chosen": -0.27976539731025696, "rewards/margins": 0.31496569514274597, "rewards/rejected": -0.5947310924530029, "step": 736 }, { "epoch": 0.8547231270358306, "grad_norm": 88.51535267570014, "learning_rate": 1.331711569000878e-07, "logits/chosen": -1.385151982307434, "logits/rejected": -1.3968982696533203, "logps/chosen": -128.69894409179688, "logps/rejected": -145.05084228515625, "loss": 0.6054, "rewards/accuracies": 0.625, "rewards/chosen": 0.024006934836506844, "rewards/margins": 0.4703417718410492, "rewards/rejected": -0.4463347792625427, "step": 738 }, { "epoch": 0.8570394498733261, "grad_norm": 83.96836445474686, "learning_rate": 1.3280636914777344e-07, "logits/chosen": -1.2993488311767578, "logits/rejected": -1.3133535385131836, "logps/chosen": -133.5054168701172, "logps/rejected": -148.28396606445312, "loss": 0.557, "rewards/accuracies": 0.59375, "rewards/chosen": -0.11238986253738403, "rewards/margins": 0.47711944580078125, "rewards/rejected": -0.5895093083381653, "step": 740 }, { "epoch": 0.8593557727108215, "grad_norm": 83.73076557575334, "learning_rate": 1.3244109153060547e-07, "logits/chosen": -1.3348900079727173, "logits/rejected": -1.3360458612442017, "logps/chosen": -141.75746154785156, "logps/rejected": -149.81410217285156, "loss": 0.6232, "rewards/accuracies": 0.5625, "rewards/chosen": -0.05470200255513191, "rewards/margins": 0.1306239813566208, "rewards/rejected": -0.1853259950876236, "step": 742 }, { "epoch": 0.861672095548317, "grad_norm": 96.3207173875703, "learning_rate": 1.320753295029109e-07, "logits/chosen": -1.199431300163269, "logits/rejected": -1.2418042421340942, "logps/chosen": -101.57156372070312, "logps/rejected": -137.4820098876953, "loss": 0.6712, "rewards/accuracies": 0.71875, "rewards/chosen": -0.06123688444495201, "rewards/margins": 0.6316696405410767, "rewards/rejected": -0.6929064989089966, "step": 744 }, { "epoch": 0.8639884183858125, "grad_norm": 100.3243313129826, "learning_rate": 1.3170908852625012e-07, "logits/chosen": -1.3424568176269531, "logits/rejected": -1.3905185461044312, "logps/chosen": -149.32872009277344, "logps/rejected": -220.64122009277344, "loss": 0.5537, "rewards/accuracies": 0.71875, "rewards/chosen": -0.28289514780044556, "rewards/margins": 0.7176157236099243, "rewards/rejected": -1.000510811805725, "step": 746 }, { "epoch": 0.866304741223308, "grad_norm": 70.91956724923047, "learning_rate": 1.313423740693349e-07, "logits/chosen": -1.2025208473205566, "logits/rejected": -1.266943335533142, "logps/chosen": -152.86985778808594, "logps/rejected": -173.8192138671875, "loss": 0.6223, "rewards/accuracies": 0.5625, "rewards/chosen": -0.04741968959569931, "rewards/margins": 0.35572314262390137, "rewards/rejected": -0.4031428098678589, "step": 748 }, { "epoch": 0.8686210640608035, "grad_norm": 75.21549346823406, "learning_rate": 1.3097519160794723e-07, "logits/chosen": -1.3221216201782227, "logits/rejected": -1.326745629310608, "logps/chosen": -105.75431823730469, "logps/rejected": -113.87251281738281, "loss": 0.5687, "rewards/accuracies": 0.78125, "rewards/chosen": -0.058674052357673645, "rewards/margins": 0.6489120125770569, "rewards/rejected": -0.7075860500335693, "step": 750 }, { "epoch": 0.8709373868982989, "grad_norm": 74.6437050803779, "learning_rate": 1.306075466248574e-07, "logits/chosen": -1.3908584117889404, "logits/rejected": -1.3725515604019165, "logps/chosen": -156.27899169921875, "logps/rejected": -151.42625427246094, "loss": 0.5536, "rewards/accuracies": 0.71875, "rewards/chosen": 0.030293064191937447, "rewards/margins": 0.33356091380119324, "rewards/rejected": -0.30326780676841736, "step": 752 }, { "epoch": 0.8732537097357944, "grad_norm": 86.78294546912407, "learning_rate": 1.302394446097418e-07, "logits/chosen": -1.3920029401779175, "logits/rejected": -1.4190560579299927, "logps/chosen": -131.45228576660156, "logps/rejected": -169.52101135253906, "loss": 0.6078, "rewards/accuracies": 0.59375, "rewards/chosen": -0.09971225261688232, "rewards/margins": 0.5380171537399292, "rewards/rejected": -0.6377294659614563, "step": 754 }, { "epoch": 0.8755700325732899, "grad_norm": 74.55059453725265, "learning_rate": 1.2987089105910155e-07, "logits/chosen": -1.239861011505127, "logits/rejected": -1.2780354022979736, "logps/chosen": -157.7389373779297, "logps/rejected": -199.31524658203125, "loss": 0.5538, "rewards/accuracies": 0.78125, "rewards/chosen": 0.019644632935523987, "rewards/margins": 1.3704544305801392, "rewards/rejected": -1.3508098125457764, "step": 756 }, { "epoch": 0.8778863554107854, "grad_norm": 76.3646913429212, "learning_rate": 1.2950189147617987e-07, "logits/chosen": -1.484637975692749, "logits/rejected": -1.451521873474121, "logps/chosen": -216.90333557128906, "logps/rejected": -216.04563903808594, "loss": 0.5957, "rewards/accuracies": 0.625, "rewards/chosen": -0.4405128061771393, "rewards/margins": 0.5718247294425964, "rewards/rejected": -1.012337565422058, "step": 758 }, { "epoch": 0.8802026782482808, "grad_norm": 81.06900576809731, "learning_rate": 1.2913245137088022e-07, "logits/chosen": -1.4075809717178345, "logits/rejected": -1.446765422821045, "logps/chosen": -182.1182861328125, "logps/rejected": -211.40109252929688, "loss": 0.5711, "rewards/accuracies": 0.71875, "rewards/chosen": -0.01186487078666687, "rewards/margins": 0.6298959255218506, "rewards/rejected": -0.6417607665061951, "step": 760 }, { "epoch": 0.8825190010857763, "grad_norm": 105.35884658750386, "learning_rate": 1.2876257625968397e-07, "logits/chosen": -1.2835676670074463, "logits/rejected": -1.3046995401382446, "logps/chosen": -132.78152465820312, "logps/rejected": -161.31594848632812, "loss": 0.5776, "rewards/accuracies": 0.78125, "rewards/chosen": -0.09922360628843307, "rewards/margins": 0.8614255785942078, "rewards/rejected": -0.9606491327285767, "step": 762 }, { "epoch": 0.8848353239232718, "grad_norm": 79.20062356379438, "learning_rate": 1.283922716655679e-07, "logits/chosen": -1.1998169422149658, "logits/rejected": -1.2543299198150635, "logps/chosen": -123.684326171875, "logps/rejected": -152.342041015625, "loss": 0.5882, "rewards/accuracies": 0.6875, "rewards/chosen": -0.04377639666199684, "rewards/margins": 0.5460759997367859, "rewards/rejected": -0.5898523926734924, "step": 764 }, { "epoch": 0.8871516467607673, "grad_norm": 83.51961253986426, "learning_rate": 1.2802154311792196e-07, "logits/chosen": -1.1967344284057617, "logits/rejected": -1.2713446617126465, "logps/chosen": -139.49871826171875, "logps/rejected": -159.91653442382812, "loss": 0.609, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07481690496206284, "rewards/margins": 0.5285822153091431, "rewards/rejected": -0.6033991575241089, "step": 766 }, { "epoch": 0.8894679695982628, "grad_norm": 87.62676156964712, "learning_rate": 1.276503961524665e-07, "logits/chosen": -1.3275455236434937, "logits/rejected": -1.4026497602462769, "logps/chosen": -129.25172424316406, "logps/rejected": -143.60403442382812, "loss": 0.6153, "rewards/accuracies": 0.6875, "rewards/chosen": -0.25011736154556274, "rewards/margins": 0.5108656883239746, "rewards/rejected": -0.7609830498695374, "step": 768 }, { "epoch": 0.8917842924357582, "grad_norm": 79.69384964872866, "learning_rate": 1.2727883631116967e-07, "logits/chosen": -1.250293493270874, "logits/rejected": -1.371201753616333, "logps/chosen": -165.1368865966797, "logps/rejected": -193.74635314941406, "loss": 0.5857, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06070555001497269, "rewards/margins": 0.8822014331817627, "rewards/rejected": -0.94290691614151, "step": 770 }, { "epoch": 0.8941006152732537, "grad_norm": 208.13208814234412, "learning_rate": 1.2690686914216473e-07, "logits/chosen": -1.389907956123352, "logits/rejected": -1.372930645942688, "logps/chosen": -154.89971923828125, "logps/rejected": -180.29736328125, "loss": 0.7413, "rewards/accuracies": 0.53125, "rewards/chosen": -0.334947407245636, "rewards/margins": 0.8569056391716003, "rewards/rejected": -1.1918531656265259, "step": 772 }, { "epoch": 0.8964169381107492, "grad_norm": 87.24757168930763, "learning_rate": 1.2653450019966719e-07, "logits/chosen": -1.2602267265319824, "logits/rejected": -1.2742539644241333, "logps/chosen": -108.18531036376953, "logps/rejected": -128.6900634765625, "loss": 0.5774, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06359031796455383, "rewards/margins": 0.7088490128517151, "rewards/rejected": -0.7724392414093018, "step": 774 }, { "epoch": 0.8987332609482447, "grad_norm": 83.80268761754905, "learning_rate": 1.2616173504389172e-07, "logits/chosen": -1.2021056413650513, "logits/rejected": -1.2368252277374268, "logps/chosen": -141.87701416015625, "logps/rejected": -154.36065673828125, "loss": 0.6236, "rewards/accuracies": 0.5, "rewards/chosen": -0.3400351405143738, "rewards/margins": 0.41063636541366577, "rewards/rejected": -0.7506715655326843, "step": 776 }, { "epoch": 0.9010495837857402, "grad_norm": 153.2532920637624, "learning_rate": 1.2578857924096933e-07, "logits/chosen": -1.448024034500122, "logits/rejected": -1.3988113403320312, "logps/chosen": -154.5913848876953, "logps/rejected": -158.92227172851562, "loss": 0.6673, "rewards/accuracies": 0.6875, "rewards/chosen": -0.05520397424697876, "rewards/margins": 0.35100066661834717, "rewards/rejected": -0.4062046408653259, "step": 778 }, { "epoch": 0.9033659066232356, "grad_norm": 71.28047852511637, "learning_rate": 1.2541503836286425e-07, "logits/chosen": -1.3942607641220093, "logits/rejected": -1.349011778831482, "logps/chosen": -173.88368225097656, "logps/rejected": -171.93663024902344, "loss": 0.5815, "rewards/accuracies": 0.84375, "rewards/chosen": -0.220017209649086, "rewards/margins": 0.7204620242118835, "rewards/rejected": -0.9404792785644531, "step": 780 }, { "epoch": 0.9056822294607311, "grad_norm": 111.86020202854063, "learning_rate": 1.250411179872905e-07, "logits/chosen": -1.2979280948638916, "logits/rejected": -1.3246498107910156, "logps/chosen": -132.1781463623047, "logps/rejected": -142.88433837890625, "loss": 0.5809, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1429004967212677, "rewards/margins": 0.4971502721309662, "rewards/rejected": -0.6400507688522339, "step": 782 }, { "epoch": 0.9079985522982266, "grad_norm": 72.6068979409082, "learning_rate": 1.246668236976288e-07, "logits/chosen": -1.365678310394287, "logits/rejected": -1.2877997159957886, "logps/chosen": -132.5888671875, "logps/rejected": -125.15145874023438, "loss": 0.6167, "rewards/accuracies": 0.5625, "rewards/chosen": -0.05012892931699753, "rewards/margins": 0.14901477098464966, "rewards/rejected": -0.19914370775222778, "step": 784 }, { "epoch": 0.9103148751357221, "grad_norm": 77.28231066519794, "learning_rate": 1.2429216108284333e-07, "logits/chosen": -1.241924524307251, "logits/rejected": -1.2669358253479004, "logps/chosen": -170.2476348876953, "logps/rejected": -176.51695251464844, "loss": 0.6174, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2486531287431717, "rewards/margins": 0.7952675223350525, "rewards/rejected": -1.0439205169677734, "step": 786 }, { "epoch": 0.9126311979732176, "grad_norm": 77.68314015122489, "learning_rate": 1.2391713573739784e-07, "logits/chosen": -1.5044151544570923, "logits/rejected": -1.38852059841156, "logps/chosen": -156.23318481445312, "logps/rejected": -150.65298461914062, "loss": 0.5763, "rewards/accuracies": 0.5, "rewards/chosen": -0.028940770775079727, "rewards/margins": 0.28721868991851807, "rewards/rejected": -0.3161594271659851, "step": 788 }, { "epoch": 0.914947520810713, "grad_norm": 74.30222390288303, "learning_rate": 1.235417532611725e-07, "logits/chosen": -1.4984527826309204, "logits/rejected": -1.5665603876113892, "logps/chosen": -166.45755004882812, "logps/rejected": -199.21507263183594, "loss": 0.6066, "rewards/accuracies": 0.65625, "rewards/chosen": -0.00897267460823059, "rewards/margins": 0.5808253288269043, "rewards/rejected": -0.5897979736328125, "step": 790 }, { "epoch": 0.9172638436482085, "grad_norm": 73.07953128202571, "learning_rate": 1.2316601925938025e-07, "logits/chosen": -1.3732727766036987, "logits/rejected": -1.444867730140686, "logps/chosen": -126.97662353515625, "logps/rejected": -142.06971740722656, "loss": 0.5687, "rewards/accuracies": 0.65625, "rewards/chosen": -0.05256509408354759, "rewards/margins": 0.28611257672309875, "rewards/rejected": -0.33867767453193665, "step": 792 }, { "epoch": 0.919580166485704, "grad_norm": 95.48085015281728, "learning_rate": 1.2278993934248278e-07, "logits/chosen": -1.258427381515503, "logits/rejected": -1.275139570236206, "logps/chosen": -110.3523178100586, "logps/rejected": -123.8600082397461, "loss": 0.692, "rewards/accuracies": 0.6875, "rewards/chosen": -0.11288494616746902, "rewards/margins": 0.2848440706729889, "rewards/rejected": -0.3977290093898773, "step": 794 }, { "epoch": 0.9218964893231995, "grad_norm": 58.55738169440391, "learning_rate": 1.2241351912610725e-07, "logits/chosen": -1.2354912757873535, "logits/rejected": -1.2666208744049072, "logps/chosen": -107.17646026611328, "logps/rejected": -116.17412567138672, "loss": 0.6002, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3244227468967438, "rewards/margins": 0.10813488066196442, "rewards/rejected": -0.4325576424598694, "step": 796 }, { "epoch": 0.924212812160695, "grad_norm": 79.9120853192708, "learning_rate": 1.2203676423096197e-07, "logits/chosen": -1.2370004653930664, "logits/rejected": -1.348116159439087, "logps/chosen": -173.00152587890625, "logps/rejected": -202.92649841308594, "loss": 0.5644, "rewards/accuracies": 0.71875, "rewards/chosen": -0.16612032055854797, "rewards/margins": 0.47205114364624023, "rewards/rejected": -0.6381714344024658, "step": 798 }, { "epoch": 0.9265291349981903, "grad_norm": 70.53190608979963, "learning_rate": 1.2165968028275275e-07, "logits/chosen": -1.3343358039855957, "logits/rejected": -1.339569330215454, "logps/chosen": -177.5317840576172, "logps/rejected": -194.53817749023438, "loss": 0.5761, "rewards/accuracies": 0.625, "rewards/chosen": -0.04334399104118347, "rewards/margins": 0.66258704662323, "rewards/rejected": -0.7059310078620911, "step": 800 }, { "epoch": 0.9265291349981903, "eval_logits/chosen": -1.319779396057129, "eval_logits/rejected": -1.3124960660934448, "eval_logps/chosen": -139.39353942871094, "eval_logps/rejected": -140.94285583496094, "eval_loss": 0.6185734272003174, "eval_rewards/accuracies": 0.6399999856948853, "eval_rewards/chosen": -0.37623491883277893, "eval_rewards/margins": 0.32269400358200073, "eval_rewards/rejected": -0.6989290118217468, "eval_runtime": 29.0213, "eval_samples_per_second": 3.446, "eval_steps_per_second": 0.861, "step": 800 }, { "epoch": 0.9288454578356858, "grad_norm": 80.16286794415615, "learning_rate": 1.212822729120989e-07, "logits/chosen": -1.4367756843566895, "logits/rejected": -1.3805702924728394, "logps/chosen": -171.19290161132812, "logps/rejected": -173.32327270507812, "loss": 0.6077, "rewards/accuracies": 0.625, "rewards/chosen": -0.2906244695186615, "rewards/margins": 0.5511162281036377, "rewards/rejected": -0.8417407274246216, "step": 802 }, { "epoch": 0.9311617806731813, "grad_norm": 73.70947981864272, "learning_rate": 1.209045477544489e-07, "logits/chosen": -1.219347357749939, "logits/rejected": -1.3135026693344116, "logps/chosen": -125.98372650146484, "logps/rejected": -145.56602478027344, "loss": 0.5605, "rewards/accuracies": 0.75, "rewards/chosen": -0.173287034034729, "rewards/margins": 0.5581431984901428, "rewards/rejected": -0.7314302325248718, "step": 804 }, { "epoch": 0.9334781035106768, "grad_norm": 103.71883284173158, "learning_rate": 1.2052651044999658e-07, "logits/chosen": -1.2322499752044678, "logits/rejected": -1.3243571519851685, "logps/chosen": -103.25643157958984, "logps/rejected": -140.8385009765625, "loss": 0.5835, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15573744475841522, "rewards/margins": 0.8674708008766174, "rewards/rejected": -1.0232083797454834, "step": 806 }, { "epoch": 0.9357944263481722, "grad_norm": 62.392714879995744, "learning_rate": 1.201481666435967e-07, "logits/chosen": -1.3585785627365112, "logits/rejected": -1.3998768329620361, "logps/chosen": -153.4759521484375, "logps/rejected": -162.12579345703125, "loss": 0.5661, "rewards/accuracies": 0.6875, "rewards/chosen": -0.23147635161876678, "rewards/margins": 0.3873007893562317, "rewards/rejected": -0.6187771558761597, "step": 808 }, { "epoch": 0.9381107491856677, "grad_norm": 92.78640979629462, "learning_rate": 1.1976952198468065e-07, "logits/chosen": -1.3264836072921753, "logits/rejected": -1.3320363759994507, "logps/chosen": -168.32054138183594, "logps/rejected": -156.73651123046875, "loss": 0.5762, "rewards/accuracies": 0.71875, "rewards/chosen": -0.20424926280975342, "rewards/margins": 0.26160287857055664, "rewards/rejected": -0.46585214138031006, "step": 810 }, { "epoch": 0.9404270720231632, "grad_norm": 81.23624930191687, "learning_rate": 1.1939058212717224e-07, "logits/chosen": -1.25242280960083, "logits/rejected": -1.2761772871017456, "logps/chosen": -142.50938415527344, "logps/rejected": -162.35137939453125, "loss": 0.6122, "rewards/accuracies": 0.59375, "rewards/chosen": -0.40755337476730347, "rewards/margins": 0.8417052030563354, "rewards/rejected": -1.2492586374282837, "step": 812 }, { "epoch": 0.9427433948606587, "grad_norm": 93.63860671562202, "learning_rate": 1.1901135272940319e-07, "logits/chosen": -1.3451721668243408, "logits/rejected": -1.4240652322769165, "logps/chosen": -166.04376220703125, "logps/rejected": -201.26220703125, "loss": 0.5927, "rewards/accuracies": 0.75, "rewards/chosen": -0.3375451862812042, "rewards/margins": 0.7751595377922058, "rewards/rejected": -1.112704873085022, "step": 814 }, { "epoch": 0.9450597176981542, "grad_norm": 74.49560053095384, "learning_rate": 1.1863183945402853e-07, "logits/chosen": -1.399809718132019, "logits/rejected": -1.4244718551635742, "logps/chosen": -142.70849609375, "logps/rejected": -154.3977508544922, "loss": 0.6206, "rewards/accuracies": 0.59375, "rewards/chosen": -0.27791112661361694, "rewards/margins": 0.23856940865516663, "rewards/rejected": -0.5164804458618164, "step": 816 }, { "epoch": 0.9473760405356496, "grad_norm": 76.5360836871227, "learning_rate": 1.1825204796794222e-07, "logits/chosen": -1.2848563194274902, "logits/rejected": -1.3460373878479004, "logps/chosen": -126.32781982421875, "logps/rejected": -146.41732788085938, "loss": 0.5922, "rewards/accuracies": 0.65625, "rewards/chosen": -0.04460408538579941, "rewards/margins": 0.6903345584869385, "rewards/rejected": -0.7349386215209961, "step": 818 }, { "epoch": 0.9496923633731451, "grad_norm": 69.37848187371887, "learning_rate": 1.1787198394219248e-07, "logits/chosen": -1.212153434753418, "logits/rejected": -1.2568764686584473, "logps/chosen": -112.97404479980469, "logps/rejected": -127.22880554199219, "loss": 0.5922, "rewards/accuracies": 0.65625, "rewards/chosen": -0.2542499303817749, "rewards/margins": 0.43802982568740845, "rewards/rejected": -0.6922798156738281, "step": 820 }, { "epoch": 0.9520086862106406, "grad_norm": 74.78466523725046, "learning_rate": 1.1749165305189708e-07, "logits/chosen": -1.2366949319839478, "logits/rejected": -1.3136887550354004, "logps/chosen": -135.05006408691406, "logps/rejected": -167.1379852294922, "loss": 0.6245, "rewards/accuracies": 0.875, "rewards/chosen": -0.1665336787700653, "rewards/margins": 0.8222386837005615, "rewards/rejected": -0.9887723922729492, "step": 822 }, { "epoch": 0.9543250090481361, "grad_norm": 114.72850198751108, "learning_rate": 1.1711106097615862e-07, "logits/chosen": -1.2289320230484009, "logits/rejected": -1.2598837614059448, "logps/chosen": -175.93649291992188, "logps/rejected": -174.3079376220703, "loss": 0.7136, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8970119953155518, "rewards/margins": -0.25789961218833923, "rewards/rejected": -0.6391124725341797, "step": 824 }, { "epoch": 0.9566413318856316, "grad_norm": 82.7729985201964, "learning_rate": 1.1673021339797967e-07, "logits/chosen": -1.3273519277572632, "logits/rejected": -1.309187412261963, "logps/chosen": -123.46469116210938, "logps/rejected": -143.95350646972656, "loss": 0.6453, "rewards/accuracies": 0.75, "rewards/chosen": -0.04680802673101425, "rewards/margins": 0.8337461352348328, "rewards/rejected": -0.88055419921875, "step": 826 }, { "epoch": 0.958957654723127, "grad_norm": 93.78229578972356, "learning_rate": 1.16349116004178e-07, "logits/chosen": -1.235456943511963, "logits/rejected": -1.2821218967437744, "logps/chosen": -108.3389892578125, "logps/rejected": -153.83152770996094, "loss": 0.5369, "rewards/accuracies": 0.65625, "rewards/chosen": -0.07939951866865158, "rewards/margins": 0.8632293939590454, "rewards/rejected": -0.942628800868988, "step": 828 }, { "epoch": 0.9612739775606225, "grad_norm": 82.41065568509865, "learning_rate": 1.1596777448530169e-07, "logits/chosen": -1.2206637859344482, "logits/rejected": -1.2953643798828125, "logps/chosen": -127.601318359375, "logps/rejected": -152.67910766601562, "loss": 0.569, "rewards/accuracies": 0.75, "rewards/chosen": 0.004549721255898476, "rewards/margins": 1.1071059703826904, "rewards/rejected": -1.1025562286376953, "step": 830 }, { "epoch": 0.963590300398118, "grad_norm": 73.37771598718861, "learning_rate": 1.15586194535544e-07, "logits/chosen": -1.3030959367752075, "logits/rejected": -1.3053159713745117, "logps/chosen": -145.50299072265625, "logps/rejected": -202.24827575683594, "loss": 0.5484, "rewards/accuracies": 0.71875, "rewards/chosen": -0.16388273239135742, "rewards/margins": 2.732254981994629, "rewards/rejected": -2.8961377143859863, "step": 832 }, { "epoch": 0.9659066232356135, "grad_norm": 77.35826158650924, "learning_rate": 1.1520438185265846e-07, "logits/chosen": -1.3323960304260254, "logits/rejected": -1.3863334655761719, "logps/chosen": -129.73374938964844, "logps/rejected": -150.87632751464844, "loss": 0.6108, "rewards/accuracies": 0.625, "rewards/chosen": -0.013788050971925259, "rewards/margins": 0.6643078923225403, "rewards/rejected": -0.6780959367752075, "step": 834 }, { "epoch": 0.968222946073109, "grad_norm": 71.75741636541773, "learning_rate": 1.1482234213787383e-07, "logits/chosen": -1.404686689376831, "logits/rejected": -1.4628666639328003, "logps/chosen": -105.55296325683594, "logps/rejected": -159.8828582763672, "loss": 0.5356, "rewards/accuracies": 0.8125, "rewards/chosen": 0.02817944809794426, "rewards/margins": 0.7608631253242493, "rewards/rejected": -0.7326837182044983, "step": 836 }, { "epoch": 0.9705392689106044, "grad_norm": 70.54976379165657, "learning_rate": 1.1444008109580882e-07, "logits/chosen": -1.1081938743591309, "logits/rejected": -1.19542396068573, "logps/chosen": -109.79875946044922, "logps/rejected": -166.45217895507812, "loss": 0.5609, "rewards/accuracies": 0.75, "rewards/chosen": -0.2566208839416504, "rewards/margins": 0.9902852773666382, "rewards/rejected": -1.2469063997268677, "step": 838 }, { "epoch": 0.9728555917480999, "grad_norm": 68.85449968110935, "learning_rate": 1.1405760443438712e-07, "logits/chosen": -1.4356677532196045, "logits/rejected": -1.4843618869781494, "logps/chosen": -160.30462646484375, "logps/rejected": -174.40020751953125, "loss": 0.5569, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0007999027147889137, "rewards/margins": 0.5300293564796448, "rewards/rejected": -0.5308292508125305, "step": 840 }, { "epoch": 0.9751719145855954, "grad_norm": 87.62002715815144, "learning_rate": 1.1367491786475194e-07, "logits/chosen": -1.3982141017913818, "logits/rejected": -1.436221957206726, "logps/chosen": -142.02593994140625, "logps/rejected": -171.52378845214844, "loss": 0.5838, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1319505125284195, "rewards/margins": 0.5477418303489685, "rewards/rejected": -0.6796923279762268, "step": 842 }, { "epoch": 0.9774882374230909, "grad_norm": 82.61971695055652, "learning_rate": 1.1329202710118086e-07, "logits/chosen": -1.2287306785583496, "logits/rejected": -1.2248528003692627, "logps/chosen": -153.746826171875, "logps/rejected": -183.39047241210938, "loss": 0.5654, "rewards/accuracies": 0.625, "rewards/chosen": -0.13021187484264374, "rewards/margins": 0.843558132648468, "rewards/rejected": -0.9737700819969177, "step": 844 }, { "epoch": 0.9798045602605863, "grad_norm": 70.54919389801739, "learning_rate": 1.1290893786100058e-07, "logits/chosen": -1.2976385354995728, "logits/rejected": -1.2402337789535522, "logps/chosen": -132.8706817626953, "logps/rejected": -156.9254608154297, "loss": 0.5145, "rewards/accuracies": 0.78125, "rewards/chosen": -0.42142388224601746, "rewards/margins": 0.9857786297798157, "rewards/rejected": -1.4072024822235107, "step": 846 }, { "epoch": 0.9821208830980818, "grad_norm": 81.68100060471126, "learning_rate": 1.1252565586450131e-07, "logits/chosen": -1.2878555059432983, "logits/rejected": -1.337039828300476, "logps/chosen": -193.6406707763672, "logps/rejected": -220.8046112060547, "loss": 0.5546, "rewards/accuracies": 0.75, "rewards/chosen": -0.4571903347969055, "rewards/margins": 0.6591623425483704, "rewards/rejected": -1.1163526773452759, "step": 848 }, { "epoch": 0.9844372059355773, "grad_norm": 93.0455963225426, "learning_rate": 1.1214218683485157e-07, "logits/chosen": -1.3210810422897339, "logits/rejected": -1.2823312282562256, "logps/chosen": -187.9322509765625, "logps/rejected": -193.3695068359375, "loss": 0.5531, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4511820077896118, "rewards/margins": 0.7341474294662476, "rewards/rejected": -1.1853294372558594, "step": 850 }, { "epoch": 0.9867535287730728, "grad_norm": 99.45149290799081, "learning_rate": 1.1175853649801273e-07, "logits/chosen": -1.2975995540618896, "logits/rejected": -1.3011971712112427, "logps/chosen": -134.58206176757812, "logps/rejected": -155.63638305664062, "loss": 0.6041, "rewards/accuracies": 0.65625, "rewards/chosen": -0.1595260500907898, "rewards/margins": 0.8379107713699341, "rewards/rejected": -0.9974368810653687, "step": 852 }, { "epoch": 0.9890698516105683, "grad_norm": 103.0078434001977, "learning_rate": 1.1137471058265329e-07, "logits/chosen": -1.2952150106430054, "logits/rejected": -1.3017812967300415, "logps/chosen": -148.48033142089844, "logps/rejected": -152.46951293945312, "loss": 0.5662, "rewards/accuracies": 0.59375, "rewards/chosen": -0.3374045491218567, "rewards/margins": 0.7678672671318054, "rewards/rejected": -1.105271816253662, "step": 854 }, { "epoch": 0.9913861744480637, "grad_norm": 132.30637646730733, "learning_rate": 1.1099071482006359e-07, "logits/chosen": -1.3640661239624023, "logits/rejected": -1.4117590188980103, "logps/chosen": -170.84063720703125, "logps/rejected": -178.2880096435547, "loss": 0.6467, "rewards/accuracies": 0.46875, "rewards/chosen": -0.8598957061767578, "rewards/margins": 0.3869331181049347, "rewards/rejected": -1.2468287944793701, "step": 856 }, { "epoch": 0.9937024972855591, "grad_norm": 106.55866485365634, "learning_rate": 1.1060655494407012e-07, "logits/chosen": -1.2328029870986938, "logits/rejected": -1.2614901065826416, "logps/chosen": -125.70144653320312, "logps/rejected": -148.56796264648438, "loss": 0.6174, "rewards/accuracies": 0.5625, "rewards/chosen": -0.24408888816833496, "rewards/margins": 0.35338127613067627, "rewards/rejected": -0.5974701642990112, "step": 858 }, { "epoch": 0.9960188201230546, "grad_norm": 92.00339838498802, "learning_rate": 1.102222366909499e-07, "logits/chosen": -1.125497579574585, "logits/rejected": -1.1907844543457031, "logps/chosen": -168.60272216796875, "logps/rejected": -184.2926025390625, "loss": 0.5696, "rewards/accuracies": 0.71875, "rewards/chosen": -0.504738450050354, "rewards/margins": 0.6949242353439331, "rewards/rejected": -1.199662685394287, "step": 860 }, { "epoch": 0.9983351429605501, "grad_norm": 95.4684250342951, "learning_rate": 1.0983776579934481e-07, "logits/chosen": -1.2691606283187866, "logits/rejected": -1.2375824451446533, "logps/chosen": -130.89102172851562, "logps/rejected": -144.6857452392578, "loss": 0.5968, "rewards/accuracies": 0.75, "rewards/chosen": -0.2888919413089752, "rewards/margins": 1.0368549823760986, "rewards/rejected": -1.3257468938827515, "step": 862 }, { "epoch": 1.0006514657980456, "grad_norm": 74.43449432333051, "learning_rate": 1.09453148010176e-07, "logits/chosen": -1.3525639772415161, "logits/rejected": -1.3091143369674683, "logps/chosen": -155.21115112304688, "logps/rejected": -155.0146942138672, "loss": 0.5571, "rewards/accuracies": 0.6875, "rewards/chosen": -0.15043863654136658, "rewards/margins": 0.7910802364349365, "rewards/rejected": -0.9415189027786255, "step": 864 }, { "epoch": 1.002967788635541, "grad_norm": 53.052072782970285, "learning_rate": 1.09068389066558e-07, "logits/chosen": -1.264554738998413, "logits/rejected": -1.3304630517959595, "logps/chosen": -138.06777954101562, "logps/rejected": -179.81643676757812, "loss": 0.4282, "rewards/accuracies": 0.75, "rewards/chosen": 0.0020294710993766785, "rewards/margins": 0.9756675362586975, "rewards/rejected": -0.9736379981040955, "step": 866 }, { "epoch": 1.0052841114730366, "grad_norm": 54.29727997773151, "learning_rate": 1.0868349471371314e-07, "logits/chosen": -1.2127741575241089, "logits/rejected": -1.2704501152038574, "logps/chosen": -142.79518127441406, "logps/rejected": -189.8652801513672, "loss": 0.4317, "rewards/accuracies": 0.78125, "rewards/chosen": -0.13422907888889313, "rewards/margins": 1.5447545051574707, "rewards/rejected": -1.678983449935913, "step": 868 }, { "epoch": 1.007600434310532, "grad_norm": 56.12931427605149, "learning_rate": 1.0829847069888565e-07, "logits/chosen": -1.2043306827545166, "logits/rejected": -1.2564079761505127, "logps/chosen": -111.9096908569336, "logps/rejected": -146.1145782470703, "loss": 0.463, "rewards/accuracies": 0.84375, "rewards/chosen": -0.0683784931898117, "rewards/margins": 0.8024751543998718, "rewards/rejected": -0.8708536624908447, "step": 870 }, { "epoch": 1.0099167571480274, "grad_norm": 51.89999261447005, "learning_rate": 1.0791332277125587e-07, "logits/chosen": -1.3936680555343628, "logits/rejected": -1.435255527496338, "logps/chosen": -146.84841918945312, "logps/rejected": -182.092041015625, "loss": 0.4189, "rewards/accuracies": 0.78125, "rewards/chosen": 0.1668233424425125, "rewards/margins": 1.2165982723236084, "rewards/rejected": -1.0497750043869019, "step": 872 }, { "epoch": 1.012233079985523, "grad_norm": 59.36325847754287, "learning_rate": 1.075280566818544e-07, "logits/chosen": -1.2359968423843384, "logits/rejected": -1.1934764385223389, "logps/chosen": -142.46665954589844, "logps/rejected": -150.53485107421875, "loss": 0.4029, "rewards/accuracies": 0.75, "rewards/chosen": 0.20680281519889832, "rewards/margins": 0.8232614994049072, "rewards/rejected": -0.6164587736129761, "step": 874 }, { "epoch": 1.0145494028230184, "grad_norm": 59.925719498657855, "learning_rate": 1.0714267818347629e-07, "logits/chosen": -1.351982831954956, "logits/rejected": -1.404510259628296, "logps/chosen": -119.25331115722656, "logps/rejected": -147.39370727539062, "loss": 0.4487, "rewards/accuracies": 0.84375, "rewards/chosen": -0.004702276550233364, "rewards/margins": 1.059253454208374, "rewards/rejected": -1.0639557838439941, "step": 876 }, { "epoch": 1.016865725660514, "grad_norm": 63.18524537189573, "learning_rate": 1.0675719303059492e-07, "logits/chosen": -1.3256399631500244, "logits/rejected": -1.351796269416809, "logps/chosen": -153.5157928466797, "logps/rejected": -202.40603637695312, "loss": 0.4485, "rewards/accuracies": 0.78125, "rewards/chosen": 0.10010495781898499, "rewards/margins": 1.2945441007614136, "rewards/rejected": -1.1944392919540405, "step": 878 }, { "epoch": 1.0191820484980094, "grad_norm": 56.63348082432867, "learning_rate": 1.0637160697927649e-07, "logits/chosen": -1.359007716178894, "logits/rejected": -1.3727188110351562, "logps/chosen": -164.15415954589844, "logps/rejected": -177.09884643554688, "loss": 0.4274, "rewards/accuracies": 0.8125, "rewards/chosen": -0.14782752096652985, "rewards/margins": 1.1597306728363037, "rewards/rejected": -1.307558298110962, "step": 880 }, { "epoch": 1.0214983713355048, "grad_norm": 47.49406015149724, "learning_rate": 1.059859257870936e-07, "logits/chosen": -1.2314316034317017, "logits/rejected": -1.3156919479370117, "logps/chosen": -99.96794891357422, "logps/rejected": -125.84103393554688, "loss": 0.4242, "rewards/accuracies": 0.78125, "rewards/chosen": 0.19646473228931427, "rewards/margins": 0.9843569993972778, "rewards/rejected": -0.78789222240448, "step": 882 }, { "epoch": 1.0238146941730004, "grad_norm": 45.83894890851922, "learning_rate": 1.0560015521303953e-07, "logits/chosen": -1.3569869995117188, "logits/rejected": -1.397745132446289, "logps/chosen": -140.70188903808594, "logps/rejected": -169.63076782226562, "loss": 0.3978, "rewards/accuracies": 0.8125, "rewards/chosen": 0.11355403065681458, "rewards/margins": 1.1430623531341553, "rewards/rejected": -1.029508352279663, "step": 884 }, { "epoch": 1.0261310170104958, "grad_norm": 54.040039551724966, "learning_rate": 1.0521430101744238e-07, "logits/chosen": -1.4817092418670654, "logits/rejected": -1.434765100479126, "logps/chosen": -151.61231994628906, "logps/rejected": -144.6493682861328, "loss": 0.4342, "rewards/accuracies": 0.6875, "rewards/chosen": 0.10845161974430084, "rewards/margins": 0.5633866786956787, "rewards/rejected": -0.4549350440502167, "step": 886 }, { "epoch": 1.0284473398479914, "grad_norm": 51.70194540758909, "learning_rate": 1.0482836896187861e-07, "logits/chosen": -1.207306981086731, "logits/rejected": -1.2579807043075562, "logps/chosen": -122.11793518066406, "logps/rejected": -145.04310607910156, "loss": 0.4389, "rewards/accuracies": 0.65625, "rewards/chosen": -0.15163260698318481, "rewards/margins": 0.5189149379730225, "rewards/rejected": -0.6705474853515625, "step": 888 }, { "epoch": 1.0307636626854868, "grad_norm": 61.2930931985187, "learning_rate": 1.044423648090875e-07, "logits/chosen": -1.3574914932250977, "logits/rejected": -1.445235013961792, "logps/chosen": -143.323974609375, "logps/rejected": -170.43026733398438, "loss": 0.4619, "rewards/accuracies": 0.84375, "rewards/chosen": 0.22330205142498016, "rewards/margins": 1.1949002742767334, "rewards/rejected": -0.971598207950592, "step": 890 }, { "epoch": 1.0330799855229822, "grad_norm": 52.83359389131165, "learning_rate": 1.0405629432288488e-07, "logits/chosen": -1.2581863403320312, "logits/rejected": -1.2796192169189453, "logps/chosen": -130.67315673828125, "logps/rejected": -158.39817810058594, "loss": 0.4437, "rewards/accuracies": 0.8125, "rewards/chosen": 2.531241625547409e-05, "rewards/margins": 1.0420689582824707, "rewards/rejected": -1.0420435667037964, "step": 892 }, { "epoch": 1.0353963083604778, "grad_norm": 54.354875880802155, "learning_rate": 1.036701632680769e-07, "logits/chosen": -1.2646857500076294, "logits/rejected": -1.2912808656692505, "logps/chosen": -143.00558471679688, "logps/rejected": -181.23011779785156, "loss": 0.4284, "rewards/accuracies": 0.90625, "rewards/chosen": -0.12129174172878265, "rewards/margins": 1.4072792530059814, "rewards/rejected": -1.5285711288452148, "step": 894 }, { "epoch": 1.0377126311979732, "grad_norm": 103.14604966973508, "learning_rate": 1.0328397741037426e-07, "logits/chosen": -1.3190052509307861, "logits/rejected": -1.2969970703125, "logps/chosen": -107.70980072021484, "logps/rejected": -113.95564270019531, "loss": 0.5385, "rewards/accuracies": 0.75, "rewards/chosen": -0.04918019846081734, "rewards/margins": 0.5177782773971558, "rewards/rejected": -0.566958487033844, "step": 896 }, { "epoch": 1.0400289540354688, "grad_norm": 43.79052026196426, "learning_rate": 1.0289774251630601e-07, "logits/chosen": -1.3973785638809204, "logits/rejected": -1.444900393486023, "logps/chosen": -96.34955596923828, "logps/rejected": -113.84618377685547, "loss": 0.3983, "rewards/accuracies": 0.78125, "rewards/chosen": 0.06471097469329834, "rewards/margins": 0.8261978626251221, "rewards/rejected": -0.7614869475364685, "step": 898 }, { "epoch": 1.0423452768729642, "grad_norm": 49.65418562930932, "learning_rate": 1.0251146435313328e-07, "logits/chosen": -1.3165615797042847, "logits/rejected": -1.294682502746582, "logps/chosen": -149.90878295898438, "logps/rejected": -151.5213623046875, "loss": 0.4286, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07793111354112625, "rewards/margins": 0.9900121092796326, "rewards/rejected": -1.0679432153701782, "step": 900 }, { "epoch": 1.0423452768729642, "eval_logits/chosen": -1.267106533050537, "eval_logits/rejected": -1.2631834745407104, "eval_logps/chosen": -143.7148895263672, "eval_logps/rejected": -145.94981384277344, "eval_loss": 0.6367580890655518, "eval_rewards/accuracies": 0.6800000071525574, "eval_rewards/chosen": -0.8083698153495789, "eval_rewards/margins": 0.3912562429904938, "eval_rewards/rejected": -1.1996259689331055, "eval_runtime": 28.4118, "eval_samples_per_second": 3.52, "eval_steps_per_second": 0.88, "step": 900 }, { "epoch": 1.0446615997104596, "grad_norm": 50.09581799755012, "learning_rate": 1.0212514868876336e-07, "logits/chosen": -1.2866871356964111, "logits/rejected": -1.2886251211166382, "logps/chosen": -134.37945556640625, "logps/rejected": -155.17453002929688, "loss": 0.4296, "rewards/accuracies": 0.875, "rewards/chosen": -0.19163765013217926, "rewards/margins": 1.8016985654830933, "rewards/rejected": -1.9933364391326904, "step": 902 }, { "epoch": 1.0469779225479552, "grad_norm": 94.55399619359126, "learning_rate": 1.0173880129166357e-07, "logits/chosen": -1.26423978805542, "logits/rejected": -1.3786003589630127, "logps/chosen": -168.92079162597656, "logps/rejected": -219.23475646972656, "loss": 0.3919, "rewards/accuracies": 0.84375, "rewards/chosen": -0.48411691188812256, "rewards/margins": 2.5960710048675537, "rewards/rejected": -3.0801875591278076, "step": 904 }, { "epoch": 1.0492942453854506, "grad_norm": 59.87919429479282, "learning_rate": 1.0135242793077495e-07, "logits/chosen": -1.3900208473205566, "logits/rejected": -1.4745222330093384, "logps/chosen": -162.7766571044922, "logps/rejected": -199.24411010742188, "loss": 0.4539, "rewards/accuracies": 0.8125, "rewards/chosen": 0.10406473278999329, "rewards/margins": 1.1120522022247314, "rewards/rejected": -1.007987380027771, "step": 906 }, { "epoch": 1.0516105682229462, "grad_norm": 55.07389049227209, "learning_rate": 1.0096603437542632e-07, "logits/chosen": -1.2171599864959717, "logits/rejected": -1.2780107259750366, "logps/chosen": -178.72128295898438, "logps/rejected": -219.81561279296875, "loss": 0.4137, "rewards/accuracies": 0.84375, "rewards/chosen": -0.1644848883152008, "rewards/margins": 1.632908821105957, "rewards/rejected": -1.797393798828125, "step": 908 }, { "epoch": 1.0539268910604416, "grad_norm": 64.00644608976998, "learning_rate": 1.0057962639524798e-07, "logits/chosen": -1.193771481513977, "logits/rejected": -1.2244319915771484, "logps/chosen": -165.76658630371094, "logps/rejected": -193.3189697265625, "loss": 0.4492, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1277046501636505, "rewards/margins": 0.9276853799819946, "rewards/rejected": -1.0553901195526123, "step": 910 }, { "epoch": 1.056243213897937, "grad_norm": 116.71156999784645, "learning_rate": 1.0019320976008566e-07, "logits/chosen": -1.1768372058868408, "logits/rejected": -1.2854583263397217, "logps/chosen": -163.41737365722656, "logps/rejected": -220.3693084716797, "loss": 0.4746, "rewards/accuracies": 0.75, "rewards/chosen": -0.866296112537384, "rewards/margins": 1.5394386053085327, "rewards/rejected": -2.4057345390319824, "step": 912 }, { "epoch": 1.0585595367354326, "grad_norm": 54.66065249280564, "learning_rate": 9.980679023991435e-08, "logits/chosen": -1.1219855546951294, "logits/rejected": -1.1816667318344116, "logps/chosen": -140.3036651611328, "logps/rejected": -150.07461547851562, "loss": 0.48, "rewards/accuracies": 0.71875, "rewards/chosen": -0.544008195400238, "rewards/margins": 1.0536366701126099, "rewards/rejected": -1.5976448059082031, "step": 914 }, { "epoch": 1.060875859572928, "grad_norm": 53.241869392230555, "learning_rate": 9.942037360475204e-08, "logits/chosen": -1.1496176719665527, "logits/rejected": -1.234816074371338, "logps/chosen": -115.39232635498047, "logps/rejected": -157.61270141601562, "loss": 0.436, "rewards/accuracies": 0.8125, "rewards/chosen": -0.24051757156848907, "rewards/margins": 1.0525996685028076, "rewards/rejected": -1.2931171655654907, "step": 916 }, { "epoch": 1.0631921824104236, "grad_norm": 58.28950080838779, "learning_rate": 9.90339656245737e-08, "logits/chosen": -1.2064344882965088, "logits/rejected": -1.2606756687164307, "logps/chosen": -131.2918243408203, "logps/rejected": -159.1555938720703, "loss": 0.4701, "rewards/accuracies": 0.75, "rewards/chosen": -0.12312920391559601, "rewards/margins": 1.0161489248275757, "rewards/rejected": -1.1392781734466553, "step": 918 }, { "epoch": 1.065508505247919, "grad_norm": 55.39446729410911, "learning_rate": 9.864757206922503e-08, "logits/chosen": -1.2960537672042847, "logits/rejected": -1.363585352897644, "logps/chosen": -111.44401550292969, "logps/rejected": -134.6551513671875, "loss": 0.4533, "rewards/accuracies": 0.78125, "rewards/chosen": 0.02558358758687973, "rewards/margins": 1.097936749458313, "rewards/rejected": -1.0723532438278198, "step": 920 }, { "epoch": 1.0678248280854143, "grad_norm": 42.05258080368828, "learning_rate": 9.826119870833642e-08, "logits/chosen": -1.2935595512390137, "logits/rejected": -1.2712563276290894, "logps/chosen": -148.2998504638672, "logps/rejected": -173.95767211914062, "loss": 0.4009, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07796184718608856, "rewards/margins": 1.0437368154525757, "rewards/rejected": -1.1216987371444702, "step": 922 }, { "epoch": 1.07014115092291, "grad_norm": 53.13674529496238, "learning_rate": 9.787485131123664e-08, "logits/chosen": -1.1843931674957275, "logits/rejected": -1.1805684566497803, "logps/chosen": -131.55718994140625, "logps/rejected": -137.18283081054688, "loss": 0.4561, "rewards/accuracies": 0.75, "rewards/chosen": -0.24045638740062714, "rewards/margins": 1.1393637657165527, "rewards/rejected": -1.3798199892044067, "step": 924 }, { "epoch": 1.0724574737604053, "grad_norm": 65.20870154743594, "learning_rate": 9.748853564686674e-08, "logits/chosen": -1.2203890085220337, "logits/rejected": -1.2355417013168335, "logps/chosen": -138.7787628173828, "logps/rejected": -155.23309326171875, "loss": 0.4613, "rewards/accuracies": 0.8125, "rewards/chosen": -0.11271880567073822, "rewards/margins": 0.8596581220626831, "rewards/rejected": -0.9723768830299377, "step": 926 }, { "epoch": 1.0747737965979007, "grad_norm": 54.50345618528414, "learning_rate": 9.7102257483694e-08, "logits/chosen": -1.339476227760315, "logits/rejected": -1.2760694026947021, "logps/chosen": -124.2467269897461, "logps/rejected": -126.01527404785156, "loss": 0.4199, "rewards/accuracies": 0.78125, "rewards/chosen": -0.29890668392181396, "rewards/margins": 0.8146764039993286, "rewards/rejected": -1.1135830879211426, "step": 928 }, { "epoch": 1.0770901194353963, "grad_norm": 65.98457223669924, "learning_rate": 9.671602258962574e-08, "logits/chosen": -1.2611260414123535, "logits/rejected": -1.3340933322906494, "logps/chosen": -146.05282592773438, "logps/rejected": -190.32269287109375, "loss": 0.4587, "rewards/accuracies": 0.71875, "rewards/chosen": -0.10131717473268509, "rewards/margins": 1.1271921396255493, "rewards/rejected": -1.2285093069076538, "step": 930 }, { "epoch": 1.0794064422728917, "grad_norm": 61.15769059560887, "learning_rate": 9.632983673192313e-08, "logits/chosen": -1.0492568016052246, "logits/rejected": -1.0924451351165771, "logps/chosen": -121.42367553710938, "logps/rejected": -128.29689025878906, "loss": 0.4544, "rewards/accuracies": 0.65625, "rewards/chosen": -0.37191495299339294, "rewards/margins": 0.5627624988555908, "rewards/rejected": -0.9346774220466614, "step": 932 }, { "epoch": 1.0817227651103873, "grad_norm": 57.03166805866933, "learning_rate": 9.594370567711511e-08, "logits/chosen": -1.2661980390548706, "logits/rejected": -1.3073248863220215, "logps/chosen": -182.6976776123047, "logps/rejected": -234.59210205078125, "loss": 0.4014, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2322644144296646, "rewards/margins": 1.5935356616973877, "rewards/rejected": -1.8258000612258911, "step": 934 }, { "epoch": 1.0840390879478827, "grad_norm": 82.40499356623086, "learning_rate": 9.555763519091248e-08, "logits/chosen": -1.315928339958191, "logits/rejected": -1.2872464656829834, "logps/chosen": -175.55258178710938, "logps/rejected": -189.99154663085938, "loss": 0.4182, "rewards/accuracies": 0.96875, "rewards/chosen": -0.45597535371780396, "rewards/margins": 1.532588005065918, "rewards/rejected": -1.9885634183883667, "step": 936 }, { "epoch": 1.0863554107853781, "grad_norm": 50.520663776823014, "learning_rate": 9.517163103812138e-08, "logits/chosen": -1.1214520931243896, "logits/rejected": -1.1749013662338257, "logps/chosen": -144.7432098388672, "logps/rejected": -201.30775451660156, "loss": 0.4199, "rewards/accuracies": 0.875, "rewards/chosen": -0.43661484122276306, "rewards/margins": 2.3110551834106445, "rewards/rejected": -2.7476699352264404, "step": 938 }, { "epoch": 1.0886717336228737, "grad_norm": 54.03127984334986, "learning_rate": 9.478569898255764e-08, "logits/chosen": -1.3171803951263428, "logits/rejected": -1.2946100234985352, "logps/chosen": -136.1056365966797, "logps/rejected": -154.28529357910156, "loss": 0.4366, "rewards/accuracies": 0.8125, "rewards/chosen": -0.203598752617836, "rewards/margins": 1.1319787502288818, "rewards/rejected": -1.3355774879455566, "step": 940 }, { "epoch": 1.0909880564603691, "grad_norm": 56.859372680461554, "learning_rate": 9.439984478696047e-08, "logits/chosen": -1.2367607355117798, "logits/rejected": -1.296886682510376, "logps/chosen": -155.44183349609375, "logps/rejected": -188.44387817382812, "loss": 0.4481, "rewards/accuracies": 0.75, "rewards/chosen": -0.2627331018447876, "rewards/margins": 1.0997084379196167, "rewards/rejected": -1.3624415397644043, "step": 942 }, { "epoch": 1.0933043792978647, "grad_norm": 54.353753502221494, "learning_rate": 9.401407421290643e-08, "logits/chosen": -1.3422706127166748, "logits/rejected": -1.387117862701416, "logps/chosen": -155.46054077148438, "logps/rejected": -194.65390014648438, "loss": 0.4562, "rewards/accuracies": 0.9375, "rewards/chosen": -0.03363298624753952, "rewards/margins": 1.2595181465148926, "rewards/rejected": -1.2931511402130127, "step": 944 }, { "epoch": 1.0956207021353601, "grad_norm": 101.24958194897121, "learning_rate": 9.362839302072353e-08, "logits/chosen": -1.2621257305145264, "logits/rejected": -1.27442467212677, "logps/chosen": -156.4713134765625, "logps/rejected": -171.198974609375, "loss": 0.4625, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4368613362312317, "rewards/margins": 0.8723641633987427, "rewards/rejected": -1.3092255592346191, "step": 946 }, { "epoch": 1.0979370249728555, "grad_norm": 62.41659804612414, "learning_rate": 9.324280696940505e-08, "logits/chosen": -1.2930984497070312, "logits/rejected": -1.3466129302978516, "logps/chosen": -179.30697631835938, "logps/rejected": -208.09213256835938, "loss": 0.415, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1444242000579834, "rewards/margins": 1.228155493736267, "rewards/rejected": -1.3725796937942505, "step": 948 }, { "epoch": 1.1002533478103511, "grad_norm": 50.935316539617354, "learning_rate": 9.285732181652373e-08, "logits/chosen": -1.124991536140442, "logits/rejected": -1.1405972242355347, "logps/chosen": -112.41706848144531, "logps/rejected": -129.70425415039062, "loss": 0.4202, "rewards/accuracies": 0.84375, "rewards/chosen": -0.024788763374090195, "rewards/margins": 0.9119303226470947, "rewards/rejected": -0.9367191195487976, "step": 950 }, { "epoch": 1.1025696706478465, "grad_norm": 54.50260501747858, "learning_rate": 9.24719433181456e-08, "logits/chosen": -1.2003628015518188, "logits/rejected": -1.3279610872268677, "logps/chosen": -140.67198181152344, "logps/rejected": -192.48184204101562, "loss": 0.4364, "rewards/accuracies": 0.84375, "rewards/chosen": -0.13002000749111176, "rewards/margins": 1.472712755203247, "rewards/rejected": -1.6027326583862305, "step": 952 }, { "epoch": 1.104885993485342, "grad_norm": 56.17036108462227, "learning_rate": 9.208667722874413e-08, "logits/chosen": -1.194599986076355, "logits/rejected": -1.151258111000061, "logps/chosen": -121.25810241699219, "logps/rejected": -155.66912841796875, "loss": 0.4123, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1690177619457245, "rewards/margins": 1.9425208568572998, "rewards/rejected": -2.1115384101867676, "step": 954 }, { "epoch": 1.1072023163228375, "grad_norm": 55.589916323854546, "learning_rate": 9.170152930111436e-08, "logits/chosen": -1.1914734840393066, "logits/rejected": -1.1516671180725098, "logps/chosen": -143.017578125, "logps/rejected": -154.72305297851562, "loss": 0.4257, "rewards/accuracies": 0.75, "rewards/chosen": -0.14978399872779846, "rewards/margins": 0.947557806968689, "rewards/rejected": -1.097341775894165, "step": 956 }, { "epoch": 1.1095186391603329, "grad_norm": 56.27177250398282, "learning_rate": 9.131650528628687e-08, "logits/chosen": -1.0663305521011353, "logits/rejected": -1.1537647247314453, "logps/chosen": -79.43197631835938, "logps/rejected": -117.70744323730469, "loss": 0.4493, "rewards/accuracies": 0.84375, "rewards/chosen": -0.39384108781814575, "rewards/margins": 1.1493465900421143, "rewards/rejected": -1.5431876182556152, "step": 958 }, { "epoch": 1.1118349619978285, "grad_norm": 50.22529256717393, "learning_rate": 9.093161093344198e-08, "logits/chosen": -1.1022623777389526, "logits/rejected": -1.1051996946334839, "logps/chosen": -109.7437973022461, "logps/rejected": -133.01646423339844, "loss": 0.4091, "rewards/accuracies": 0.8125, "rewards/chosen": -0.02206096053123474, "rewards/margins": 1.2954108715057373, "rewards/rejected": -1.3174718618392944, "step": 960 }, { "epoch": 1.1141512848353239, "grad_norm": 88.42504035499209, "learning_rate": 9.054685198982399e-08, "logits/chosen": -1.1374421119689941, "logits/rejected": -1.2034603357315063, "logps/chosen": -129.9014892578125, "logps/rejected": -211.5654296875, "loss": 0.4603, "rewards/accuracies": 0.875, "rewards/chosen": -0.3869945704936981, "rewards/margins": 2.0602149963378906, "rewards/rejected": -2.447209358215332, "step": 962 }, { "epoch": 1.1164676076728195, "grad_norm": 65.27721318276478, "learning_rate": 9.016223420065518e-08, "logits/chosen": -1.3324483633041382, "logits/rejected": -1.4159456491470337, "logps/chosen": -174.03768920898438, "logps/rejected": -237.87066650390625, "loss": 0.4097, "rewards/accuracies": 0.96875, "rewards/chosen": -0.043332889676094055, "rewards/margins": 2.060105800628662, "rewards/rejected": -2.10343861579895, "step": 964 }, { "epoch": 1.1187839305103149, "grad_norm": 48.0388212936529, "learning_rate": 8.977776330905011e-08, "logits/chosen": -1.2459259033203125, "logits/rejected": -1.2535409927368164, "logps/chosen": -134.60635375976562, "logps/rejected": -150.04879760742188, "loss": 0.3575, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1754188984632492, "rewards/margins": 1.841836929321289, "rewards/rejected": -1.6664178371429443, "step": 966 }, { "epoch": 1.1211002533478103, "grad_norm": 49.09469637878338, "learning_rate": 8.939344505592989e-08, "logits/chosen": -1.3384616374969482, "logits/rejected": -1.2898532152175903, "logps/chosen": -147.48941040039062, "logps/rejected": -144.66139221191406, "loss": 0.4146, "rewards/accuracies": 0.875, "rewards/chosen": -0.24728082120418549, "rewards/margins": 0.9114809632301331, "rewards/rejected": -1.1587618589401245, "step": 968 }, { "epoch": 1.1234165761853059, "grad_norm": 54.96606295613573, "learning_rate": 8.900928517993643e-08, "logits/chosen": -1.1945394277572632, "logits/rejected": -1.2390636205673218, "logps/chosen": -158.14991760253906, "logps/rejected": -179.98683166503906, "loss": 0.4435, "rewards/accuracies": 0.84375, "rewards/chosen": -0.09271246939897537, "rewards/margins": 1.089883804321289, "rewards/rejected": -1.1825964450836182, "step": 970 }, { "epoch": 1.1257328990228013, "grad_norm": 61.437597331796724, "learning_rate": 8.862528941734674e-08, "logits/chosen": -1.2488527297973633, "logits/rejected": -1.281649112701416, "logps/chosen": -103.41644287109375, "logps/rejected": -122.69953155517578, "loss": 0.4408, "rewards/accuracies": 0.8125, "rewards/chosen": -0.064623162150383, "rewards/margins": 0.6563097238540649, "rewards/rejected": -0.7209329605102539, "step": 972 }, { "epoch": 1.1280492218602967, "grad_norm": 76.02385869949369, "learning_rate": 8.824146350198727e-08, "logits/chosen": -1.2774415016174316, "logits/rejected": -1.2861087322235107, "logps/chosen": -147.76309204101562, "logps/rejected": -194.18563842773438, "loss": 0.4387, "rewards/accuracies": 0.96875, "rewards/chosen": -0.05790720880031586, "rewards/margins": 1.9285999536514282, "rewards/rejected": -1.9865069389343262, "step": 974 }, { "epoch": 1.1303655446977923, "grad_norm": 40.25949311574699, "learning_rate": 8.785781316514841e-08, "logits/chosen": -1.0914101600646973, "logits/rejected": -1.1336119174957275, "logps/chosen": -150.45669555664062, "logps/rejected": -161.0519256591797, "loss": 0.4057, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3045538067817688, "rewards/margins": 1.2223623991012573, "rewards/rejected": -1.5269161462783813, "step": 976 }, { "epoch": 1.1326818675352877, "grad_norm": 76.14381445978157, "learning_rate": 8.747434413549869e-08, "logits/chosen": -1.1906322240829468, "logits/rejected": -1.218949317932129, "logps/chosen": -95.68942260742188, "logps/rejected": -113.86444854736328, "loss": 0.4352, "rewards/accuracies": 0.875, "rewards/chosen": -0.290942907333374, "rewards/margins": 0.9066973328590393, "rewards/rejected": -1.1976401805877686, "step": 978 }, { "epoch": 1.1349981903727833, "grad_norm": 47.11748284842515, "learning_rate": 8.709106213899941e-08, "logits/chosen": -1.1749687194824219, "logits/rejected": -1.2918882369995117, "logps/chosen": -157.98822021484375, "logps/rejected": -198.7054443359375, "loss": 0.3629, "rewards/accuracies": 0.84375, "rewards/chosen": -0.028000857681035995, "rewards/margins": 1.3484855890274048, "rewards/rejected": -1.3764865398406982, "step": 980 }, { "epoch": 1.1373145132102787, "grad_norm": 50.77441047309427, "learning_rate": 8.670797289881914e-08, "logits/chosen": -1.1183217763900757, "logits/rejected": -1.1116249561309814, "logps/chosen": -108.6583023071289, "logps/rejected": -129.86685180664062, "loss": 0.4299, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2509170472621918, "rewards/margins": 1.379974603652954, "rewards/rejected": -1.6308915615081787, "step": 982 }, { "epoch": 1.139630836047774, "grad_norm": 56.33912942728816, "learning_rate": 8.632508213524807e-08, "logits/chosen": -1.2698084115982056, "logits/rejected": -1.2346493005752563, "logps/chosen": -151.91213989257812, "logps/rejected": -182.04408264160156, "loss": 0.4173, "rewards/accuracies": 0.9375, "rewards/chosen": -0.37673044204711914, "rewards/margins": 1.7139312028884888, "rewards/rejected": -2.0906617641448975, "step": 984 }, { "epoch": 1.1419471588852697, "grad_norm": 56.345779338414744, "learning_rate": 8.594239556561289e-08, "logits/chosen": -1.3207218647003174, "logits/rejected": -1.3042893409729004, "logps/chosen": -163.16778564453125, "logps/rejected": -167.49691772460938, "loss": 0.4225, "rewards/accuracies": 0.875, "rewards/chosen": -0.24410971999168396, "rewards/margins": 1.5222946405410767, "rewards/rejected": -1.766404390335083, "step": 986 }, { "epoch": 1.144263481722765, "grad_norm": 76.91184841650933, "learning_rate": 8.555991890419115e-08, "logits/chosen": -1.1845864057540894, "logits/rejected": -1.1347922086715698, "logps/chosen": -102.96405792236328, "logps/rejected": -115.64076232910156, "loss": 0.5116, "rewards/accuracies": 0.84375, "rewards/chosen": -0.14511136710643768, "rewards/margins": 1.0828670263290405, "rewards/rejected": -1.2279783487319946, "step": 988 }, { "epoch": 1.1465798045602607, "grad_norm": 51.33786523502233, "learning_rate": 8.517765786212616e-08, "logits/chosen": -1.2392418384552002, "logits/rejected": -1.221925139427185, "logps/chosen": -129.43197631835938, "logps/rejected": -154.31982421875, "loss": 0.4278, "rewards/accuracies": 0.8125, "rewards/chosen": -0.057925283908843994, "rewards/margins": 1.4328621625900269, "rewards/rejected": -1.4907875061035156, "step": 990 }, { "epoch": 1.148896127397756, "grad_norm": 59.37742447491477, "learning_rate": 8.479561814734156e-08, "logits/chosen": -1.2217767238616943, "logits/rejected": -1.221116065979004, "logps/chosen": -150.78811645507812, "logps/rejected": -162.00537109375, "loss": 0.4339, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2516019940376282, "rewards/margins": 0.8484339118003845, "rewards/rejected": -1.1000357866287231, "step": 992 }, { "epoch": 1.1512124502352514, "grad_norm": 60.815493241098714, "learning_rate": 8.441380546445602e-08, "logits/chosen": -1.2922110557556152, "logits/rejected": -1.3142673969268799, "logps/chosen": -134.7477264404297, "logps/rejected": -167.12527465820312, "loss": 0.4186, "rewards/accuracies": 0.75, "rewards/chosen": -0.3243613541126251, "rewards/margins": 1.269153118133545, "rewards/rejected": -1.5935145616531372, "step": 994 }, { "epoch": 1.153528773072747, "grad_norm": 70.98343857349872, "learning_rate": 8.403222551469832e-08, "logits/chosen": -1.3424263000488281, "logits/rejected": -1.3853594064712524, "logps/chosen": -148.0376739501953, "logps/rejected": -170.81878662109375, "loss": 0.4703, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4359724521636963, "rewards/margins": 1.1364283561706543, "rewards/rejected": -1.5724008083343506, "step": 996 }, { "epoch": 1.1558450959102424, "grad_norm": 65.83403886075305, "learning_rate": 8.365088399582203e-08, "logits/chosen": -1.127755880355835, "logits/rejected": -1.1841005086898804, "logps/chosen": -156.42657470703125, "logps/rejected": -193.5218505859375, "loss": 0.3891, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4664873480796814, "rewards/margins": 1.631252408027649, "rewards/rejected": -2.0977399349212646, "step": 998 }, { "epoch": 1.158161418747738, "grad_norm": 52.29302766074037, "learning_rate": 8.326978660202032e-08, "logits/chosen": -1.0610343217849731, "logits/rejected": -1.1935327053070068, "logps/chosen": -104.4008560180664, "logps/rejected": -121.67472076416016, "loss": 0.407, "rewards/accuracies": 0.75, "rewards/chosen": -0.2705633044242859, "rewards/margins": 0.650943398475647, "rewards/rejected": -0.9215067028999329, "step": 1000 }, { "epoch": 1.158161418747738, "eval_logits/chosen": -1.2268643379211426, "eval_logits/rejected": -1.223351240158081, "eval_logps/chosen": -144.15550231933594, "eval_logps/rejected": -147.5272979736328, "eval_loss": 0.6345099806785583, "eval_rewards/accuracies": 0.7200000286102295, "eval_rewards/chosen": -0.8524341583251953, "eval_rewards/margins": 0.5049389004707336, "eval_rewards/rejected": -1.3573729991912842, "eval_runtime": 28.7122, "eval_samples_per_second": 3.483, "eval_steps_per_second": 0.871, "step": 1000 }, { "epoch": 1.1604777415852334, "grad_norm": 91.54622696958829, "learning_rate": 8.28889390238414e-08, "logits/chosen": -1.2319010496139526, "logits/rejected": -1.223869800567627, "logps/chosen": -122.68306732177734, "logps/rejected": -140.53533935546875, "loss": 0.4749, "rewards/accuracies": 0.90625, "rewards/chosen": -0.04680479317903519, "rewards/margins": 1.1401985883712769, "rewards/rejected": -1.1870033740997314, "step": 1002 }, { "epoch": 1.1627940644227288, "grad_norm": 61.19183925834221, "learning_rate": 8.250834694810293e-08, "logits/chosen": -1.0691426992416382, "logits/rejected": -1.0059016942977905, "logps/chosen": -115.401611328125, "logps/rejected": -114.543701171875, "loss": 0.4939, "rewards/accuracies": 0.65625, "rewards/chosen": -0.6190845966339111, "rewards/margins": 0.5029242634773254, "rewards/rejected": -1.1220089197158813, "step": 1004 }, { "epoch": 1.1651103872602244, "grad_norm": 53.73987538281513, "learning_rate": 8.212801605780752e-08, "logits/chosen": -1.211553692817688, "logits/rejected": -1.1825435161590576, "logps/chosen": -103.79405212402344, "logps/rejected": -126.38043975830078, "loss": 0.386, "rewards/accuracies": 0.84375, "rewards/chosen": -0.19683058559894562, "rewards/margins": 1.2718679904937744, "rewards/rejected": -1.4686983823776245, "step": 1006 }, { "epoch": 1.1674267100977198, "grad_norm": 57.8834509748926, "learning_rate": 8.174795203205778e-08, "logits/chosen": -1.1160019636154175, "logits/rejected": -1.201751947402954, "logps/chosen": -131.49493408203125, "logps/rejected": -171.06527709960938, "loss": 0.3941, "rewards/accuracies": 0.8125, "rewards/chosen": -0.40603625774383545, "rewards/margins": 1.6363184452056885, "rewards/rejected": -2.0423548221588135, "step": 1008 }, { "epoch": 1.1697430329352154, "grad_norm": 63.46818839405486, "learning_rate": 8.136816054597151e-08, "logits/chosen": -1.3906997442245483, "logits/rejected": -1.4111884832382202, "logps/chosen": -143.24041748046875, "logps/rejected": -157.61639404296875, "loss": 0.4229, "rewards/accuracies": 0.84375, "rewards/chosen": -0.07397030293941498, "rewards/margins": 1.1699588298797607, "rewards/rejected": -1.2439290285110474, "step": 1010 }, { "epoch": 1.1720593557727108, "grad_norm": 52.90184460499552, "learning_rate": 8.098864727059684e-08, "logits/chosen": -1.1468557119369507, "logits/rejected": -1.2187554836273193, "logps/chosen": -106.27362060546875, "logps/rejected": -137.64309692382812, "loss": 0.4588, "rewards/accuracies": 0.78125, "rewards/chosen": -0.41298946738243103, "rewards/margins": 0.8978220820426941, "rewards/rejected": -1.3108115196228027, "step": 1012 }, { "epoch": 1.1743756786102062, "grad_norm": 71.5431594089592, "learning_rate": 8.060941787282773e-08, "logits/chosen": -1.068381428718567, "logits/rejected": -1.041067123413086, "logps/chosen": -144.31198120117188, "logps/rejected": -153.92282104492188, "loss": 0.4285, "rewards/accuracies": 0.8125, "rewards/chosen": -0.33827927708625793, "rewards/margins": 1.2933235168457031, "rewards/rejected": -1.6316028833389282, "step": 1014 }, { "epoch": 1.1766920014477018, "grad_norm": 52.80631636688983, "learning_rate": 8.023047801531934e-08, "logits/chosen": -1.2445508241653442, "logits/rejected": -1.2780728340148926, "logps/chosen": -137.19337463378906, "logps/rejected": -179.17239379882812, "loss": 0.4182, "rewards/accuracies": 0.90625, "rewards/chosen": -0.407768189907074, "rewards/margins": 1.586388111114502, "rewards/rejected": -1.9941563606262207, "step": 1016 }, { "epoch": 1.1790083242851972, "grad_norm": 56.13591725864127, "learning_rate": 7.985183335640331e-08, "logits/chosen": -1.3068925142288208, "logits/rejected": -1.2794513702392578, "logps/chosen": -165.38331604003906, "logps/rejected": -195.83282470703125, "loss": 0.4634, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3045884668827057, "rewards/margins": 1.1379314661026, "rewards/rejected": -1.442520022392273, "step": 1018 }, { "epoch": 1.1813246471226928, "grad_norm": 54.085935777341916, "learning_rate": 7.947348955000344e-08, "logits/chosen": -1.2266862392425537, "logits/rejected": -1.2392640113830566, "logps/chosen": -174.76828002929688, "logps/rejected": -200.17649841308594, "loss": 0.4668, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4116063714027405, "rewards/margins": 1.3313807249069214, "rewards/rejected": -1.7429871559143066, "step": 1020 }, { "epoch": 1.1836409699601882, "grad_norm": 54.470511127245786, "learning_rate": 7.909545224555113e-08, "logits/chosen": -1.265533208847046, "logits/rejected": -1.2974027395248413, "logps/chosen": -175.73451232910156, "logps/rejected": -197.0155029296875, "loss": 0.394, "rewards/accuracies": 0.84375, "rewards/chosen": -0.36628925800323486, "rewards/margins": 1.0738152265548706, "rewards/rejected": -1.4401044845581055, "step": 1022 }, { "epoch": 1.1859572927976836, "grad_norm": 51.97167413329414, "learning_rate": 7.871772708790113e-08, "logits/chosen": -1.3699404001235962, "logits/rejected": -1.3305433988571167, "logps/chosen": -138.38400268554688, "logps/rejected": -147.48019409179688, "loss": 0.4392, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3684295415878296, "rewards/margins": 1.235937476158142, "rewards/rejected": -1.6043670177459717, "step": 1024 }, { "epoch": 1.1882736156351792, "grad_norm": 71.10686737396831, "learning_rate": 7.834031971724727e-08, "logits/chosen": -1.3444818258285522, "logits/rejected": -1.2173364162445068, "logps/chosen": -186.4178466796875, "logps/rejected": -190.82876586914062, "loss": 0.3623, "rewards/accuracies": 0.90625, "rewards/chosen": -0.35608479380607605, "rewards/margins": 1.72867751121521, "rewards/rejected": -2.0847623348236084, "step": 1026 }, { "epoch": 1.1905899384726746, "grad_norm": 57.74529922643325, "learning_rate": 7.796323576903802e-08, "logits/chosen": -1.2337273359298706, "logits/rejected": -1.1950453519821167, "logps/chosen": -117.95428466796875, "logps/rejected": -130.00286865234375, "loss": 0.4105, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5730584263801575, "rewards/margins": 1.2603018283843994, "rewards/rejected": -1.8333604335784912, "step": 1028 }, { "epoch": 1.1929062613101702, "grad_norm": 48.05824759856328, "learning_rate": 7.758648087389277e-08, "logits/chosen": -1.1776765584945679, "logits/rejected": -1.138443946838379, "logps/chosen": -154.47654724121094, "logps/rejected": -178.0967254638672, "loss": 0.4069, "rewards/accuracies": 0.84375, "rewards/chosen": -0.35402223467826843, "rewards/margins": 1.7749781608581543, "rewards/rejected": -2.129000186920166, "step": 1030 }, { "epoch": 1.1952225841476656, "grad_norm": 60.882004275691166, "learning_rate": 7.721006065751722e-08, "logits/chosen": -1.232604742050171, "logits/rejected": -1.1865416765213013, "logps/chosen": -126.62928771972656, "logps/rejected": -172.99932861328125, "loss": 0.4158, "rewards/accuracies": 0.71875, "rewards/chosen": -0.36373910307884216, "rewards/margins": 1.6464154720306396, "rewards/rejected": -2.0101544857025146, "step": 1032 }, { "epoch": 1.197538906985161, "grad_norm": 70.29624608969998, "learning_rate": 7.683398074061978e-08, "logits/chosen": -1.06423819065094, "logits/rejected": -1.11081063747406, "logps/chosen": -159.3987579345703, "logps/rejected": -209.29507446289062, "loss": 0.4477, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6383198499679565, "rewards/margins": 1.1516997814178467, "rewards/rejected": -1.7900197505950928, "step": 1034 }, { "epoch": 1.1998552298226566, "grad_norm": 69.02503899807796, "learning_rate": 7.645824673882748e-08, "logits/chosen": -1.2093576192855835, "logits/rejected": -1.295418620109558, "logps/chosen": -149.6958770751953, "logps/rejected": -182.64175415039062, "loss": 0.4361, "rewards/accuracies": 0.84375, "rewards/chosen": -0.20852233469486237, "rewards/margins": 1.81901216506958, "rewards/rejected": -2.0275347232818604, "step": 1036 }, { "epoch": 1.202171552660152, "grad_norm": 55.284909574315805, "learning_rate": 7.608286426260219e-08, "logits/chosen": -1.1127514839172363, "logits/rejected": -1.133642554283142, "logps/chosen": -142.2635040283203, "logps/rejected": -179.11532592773438, "loss": 0.4666, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3077201247215271, "rewards/margins": 1.6411253213882446, "rewards/rejected": -1.948845386505127, "step": 1038 }, { "epoch": 1.2044878754976476, "grad_norm": 49.756140445716696, "learning_rate": 7.570783891715665e-08, "logits/chosen": -1.173255443572998, "logits/rejected": -1.2683913707733154, "logps/chosen": -106.72663116455078, "logps/rejected": -135.44651794433594, "loss": 0.4929, "rewards/accuracies": 0.6875, "rewards/chosen": -0.030147109180688858, "rewards/margins": 0.8438385128974915, "rewards/rejected": -0.8739855885505676, "step": 1040 }, { "epoch": 1.206804198335143, "grad_norm": 60.06794739596405, "learning_rate": 7.533317630237115e-08, "logits/chosen": -1.3435733318328857, "logits/rejected": -1.312391996383667, "logps/chosen": -163.0519256591797, "logps/rejected": -166.5930938720703, "loss": 0.4324, "rewards/accuracies": 0.71875, "rewards/chosen": -0.11638803780078888, "rewards/margins": 0.7535598278045654, "rewards/rejected": -0.8699477910995483, "step": 1042 }, { "epoch": 1.2091205211726384, "grad_norm": 56.931564962745455, "learning_rate": 7.49588820127095e-08, "logits/chosen": -1.1953742504119873, "logits/rejected": -1.1958227157592773, "logps/chosen": -118.95218658447266, "logps/rejected": -140.38002014160156, "loss": 0.4184, "rewards/accuracies": 0.78125, "rewards/chosen": -0.22595052421092987, "rewards/margins": 0.9052475690841675, "rewards/rejected": -1.1311979293823242, "step": 1044 }, { "epoch": 1.211436844010134, "grad_norm": 55.662571403764815, "learning_rate": 7.458496163713574e-08, "logits/chosen": -1.2039445638656616, "logits/rejected": -1.2152304649353027, "logps/chosen": -109.94539642333984, "logps/rejected": -135.84727478027344, "loss": 0.4141, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2002752423286438, "rewards/margins": 1.1491786241531372, "rewards/rejected": -1.3494538068771362, "step": 1046 }, { "epoch": 1.2137531668476294, "grad_norm": 49.82091399401212, "learning_rate": 7.421142075903066e-08, "logits/chosen": -1.2191132307052612, "logits/rejected": -1.2416399717330933, "logps/chosen": -226.6573944091797, "logps/rejected": -276.5934143066406, "loss": 0.3952, "rewards/accuracies": 0.9375, "rewards/chosen": -0.4292265772819519, "rewards/margins": 2.0150585174560547, "rewards/rejected": -2.4442849159240723, "step": 1048 }, { "epoch": 1.216069489685125, "grad_norm": 64.17855659320382, "learning_rate": 7.38382649561083e-08, "logits/chosen": -1.1202154159545898, "logits/rejected": -1.2247982025146484, "logps/chosen": -111.54693603515625, "logps/rejected": -145.57147216796875, "loss": 0.4106, "rewards/accuracies": 0.875, "rewards/chosen": -0.25748300552368164, "rewards/margins": 1.1120051145553589, "rewards/rejected": -1.369488000869751, "step": 1050 }, { "epoch": 1.2183858125226203, "grad_norm": 65.69606574367761, "learning_rate": 7.346549980033283e-08, "logits/chosen": -1.3523645401000977, "logits/rejected": -1.351737380027771, "logps/chosen": -136.0108184814453, "logps/rejected": -162.3245849609375, "loss": 0.4704, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3510608375072479, "rewards/margins": 1.473578691482544, "rewards/rejected": -1.8246395587921143, "step": 1052 }, { "epoch": 1.2207021353601157, "grad_norm": 55.82580920258548, "learning_rate": 7.309313085783524e-08, "logits/chosen": -1.2405411005020142, "logits/rejected": -1.2566791772842407, "logps/chosen": -156.658447265625, "logps/rejected": -178.74977111816406, "loss": 0.4462, "rewards/accuracies": 0.8125, "rewards/chosen": -0.173320010304451, "rewards/margins": 1.349287748336792, "rewards/rejected": -1.522607684135437, "step": 1054 }, { "epoch": 1.2230184581976113, "grad_norm": 48.17933326291119, "learning_rate": 7.272116368883032e-08, "logits/chosen": -1.2654701471328735, "logits/rejected": -1.2995151281356812, "logps/chosen": -113.21552276611328, "logps/rejected": -131.8874053955078, "loss": 0.4331, "rewards/accuracies": 0.75, "rewards/chosen": -0.33794450759887695, "rewards/margins": 0.8526211977005005, "rewards/rejected": -1.190565824508667, "step": 1056 }, { "epoch": 1.2253347810351067, "grad_norm": 43.469270032927135, "learning_rate": 7.234960384753352e-08, "logits/chosen": -1.330349087715149, "logits/rejected": -1.3346906900405884, "logps/chosen": -144.82508850097656, "logps/rejected": -143.67098999023438, "loss": 0.4426, "rewards/accuracies": 0.71875, "rewards/chosen": -0.05265193432569504, "rewards/margins": 0.768354594707489, "rewards/rejected": -0.8210065364837646, "step": 1058 }, { "epoch": 1.2276511038726023, "grad_norm": 63.94785332530619, "learning_rate": 7.197845688207805e-08, "logits/chosen": -1.2484495639801025, "logits/rejected": -1.325204849243164, "logps/chosen": -157.0628662109375, "logps/rejected": -188.86407470703125, "loss": 0.4299, "rewards/accuracies": 0.78125, "rewards/chosen": -0.21731358766555786, "rewards/margins": 0.9957473278045654, "rewards/rejected": -1.213060975074768, "step": 1060 }, { "epoch": 1.2299674267100977, "grad_norm": 58.10630261522442, "learning_rate": 7.160772833443211e-08, "logits/chosen": -1.1713765859603882, "logits/rejected": -1.224376916885376, "logps/chosen": -111.91516876220703, "logps/rejected": -145.06793212890625, "loss": 0.4057, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3621346950531006, "rewards/margins": 1.1002415418624878, "rewards/rejected": -1.462376356124878, "step": 1062 }, { "epoch": 1.2322837495475931, "grad_norm": 64.07992580986834, "learning_rate": 7.123742374031605e-08, "logits/chosen": -1.3821399211883545, "logits/rejected": -1.3516349792480469, "logps/chosen": -150.6949920654297, "logps/rejected": -168.636474609375, "loss": 0.4631, "rewards/accuracies": 0.8125, "rewards/chosen": -0.31529155373573303, "rewards/margins": 1.3215854167938232, "rewards/rejected": -1.6368769407272339, "step": 1064 }, { "epoch": 1.2346000723850887, "grad_norm": 83.72009451188586, "learning_rate": 7.086754862911981e-08, "logits/chosen": -1.2367409467697144, "logits/rejected": -1.33730947971344, "logps/chosen": -136.8096160888672, "logps/rejected": -179.91090393066406, "loss": 0.4679, "rewards/accuracies": 0.78125, "rewards/chosen": -0.29408732056617737, "rewards/margins": 1.3833246231079102, "rewards/rejected": -1.6774119138717651, "step": 1066 }, { "epoch": 1.2369163952225841, "grad_norm": 52.574273936246385, "learning_rate": 7.049810852382013e-08, "logits/chosen": -1.2584588527679443, "logits/rejected": -1.301451563835144, "logps/chosen": -143.51211547851562, "logps/rejected": -171.96951293945312, "loss": 0.3966, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2525397539138794, "rewards/margins": 1.394914150238037, "rewards/rejected": -1.647454023361206, "step": 1068 }, { "epoch": 1.2392327180600797, "grad_norm": 56.969381805967295, "learning_rate": 7.012910894089845e-08, "logits/chosen": -1.1986093521118164, "logits/rejected": -1.2569533586502075, "logps/chosen": -126.60311126708984, "logps/rejected": -163.20584106445312, "loss": 0.4396, "rewards/accuracies": 0.75, "rewards/chosen": -0.25415244698524475, "rewards/margins": 1.2877380847930908, "rewards/rejected": -1.5418905019760132, "step": 1070 }, { "epoch": 1.2415490408975751, "grad_norm": 60.76903483748122, "learning_rate": 6.976055539025817e-08, "logits/chosen": -1.146588921546936, "logits/rejected": -1.2753336429595947, "logps/chosen": -130.54420471191406, "logps/rejected": -164.23394775390625, "loss": 0.4353, "rewards/accuracies": 0.75, "rewards/chosen": -0.2767847180366516, "rewards/margins": 1.2378323078155518, "rewards/rejected": -1.5146170854568481, "step": 1072 }, { "epoch": 1.2438653637350705, "grad_norm": 94.702744651523, "learning_rate": 6.939245337514263e-08, "logits/chosen": -1.2317594289779663, "logits/rejected": -1.2677679061889648, "logps/chosen": -126.01373291015625, "logps/rejected": -153.23544311523438, "loss": 0.4175, "rewards/accuracies": 0.75, "rewards/chosen": 0.04374053329229355, "rewards/margins": 1.1703221797943115, "rewards/rejected": -1.1265815496444702, "step": 1074 }, { "epoch": 1.2461816865725661, "grad_norm": 52.89885223375742, "learning_rate": 6.902480839205276e-08, "logits/chosen": -1.180016279220581, "logits/rejected": -1.2080872058868408, "logps/chosen": -201.50587463378906, "logps/rejected": -249.22219848632812, "loss": 0.3789, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3483750522136688, "rewards/margins": 2.1332645416259766, "rewards/rejected": -2.481639862060547, "step": 1076 }, { "epoch": 1.2484980094100615, "grad_norm": 54.80164326787735, "learning_rate": 6.865762593066514e-08, "logits/chosen": -0.9807424545288086, "logits/rejected": -1.0156656503677368, "logps/chosen": -100.35003662109375, "logps/rejected": -127.42887878417969, "loss": 0.4184, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5549271106719971, "rewards/margins": 1.5452109575271606, "rewards/rejected": -2.1001381874084473, "step": 1078 }, { "epoch": 1.2508143322475571, "grad_norm": 53.45042262080674, "learning_rate": 6.82909114737499e-08, "logits/chosen": -1.283860206604004, "logits/rejected": -1.298774242401123, "logps/chosen": -192.04173278808594, "logps/rejected": -203.33514404296875, "loss": 0.4287, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3390694260597229, "rewards/margins": 1.2199491262435913, "rewards/rejected": -1.5590184926986694, "step": 1080 }, { "epoch": 1.2531306550850525, "grad_norm": 62.048531646505914, "learning_rate": 6.792467049708906e-08, "logits/chosen": -1.3052669763565063, "logits/rejected": -1.3348180055618286, "logps/chosen": -188.72085571289062, "logps/rejected": -217.85693359375, "loss": 0.42, "rewards/accuracies": 0.78125, "rewards/chosen": -0.23418346047401428, "rewards/margins": 1.5606598854064941, "rewards/rejected": -1.794843316078186, "step": 1082 }, { "epoch": 1.255446977922548, "grad_norm": 53.407667874895914, "learning_rate": 6.755890846939453e-08, "logits/chosen": -1.1453837156295776, "logits/rejected": -1.2053775787353516, "logps/chosen": -99.1140365600586, "logps/rejected": -125.2704849243164, "loss": 0.4225, "rewards/accuracies": 0.75, "rewards/chosen": -0.2742993235588074, "rewards/margins": 0.9954284429550171, "rewards/rejected": -1.2697277069091797, "step": 1084 }, { "epoch": 1.2577633007600435, "grad_norm": 65.0600936041139, "learning_rate": 6.719363085222656e-08, "logits/chosen": -1.278088092803955, "logits/rejected": -1.439531922340393, "logps/chosen": -156.99533081054688, "logps/rejected": -208.32614135742188, "loss": 0.4391, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1764114499092102, "rewards/margins": 1.526625156402588, "rewards/rejected": -1.7030367851257324, "step": 1086 }, { "epoch": 1.260079623597539, "grad_norm": 67.09113553951667, "learning_rate": 6.682884309991223e-08, "logits/chosen": -1.2901434898376465, "logits/rejected": -1.2637979984283447, "logps/chosen": -188.45367431640625, "logps/rejected": -209.62330627441406, "loss": 0.4598, "rewards/accuracies": 0.90625, "rewards/chosen": -0.1700163036584854, "rewards/margins": 1.609601378440857, "rewards/rejected": -1.7796176671981812, "step": 1088 }, { "epoch": 1.2623959464350345, "grad_norm": 62.43345962867118, "learning_rate": 6.646455065946386e-08, "logits/chosen": -1.2528804540634155, "logits/rejected": -1.285109043121338, "logps/chosen": -151.94007873535156, "logps/rejected": -183.51473999023438, "loss": 0.4645, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4142577052116394, "rewards/margins": 1.807876706123352, "rewards/rejected": -2.222134590148926, "step": 1090 }, { "epoch": 1.2647122692725299, "grad_norm": 68.06328374523846, "learning_rate": 6.610075897049787e-08, "logits/chosen": -1.1376911401748657, "logits/rejected": -1.1852926015853882, "logps/chosen": -133.3196563720703, "logps/rejected": -151.58944702148438, "loss": 0.4271, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2607348561286926, "rewards/margins": 1.1624417304992676, "rewards/rejected": -1.423176646232605, "step": 1092 }, { "epoch": 1.2670285921100253, "grad_norm": 54.82252614834041, "learning_rate": 6.573747346515338e-08, "logits/chosen": -1.1859668493270874, "logits/rejected": -1.2004412412643433, "logps/chosen": -98.60173034667969, "logps/rejected": -108.228759765625, "loss": 0.427, "rewards/accuracies": 0.84375, "rewards/chosen": 0.049306582659482956, "rewards/margins": 0.7941040396690369, "rewards/rejected": -0.7447974681854248, "step": 1094 }, { "epoch": 1.2693449149475209, "grad_norm": 57.11140925441194, "learning_rate": 6.537469956801128e-08, "logits/chosen": -1.2591499090194702, "logits/rejected": -1.2756586074829102, "logps/chosen": -172.0859832763672, "logps/rejected": -197.0950469970703, "loss": 0.4584, "rewards/accuracies": 0.84375, "rewards/chosen": 0.014586816541850567, "rewards/margins": 1.5466443300247192, "rewards/rejected": -1.532057523727417, "step": 1096 }, { "epoch": 1.2716612377850163, "grad_norm": 67.16427104689467, "learning_rate": 6.501244269601301e-08, "logits/chosen": -1.1831741333007812, "logits/rejected": -1.2193033695220947, "logps/chosen": -137.45553588867188, "logps/rejected": -160.08860778808594, "loss": 0.4516, "rewards/accuracies": 0.78125, "rewards/chosen": -0.44443202018737793, "rewards/margins": 1.2256213426589966, "rewards/rejected": -1.670053243637085, "step": 1098 }, { "epoch": 1.2739775606225119, "grad_norm": 62.25777011992104, "learning_rate": 6.465070825837984e-08, "logits/chosen": -1.4166083335876465, "logits/rejected": -1.315108299255371, "logps/chosen": -205.66421508789062, "logps/rejected": -192.6638946533203, "loss": 0.4758, "rewards/accuracies": 0.84375, "rewards/chosen": -0.35738953948020935, "rewards/margins": 1.106539011001587, "rewards/rejected": -1.463928461074829, "step": 1100 }, { "epoch": 1.2739775606225119, "eval_logits/chosen": -1.2365539073944092, "eval_logits/rejected": -1.2307358980178833, "eval_logps/chosen": -141.82875061035156, "eval_logps/rejected": -145.88864135742188, "eval_loss": 0.6021616458892822, "eval_rewards/accuracies": 0.6800000071525574, "eval_rewards/chosen": -0.6197583079338074, "eval_rewards/margins": 0.573750913143158, "eval_rewards/rejected": -1.1935091018676758, "eval_runtime": 26.1434, "eval_samples_per_second": 3.825, "eval_steps_per_second": 0.956, "step": 1100 }, { "epoch": 1.2762938834600073, "grad_norm": 79.32910060699834, "learning_rate": 6.428950165653203e-08, "logits/chosen": -1.1748266220092773, "logits/rejected": -1.21055269241333, "logps/chosen": -180.78173828125, "logps/rejected": -216.40310668945312, "loss": 0.407, "rewards/accuracies": 0.875, "rewards/chosen": -0.34648019075393677, "rewards/margins": 1.1977996826171875, "rewards/rejected": -1.5442798137664795, "step": 1102 }, { "epoch": 1.2786102062975027, "grad_norm": 57.00819560561195, "learning_rate": 6.392882828400824e-08, "logits/chosen": -1.20145583152771, "logits/rejected": -1.2809226512908936, "logps/chosen": -129.96795654296875, "logps/rejected": -172.04791259765625, "loss": 0.3894, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0752699226140976, "rewards/margins": 1.4692878723144531, "rewards/rejected": -1.544557809829712, "step": 1104 }, { "epoch": 1.2809265291349983, "grad_norm": 89.86303419517047, "learning_rate": 6.356869352638487e-08, "logits/chosen": -1.1432220935821533, "logits/rejected": -1.130506157875061, "logps/chosen": -121.71344757080078, "logps/rejected": -145.05398559570312, "loss": 0.4827, "rewards/accuracies": 0.75, "rewards/chosen": -0.3815082311630249, "rewards/margins": 1.2451475858688354, "rewards/rejected": -1.6266558170318604, "step": 1106 }, { "epoch": 1.2832428519724937, "grad_norm": 55.089745486178494, "learning_rate": 6.320910276119576e-08, "logits/chosen": -1.2031654119491577, "logits/rejected": -1.2435033321380615, "logps/chosen": -124.21519470214844, "logps/rejected": -163.48570251464844, "loss": 0.3949, "rewards/accuracies": 0.9375, "rewards/chosen": -0.23762348294258118, "rewards/margins": 1.7965894937515259, "rewards/rejected": -2.034213066101074, "step": 1108 }, { "epoch": 1.2855591748099893, "grad_norm": 55.670403843867525, "learning_rate": 6.285006135785188e-08, "logits/chosen": -1.2561604976654053, "logits/rejected": -1.3285340070724487, "logps/chosen": -183.5081024169922, "logps/rejected": -211.64520263671875, "loss": 0.3639, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07203777134418488, "rewards/margins": 1.2524206638336182, "rewards/rejected": -1.324458360671997, "step": 1110 }, { "epoch": 1.2878754976474847, "grad_norm": 64.9709430383997, "learning_rate": 6.24915746775611e-08, "logits/chosen": -1.3581562042236328, "logits/rejected": -1.395113468170166, "logps/chosen": -129.36273193359375, "logps/rejected": -150.3099822998047, "loss": 0.4628, "rewards/accuracies": 0.875, "rewards/chosen": -0.04725523293018341, "rewards/margins": 0.9219971299171448, "rewards/rejected": -0.9692522883415222, "step": 1112 }, { "epoch": 1.29019182048498, "grad_norm": 64.29245714266342, "learning_rate": 6.213364807324817e-08, "logits/chosen": -1.3286385536193848, "logits/rejected": -1.327454686164856, "logps/chosen": -144.95196533203125, "logps/rejected": -151.3776092529297, "loss": 0.4389, "rewards/accuracies": 0.875, "rewards/chosen": 0.029904872179031372, "rewards/margins": 0.8001270890235901, "rewards/rejected": -0.7702221274375916, "step": 1114 }, { "epoch": 1.2925081433224754, "grad_norm": 58.190371163079014, "learning_rate": 6.177628688947478e-08, "logits/chosen": -1.234309196472168, "logits/rejected": -1.350053310394287, "logps/chosen": -127.14999389648438, "logps/rejected": -157.17440795898438, "loss": 0.4114, "rewards/accuracies": 0.84375, "rewards/chosen": 0.16708944737911224, "rewards/margins": 1.1116842031478882, "rewards/rejected": -0.9445948004722595, "step": 1116 }, { "epoch": 1.294824466159971, "grad_norm": 54.20945636543121, "learning_rate": 6.141949646235971e-08, "logits/chosen": -1.3086659908294678, "logits/rejected": -1.3124091625213623, "logps/chosen": -99.00717163085938, "logps/rejected": -106.53187561035156, "loss": 0.413, "rewards/accuracies": 0.78125, "rewards/chosen": -0.06069256365299225, "rewards/margins": 0.8353962898254395, "rewards/rejected": -0.8960888385772705, "step": 1118 }, { "epoch": 1.2971407889974667, "grad_norm": 91.99220363408854, "learning_rate": 6.106328211949928e-08, "logits/chosen": -1.2780444622039795, "logits/rejected": -1.2977427244186401, "logps/chosen": -132.8006591796875, "logps/rejected": -153.55703735351562, "loss": 0.4044, "rewards/accuracies": 0.90625, "rewards/chosen": -0.38342520594596863, "rewards/margins": 1.2909700870513916, "rewards/rejected": -1.6743953227996826, "step": 1120 }, { "epoch": 1.299457111834962, "grad_norm": 56.80422831250697, "learning_rate": 6.070764917988767e-08, "logits/chosen": -1.3879225254058838, "logits/rejected": -1.3974205255508423, "logps/chosen": -159.0819854736328, "logps/rejected": -178.7696533203125, "loss": 0.4403, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07067500054836273, "rewards/margins": 0.9483537673950195, "rewards/rejected": -1.0190287828445435, "step": 1122 }, { "epoch": 1.3017734346724574, "grad_norm": 65.5448850259776, "learning_rate": 6.035260295383755e-08, "logits/chosen": -1.3123283386230469, "logits/rejected": -1.2638592720031738, "logps/chosen": -155.3396453857422, "logps/rejected": -169.42770385742188, "loss": 0.3825, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5618670582771301, "rewards/margins": 0.81768798828125, "rewards/rejected": -1.3795549869537354, "step": 1124 }, { "epoch": 1.3040897575099528, "grad_norm": 57.561302666900545, "learning_rate": 5.999814874290083e-08, "logits/chosen": -1.1903706789016724, "logits/rejected": -1.2088305950164795, "logps/chosen": -99.7565689086914, "logps/rejected": -109.8669204711914, "loss": 0.4488, "rewards/accuracies": 0.875, "rewards/chosen": -0.09332703053951263, "rewards/margins": 0.9215379357337952, "rewards/rejected": -1.0148649215698242, "step": 1126 }, { "epoch": 1.3064060803474484, "grad_norm": 48.59107200050167, "learning_rate": 5.964429183978934e-08, "logits/chosen": -1.235561728477478, "logits/rejected": -1.266747236251831, "logps/chosen": -166.07763671875, "logps/rejected": -201.61143493652344, "loss": 0.4768, "rewards/accuracies": 0.875, "rewards/chosen": 0.0017203092575073242, "rewards/margins": 1.7348185777664185, "rewards/rejected": -1.7330982685089111, "step": 1128 }, { "epoch": 1.308722403184944, "grad_norm": 69.70400295630186, "learning_rate": 5.9291037528296004e-08, "logits/chosen": -1.2883453369140625, "logits/rejected": -1.3554822206497192, "logps/chosen": -177.18875122070312, "logps/rejected": -240.52230834960938, "loss": 0.437, "rewards/accuracies": 0.875, "rewards/chosen": -0.42891791462898254, "rewards/margins": 1.278578281402588, "rewards/rejected": -1.7074964046478271, "step": 1130 }, { "epoch": 1.3110387260224394, "grad_norm": 46.3561129999744, "learning_rate": 5.8938391083215836e-08, "logits/chosen": -1.127896785736084, "logits/rejected": -1.1460002660751343, "logps/chosen": -87.72144317626953, "logps/rejected": -90.4254379272461, "loss": 0.4029, "rewards/accuracies": 0.90625, "rewards/chosen": -0.16617347300052643, "rewards/margins": 0.7372673749923706, "rewards/rejected": -0.9034408926963806, "step": 1132 }, { "epoch": 1.3133550488599348, "grad_norm": 73.59830775445285, "learning_rate": 5.858635777026705e-08, "logits/chosen": -1.1450217962265015, "logits/rejected": -1.199432134628296, "logps/chosen": -133.6552734375, "logps/rejected": -164.2638397216797, "loss": 0.4896, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4337310791015625, "rewards/margins": 1.023172378540039, "rewards/rejected": -1.4569035768508911, "step": 1134 }, { "epoch": 1.3156713716974302, "grad_norm": 58.55089972225911, "learning_rate": 5.823494284601282e-08, "logits/chosen": -1.2085198163986206, "logits/rejected": -1.2123228311538696, "logps/chosen": -70.654052734375, "logps/rejected": -79.30513000488281, "loss": 0.4436, "rewards/accuracies": 0.65625, "rewards/chosen": -0.11036411672830582, "rewards/margins": 0.5570423603057861, "rewards/rejected": -0.667406439781189, "step": 1136 }, { "epoch": 1.3179876945349258, "grad_norm": 59.23557069037413, "learning_rate": 5.78841515577823e-08, "logits/chosen": -1.2972431182861328, "logits/rejected": -1.280043125152588, "logps/chosen": -177.51788330078125, "logps/rejected": -195.36822509765625, "loss": 0.4073, "rewards/accuracies": 0.8125, "rewards/chosen": -0.28714144229888916, "rewards/margins": 1.2412432432174683, "rewards/rejected": -1.5283846855163574, "step": 1138 }, { "epoch": 1.3203040173724212, "grad_norm": 53.15802234553906, "learning_rate": 5.753398914359266e-08, "logits/chosen": -1.1981011629104614, "logits/rejected": -1.1449341773986816, "logps/chosen": -145.66134643554688, "logps/rejected": -182.11788940429688, "loss": 0.4173, "rewards/accuracies": 0.90625, "rewards/chosen": -0.47839483618736267, "rewards/margins": 1.4796185493469238, "rewards/rejected": -1.9580132961273193, "step": 1140 }, { "epoch": 1.3226203402099168, "grad_norm": 62.503806777381996, "learning_rate": 5.718446083207067e-08, "logits/chosen": -1.1653722524642944, "logits/rejected": -1.1504828929901123, "logps/chosen": -127.88018035888672, "logps/rejected": -162.83746337890625, "loss": 0.4462, "rewards/accuracies": 0.6875, "rewards/chosen": -0.29053574800491333, "rewards/margins": 1.4083887338638306, "rewards/rejected": -1.6989246606826782, "step": 1142 }, { "epoch": 1.3249366630474122, "grad_norm": 60.87169400038914, "learning_rate": 5.6835571842374596e-08, "logits/chosen": -1.2247257232666016, "logits/rejected": -1.211700677871704, "logps/chosen": -116.61417388916016, "logps/rejected": -123.75843811035156, "loss": 0.4267, "rewards/accuracies": 0.8125, "rewards/chosen": -0.37912824749946594, "rewards/margins": 0.8145177364349365, "rewards/rejected": -1.1936458349227905, "step": 1144 }, { "epoch": 1.3272529858849076, "grad_norm": 47.22854815508047, "learning_rate": 5.64873273841165e-08, "logits/chosen": -1.2592945098876953, "logits/rejected": -1.3327893018722534, "logps/chosen": -81.57473754882812, "logps/rejected": -102.12101745605469, "loss": 0.4355, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2720547318458557, "rewards/margins": 0.8181528449058533, "rewards/rejected": -1.090207576751709, "step": 1146 }, { "epoch": 1.3295693087224032, "grad_norm": 52.172801221764196, "learning_rate": 5.613973265728415e-08, "logits/chosen": -1.348939299583435, "logits/rejected": -1.3633886575698853, "logps/chosen": -161.54638671875, "logps/rejected": -189.577880859375, "loss": 0.4636, "rewards/accuracies": 0.8125, "rewards/chosen": -0.327782541513443, "rewards/margins": 1.0368468761444092, "rewards/rejected": -1.3646295070648193, "step": 1148 }, { "epoch": 1.3318856315598986, "grad_norm": 56.430234636979094, "learning_rate": 5.5792792852163686e-08, "logits/chosen": -1.2496174573898315, "logits/rejected": -1.2435290813446045, "logps/chosen": -155.9248046875, "logps/rejected": -224.99728393554688, "loss": 0.3778, "rewards/accuracies": 0.8125, "rewards/chosen": -0.453360378742218, "rewards/margins": 2.788325548171997, "rewards/rejected": -3.2416858673095703, "step": 1150 }, { "epoch": 1.3342019543973942, "grad_norm": 49.140687823021224, "learning_rate": 5.544651314926175e-08, "logits/chosen": -1.1513588428497314, "logits/rejected": -1.1785624027252197, "logps/chosen": -132.36451721191406, "logps/rejected": -139.9801025390625, "loss": 0.3794, "rewards/accuracies": 0.90625, "rewards/chosen": -0.17863646149635315, "rewards/margins": 0.9919829368591309, "rewards/rejected": -1.1706193685531616, "step": 1152 }, { "epoch": 1.3365182772348896, "grad_norm": 54.74690607277333, "learning_rate": 5.510089871922854e-08, "logits/chosen": -1.1350257396697998, "logits/rejected": -1.091831922531128, "logps/chosen": -140.16416931152344, "logps/rejected": -163.96957397460938, "loss": 0.4229, "rewards/accuracies": 0.84375, "rewards/chosen": -0.26455140113830566, "rewards/margins": 1.4111688137054443, "rewards/rejected": -1.67572021484375, "step": 1154 }, { "epoch": 1.338834600072385, "grad_norm": 58.027061721474524, "learning_rate": 5.475595472278023e-08, "logits/chosen": -1.1636797189712524, "logits/rejected": -1.2959442138671875, "logps/chosen": -131.42010498046875, "logps/rejected": -175.03054809570312, "loss": 0.4132, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11369796097278595, "rewards/margins": 1.662842035293579, "rewards/rejected": -1.7765400409698486, "step": 1156 }, { "epoch": 1.3411509229098806, "grad_norm": 67.45272251248225, "learning_rate": 5.4411686310622186e-08, "logits/chosen": -1.178033471107483, "logits/rejected": -1.1936701536178589, "logps/chosen": -162.93310546875, "logps/rejected": -208.5606231689453, "loss": 0.4718, "rewards/accuracies": 0.75, "rewards/chosen": -0.7663465738296509, "rewards/margins": 1.6512737274169922, "rewards/rejected": -2.4176201820373535, "step": 1158 }, { "epoch": 1.343467245747376, "grad_norm": 59.19683202042164, "learning_rate": 5.406809862337187e-08, "logits/chosen": -1.3426570892333984, "logits/rejected": -1.3180351257324219, "logps/chosen": -160.33921813964844, "logps/rejected": -176.23568725585938, "loss": 0.4398, "rewards/accuracies": 0.78125, "rewards/chosen": -0.15457448363304138, "rewards/margins": 1.2215690612792969, "rewards/rejected": -1.376143455505371, "step": 1160 }, { "epoch": 1.3457835685848716, "grad_norm": 56.807281430093354, "learning_rate": 5.3725196791482266e-08, "logits/chosen": -1.1778029203414917, "logits/rejected": -1.2173116207122803, "logps/chosen": -166.3194580078125, "logps/rejected": -187.35598754882812, "loss": 0.4569, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2830876111984253, "rewards/margins": 1.4655952453613281, "rewards/rejected": -1.7486828565597534, "step": 1162 }, { "epoch": 1.348099891422367, "grad_norm": 67.01942867566683, "learning_rate": 5.338298593516499e-08, "logits/chosen": -1.1145247220993042, "logits/rejected": -1.1617120504379272, "logps/chosen": -102.82472229003906, "logps/rejected": -128.9154052734375, "loss": 0.4307, "rewards/accuracies": 0.75, "rewards/chosen": -0.4769600033760071, "rewards/margins": 0.9643608331680298, "rewards/rejected": -1.4413208961486816, "step": 1164 }, { "epoch": 1.3504162142598624, "grad_norm": 51.57005647269304, "learning_rate": 5.3041471164314214e-08, "logits/chosen": -1.308578610420227, "logits/rejected": -1.3058252334594727, "logps/chosen": -160.94886779785156, "logps/rejected": -172.73074340820312, "loss": 0.4133, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3100912868976593, "rewards/margins": 1.440206527709961, "rewards/rejected": -1.7502976655960083, "step": 1166 }, { "epoch": 1.352732537097358, "grad_norm": 53.72829765043651, "learning_rate": 5.2700657578429995e-08, "logits/chosen": -1.1010003089904785, "logits/rejected": -1.1325709819793701, "logps/chosen": -119.34065246582031, "logps/rejected": -141.7773895263672, "loss": 0.3688, "rewards/accuracies": 0.875, "rewards/chosen": -0.09922142326831818, "rewards/margins": 0.9483239650726318, "rewards/rejected": -1.0475454330444336, "step": 1168 }, { "epoch": 1.3550488599348534, "grad_norm": 72.40229211731818, "learning_rate": 5.236055026654231e-08, "logits/chosen": -1.1682099103927612, "logits/rejected": -1.2344272136688232, "logps/chosen": -149.36631774902344, "logps/rejected": -195.6129608154297, "loss": 0.4075, "rewards/accuracies": 0.8125, "rewards/chosen": -0.36407387256622314, "rewards/margins": 1.1139262914657593, "rewards/rejected": -1.478000283241272, "step": 1170 }, { "epoch": 1.357365182772349, "grad_norm": 65.62146445760688, "learning_rate": 5.202115430713518e-08, "logits/chosen": -1.290632724761963, "logits/rejected": -1.3468101024627686, "logps/chosen": -148.1609344482422, "logps/rejected": -171.7547607421875, "loss": 0.434, "rewards/accuracies": 0.8125, "rewards/chosen": -0.13772422075271606, "rewards/margins": 1.1604384183883667, "rewards/rejected": -1.2981626987457275, "step": 1172 }, { "epoch": 1.3596815056098444, "grad_norm": 73.65392956429626, "learning_rate": 5.168247476807053e-08, "logits/chosen": -1.2010302543640137, "logits/rejected": -1.1821855306625366, "logps/chosen": -169.35751342773438, "logps/rejected": -201.23187255859375, "loss": 0.4576, "rewards/accuracies": 0.6875, "rewards/chosen": -0.726340651512146, "rewards/margins": 2.0060853958129883, "rewards/rejected": -2.732426404953003, "step": 1174 }, { "epoch": 1.3619978284473397, "grad_norm": 67.23562182380863, "learning_rate": 5.1344516706512834e-08, "logits/chosen": -1.1466736793518066, "logits/rejected": -1.1924691200256348, "logps/chosen": -120.49522399902344, "logps/rejected": -158.2263946533203, "loss": 0.4164, "rewards/accuracies": 0.8125, "rewards/chosen": -0.15503083169460297, "rewards/margins": 1.3568706512451172, "rewards/rejected": -1.5119014978408813, "step": 1176 }, { "epoch": 1.3643141512848354, "grad_norm": 53.40303785033479, "learning_rate": 5.100728516885342e-08, "logits/chosen": -1.2885719537734985, "logits/rejected": -1.3625785112380981, "logps/chosen": -129.93800354003906, "logps/rejected": -161.71005249023438, "loss": 0.4342, "rewards/accuracies": 0.84375, "rewards/chosen": 0.009626433253288269, "rewards/margins": 0.8826741576194763, "rewards/rejected": -0.8730477094650269, "step": 1178 }, { "epoch": 1.3666304741223307, "grad_norm": 49.42170731373946, "learning_rate": 5.067078519063513e-08, "logits/chosen": -1.167036771774292, "logits/rejected": -1.1623289585113525, "logps/chosen": -138.56105041503906, "logps/rejected": -148.9010009765625, "loss": 0.4115, "rewards/accuracies": 0.6875, "rewards/chosen": -0.1745842695236206, "rewards/margins": 0.8778476715087891, "rewards/rejected": -1.0524319410324097, "step": 1180 }, { "epoch": 1.3689467969598264, "grad_norm": 56.18353960346196, "learning_rate": 5.033502179647713e-08, "logits/chosen": -1.1595622301101685, "logits/rejected": -1.1569626331329346, "logps/chosen": -115.88796997070312, "logps/rejected": -131.73147583007812, "loss": 0.4146, "rewards/accuracies": 0.75, "rewards/chosen": -0.11277991533279419, "rewards/margins": 1.4318398237228394, "rewards/rejected": -1.5446196794509888, "step": 1182 }, { "epoch": 1.3712631197973217, "grad_norm": 59.72143529349983, "learning_rate": 5.000000000000002e-08, "logits/chosen": -1.15464448928833, "logits/rejected": -1.1753931045532227, "logps/chosen": -119.97201538085938, "logps/rejected": -145.59030151367188, "loss": 0.4074, "rewards/accuracies": 0.8125, "rewards/chosen": -0.31518158316612244, "rewards/margins": 0.9257540106773376, "rewards/rejected": -1.2409354448318481, "step": 1184 }, { "epoch": 1.3735794426348171, "grad_norm": 62.14848354015407, "learning_rate": 4.9665724803750756e-08, "logits/chosen": -1.1488442420959473, "logits/rejected": -1.1463112831115723, "logps/chosen": -158.23907470703125, "logps/rejected": -191.94760131835938, "loss": 0.4367, "rewards/accuracies": 0.75, "rewards/chosen": 0.018879923969507217, "rewards/margins": 1.4537690877914429, "rewards/rejected": -1.4348890781402588, "step": 1186 }, { "epoch": 1.3758957654723127, "grad_norm": 49.492612484233774, "learning_rate": 4.933220119912801e-08, "logits/chosen": -1.1548792123794556, "logits/rejected": -1.169516682624817, "logps/chosen": -130.5518035888672, "logps/rejected": -147.64028930664062, "loss": 0.4145, "rewards/accuracies": 0.78125, "rewards/chosen": -0.40217429399490356, "rewards/margins": 0.7372788786888123, "rewards/rejected": -1.1394531726837158, "step": 1188 }, { "epoch": 1.3782120883098081, "grad_norm": 60.24929695435592, "learning_rate": 4.899943416630794e-08, "logits/chosen": -1.092994213104248, "logits/rejected": -1.1242940425872803, "logps/chosen": -128.6440887451172, "logps/rejected": -153.77162170410156, "loss": 0.4132, "rewards/accuracies": 0.84375, "rewards/chosen": -0.29642218351364136, "rewards/margins": 0.9425420165061951, "rewards/rejected": -1.238964319229126, "step": 1190 }, { "epoch": 1.3805284111473037, "grad_norm": 54.83654894795225, "learning_rate": 4.86674286741693e-08, "logits/chosen": -1.0443332195281982, "logits/rejected": -1.1355880498886108, "logps/chosen": -136.2052764892578, "logps/rejected": -195.44418334960938, "loss": 0.4547, "rewards/accuracies": 0.84375, "rewards/chosen": -0.20831860601902008, "rewards/margins": 1.922303318977356, "rewards/rejected": -2.130621910095215, "step": 1192 }, { "epoch": 1.3828447339847991, "grad_norm": 47.639304202736085, "learning_rate": 4.833618968021956e-08, "logits/chosen": -1.1499733924865723, "logits/rejected": -1.150397539138794, "logps/chosen": -144.41278076171875, "logps/rejected": -184.17367553710938, "loss": 0.4156, "rewards/accuracies": 0.84375, "rewards/chosen": -0.29232925176620483, "rewards/margins": 1.5826106071472168, "rewards/rejected": -1.8749399185180664, "step": 1194 }, { "epoch": 1.3851610568222945, "grad_norm": 56.85736200158409, "learning_rate": 4.800572213052101e-08, "logits/chosen": -1.2480294704437256, "logits/rejected": -1.2443549633026123, "logps/chosen": -137.35763549804688, "logps/rejected": -159.02120971679688, "loss": 0.4256, "rewards/accuracies": 0.8125, "rewards/chosen": -0.35943007469177246, "rewards/margins": 1.1223219633102417, "rewards/rejected": -1.4817521572113037, "step": 1196 }, { "epoch": 1.3874773796597901, "grad_norm": 60.652803586637134, "learning_rate": 4.767603095961652e-08, "logits/chosen": -1.1794486045837402, "logits/rejected": -1.2305172681808472, "logps/chosen": -114.67503356933594, "logps/rejected": -131.82688903808594, "loss": 0.4436, "rewards/accuracies": 0.71875, "rewards/chosen": -0.19165346026420593, "rewards/margins": 0.7199077010154724, "rewards/rejected": -0.911561131477356, "step": 1198 }, { "epoch": 1.3897937024972855, "grad_norm": 67.23416627527426, "learning_rate": 4.734712109045615e-08, "logits/chosen": -1.231626033782959, "logits/rejected": -1.2483142614364624, "logps/chosen": -152.45318603515625, "logps/rejected": -177.7398681640625, "loss": 0.4415, "rewards/accuracies": 0.875, "rewards/chosen": -0.22800426185131073, "rewards/margins": 1.2493054866790771, "rewards/rejected": -1.4773095846176147, "step": 1200 }, { "epoch": 1.3897937024972855, "eval_logits/chosen": -1.2304502725601196, "eval_logits/rejected": -1.2247934341430664, "eval_logps/chosen": -142.80154418945312, "eval_logps/rejected": -147.39393615722656, "eval_loss": 0.5958621501922607, "eval_rewards/accuracies": 0.7200000286102295, "eval_rewards/chosen": -0.717038094997406, "eval_rewards/margins": 0.6269991993904114, "eval_rewards/rejected": -1.3440372943878174, "eval_runtime": 27.927, "eval_samples_per_second": 3.581, "eval_steps_per_second": 0.895, "step": 1200 }, { "epoch": 1.3921100253347811, "grad_norm": 65.19042296707491, "learning_rate": 4.70189974343236e-08, "logits/chosen": -1.2034111022949219, "logits/rejected": -1.2350322008132935, "logps/chosen": -121.24800872802734, "logps/rejected": -157.0181427001953, "loss": 0.389, "rewards/accuracies": 0.875, "rewards/chosen": -0.05601517856121063, "rewards/margins": 1.3547430038452148, "rewards/rejected": -1.4107582569122314, "step": 1202 }, { "epoch": 1.3944263481722765, "grad_norm": 53.022218131457315, "learning_rate": 4.669166489076283e-08, "logits/chosen": -1.1960185766220093, "logits/rejected": -1.2297847270965576, "logps/chosen": -147.0485382080078, "logps/rejected": -177.79165649414062, "loss": 0.3619, "rewards/accuracies": 0.84375, "rewards/chosen": -0.17010337114334106, "rewards/margins": 1.6398181915283203, "rewards/rejected": -1.8099215030670166, "step": 1204 }, { "epoch": 1.396742671009772, "grad_norm": 51.968933285305766, "learning_rate": 4.636512834750479e-08, "logits/chosen": -1.1823878288269043, "logits/rejected": -1.1940593719482422, "logps/chosen": -165.0301055908203, "logps/rejected": -192.39288330078125, "loss": 0.3919, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2851025462150574, "rewards/margins": 1.3472487926483154, "rewards/rejected": -1.6323513984680176, "step": 1206 }, { "epoch": 1.3990589938472675, "grad_norm": 70.67490625711265, "learning_rate": 4.6039392680394705e-08, "logits/chosen": -1.1791980266571045, "logits/rejected": -1.2245995998382568, "logps/chosen": -144.21392822265625, "logps/rejected": -177.3154754638672, "loss": 0.4364, "rewards/accuracies": 0.78125, "rewards/chosen": -0.42649248242378235, "rewards/margins": 1.543526291847229, "rewards/rejected": -1.9700188636779785, "step": 1208 }, { "epoch": 1.401375316684763, "grad_norm": 73.92072443542028, "learning_rate": 4.5714462753319025e-08, "logits/chosen": -1.1179996728897095, "logits/rejected": -1.1189749240875244, "logps/chosen": -176.80593872070312, "logps/rejected": -192.028564453125, "loss": 0.3727, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0178598165512085, "rewards/margins": 1.1851011514663696, "rewards/rejected": -2.202960968017578, "step": 1210 }, { "epoch": 1.4036916395222585, "grad_norm": 77.10308260730022, "learning_rate": 4.539034341813285e-08, "logits/chosen": -1.1620545387268066, "logits/rejected": -1.1280796527862549, "logps/chosen": -148.05978393554688, "logps/rejected": -170.8760986328125, "loss": 0.4417, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6205931901931763, "rewards/margins": 0.9719365835189819, "rewards/rejected": -1.5925298929214478, "step": 1212 }, { "epoch": 1.406007962359754, "grad_norm": 50.96217046302329, "learning_rate": 4.50670395145876e-08, "logits/chosen": -1.2354716062545776, "logits/rejected": -1.2715418338775635, "logps/chosen": -197.6702423095703, "logps/rejected": -234.24781799316406, "loss": 0.416, "rewards/accuracies": 0.8125, "rewards/chosen": -0.38070574402809143, "rewards/margins": 1.9455530643463135, "rewards/rejected": -2.326258897781372, "step": 1214 }, { "epoch": 1.4083242851972493, "grad_norm": 60.92763813478579, "learning_rate": 4.474455587025869e-08, "logits/chosen": -1.2651827335357666, "logits/rejected": -1.2328460216522217, "logps/chosen": -184.45912170410156, "logps/rejected": -197.0801239013672, "loss": 0.3824, "rewards/accuracies": 0.875, "rewards/chosen": -0.584815502166748, "rewards/margins": 1.4052482843399048, "rewards/rejected": -1.9900637865066528, "step": 1216 }, { "epoch": 1.410640608034745, "grad_norm": 63.32401043366279, "learning_rate": 4.4422897300473315e-08, "logits/chosen": -1.3017557859420776, "logits/rejected": -1.2864493131637573, "logps/chosen": -124.99530792236328, "logps/rejected": -148.06082153320312, "loss": 0.3893, "rewards/accuracies": 0.875, "rewards/chosen": -0.2559690475463867, "rewards/margins": 1.1134750843048096, "rewards/rejected": -1.3694441318511963, "step": 1218 }, { "epoch": 1.4129569308722403, "grad_norm": 53.98313466834894, "learning_rate": 4.4102068608238685e-08, "logits/chosen": -1.230201244354248, "logits/rejected": -1.2914016246795654, "logps/chosen": -171.6476287841797, "logps/rejected": -189.03538513183594, "loss": 0.4396, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6881774067878723, "rewards/margins": 0.9222534894943237, "rewards/rejected": -1.6104308366775513, "step": 1220 }, { "epoch": 1.415273253709736, "grad_norm": 62.04575224759104, "learning_rate": 4.3782074584170346e-08, "logits/chosen": -1.214889645576477, "logits/rejected": -1.1926887035369873, "logps/chosen": -92.45348358154297, "logps/rejected": -105.89907836914062, "loss": 0.4592, "rewards/accuracies": 0.71875, "rewards/chosen": -0.3113905191421509, "rewards/margins": 0.9316319823265076, "rewards/rejected": -1.2430225610733032, "step": 1222 }, { "epoch": 1.4175895765472313, "grad_norm": 71.40412638311601, "learning_rate": 4.34629200064205e-08, "logits/chosen": -1.1961030960083008, "logits/rejected": -1.2155812978744507, "logps/chosen": -170.02508544921875, "logps/rejected": -210.41592407226562, "loss": 0.4078, "rewards/accuracies": 0.84375, "rewards/chosen": -0.856181263923645, "rewards/margins": 1.6567095518112183, "rewards/rejected": -2.5128908157348633, "step": 1224 }, { "epoch": 1.4199058993847267, "grad_norm": 57.3723784957176, "learning_rate": 4.314460964060672e-08, "logits/chosen": -1.1995205879211426, "logits/rejected": -1.306661605834961, "logps/chosen": -158.38504028320312, "logps/rejected": -204.92669677734375, "loss": 0.4169, "rewards/accuracies": 0.75, "rewards/chosen": -0.4841935634613037, "rewards/margins": 1.3874504566192627, "rewards/rejected": -1.8716439008712769, "step": 1226 }, { "epoch": 1.4222222222222223, "grad_norm": 51.26821420581517, "learning_rate": 4.2827148239740875e-08, "logits/chosen": -1.1911481618881226, "logits/rejected": -1.2252691984176636, "logps/chosen": -149.15687561035156, "logps/rejected": -168.63299560546875, "loss": 0.3545, "rewards/accuracies": 0.84375, "rewards/chosen": 0.06090724095702171, "rewards/margins": 1.1758480072021484, "rewards/rejected": -1.114940881729126, "step": 1228 }, { "epoch": 1.4245385450597177, "grad_norm": 59.18176805705484, "learning_rate": 4.251054054415808e-08, "logits/chosen": -1.2053039073944092, "logits/rejected": -1.26601243019104, "logps/chosen": -124.14604187011719, "logps/rejected": -133.4225616455078, "loss": 0.5098, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6462215781211853, "rewards/margins": 0.5801703333854675, "rewards/rejected": -1.2263920307159424, "step": 1230 }, { "epoch": 1.4268548678972133, "grad_norm": 59.54196160924442, "learning_rate": 4.219479128144583e-08, "logits/chosen": -1.1135673522949219, "logits/rejected": -1.1362190246582031, "logps/chosen": -168.93687438964844, "logps/rejected": -226.7947235107422, "loss": 0.4097, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9602434039115906, "rewards/margins": 2.3180015087127686, "rewards/rejected": -3.278245210647583, "step": 1232 }, { "epoch": 1.4291711907347087, "grad_norm": 56.10855119361768, "learning_rate": 4.187990516637361e-08, "logits/chosen": -1.3297936916351318, "logits/rejected": -1.3411719799041748, "logps/chosen": -160.88560485839844, "logps/rejected": -200.78402709960938, "loss": 0.4441, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3891071081161499, "rewards/margins": 1.3050543069839478, "rewards/rejected": -1.6941611766815186, "step": 1234 }, { "epoch": 1.431487513572204, "grad_norm": 63.476091563634505, "learning_rate": 4.156588690082229e-08, "logits/chosen": -1.2897419929504395, "logits/rejected": -1.2645704746246338, "logps/chosen": -194.49856567382812, "logps/rejected": -206.52557373046875, "loss": 0.4213, "rewards/accuracies": 0.875, "rewards/chosen": -0.4005884528160095, "rewards/margins": 1.0420472621917725, "rewards/rejected": -1.4426357746124268, "step": 1236 }, { "epoch": 1.4338038364096997, "grad_norm": 64.48058226856965, "learning_rate": 4.125274117371401e-08, "logits/chosen": -1.2089612483978271, "logits/rejected": -1.1553493738174438, "logps/chosen": -168.6042938232422, "logps/rejected": -183.2893524169922, "loss": 0.4148, "rewards/accuracies": 0.75, "rewards/chosen": -0.5347051024436951, "rewards/margins": 1.0337334871292114, "rewards/rejected": -1.5684385299682617, "step": 1238 }, { "epoch": 1.436120159247195, "grad_norm": 66.24493235516152, "learning_rate": 4.094047266094225e-08, "logits/chosen": -1.1589419841766357, "logits/rejected": -1.2252132892608643, "logps/chosen": -121.9848403930664, "logps/rejected": -142.36962890625, "loss": 0.4173, "rewards/accuracies": 0.8125, "rewards/chosen": -0.47534143924713135, "rewards/margins": 0.8780463933944702, "rewards/rejected": -1.353387713432312, "step": 1240 }, { "epoch": 1.4384364820846907, "grad_norm": 67.05029588369739, "learning_rate": 4.062908602530186e-08, "logits/chosen": -1.2876590490341187, "logits/rejected": -1.2815279960632324, "logps/chosen": -125.93046569824219, "logps/rejected": -150.712890625, "loss": 0.4631, "rewards/accuracies": 0.78125, "rewards/chosen": -0.10989043861627579, "rewards/margins": 1.1261409521102905, "rewards/rejected": -1.2360315322875977, "step": 1242 }, { "epoch": 1.440752804922186, "grad_norm": 72.90739630648197, "learning_rate": 4.031858591641948e-08, "logits/chosen": -1.2130520343780518, "logits/rejected": -1.2716223001480103, "logps/chosen": -173.3944549560547, "logps/rejected": -214.772705078125, "loss": 0.4677, "rewards/accuracies": 0.90625, "rewards/chosen": -0.10449859499931335, "rewards/margins": 1.398302435874939, "rewards/rejected": -1.5028009414672852, "step": 1244 }, { "epoch": 1.4430691277596814, "grad_norm": 53.298120215279184, "learning_rate": 4.000897697068417e-08, "logits/chosen": -1.2028003931045532, "logits/rejected": -1.2138316631317139, "logps/chosen": -153.28302001953125, "logps/rejected": -213.52073669433594, "loss": 0.4146, "rewards/accuracies": 0.8125, "rewards/chosen": -0.801752507686615, "rewards/margins": 2.9270172119140625, "rewards/rejected": -3.7287697792053223, "step": 1246 }, { "epoch": 1.445385450597177, "grad_norm": 65.76302946420111, "learning_rate": 3.970026381117813e-08, "logits/chosen": -1.2451378107070923, "logits/rejected": -1.2875595092773438, "logps/chosen": -148.5320587158203, "logps/rejected": -175.6725616455078, "loss": 0.4043, "rewards/accuracies": 0.84375, "rewards/chosen": -0.14613214135169983, "rewards/margins": 1.1353800296783447, "rewards/rejected": -1.2815121412277222, "step": 1248 }, { "epoch": 1.4477017734346724, "grad_norm": 60.49195421908293, "learning_rate": 3.93924510476076e-08, "logits/chosen": -1.1492172479629517, "logits/rejected": -1.1965818405151367, "logps/chosen": -182.40252685546875, "logps/rejected": -212.2118682861328, "loss": 0.3858, "rewards/accuracies": 0.90625, "rewards/chosen": -0.37888991832733154, "rewards/margins": 1.5161809921264648, "rewards/rejected": -1.8950707912445068, "step": 1250 }, { "epoch": 1.450018096272168, "grad_norm": 58.233537771407626, "learning_rate": 3.9085543276234246e-08, "logits/chosen": -1.165425419807434, "logits/rejected": -1.2350255250930786, "logps/chosen": -173.421630859375, "logps/rejected": -217.11502075195312, "loss": 0.395, "rewards/accuracies": 0.875, "rewards/chosen": -0.4715797007083893, "rewards/margins": 1.3812612295150757, "rewards/rejected": -1.8528410196304321, "step": 1252 }, { "epoch": 1.4523344191096634, "grad_norm": 56.04571021565187, "learning_rate": 3.8779545079806244e-08, "logits/chosen": -1.2306987047195435, "logits/rejected": -1.2661701440811157, "logps/chosen": -158.64395141601562, "logps/rejected": -163.93408203125, "loss": 0.4244, "rewards/accuracies": 0.78125, "rewards/chosen": -0.8128874897956848, "rewards/margins": 1.2007758617401123, "rewards/rejected": -2.0136635303497314, "step": 1254 }, { "epoch": 1.4546507419471588, "grad_norm": 56.87831566114625, "learning_rate": 3.847446102749009e-08, "logits/chosen": -1.2190110683441162, "logits/rejected": -1.2493897676467896, "logps/chosen": -165.40530395507812, "logps/rejected": -196.738037109375, "loss": 0.4215, "rewards/accuracies": 0.875, "rewards/chosen": -0.03989005833864212, "rewards/margins": 1.3989439010620117, "rewards/rejected": -1.4388341903686523, "step": 1256 }, { "epoch": 1.4569670647846544, "grad_norm": 50.40656011526863, "learning_rate": 3.817029567480228e-08, "logits/chosen": -1.1830198764801025, "logits/rejected": -1.1742397546768188, "logps/chosen": -136.02699279785156, "logps/rejected": -155.1636199951172, "loss": 0.4128, "rewards/accuracies": 0.90625, "rewards/chosen": -0.21887874603271484, "rewards/margins": 1.521628737449646, "rewards/rejected": -1.7405076026916504, "step": 1258 }, { "epoch": 1.4592833876221498, "grad_norm": 57.01905107655371, "learning_rate": 3.7867053563541195e-08, "logits/chosen": -0.9882857799530029, "logits/rejected": -0.9833186864852905, "logps/chosen": -144.3942413330078, "logps/rejected": -162.635498046875, "loss": 0.3927, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3512459993362427, "rewards/margins": 1.1930773258209229, "rewards/rejected": -1.544323205947876, "step": 1260 }, { "epoch": 1.4615997104596454, "grad_norm": 59.931801696827705, "learning_rate": 3.756473922171941e-08, "logits/chosen": -1.2637214660644531, "logits/rejected": -1.3037135601043701, "logps/chosen": -177.24203491210938, "logps/rejected": -203.71690368652344, "loss": 0.4226, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2562026083469391, "rewards/margins": 1.4607383012771606, "rewards/rejected": -1.716940999031067, "step": 1262 }, { "epoch": 1.4639160332971408, "grad_norm": 61.29727113734488, "learning_rate": 3.726335716349611e-08, "logits/chosen": -1.0998203754425049, "logits/rejected": -1.1926376819610596, "logps/chosen": -140.4119415283203, "logps/rejected": -204.82289123535156, "loss": 0.3865, "rewards/accuracies": 0.875, "rewards/chosen": -0.2265455424785614, "rewards/margins": 1.8351831436157227, "rewards/rejected": -2.0617284774780273, "step": 1264 }, { "epoch": 1.4662323561346362, "grad_norm": 52.33658884441956, "learning_rate": 3.696291188910954e-08, "logits/chosen": -1.334247350692749, "logits/rejected": -1.2990857362747192, "logps/chosen": -211.4185791015625, "logps/rejected": -239.36732482910156, "loss": 0.4207, "rewards/accuracies": 0.8125, "rewards/chosen": -0.521153450012207, "rewards/margins": 1.7276406288146973, "rewards/rejected": -2.2487940788269043, "step": 1266 }, { "epoch": 1.4685486789721318, "grad_norm": 55.99376810708936, "learning_rate": 3.666340788480986e-08, "logits/chosen": -1.2197870016098022, "logits/rejected": -1.2413604259490967, "logps/chosen": -152.36419677734375, "logps/rejected": -199.79225158691406, "loss": 0.4336, "rewards/accuracies": 0.78125, "rewards/chosen": -0.41305410861968994, "rewards/margins": 1.4322149753570557, "rewards/rejected": -1.845268964767456, "step": 1268 }, { "epoch": 1.4708650018096272, "grad_norm": 48.41879744298975, "learning_rate": 3.636484962279226e-08, "logits/chosen": -1.319197416305542, "logits/rejected": -1.4153599739074707, "logps/chosen": -138.763427734375, "logps/rejected": -166.7461395263672, "loss": 0.3822, "rewards/accuracies": 0.875, "rewards/chosen": 0.17914807796478271, "rewards/margins": 1.2840946912765503, "rewards/rejected": -1.1049466133117676, "step": 1270 }, { "epoch": 1.4731813246471228, "grad_norm": 61.194603768661054, "learning_rate": 3.6067241561130114e-08, "logits/chosen": -1.2439590692520142, "logits/rejected": -1.183296799659729, "logps/chosen": -141.58709716796875, "logps/rejected": -141.97996520996094, "loss": 0.409, "rewards/accuracies": 0.8125, "rewards/chosen": 0.017182359471917152, "rewards/margins": 0.84832763671875, "rewards/rejected": -0.8311452269554138, "step": 1272 }, { "epoch": 1.4754976474846182, "grad_norm": 77.92386005527986, "learning_rate": 3.5770588143708315e-08, "logits/chosen": -1.2311725616455078, "logits/rejected": -1.2159764766693115, "logps/chosen": -155.29443359375, "logps/rejected": -178.33035278320312, "loss": 0.4345, "rewards/accuracies": 0.90625, "rewards/chosen": 0.0964183509349823, "rewards/margins": 1.3752447366714478, "rewards/rejected": -1.278826355934143, "step": 1274 }, { "epoch": 1.4778139703221136, "grad_norm": 59.69529021775264, "learning_rate": 3.5474893800157e-08, "logits/chosen": -1.2245632410049438, "logits/rejected": -1.216321349143982, "logps/chosen": -146.5887451171875, "logps/rejected": -148.22378540039062, "loss": 0.4262, "rewards/accuracies": 0.78125, "rewards/chosen": -0.34651532769203186, "rewards/margins": 0.8327122926712036, "rewards/rejected": -1.179227590560913, "step": 1276 }, { "epoch": 1.4801302931596092, "grad_norm": 75.38031016102305, "learning_rate": 3.5180162945785554e-08, "logits/chosen": -1.2859201431274414, "logits/rejected": -1.3334230184555054, "logps/chosen": -122.11463928222656, "logps/rejected": -150.13040161132812, "loss": 0.4738, "rewards/accuracies": 0.875, "rewards/chosen": -0.1191844493150711, "rewards/margins": 1.2066011428833008, "rewards/rejected": -1.3257856369018555, "step": 1278 }, { "epoch": 1.4824466159971046, "grad_norm": 70.9685656111324, "learning_rate": 3.488639998151633e-08, "logits/chosen": -1.1641169786453247, "logits/rejected": -1.2118412256240845, "logps/chosen": -177.6825714111328, "logps/rejected": -220.609619140625, "loss": 0.452, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5036740303039551, "rewards/margins": 2.158677101135254, "rewards/rejected": -2.662351131439209, "step": 1280 }, { "epoch": 1.4847629388346002, "grad_norm": 56.844036643554055, "learning_rate": 3.45936092938193e-08, "logits/chosen": -1.2509068250656128, "logits/rejected": -1.3152053356170654, "logps/chosen": -156.49288940429688, "logps/rejected": -175.0799102783203, "loss": 0.4025, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10208474099636078, "rewards/margins": 1.4737757444381714, "rewards/rejected": -1.5758603811264038, "step": 1282 }, { "epoch": 1.4870792616720956, "grad_norm": 63.04162868208357, "learning_rate": 3.4301795254646396e-08, "logits/chosen": -1.1289265155792236, "logits/rejected": -1.1296483278274536, "logps/chosen": -92.31997680664062, "logps/rejected": -125.47240447998047, "loss": 0.4319, "rewards/accuracies": 0.9375, "rewards/chosen": -0.2308368980884552, "rewards/margins": 1.562235951423645, "rewards/rejected": -1.7930728197097778, "step": 1284 }, { "epoch": 1.489395584509591, "grad_norm": 53.7512285130868, "learning_rate": 3.4010962221366125e-08, "logits/chosen": -1.2739626169204712, "logits/rejected": -1.3384690284729004, "logps/chosen": -133.05880737304688, "logps/rejected": -184.07568359375, "loss": 0.4018, "rewards/accuracies": 0.875, "rewards/chosen": -0.1107388436794281, "rewards/margins": 1.6195769309997559, "rewards/rejected": -1.7303158044815063, "step": 1286 }, { "epoch": 1.4917119073470864, "grad_norm": 69.01008947683725, "learning_rate": 3.3721114536698635e-08, "logits/chosen": -1.3090903759002686, "logits/rejected": -1.3190556764602661, "logps/chosen": -157.00119018554688, "logps/rejected": -156.64285278320312, "loss": 0.435, "rewards/accuracies": 0.84375, "rewards/chosen": -0.31143975257873535, "rewards/margins": 0.8066626787185669, "rewards/rejected": -1.1181025505065918, "step": 1288 }, { "epoch": 1.494028230184582, "grad_norm": 50.25709967553361, "learning_rate": 3.343225652865095e-08, "logits/chosen": -1.1199434995651245, "logits/rejected": -1.2268508672714233, "logps/chosen": -122.3958740234375, "logps/rejected": -160.63177490234375, "loss": 0.3879, "rewards/accuracies": 0.84375, "rewards/chosen": -0.30211785435676575, "rewards/margins": 1.6221270561218262, "rewards/rejected": -1.9242448806762695, "step": 1290 }, { "epoch": 1.4963445530220776, "grad_norm": 52.88445623952638, "learning_rate": 3.3144392510452125e-08, "logits/chosen": -1.2343542575836182, "logits/rejected": -1.3168511390686035, "logps/chosen": -92.0683822631836, "logps/rejected": -115.30619812011719, "loss": 0.4227, "rewards/accuracies": 0.8125, "rewards/chosen": -0.022456973791122437, "rewards/margins": 1.0297763347625732, "rewards/rejected": -1.052233338356018, "step": 1292 }, { "epoch": 1.498660875859573, "grad_norm": 58.861501596082704, "learning_rate": 3.285752678048892e-08, "logits/chosen": -1.1001089811325073, "logits/rejected": -1.200326919555664, "logps/chosen": -122.45906829833984, "logps/rejected": -142.00970458984375, "loss": 0.3886, "rewards/accuracies": 0.75, "rewards/chosen": -0.28310656547546387, "rewards/margins": 0.9431065320968628, "rewards/rejected": -1.226212978363037, "step": 1294 }, { "epoch": 1.5009771986970684, "grad_norm": 56.84546595956147, "learning_rate": 3.2571663622241875e-08, "logits/chosen": -1.2022103071212769, "logits/rejected": -1.2175830602645874, "logps/chosen": -152.65174865722656, "logps/rejected": -191.4637908935547, "loss": 0.4328, "rewards/accuracies": 0.84375, "rewards/chosen": -0.23669162392616272, "rewards/margins": 1.503944993019104, "rewards/rejected": -1.7406367063522339, "step": 1296 }, { "epoch": 1.5032935215345637, "grad_norm": 75.28276112082713, "learning_rate": 3.2286807304220874e-08, "logits/chosen": -1.2572470903396606, "logits/rejected": -1.3082858324050903, "logps/chosen": -178.1489715576172, "logps/rejected": -204.2378692626953, "loss": 0.3727, "rewards/accuracies": 0.875, "rewards/chosen": -0.29775533080101013, "rewards/margins": 1.6656044721603394, "rewards/rejected": -1.9633598327636719, "step": 1298 }, { "epoch": 1.5056098443720594, "grad_norm": 58.03906016771635, "learning_rate": 3.200296207990174e-08, "logits/chosen": -1.234593391418457, "logits/rejected": -1.1559195518493652, "logps/chosen": -146.87179565429688, "logps/rejected": -168.06884765625, "loss": 0.4228, "rewards/accuracies": 0.75, "rewards/chosen": -0.2712811231613159, "rewards/margins": 1.2952864170074463, "rewards/rejected": -1.5665674209594727, "step": 1300 }, { "epoch": 1.5056098443720594, "eval_logits/chosen": -1.2275954484939575, "eval_logits/rejected": -1.222235918045044, "eval_logps/chosen": -142.21490478515625, "eval_logps/rejected": -147.02261352539062, "eval_loss": 0.5889570713043213, "eval_rewards/accuracies": 0.7599999904632568, "eval_rewards/chosen": -0.6583734154701233, "eval_rewards/margins": 0.6485320329666138, "eval_rewards/rejected": -1.3069055080413818, "eval_runtime": 28.0973, "eval_samples_per_second": 3.559, "eval_steps_per_second": 0.89, "step": 1300 }, { "epoch": 1.507926167209555, "grad_norm": 53.8054068622811, "learning_rate": 3.172013218766273e-08, "logits/chosen": -1.1949838399887085, "logits/rejected": -1.1276732683181763, "logps/chosen": -116.6605453491211, "logps/rejected": -129.42701721191406, "loss": 0.4054, "rewards/accuracies": 0.84375, "rewards/chosen": -0.26029157638549805, "rewards/margins": 0.894656777381897, "rewards/rejected": -1.1549484729766846, "step": 1302 }, { "epoch": 1.5102424900470504, "grad_norm": 53.44469670494882, "learning_rate": 3.143832185072103e-08, "logits/chosen": -1.330610752105713, "logits/rejected": -1.3352289199829102, "logps/chosen": -129.06997680664062, "logps/rejected": -138.2757568359375, "loss": 0.4228, "rewards/accuracies": 0.8125, "rewards/chosen": -0.19665758311748505, "rewards/margins": 0.8899365067481995, "rewards/rejected": -1.0865941047668457, "step": 1304 }, { "epoch": 1.5125588128845457, "grad_norm": 52.12869653110471, "learning_rate": 3.115753527706986e-08, "logits/chosen": -1.2492622137069702, "logits/rejected": -1.2376924753189087, "logps/chosen": -176.64134216308594, "logps/rejected": -197.2509307861328, "loss": 0.3503, "rewards/accuracies": 0.90625, "rewards/chosen": -0.33254703879356384, "rewards/margins": 1.2227458953857422, "rewards/rejected": -1.5552929639816284, "step": 1306 }, { "epoch": 1.5148751357220411, "grad_norm": 76.85376628040396, "learning_rate": 3.087777665941565e-08, "logits/chosen": -1.0722814798355103, "logits/rejected": -1.1189197301864624, "logps/chosen": -142.5889892578125, "logps/rejected": -179.30929565429688, "loss": 0.425, "rewards/accuracies": 0.9375, "rewards/chosen": -0.607629120349884, "rewards/margins": 1.3540990352630615, "rewards/rejected": -1.9617282152175903, "step": 1308 }, { "epoch": 1.5171914585595367, "grad_norm": 53.40350152738603, "learning_rate": 3.059905017511536e-08, "logits/chosen": -1.289185881614685, "logits/rejected": -1.3002314567565918, "logps/chosen": -155.68594360351562, "logps/rejected": -181.0437774658203, "loss": 0.4321, "rewards/accuracies": 0.78125, "rewards/chosen": -0.1934920847415924, "rewards/margins": 1.0485948324203491, "rewards/rejected": -1.2420868873596191, "step": 1310 }, { "epoch": 1.5195077813970324, "grad_norm": 62.10414133443873, "learning_rate": 3.032135998611409e-08, "logits/chosen": -1.2625949382781982, "logits/rejected": -1.2948625087738037, "logps/chosen": -120.27640533447266, "logps/rejected": -124.78179931640625, "loss": 0.4706, "rewards/accuracies": 0.75, "rewards/chosen": -0.41021931171417236, "rewards/margins": 0.7083292603492737, "rewards/rejected": -1.1185486316680908, "step": 1312 }, { "epoch": 1.5218241042345277, "grad_norm": 53.355625531453235, "learning_rate": 3.004471023888307e-08, "logits/chosen": -1.202606201171875, "logits/rejected": -1.2765260934829712, "logps/chosen": -154.8223876953125, "logps/rejected": -174.3974609375, "loss": 0.3724, "rewards/accuracies": 0.90625, "rewards/chosen": -0.02067536488175392, "rewards/margins": 1.3669226169586182, "rewards/rejected": -1.3875980377197266, "step": 1314 }, { "epoch": 1.5241404270720231, "grad_norm": 59.47198871147131, "learning_rate": 2.9769105064357537e-08, "logits/chosen": -1.3064639568328857, "logits/rejected": -1.2530848979949951, "logps/chosen": -183.5660858154297, "logps/rejected": -209.7610626220703, "loss": 0.4216, "rewards/accuracies": 0.84375, "rewards/chosen": -0.21822071075439453, "rewards/margins": 1.0935778617858887, "rewards/rejected": -1.3117986917495728, "step": 1316 }, { "epoch": 1.5264567499095185, "grad_norm": 65.91690062559164, "learning_rate": 2.949454857787519e-08, "logits/chosen": -1.238956093788147, "logits/rejected": -1.218425989151001, "logps/chosen": -214.0928955078125, "logps/rejected": -242.11489868164062, "loss": 0.431, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3267192244529724, "rewards/margins": 2.120006799697876, "rewards/rejected": -2.446725606918335, "step": 1318 }, { "epoch": 1.5287730727470141, "grad_norm": 79.84542515208742, "learning_rate": 2.9221044879114775e-08, "logits/chosen": -1.251328468322754, "logits/rejected": -1.3467867374420166, "logps/chosen": -177.4748077392578, "logps/rejected": -217.42910766601562, "loss": 0.4328, "rewards/accuracies": 0.90625, "rewards/chosen": -0.36167412996292114, "rewards/margins": 1.5050268173217773, "rewards/rejected": -1.8667008876800537, "step": 1320 }, { "epoch": 1.5310893955845097, "grad_norm": 49.694028003635175, "learning_rate": 2.8948598052034777e-08, "logits/chosen": -1.3321678638458252, "logits/rejected": -1.336784839630127, "logps/chosen": -151.92041015625, "logps/rejected": -168.43971252441406, "loss": 0.4527, "rewards/accuracies": 0.8125, "rewards/chosen": -0.07541098445653915, "rewards/margins": 1.295732021331787, "rewards/rejected": -1.371143102645874, "step": 1322 }, { "epoch": 1.5334057184220051, "grad_norm": 53.56450173273162, "learning_rate": 2.867721216481246e-08, "logits/chosen": -1.278252363204956, "logits/rejected": -1.380322813987732, "logps/chosen": -117.78822326660156, "logps/rejected": -142.52127075195312, "loss": 0.4433, "rewards/accuracies": 0.8125, "rewards/chosen": -0.0004923827946186066, "rewards/margins": 0.94923996925354, "rewards/rejected": -0.94973224401474, "step": 1324 }, { "epoch": 1.5357220412595005, "grad_norm": 61.396961223172646, "learning_rate": 2.8406891269783073e-08, "logits/chosen": -1.1902800798416138, "logits/rejected": -1.3021633625030518, "logps/chosen": -158.06736755371094, "logps/rejected": -229.27200317382812, "loss": 0.4586, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1848677396774292, "rewards/margins": 1.0514723062515259, "rewards/rejected": -1.236340045928955, "step": 1326 }, { "epoch": 1.538038364096996, "grad_norm": 60.56277160405794, "learning_rate": 2.813763940337952e-08, "logits/chosen": -1.2507346868515015, "logits/rejected": -1.2003321647644043, "logps/chosen": -127.8934097290039, "logps/rejected": -149.8028564453125, "loss": 0.4295, "rewards/accuracies": 0.78125, "rewards/chosen": -0.22713351249694824, "rewards/margins": 1.3372572660446167, "rewards/rejected": -1.5643908977508545, "step": 1328 }, { "epoch": 1.5403546869344915, "grad_norm": 49.96943093434472, "learning_rate": 2.7869460586071868e-08, "logits/chosen": -1.290654182434082, "logits/rejected": -1.226252794265747, "logps/chosen": -154.7710418701172, "logps/rejected": -160.32904052734375, "loss": 0.4118, "rewards/accuracies": 0.96875, "rewards/chosen": 0.027480699121952057, "rewards/margins": 0.9481416940689087, "rewards/rejected": -0.9206609129905701, "step": 1330 }, { "epoch": 1.5426710097719871, "grad_norm": 59.55494034761672, "learning_rate": 2.7602358822307413e-08, "logits/chosen": -1.2469313144683838, "logits/rejected": -1.2422947883605957, "logps/chosen": -134.33392333984375, "logps/rejected": -147.20509338378906, "loss": 0.4157, "rewards/accuracies": 0.875, "rewards/chosen": -0.2177230417728424, "rewards/margins": 1.3250346183776855, "rewards/rejected": -1.5427578687667847, "step": 1332 }, { "epoch": 1.5449873326094825, "grad_norm": 50.53246486906999, "learning_rate": 2.733633810045094e-08, "logits/chosen": -1.2038668394088745, "logits/rejected": -1.2241549491882324, "logps/chosen": -134.41952514648438, "logps/rejected": -164.3434600830078, "loss": 0.3798, "rewards/accuracies": 0.90625, "rewards/chosen": -0.30086490511894226, "rewards/margins": 1.5149168968200684, "rewards/rejected": -1.8157817125320435, "step": 1334 }, { "epoch": 1.547303655446978, "grad_norm": 64.60352169707365, "learning_rate": 2.7071402392725096e-08, "logits/chosen": -1.223114013671875, "logits/rejected": -1.2082433700561523, "logps/chosen": -165.63157653808594, "logps/rejected": -190.38706970214844, "loss": 0.4127, "rewards/accuracies": 0.875, "rewards/chosen": -0.4103352725505829, "rewards/margins": 1.6495842933654785, "rewards/rejected": -2.0599193572998047, "step": 1336 }, { "epoch": 1.5496199782844733, "grad_norm": 49.10542820839143, "learning_rate": 2.6807555655151025e-08, "logits/chosen": -1.2764735221862793, "logits/rejected": -1.3327206373214722, "logps/chosen": -134.23565673828125, "logps/rejected": -163.20474243164062, "loss": 0.3816, "rewards/accuracies": 0.78125, "rewards/chosen": -0.16112415492534637, "rewards/margins": 1.4172645807266235, "rewards/rejected": -1.5783886909484863, "step": 1338 }, { "epoch": 1.551936301121969, "grad_norm": 50.20237542030147, "learning_rate": 2.6544801827489482e-08, "logits/chosen": -1.2196974754333496, "logits/rejected": -1.1965646743774414, "logps/chosen": -129.3689422607422, "logps/rejected": -142.672119140625, "loss": 0.367, "rewards/accuracies": 0.84375, "rewards/chosen": -0.257286012172699, "rewards/margins": 1.1139978170394897, "rewards/rejected": -1.3712838888168335, "step": 1340 }, { "epoch": 1.5542526239594645, "grad_norm": 59.30212350967455, "learning_rate": 2.6283144833181782e-08, "logits/chosen": -1.1602783203125, "logits/rejected": -1.1242191791534424, "logps/chosen": -120.19819641113281, "logps/rejected": -154.9192352294922, "loss": 0.44, "rewards/accuracies": 0.875, "rewards/chosen": -0.2868664264678955, "rewards/margins": 1.309116244316101, "rewards/rejected": -1.5959827899932861, "step": 1342 }, { "epoch": 1.55656894679696, "grad_norm": 55.629592680909475, "learning_rate": 2.6022588579291327e-08, "logits/chosen": -1.1773267984390259, "logits/rejected": -1.1578972339630127, "logps/chosen": -120.31849670410156, "logps/rejected": -141.1858367919922, "loss": 0.4489, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2234707921743393, "rewards/margins": 0.9735670685768127, "rewards/rejected": -1.1970378160476685, "step": 1344 }, { "epoch": 1.5588852696344553, "grad_norm": 58.52065186564036, "learning_rate": 2.5763136956445342e-08, "logits/chosen": -1.13753342628479, "logits/rejected": -1.1701133251190186, "logps/chosen": -139.2621612548828, "logps/rejected": -194.79722595214844, "loss": 0.4361, "rewards/accuracies": 0.75, "rewards/chosen": -0.3876231908798218, "rewards/margins": 1.8029227256774902, "rewards/rejected": -2.1905460357666016, "step": 1346 }, { "epoch": 1.5612015924719507, "grad_norm": 53.922591259946095, "learning_rate": 2.5504793838776582e-08, "logits/chosen": -1.2545125484466553, "logits/rejected": -1.330094814300537, "logps/chosen": -150.107421875, "logps/rejected": -178.18336486816406, "loss": 0.3999, "rewards/accuracies": 0.8125, "rewards/chosen": -0.33850181102752686, "rewards/margins": 1.4819968938827515, "rewards/rejected": -1.8204987049102783, "step": 1348 }, { "epoch": 1.5635179153094463, "grad_norm": 50.688082246559354, "learning_rate": 2.5247563083865697e-08, "logits/chosen": -1.2270935773849487, "logits/rejected": -1.2281591892242432, "logps/chosen": -151.96878051757812, "logps/rejected": -163.12692260742188, "loss": 0.3785, "rewards/accuracies": 0.78125, "rewards/chosen": -0.06753317266702652, "rewards/margins": 0.8644936084747314, "rewards/rejected": -0.9320268630981445, "step": 1350 }, { "epoch": 1.565834238146942, "grad_norm": 48.380975770041516, "learning_rate": 2.4991448532683525e-08, "logits/chosen": -1.08161461353302, "logits/rejected": -1.0994572639465332, "logps/chosen": -72.54817962646484, "logps/rejected": -86.65010833740234, "loss": 0.4118, "rewards/accuracies": 0.8125, "rewards/chosen": 0.03128306567668915, "rewards/margins": 1.0805275440216064, "rewards/rejected": -1.0492445230484009, "step": 1352 }, { "epoch": 1.5681505609844373, "grad_norm": 56.228286429005685, "learning_rate": 2.4736454009533657e-08, "logits/chosen": -1.110878586769104, "logits/rejected": -1.1308159828186035, "logps/chosen": -91.99187469482422, "logps/rejected": -103.77259826660156, "loss": 0.4426, "rewards/accuracies": 0.71875, "rewards/chosen": -0.12472671270370483, "rewards/margins": 0.7570998668670654, "rewards/rejected": -0.8818265199661255, "step": 1354 }, { "epoch": 1.5704668838219327, "grad_norm": 45.1944314609051, "learning_rate": 2.4482583321995476e-08, "logits/chosen": -1.1095668077468872, "logits/rejected": -1.0675170421600342, "logps/chosen": -136.01248168945312, "logps/rejected": -142.30926513671875, "loss": 0.4013, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3576383888721466, "rewards/margins": 1.211026668548584, "rewards/rejected": -1.5686650276184082, "step": 1356 }, { "epoch": 1.572783206659428, "grad_norm": 69.47752257662052, "learning_rate": 2.4229840260867286e-08, "logits/chosen": -1.1537913084030151, "logits/rejected": -1.1009234189987183, "logps/chosen": -154.23377990722656, "logps/rejected": -169.52256774902344, "loss": 0.4161, "rewards/accuracies": 0.84375, "rewards/chosen": -0.404860258102417, "rewards/margins": 1.2964788675308228, "rewards/rejected": -1.7013392448425293, "step": 1358 }, { "epoch": 1.5750995294969237, "grad_norm": 57.67713141201682, "learning_rate": 2.3978228600109563e-08, "logits/chosen": -1.2760729789733887, "logits/rejected": -1.3226953744888306, "logps/chosen": -157.44998168945312, "logps/rejected": -199.5008087158203, "loss": 0.4445, "rewards/accuracies": 0.84375, "rewards/chosen": -0.27148348093032837, "rewards/margins": 1.2586216926574707, "rewards/rejected": -1.5301051139831543, "step": 1360 }, { "epoch": 1.577415852334419, "grad_norm": 63.00178299040625, "learning_rate": 2.372775209678881e-08, "logits/chosen": -1.2917158603668213, "logits/rejected": -1.2442728281021118, "logps/chosen": -135.3437957763672, "logps/rejected": -150.4008026123047, "loss": 0.4729, "rewards/accuracies": 0.75, "rewards/chosen": -0.21689726412296295, "rewards/margins": 0.914188802242279, "rewards/rejected": -1.1310861110687256, "step": 1362 }, { "epoch": 1.5797321751719147, "grad_norm": 54.9545294199578, "learning_rate": 2.347841449102136e-08, "logits/chosen": -1.16354501247406, "logits/rejected": -1.2029650211334229, "logps/chosen": -127.01541137695312, "logps/rejected": -160.94837951660156, "loss": 0.4415, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2984340488910675, "rewards/margins": 1.5368034839630127, "rewards/rejected": -1.8352375030517578, "step": 1364 }, { "epoch": 1.58204849800941, "grad_norm": 59.15647045553974, "learning_rate": 2.3230219505917424e-08, "logits/chosen": -1.1245296001434326, "logits/rejected": -1.1090185642242432, "logps/chosen": -88.6761245727539, "logps/rejected": -99.77947235107422, "loss": 0.4674, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2688758969306946, "rewards/margins": 1.0267162322998047, "rewards/rejected": -1.295592188835144, "step": 1366 }, { "epoch": 1.5843648208469054, "grad_norm": 95.242652077602, "learning_rate": 2.2983170847525635e-08, "logits/chosen": -1.2544560432434082, "logits/rejected": -1.334200382232666, "logps/chosen": -132.92308044433594, "logps/rejected": -153.16336059570312, "loss": 0.4863, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2278953194618225, "rewards/margins": 0.9455510973930359, "rewards/rejected": -1.1734462976455688, "step": 1368 }, { "epoch": 1.586681143684401, "grad_norm": 81.42436059360928, "learning_rate": 2.2737272204777737e-08, "logits/chosen": -1.0738383531570435, "logits/rejected": -1.143731951713562, "logps/chosen": -123.22402954101562, "logps/rejected": -167.40036010742188, "loss": 0.4226, "rewards/accuracies": 0.9375, "rewards/chosen": -0.27396196126937866, "rewards/margins": 1.5770013332366943, "rewards/rejected": -1.8509632349014282, "step": 1370 }, { "epoch": 1.5889974665218964, "grad_norm": 55.49979161745584, "learning_rate": 2.249252724943336e-08, "logits/chosen": -1.2299567461013794, "logits/rejected": -1.2440650463104248, "logps/chosen": -123.21504974365234, "logps/rejected": -143.9616241455078, "loss": 0.4755, "rewards/accuracies": 0.75, "rewards/chosen": -0.491690993309021, "rewards/margins": 0.894086480140686, "rewards/rejected": -1.385777473449707, "step": 1372 }, { "epoch": 1.591313789359392, "grad_norm": 54.18985696863257, "learning_rate": 2.2248939636025264e-08, "logits/chosen": -1.203713297843933, "logits/rejected": -1.2125518321990967, "logps/chosen": -176.7722930908203, "logps/rejected": -228.38929748535156, "loss": 0.408, "rewards/accuracies": 0.875, "rewards/chosen": -0.5514904260635376, "rewards/margins": 3.2685635089874268, "rewards/rejected": -3.820053815841675, "step": 1374 }, { "epoch": 1.5936301121968874, "grad_norm": 54.36252764320043, "learning_rate": 2.200651300180483e-08, "logits/chosen": -1.2815027236938477, "logits/rejected": -1.318671703338623, "logps/chosen": -130.49078369140625, "logps/rejected": -142.1308135986328, "loss": 0.3952, "rewards/accuracies": 0.90625, "rewards/chosen": -0.029716283082962036, "rewards/margins": 0.9699075222015381, "rewards/rejected": -0.999623715877533, "step": 1376 }, { "epoch": 1.5959464350343828, "grad_norm": 96.21574776555832, "learning_rate": 2.1765250966687687e-08, "logits/chosen": -1.3336846828460693, "logits/rejected": -1.3228704929351807, "logps/chosen": -176.54737854003906, "logps/rejected": -209.42909240722656, "loss": 0.4583, "rewards/accuracies": 0.71875, "rewards/chosen": -0.7003276348114014, "rewards/margins": 1.229347586631775, "rewards/rejected": -1.9296752214431763, "step": 1378 }, { "epoch": 1.5982627578718784, "grad_norm": 71.93512521949452, "learning_rate": 2.1525157133199633e-08, "logits/chosen": -1.08268404006958, "logits/rejected": -1.1782095432281494, "logps/chosen": -122.80490112304688, "logps/rejected": -159.9678955078125, "loss": 0.4574, "rewards/accuracies": 0.8125, "rewards/chosen": -0.40625882148742676, "rewards/margins": 1.3748208284378052, "rewards/rejected": -1.7810795307159424, "step": 1380 }, { "epoch": 1.6005790807093738, "grad_norm": 70.14564165621101, "learning_rate": 2.1286235086422843e-08, "logits/chosen": -1.3746612071990967, "logits/rejected": -1.3713853359222412, "logps/chosen": -178.19586181640625, "logps/rejected": -212.018310546875, "loss": 0.3689, "rewards/accuracies": 0.96875, "rewards/chosen": -0.2453938126564026, "rewards/margins": 1.7820340394973755, "rewards/rejected": -2.027428150177002, "step": 1382 }, { "epoch": 1.6028954035468694, "grad_norm": 53.127835579709945, "learning_rate": 2.1048488393942455e-08, "logits/chosen": -1.17882239818573, "logits/rejected": -1.1624205112457275, "logps/chosen": -111.90765380859375, "logps/rejected": -135.91148376464844, "loss": 0.475, "rewards/accuracies": 0.78125, "rewards/chosen": -0.25234535336494446, "rewards/margins": 1.1534082889556885, "rewards/rejected": -1.4057536125183105, "step": 1384 }, { "epoch": 1.6052117263843648, "grad_norm": 59.635014558757035, "learning_rate": 2.0811920605793122e-08, "logits/chosen": -1.1906554698944092, "logits/rejected": -1.2808858156204224, "logps/chosen": -129.28529357910156, "logps/rejected": -157.04090881347656, "loss": 0.3961, "rewards/accuracies": 0.75, "rewards/chosen": 0.06318804621696472, "rewards/margins": 1.2911432981491089, "rewards/rejected": -1.2279552221298218, "step": 1386 }, { "epoch": 1.6075280492218602, "grad_norm": 57.41476924308661, "learning_rate": 2.0576535254406157e-08, "logits/chosen": -1.1566798686981201, "logits/rejected": -1.1978453397750854, "logps/chosen": -153.1495819091797, "logps/rejected": -175.12283325195312, "loss": 0.3893, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4101862907409668, "rewards/margins": 1.2702761888504028, "rewards/rejected": -1.6804625988006592, "step": 1388 }, { "epoch": 1.6098443720593558, "grad_norm": 46.94707676426246, "learning_rate": 2.0342335854556736e-08, "logits/chosen": -1.3085883855819702, "logits/rejected": -1.2242193222045898, "logps/chosen": -173.11146545410156, "logps/rejected": -201.7664031982422, "loss": 0.3882, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6821246147155762, "rewards/margins": 1.9655760526657104, "rewards/rejected": -2.6477010250091553, "step": 1390 }, { "epoch": 1.6121606948968512, "grad_norm": 51.39756905271222, "learning_rate": 2.0109325903311324e-08, "logits/chosen": -1.1785892248153687, "logits/rejected": -1.2349551916122437, "logps/chosen": -173.98049926757812, "logps/rejected": -225.72409057617188, "loss": 0.378, "rewards/accuracies": 0.875, "rewards/chosen": -0.16729062795639038, "rewards/margins": 1.8970539569854736, "rewards/rejected": -2.064344644546509, "step": 1392 }, { "epoch": 1.6144770177343468, "grad_norm": 75.45520614404798, "learning_rate": 1.9877508879975557e-08, "logits/chosen": -1.2368243932724, "logits/rejected": -1.1864349842071533, "logps/chosen": -157.86318969726562, "logps/rejected": -164.7779083251953, "loss": 0.4102, "rewards/accuracies": 0.875, "rewards/chosen": -0.416544646024704, "rewards/margins": 1.2882412672042847, "rewards/rejected": -1.7047858238220215, "step": 1394 }, { "epoch": 1.6167933405718422, "grad_norm": 54.69368738030066, "learning_rate": 1.9646888246042337e-08, "logits/chosen": -1.2882027626037598, "logits/rejected": -1.2957350015640259, "logps/chosen": -168.46693420410156, "logps/rejected": -184.96092224121094, "loss": 0.3796, "rewards/accuracies": 0.71875, "rewards/chosen": -0.5366595983505249, "rewards/margins": 1.4451313018798828, "rewards/rejected": -1.9817910194396973, "step": 1396 }, { "epoch": 1.6191096634093376, "grad_norm": 47.07150629441627, "learning_rate": 1.941746744513999e-08, "logits/chosen": -1.2016966342926025, "logits/rejected": -1.2090229988098145, "logps/chosen": -119.58673095703125, "logps/rejected": -141.86593627929688, "loss": 0.3931, "rewards/accuracies": 0.875, "rewards/chosen": -0.3934524655342102, "rewards/margins": 1.4040158987045288, "rewards/rejected": -1.7974684238433838, "step": 1398 }, { "epoch": 1.6214259862468332, "grad_norm": 62.032269869178876, "learning_rate": 1.918924990298091e-08, "logits/chosen": -1.2298343181610107, "logits/rejected": -1.233276128768921, "logps/chosen": -160.66769409179688, "logps/rejected": -165.26895141601562, "loss": 0.4199, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6389051079750061, "rewards/margins": 0.8895078897476196, "rewards/rejected": -1.52841317653656, "step": 1400 }, { "epoch": 1.6214259862468332, "eval_logits/chosen": -1.2133427858352661, "eval_logits/rejected": -1.2084039449691772, "eval_logps/chosen": -144.74742126464844, "eval_logps/rejected": -149.5865478515625, "eval_loss": 0.6033037304878235, "eval_rewards/accuracies": 0.7200000286102295, "eval_rewards/chosen": -0.9116251468658447, "eval_rewards/margins": 0.6516737341880798, "eval_rewards/rejected": -1.5632988214492798, "eval_runtime": 24.3113, "eval_samples_per_second": 4.113, "eval_steps_per_second": 1.028, "step": 1400 }, { "epoch": 1.6237423090843286, "grad_norm": 86.9760098456171, "learning_rate": 1.8962239027310577e-08, "logits/chosen": -1.1873736381530762, "logits/rejected": -1.2712754011154175, "logps/chosen": -150.010498046875, "logps/rejected": -177.68890380859375, "loss": 0.4684, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3790132403373718, "rewards/margins": 1.0307780504226685, "rewards/rejected": -1.409791350364685, "step": 1402 }, { "epoch": 1.6260586319218242, "grad_norm": 54.339526567430624, "learning_rate": 1.8736438207856377e-08, "logits/chosen": -1.346308708190918, "logits/rejected": -1.3275481462478638, "logps/chosen": -172.08343505859375, "logps/rejected": -188.81179809570312, "loss": 0.4124, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4836081564426422, "rewards/margins": 1.5920302867889404, "rewards/rejected": -2.07563853263855, "step": 1404 }, { "epoch": 1.6283749547593196, "grad_norm": 50.44666894199177, "learning_rate": 1.851185081627714e-08, "logits/chosen": -1.313905954360962, "logits/rejected": -1.3108646869659424, "logps/chosen": -150.0095977783203, "logps/rejected": -163.9129180908203, "loss": 0.4744, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6312530040740967, "rewards/margins": 0.8648273348808289, "rewards/rejected": -1.4960802793502808, "step": 1406 }, { "epoch": 1.630691277596815, "grad_norm": 63.4120821056028, "learning_rate": 1.8288480206112877e-08, "logits/chosen": -1.2368450164794922, "logits/rejected": -1.222330927848816, "logps/chosen": -114.20048522949219, "logps/rejected": -153.55780029296875, "loss": 0.4453, "rewards/accuracies": 0.75, "rewards/chosen": -0.13375352323055267, "rewards/margins": 1.81991446018219, "rewards/rejected": -1.9536678791046143, "step": 1408 }, { "epoch": 1.6330076004343104, "grad_norm": 57.27120326939087, "learning_rate": 1.806632971273454e-08, "logits/chosen": -1.3121931552886963, "logits/rejected": -1.3301172256469727, "logps/chosen": -140.49411010742188, "logps/rejected": -147.28729248046875, "loss": 0.4135, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3088449537754059, "rewards/margins": 0.9551953673362732, "rewards/rejected": -1.264040470123291, "step": 1410 }, { "epoch": 1.635323923271806, "grad_norm": 58.835101317691525, "learning_rate": 1.7845402653294262e-08, "logits/chosen": -1.1533010005950928, "logits/rejected": -1.2099212408065796, "logps/chosen": -161.35955810546875, "logps/rejected": -182.40274047851562, "loss": 0.3941, "rewards/accuracies": 0.96875, "rewards/chosen": -0.4202643036842346, "rewards/margins": 1.3078231811523438, "rewards/rejected": -1.7280876636505127, "step": 1412 }, { "epoch": 1.6376402461093016, "grad_norm": 62.35716207894577, "learning_rate": 1.762570232667595e-08, "logits/chosen": -1.164352297782898, "logits/rejected": -1.2668792009353638, "logps/chosen": -121.197021484375, "logps/rejected": -175.3108367919922, "loss": 0.4048, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2763165235519409, "rewards/margins": 1.5692354440689087, "rewards/rejected": -1.8455519676208496, "step": 1414 }, { "epoch": 1.639956568946797, "grad_norm": 72.03828023987269, "learning_rate": 1.7407232013445893e-08, "logits/chosen": -1.120769739151001, "logits/rejected": -1.203442096710205, "logps/chosen": -147.95274353027344, "logps/rejected": -192.7701873779297, "loss": 0.4188, "rewards/accuracies": 0.9375, "rewards/chosen": -0.37782442569732666, "rewards/margins": 1.473639965057373, "rewards/rejected": -1.8514643907546997, "step": 1416 }, { "epoch": 1.6422728917842924, "grad_norm": 66.4702023164338, "learning_rate": 1.7189994975803758e-08, "logits/chosen": -1.1487022638320923, "logits/rejected": -1.2365858554840088, "logps/chosen": -117.31167602539062, "logps/rejected": -146.18536376953125, "loss": 0.4393, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3540950417518616, "rewards/margins": 0.7364134192466736, "rewards/rejected": -1.0905084609985352, "step": 1418 }, { "epoch": 1.6445892146217878, "grad_norm": 78.71556855680205, "learning_rate": 1.6973994457534023e-08, "logits/chosen": -1.2775179147720337, "logits/rejected": -1.3115909099578857, "logps/chosen": -164.73760986328125, "logps/rejected": -190.9739227294922, "loss": 0.506, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6548792123794556, "rewards/margins": 1.3077069520950317, "rewards/rejected": -1.9625860452651978, "step": 1420 }, { "epoch": 1.6469055374592834, "grad_norm": 62.03650354576015, "learning_rate": 1.6759233683957396e-08, "logits/chosen": -1.280670166015625, "logits/rejected": -1.270959734916687, "logps/chosen": -174.10838317871094, "logps/rejected": -190.41744995117188, "loss": 0.417, "rewards/accuracies": 0.90625, "rewards/chosen": -0.3348293602466583, "rewards/margins": 1.4488041400909424, "rewards/rejected": -1.7836335897445679, "step": 1422 }, { "epoch": 1.649221860296779, "grad_norm": 57.86020225721166, "learning_rate": 1.6545715861882702e-08, "logits/chosen": -1.101415753364563, "logits/rejected": -1.1186178922653198, "logps/chosen": -142.97349548339844, "logps/rejected": -186.55322265625, "loss": 0.4078, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5462976098060608, "rewards/margins": 2.0124125480651855, "rewards/rejected": -2.5587100982666016, "step": 1424 }, { "epoch": 1.6515381831342744, "grad_norm": 48.695250546687646, "learning_rate": 1.6333444179559074e-08, "logits/chosen": -1.2034971714019775, "logits/rejected": -1.2503241300582886, "logps/chosen": -169.5352020263672, "logps/rejected": -214.4050750732422, "loss": 0.4054, "rewards/accuracies": 0.875, "rewards/chosen": -0.5561625361442566, "rewards/margins": 2.379465341567993, "rewards/rejected": -2.9356279373168945, "step": 1426 }, { "epoch": 1.6538545059717698, "grad_norm": 58.54466931855239, "learning_rate": 1.6122421806628207e-08, "logits/chosen": -1.2484573125839233, "logits/rejected": -1.298370599746704, "logps/chosen": -224.2542266845703, "logps/rejected": -239.31918334960938, "loss": 0.3847, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5975647568702698, "rewards/margins": 2.2055599689483643, "rewards/rejected": -2.8031251430511475, "step": 1428 }, { "epoch": 1.6561708288092651, "grad_norm": 59.92256675039104, "learning_rate": 1.5912651894077167e-08, "logits/chosen": -1.2150397300720215, "logits/rejected": -1.1968854665756226, "logps/chosen": -150.77381896972656, "logps/rejected": -189.05838012695312, "loss": 0.4165, "rewards/accuracies": 0.9375, "rewards/chosen": -0.49205654859542847, "rewards/margins": 2.0640861988067627, "rewards/rejected": -2.556142807006836, "step": 1430 }, { "epoch": 1.6584871516467607, "grad_norm": 59.42932647345947, "learning_rate": 1.57041375741912e-08, "logits/chosen": -1.196043610572815, "logits/rejected": -1.2252073287963867, "logps/chosen": -145.3686065673828, "logps/rejected": -144.02952575683594, "loss": 0.389, "rewards/accuracies": 0.8125, "rewards/chosen": -0.924052894115448, "rewards/margins": 0.9492592215538025, "rewards/rejected": -1.8733121156692505, "step": 1432 }, { "epoch": 1.6608034744842564, "grad_norm": 59.90616287675461, "learning_rate": 1.5496881960507124e-08, "logits/chosen": -1.1822700500488281, "logits/rejected": -1.1750373840332031, "logps/chosen": -134.07861328125, "logps/rejected": -201.8026123046875, "loss": 0.382, "rewards/accuracies": 0.875, "rewards/chosen": -0.24189743399620056, "rewards/margins": 2.8620426654815674, "rewards/rejected": -3.103940010070801, "step": 1434 }, { "epoch": 1.6631197973217517, "grad_norm": 57.85512148826699, "learning_rate": 1.529088814776668e-08, "logits/chosen": -1.2793397903442383, "logits/rejected": -1.2699894905090332, "logps/chosen": -182.90478515625, "logps/rejected": -223.45468139648438, "loss": 0.3792, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6292093992233276, "rewards/margins": 1.7043105363845825, "rewards/rejected": -2.33351993560791, "step": 1436 }, { "epoch": 1.6654361201592471, "grad_norm": 80.74607876180737, "learning_rate": 1.508615921187044e-08, "logits/chosen": -1.049709677696228, "logits/rejected": -1.1322901248931885, "logps/chosen": -109.7282943725586, "logps/rejected": -159.00778198242188, "loss": 0.3868, "rewards/accuracies": 0.90625, "rewards/chosen": -0.34554052352905273, "rewards/margins": 1.7658944129943848, "rewards/rejected": -2.1114349365234375, "step": 1438 }, { "epoch": 1.6677524429967425, "grad_norm": 66.72584214795384, "learning_rate": 1.4882698209831779e-08, "logits/chosen": -1.1424648761749268, "logits/rejected": -1.1974608898162842, "logps/chosen": -97.99867248535156, "logps/rejected": -134.96340942382812, "loss": 0.4193, "rewards/accuracies": 0.71875, "rewards/chosen": -0.35685765743255615, "rewards/margins": 1.5133432149887085, "rewards/rejected": -1.8702008724212646, "step": 1440 }, { "epoch": 1.6700687658342381, "grad_norm": 78.22654470786401, "learning_rate": 1.4680508179731343e-08, "logits/chosen": -1.2903565168380737, "logits/rejected": -1.3427797555923462, "logps/chosen": -164.58184814453125, "logps/rejected": -196.1396484375, "loss": 0.4565, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2896607220172882, "rewards/margins": 1.235891580581665, "rewards/rejected": -1.5255522727966309, "step": 1442 }, { "epoch": 1.6723850886717337, "grad_norm": 64.64622163037477, "learning_rate": 1.4479592140671548e-08, "logits/chosen": -1.3046828508377075, "logits/rejected": -1.2628744840621948, "logps/chosen": -194.83673095703125, "logps/rejected": -228.6671905517578, "loss": 0.4481, "rewards/accuracies": 0.78125, "rewards/chosen": -0.3258281648159027, "rewards/margins": 1.5874714851379395, "rewards/rejected": -1.913299560546875, "step": 1444 }, { "epoch": 1.6747014115092291, "grad_norm": 54.707446026319836, "learning_rate": 1.4279953092731633e-08, "logits/chosen": -1.3320201635360718, "logits/rejected": -1.3739019632339478, "logps/chosen": -184.18093872070312, "logps/rejected": -212.85275268554688, "loss": 0.3619, "rewards/accuracies": 0.90625, "rewards/chosen": -0.21321099996566772, "rewards/margins": 1.5299046039581299, "rewards/rejected": -1.7431155443191528, "step": 1446 }, { "epoch": 1.6770177343467245, "grad_norm": 82.01457178127649, "learning_rate": 1.4081594016922772e-08, "logits/chosen": -1.3037813901901245, "logits/rejected": -1.2591630220413208, "logps/chosen": -190.23208618164062, "logps/rejected": -201.10235595703125, "loss": 0.4678, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4632962942123413, "rewards/margins": 0.94648677110672, "rewards/rejected": -1.409783124923706, "step": 1448 }, { "epoch": 1.67933405718422, "grad_norm": 55.804214360937905, "learning_rate": 1.3884517875143542e-08, "logits/chosen": -1.2149088382720947, "logits/rejected": -1.2506321668624878, "logps/chosen": -134.70565795898438, "logps/rejected": -170.83775329589844, "loss": 0.4304, "rewards/accuracies": 0.84375, "rewards/chosen": -0.13964088261127472, "rewards/margins": 1.4484151601791382, "rewards/rejected": -1.588055968284607, "step": 1450 }, { "epoch": 1.6816503800217155, "grad_norm": 45.37776187937181, "learning_rate": 1.3688727610135841e-08, "logits/chosen": -1.182690978050232, "logits/rejected": -1.128598928451538, "logps/chosen": -166.2299041748047, "logps/rejected": -189.45960998535156, "loss": 0.3818, "rewards/accuracies": 0.90625, "rewards/chosen": -0.45779427886009216, "rewards/margins": 1.7794036865234375, "rewards/rejected": -2.2371981143951416, "step": 1452 }, { "epoch": 1.6839667028592111, "grad_norm": 52.071114969595605, "learning_rate": 1.3494226145440767e-08, "logits/chosen": -1.129225492477417, "logits/rejected": -1.1527059078216553, "logps/chosen": -131.0524139404297, "logps/rejected": -152.54283142089844, "loss": 0.3594, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4113074541091919, "rewards/margins": 1.4839246273040771, "rewards/rejected": -1.8952319622039795, "step": 1454 }, { "epoch": 1.6862830256967065, "grad_norm": 47.42528149711889, "learning_rate": 1.3301016385355091e-08, "logits/chosen": -1.2042808532714844, "logits/rejected": -1.194934368133545, "logps/chosen": -156.91098022460938, "logps/rejected": -188.40354919433594, "loss": 0.3751, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7726644277572632, "rewards/margins": 1.8851245641708374, "rewards/rejected": -2.6577892303466797, "step": 1456 }, { "epoch": 1.688599348534202, "grad_norm": 61.34540856220831, "learning_rate": 1.3109101214887864e-08, "logits/chosen": -1.2197272777557373, "logits/rejected": -1.1938438415527344, "logps/chosen": -133.5988311767578, "logps/rejected": -192.86639404296875, "loss": 0.41, "rewards/accuracies": 0.875, "rewards/chosen": -0.4758758246898651, "rewards/margins": 3.0053532123565674, "rewards/rejected": -3.4812285900115967, "step": 1458 }, { "epoch": 1.6909156713716973, "grad_norm": 54.8216979898567, "learning_rate": 1.2918483499717237e-08, "logits/chosen": -1.1394593715667725, "logits/rejected": -1.1835401058197021, "logps/chosen": -122.25736999511719, "logps/rejected": -133.27230834960938, "loss": 0.3597, "rewards/accuracies": 0.90625, "rewards/chosen": -0.4267653822898865, "rewards/margins": 1.0661195516586304, "rewards/rejected": -1.4928849935531616, "step": 1460 }, { "epoch": 1.693231994209193, "grad_norm": 62.86307508634661, "learning_rate": 1.2729166086147803e-08, "logits/chosen": -1.2270219326019287, "logits/rejected": -1.2750426530838013, "logps/chosen": -128.8585968017578, "logps/rejected": -152.87161254882812, "loss": 0.4384, "rewards/accuracies": 0.84375, "rewards/chosen": -0.016907572746276855, "rewards/margins": 1.5406163930892944, "rewards/rejected": -1.5575240850448608, "step": 1462 }, { "epoch": 1.6955483170466885, "grad_norm": 87.68154889445682, "learning_rate": 1.2541151801068072e-08, "logits/chosen": -1.1000306606292725, "logits/rejected": -1.1639959812164307, "logps/chosen": -173.75588989257812, "logps/rejected": -201.65650939941406, "loss": 0.4866, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0096687078475952, "rewards/margins": 1.2543641328811646, "rewards/rejected": -2.2640328407287598, "step": 1464 }, { "epoch": 1.697864639884184, "grad_norm": 52.24912842261691, "learning_rate": 1.2354443451908202e-08, "logits/chosen": -1.2294087409973145, "logits/rejected": -1.2400881052017212, "logps/chosen": -153.24412536621094, "logps/rejected": -166.8737030029297, "loss": 0.3901, "rewards/accuracies": 0.78125, "rewards/chosen": -0.4041752815246582, "rewards/margins": 1.1100343465805054, "rewards/rejected": -1.5142096281051636, "step": 1466 }, { "epoch": 1.7001809627216793, "grad_norm": 59.60290932317476, "learning_rate": 1.2169043826598057e-08, "logits/chosen": -1.1557788848876953, "logits/rejected": -1.2175214290618896, "logps/chosen": -141.86160278320312, "logps/rejected": -166.4970245361328, "loss": 0.4357, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4277328848838806, "rewards/margins": 1.5622481107711792, "rewards/rejected": -1.989980936050415, "step": 1468 }, { "epoch": 1.7024972855591747, "grad_norm": 56.1236013000123, "learning_rate": 1.1984955693525788e-08, "logits/chosen": -1.224461317062378, "logits/rejected": -1.1498862504959106, "logps/chosen": -118.77024841308594, "logps/rejected": -134.83253479003906, "loss": 0.4148, "rewards/accuracies": 0.9375, "rewards/chosen": -0.11248160153627396, "rewards/margins": 1.6419700384140015, "rewards/rejected": -1.7544519901275635, "step": 1470 }, { "epoch": 1.7048136083966703, "grad_norm": 74.87499789321072, "learning_rate": 1.180218180149617e-08, "logits/chosen": -1.3031408786773682, "logits/rejected": -1.2794002294540405, "logps/chosen": -166.36143493652344, "logps/rejected": -182.16787719726562, "loss": 0.4709, "rewards/accuracies": 0.75, "rewards/chosen": -0.30539318919181824, "rewards/margins": 1.1033700704574585, "rewards/rejected": -1.4087631702423096, "step": 1472 }, { "epoch": 1.707129931234166, "grad_norm": 55.679405729529364, "learning_rate": 1.1620724879689791e-08, "logits/chosen": -1.2027887105941772, "logits/rejected": -1.2204790115356445, "logps/chosen": -131.13787841796875, "logps/rejected": -150.597900390625, "loss": 0.3978, "rewards/accuracies": 0.71875, "rewards/chosen": -0.2811795473098755, "rewards/margins": 0.9655717015266418, "rewards/rejected": -1.2467511892318726, "step": 1474 }, { "epoch": 1.7094462540716613, "grad_norm": 59.35657673584767, "learning_rate": 1.1440587637622256e-08, "logits/chosen": -1.2350002527236938, "logits/rejected": -1.3663004636764526, "logps/chosen": -94.2022705078125, "logps/rejected": -113.4736099243164, "loss": 0.4273, "rewards/accuracies": 0.8125, "rewards/chosen": -0.37293869256973267, "rewards/margins": 0.5190171599388123, "rewards/rejected": -0.8919559121131897, "step": 1476 }, { "epoch": 1.7117625769091567, "grad_norm": 58.75339969893179, "learning_rate": 1.1261772765103682e-08, "logits/chosen": -1.2917990684509277, "logits/rejected": -1.3643466234207153, "logps/chosen": -115.38711547851562, "logps/rejected": -147.71717834472656, "loss": 0.4022, "rewards/accuracies": 0.875, "rewards/chosen": -0.19546058773994446, "rewards/margins": 1.1290830373764038, "rewards/rejected": -1.3245434761047363, "step": 1478 }, { "epoch": 1.714078899746652, "grad_norm": 65.37238691562818, "learning_rate": 1.108428293219854e-08, "logits/chosen": -1.2615394592285156, "logits/rejected": -1.2974615097045898, "logps/chosen": -134.23512268066406, "logps/rejected": -218.10336303710938, "loss": 0.414, "rewards/accuracies": 0.90625, "rewards/chosen": -0.6633899807929993, "rewards/margins": 2.063319206237793, "rewards/rejected": -2.7267091274261475, "step": 1480 }, { "epoch": 1.7163952225841477, "grad_norm": 46.08904659289323, "learning_rate": 1.0908120789185837e-08, "logits/chosen": -1.1299887895584106, "logits/rejected": -1.0999367237091064, "logps/chosen": -147.2964324951172, "logps/rejected": -166.0971221923828, "loss": 0.3567, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3920423686504364, "rewards/margins": 1.2661586999893188, "rewards/rejected": -1.658200979232788, "step": 1482 }, { "epoch": 1.7187115454216433, "grad_norm": 62.032458754200285, "learning_rate": 1.0733288966519516e-08, "logits/chosen": -1.258570909500122, "logits/rejected": -1.253230333328247, "logps/chosen": -97.19083404541016, "logps/rejected": -109.28732299804688, "loss": 0.4554, "rewards/accuracies": 0.75, "rewards/chosen": 0.02817530930042267, "rewards/margins": 0.9206870794296265, "rewards/rejected": -0.8925117254257202, "step": 1484 }, { "epoch": 1.7210278682591387, "grad_norm": 75.38001959898925, "learning_rate": 1.0559790074789132e-08, "logits/chosen": -1.2418440580368042, "logits/rejected": -1.3091630935668945, "logps/chosen": -140.07681274414062, "logps/rejected": -164.11033630371094, "loss": 0.4497, "rewards/accuracies": 0.84375, "rewards/chosen": -0.10434143245220184, "rewards/margins": 1.5909518003463745, "rewards/rejected": -1.6952931880950928, "step": 1486 }, { "epoch": 1.723344191096634, "grad_norm": 60.00004527174649, "learning_rate": 1.0387626704680896e-08, "logits/chosen": -1.1610029935836792, "logits/rejected": -1.1764705181121826, "logps/chosen": -153.56924438476562, "logps/rejected": -186.0706329345703, "loss": 0.4105, "rewards/accuracies": 0.875, "rewards/chosen": -0.09610500931739807, "rewards/margins": 1.3976409435272217, "rewards/rejected": -1.4937461614608765, "step": 1488 }, { "epoch": 1.7256605139341294, "grad_norm": 54.45806934963145, "learning_rate": 1.0216801426939093e-08, "logits/chosen": -1.230536699295044, "logits/rejected": -1.2105457782745361, "logps/chosen": -138.13816833496094, "logps/rejected": -160.49887084960938, "loss": 0.451, "rewards/accuracies": 0.8125, "rewards/chosen": -0.30948516726493835, "rewards/margins": 1.360965371131897, "rewards/rejected": -1.6704505681991577, "step": 1490 }, { "epoch": 1.727976836771625, "grad_norm": 82.82636273947331, "learning_rate": 1.0047316792327498e-08, "logits/chosen": -1.2326574325561523, "logits/rejected": -1.3276634216308594, "logps/chosen": -173.7947235107422, "logps/rejected": -209.5412139892578, "loss": 0.3827, "rewards/accuracies": 0.875, "rewards/chosen": -0.604800820350647, "rewards/margins": 1.53623366355896, "rewards/rejected": -2.1410343647003174, "step": 1492 }, { "epoch": 1.7302931596091207, "grad_norm": 69.61022757488912, "learning_rate": 9.879175331591472e-09, "logits/chosen": -1.1758224964141846, "logits/rejected": -1.201228380203247, "logps/chosen": -147.6432342529297, "logps/rejected": -187.960693359375, "loss": 0.3883, "rewards/accuracies": 0.78125, "rewards/chosen": -0.6493701934814453, "rewards/margins": 1.607360601425171, "rewards/rejected": -2.256730794906616, "step": 1494 }, { "epoch": 1.732609482446616, "grad_norm": 46.06958168770852, "learning_rate": 9.712379555420092e-09, "logits/chosen": -1.1795316934585571, "logits/rejected": -1.285264492034912, "logps/chosen": -119.38116455078125, "logps/rejected": -151.4436798095703, "loss": 0.403, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10696488618850708, "rewards/margins": 1.2390841245651245, "rewards/rejected": -1.3460490703582764, "step": 1496 }, { "epoch": 1.7349258052841114, "grad_norm": 57.95589326263925, "learning_rate": 9.546931954408621e-09, "logits/chosen": -1.1415185928344727, "logits/rejected": -1.1725091934204102, "logps/chosen": -115.87825775146484, "logps/rejected": -149.04075622558594, "loss": 0.5099, "rewards/accuracies": 0.875, "rewards/chosen": -0.27941668033599854, "rewards/margins": 1.3693090677261353, "rewards/rejected": -1.6487256288528442, "step": 1498 }, { "epoch": 1.7372421281216068, "grad_norm": 55.43462235715989, "learning_rate": 9.382834999021372e-09, "logits/chosen": -1.2606816291809082, "logits/rejected": -1.2870656251907349, "logps/chosen": -125.43834686279297, "logps/rejected": -157.20016479492188, "loss": 0.4188, "rewards/accuracies": 0.75, "rewards/chosen": -0.31777307391166687, "rewards/margins": 0.9490638375282288, "rewards/rejected": -1.2668367624282837, "step": 1500 }, { "epoch": 1.7372421281216068, "eval_logits/chosen": -1.2174957990646362, "eval_logits/rejected": -1.2125576734542847, "eval_logps/chosen": -143.9083251953125, "eval_logps/rejected": -148.98919677734375, "eval_loss": 0.5947905778884888, "eval_rewards/accuracies": 0.7200000286102295, "eval_rewards/chosen": -0.8277125358581543, "eval_rewards/margins": 0.675851047039032, "eval_rewards/rejected": -1.5035635232925415, "eval_runtime": 23.0512, "eval_samples_per_second": 4.338, "eval_steps_per_second": 1.085, "step": 1500 }, { "epoch": 1.7395584509591024, "grad_norm": 55.81693817141226, "learning_rate": 9.220091139554887e-09, "logits/chosen": -1.1932331323623657, "logits/rejected": -1.1756948232650757, "logps/chosen": -103.76750183105469, "logps/rejected": -129.5191650390625, "loss": 0.4117, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2089885175228119, "rewards/margins": 0.9426325559616089, "rewards/rejected": -1.1516211032867432, "step": 1502 }, { "epoch": 1.741874773796598, "grad_norm": 62.56378010964504, "learning_rate": 9.05870280610117e-09, "logits/chosen": -1.2296499013900757, "logits/rejected": -1.1992714405059814, "logps/chosen": -123.05607604980469, "logps/rejected": -130.20932006835938, "loss": 0.4251, "rewards/accuracies": 0.8125, "rewards/chosen": -0.41981515288352966, "rewards/margins": 0.8822442293167114, "rewards/rejected": -1.3020594120025635, "step": 1504 }, { "epoch": 1.7441910966340934, "grad_norm": 81.02596584931305, "learning_rate": 8.898672408511553e-09, "logits/chosen": -1.2401373386383057, "logits/rejected": -1.17184317111969, "logps/chosen": -168.21986389160156, "logps/rejected": -162.71383666992188, "loss": 0.4402, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5058491230010986, "rewards/margins": 1.3156105279922485, "rewards/rejected": -1.8214595317840576, "step": 1506 }, { "epoch": 1.7465074194715888, "grad_norm": 56.282435640432055, "learning_rate": 8.740002336360686e-09, "logits/chosen": -1.351073980331421, "logits/rejected": -1.4128607511520386, "logps/chosen": -152.2001190185547, "logps/rejected": -191.74932861328125, "loss": 0.4226, "rewards/accuracies": 0.90625, "rewards/chosen": -0.2529861629009247, "rewards/margins": 1.520397424697876, "rewards/rejected": -1.773383617401123, "step": 1508 }, { "epoch": 1.7488237423090842, "grad_norm": 49.81866700382731, "learning_rate": 8.582694958910807e-09, "logits/chosen": -1.2254369258880615, "logits/rejected": -1.2749468088150024, "logps/chosen": -182.1556854248047, "logps/rejected": -216.18411254882812, "loss": 0.3902, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6539211869239807, "rewards/margins": 1.5710985660552979, "rewards/rejected": -2.225019693374634, "step": 1510 }, { "epoch": 1.7511400651465798, "grad_norm": 52.34711654194476, "learning_rate": 8.426752625076373e-09, "logits/chosen": -1.2552261352539062, "logits/rejected": -1.3420953750610352, "logps/chosen": -186.13934326171875, "logps/rejected": -285.68804931640625, "loss": 0.3499, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5289927124977112, "rewards/margins": 3.851708173751831, "rewards/rejected": -4.380701065063477, "step": 1512 }, { "epoch": 1.7534563879840754, "grad_norm": 52.389521034020916, "learning_rate": 8.272177663389046e-09, "logits/chosen": -1.1967260837554932, "logits/rejected": -1.1967551708221436, "logps/chosen": -217.95095825195312, "logps/rejected": -236.05380249023438, "loss": 0.3782, "rewards/accuracies": 0.9375, "rewards/chosen": -0.3343973159790039, "rewards/margins": 1.9475483894348145, "rewards/rejected": -2.2819457054138184, "step": 1514 }, { "epoch": 1.7557727108215708, "grad_norm": 55.99063639865476, "learning_rate": 8.118972381962851e-09, "logits/chosen": -1.1716736555099487, "logits/rejected": -1.2387813329696655, "logps/chosen": -161.53382873535156, "logps/rejected": -189.86782836914062, "loss": 0.3571, "rewards/accuracies": 0.875, "rewards/chosen": -0.7027201652526855, "rewards/margins": 1.415562629699707, "rewards/rejected": -2.1182827949523926, "step": 1516 }, { "epoch": 1.7580890336590662, "grad_norm": 60.98846930904024, "learning_rate": 7.967139068459726e-09, "logits/chosen": -1.1493229866027832, "logits/rejected": -1.146936297416687, "logps/chosen": -128.60189819335938, "logps/rejected": -160.15321350097656, "loss": 0.3883, "rewards/accuracies": 0.90625, "rewards/chosen": -0.36255598068237305, "rewards/margins": 1.3770678043365479, "rewards/rejected": -1.7396236658096313, "step": 1518 }, { "epoch": 1.7604053564965616, "grad_norm": 176.68417193498476, "learning_rate": 7.81667999005543e-09, "logits/chosen": -1.3078656196594238, "logits/rejected": -1.3120546340942383, "logps/chosen": -179.33438110351562, "logps/rejected": -186.1118621826172, "loss": 0.4993, "rewards/accuracies": 0.78125, "rewards/chosen": -0.22156819701194763, "rewards/margins": 0.9435240030288696, "rewards/rejected": -1.16509211063385, "step": 1520 }, { "epoch": 1.7627216793340572, "grad_norm": 51.91509618352603, "learning_rate": 7.6675973934056e-09, "logits/chosen": -1.0760035514831543, "logits/rejected": -1.1664559841156006, "logps/chosen": -136.77081298828125, "logps/rejected": -175.25926208496094, "loss": 0.4397, "rewards/accuracies": 0.875, "rewards/chosen": -0.2903676927089691, "rewards/margins": 1.1137360334396362, "rewards/rejected": -1.4041036367416382, "step": 1522 }, { "epoch": 1.7650380021715528, "grad_norm": 59.67977888278656, "learning_rate": 7.51989350461224e-09, "logits/chosen": -1.2789033651351929, "logits/rejected": -1.3050099611282349, "logps/chosen": -145.60400390625, "logps/rejected": -156.1199951171875, "loss": 0.4456, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3298056125640869, "rewards/margins": 0.7980384826660156, "rewards/rejected": -1.1278440952301025, "step": 1524 }, { "epoch": 1.7673543250090482, "grad_norm": 50.021597145613505, "learning_rate": 7.373570529190498e-09, "logits/chosen": -1.2145639657974243, "logits/rejected": -1.2063783407211304, "logps/chosen": -162.4467315673828, "logps/rejected": -196.65577697753906, "loss": 0.3705, "rewards/accuracies": 0.75, "rewards/chosen": -0.2843925654888153, "rewards/margins": 1.6666440963745117, "rewards/rejected": -1.9510366916656494, "step": 1526 }, { "epoch": 1.7696706478465436, "grad_norm": 62.64810414938643, "learning_rate": 7.228630652035717e-09, "logits/chosen": -1.2518867254257202, "logits/rejected": -1.1622406244277954, "logps/chosen": -169.1246337890625, "logps/rejected": -189.02169799804688, "loss": 0.3845, "rewards/accuracies": 0.9375, "rewards/chosen": -0.23387570679187775, "rewards/margins": 1.9417215585708618, "rewards/rejected": -2.1755971908569336, "step": 1528 }, { "epoch": 1.771986970684039, "grad_norm": 56.92633456964183, "learning_rate": 7.08507603739078e-09, "logits/chosen": -1.2512166500091553, "logits/rejected": -1.2901430130004883, "logps/chosen": -162.3385772705078, "logps/rejected": -193.41940307617188, "loss": 0.4376, "rewards/accuracies": 0.875, "rewards/chosen": -0.40012550354003906, "rewards/margins": 1.2529363632202148, "rewards/rejected": -1.653061866760254, "step": 1530 }, { "epoch": 1.7743032935215346, "grad_norm": 76.55205063599432, "learning_rate": 6.942908828813876e-09, "logits/chosen": -1.323652982711792, "logits/rejected": -1.2494463920593262, "logps/chosen": -192.47853088378906, "logps/rejected": -190.863037109375, "loss": 0.3828, "rewards/accuracies": 0.78125, "rewards/chosen": -0.46822619438171387, "rewards/margins": 1.0589056015014648, "rewards/rejected": -1.5271317958831787, "step": 1532 }, { "epoch": 1.77661961635903, "grad_norm": 58.83800639947019, "learning_rate": 6.802131149146373e-09, "logits/chosen": -1.254701018333435, "logits/rejected": -1.2828840017318726, "logps/chosen": -144.4114990234375, "logps/rejected": -155.79037475585938, "loss": 0.374, "rewards/accuracies": 0.84375, "rewards/chosen": -0.3679081201553345, "rewards/margins": 1.2586240768432617, "rewards/rejected": -1.6265323162078857, "step": 1534 }, { "epoch": 1.7789359391965256, "grad_norm": 68.11680500501693, "learning_rate": 6.662745100481271e-09, "logits/chosen": -1.1942329406738281, "logits/rejected": -1.289471983909607, "logps/chosen": -103.48456573486328, "logps/rejected": -110.4488754272461, "loss": 0.3996, "rewards/accuracies": 0.71875, "rewards/chosen": -0.294292151927948, "rewards/margins": 0.796977162361145, "rewards/rejected": -1.0912692546844482, "step": 1536 }, { "epoch": 1.781252262034021, "grad_norm": 60.49404420207855, "learning_rate": 6.5247527641316465e-09, "logits/chosen": -1.2070562839508057, "logits/rejected": -1.2548003196716309, "logps/chosen": -156.2920379638672, "logps/rejected": -177.38890075683594, "loss": 0.4381, "rewards/accuracies": 0.90625, "rewards/chosen": -0.03995545208454132, "rewards/margins": 1.5764446258544922, "rewards/rejected": -1.616400122642517, "step": 1538 }, { "epoch": 1.7835685848715164, "grad_norm": 58.839873216052546, "learning_rate": 6.388156200599726e-09, "logits/chosen": -1.1764907836914062, "logits/rejected": -1.2363911867141724, "logps/chosen": -138.08792114257812, "logps/rejected": -156.8811492919922, "loss": 0.4437, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5578911304473877, "rewards/margins": 0.9130861163139343, "rewards/rejected": -1.4709770679473877, "step": 1540 }, { "epoch": 1.785884907709012, "grad_norm": 54.830012236677426, "learning_rate": 6.2529574495459815e-09, "logits/chosen": -1.2464194297790527, "logits/rejected": -1.2431282997131348, "logps/chosen": -146.96051025390625, "logps/rejected": -175.07481384277344, "loss": 0.386, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2085748016834259, "rewards/margins": 1.4820951223373413, "rewards/rejected": -1.6906698942184448, "step": 1542 }, { "epoch": 1.7882012305465074, "grad_norm": 53.964612199236846, "learning_rate": 6.119158529758817e-09, "logits/chosen": -1.2010880708694458, "logits/rejected": -1.252152681350708, "logps/chosen": -125.40618896484375, "logps/rejected": -147.0822296142578, "loss": 0.4294, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6274422407150269, "rewards/margins": 1.2626943588256836, "rewards/rejected": -1.8901365995407104, "step": 1544 }, { "epoch": 1.790517553384003, "grad_norm": 61.93429459296764, "learning_rate": 5.986761439124288e-09, "logits/chosen": -1.0499889850616455, "logits/rejected": -1.0637288093566895, "logps/chosen": -145.4034881591797, "logps/rejected": -170.8926239013672, "loss": 0.4097, "rewards/accuracies": 0.90625, "rewards/chosen": -1.0535945892333984, "rewards/margins": 1.4541335105895996, "rewards/rejected": -2.507727861404419, "step": 1546 }, { "epoch": 1.7928338762214984, "grad_norm": 63.39263653013202, "learning_rate": 5.855768154596363e-09, "logits/chosen": -1.2247127294540405, "logits/rejected": -1.287811040878296, "logps/chosen": -128.2520751953125, "logps/rejected": -145.6575927734375, "loss": 0.4182, "rewards/accuracies": 0.8125, "rewards/chosen": -0.20227603614330292, "rewards/margins": 0.8022910356521606, "rewards/rejected": -1.0045669078826904, "step": 1548 }, { "epoch": 1.7951501990589938, "grad_norm": 53.1979380263347, "learning_rate": 5.726180632167354e-09, "logits/chosen": -1.2052092552185059, "logits/rejected": -1.2159252166748047, "logps/chosen": -151.31918334960938, "logps/rejected": -184.94479370117188, "loss": 0.4075, "rewards/accuracies": 0.8125, "rewards/chosen": -0.06491108983755112, "rewards/margins": 1.7111616134643555, "rewards/rejected": -1.7760728597640991, "step": 1550 }, { "epoch": 1.7974665218964894, "grad_norm": 53.36509041975899, "learning_rate": 5.5980008068387655e-09, "logits/chosen": -1.192318081855774, "logits/rejected": -1.2422665357589722, "logps/chosen": -158.2563934326172, "logps/rejected": -208.37709045410156, "loss": 0.3783, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3992021381855011, "rewards/margins": 1.9673078060150146, "rewards/rejected": -2.3665099143981934, "step": 1552 }, { "epoch": 1.7997828447339848, "grad_norm": 57.43435085106451, "learning_rate": 5.471230592592313e-09, "logits/chosen": -1.2281129360198975, "logits/rejected": -1.1943424940109253, "logps/chosen": -132.93118286132812, "logps/rejected": -142.4124755859375, "loss": 0.3999, "rewards/accuracies": 0.84375, "rewards/chosen": -0.2500740587711334, "rewards/margins": 0.9634323120117188, "rewards/rejected": -1.2135063409805298, "step": 1554 }, { "epoch": 1.8020991675714804, "grad_norm": 116.95703791110742, "learning_rate": 5.345871882361397e-09, "logits/chosen": -1.222663402557373, "logits/rejected": -1.2307226657867432, "logps/chosen": -195.47381591796875, "logps/rejected": -213.84588623046875, "loss": 0.5455, "rewards/accuracies": 0.71875, "rewards/chosen": -1.06570303440094, "rewards/margins": 1.080770492553711, "rewards/rejected": -2.1464734077453613, "step": 1556 }, { "epoch": 1.8044154904089758, "grad_norm": 54.00119490171407, "learning_rate": 5.221926548002875e-09, "logits/chosen": -1.1924062967300415, "logits/rejected": -1.269582748413086, "logps/chosen": -165.26943969726562, "logps/rejected": -179.38568115234375, "loss": 0.4258, "rewards/accuracies": 0.78125, "rewards/chosen": 0.21030552685260773, "rewards/margins": 1.1077656745910645, "rewards/rejected": -0.8974601030349731, "step": 1558 }, { "epoch": 1.8067318132464711, "grad_norm": 47.637201993987425, "learning_rate": 5.099396440269033e-09, "logits/chosen": -1.1668461561203003, "logits/rejected": -1.1675832271575928, "logps/chosen": -132.747314453125, "logps/rejected": -197.4693603515625, "loss": 0.3714, "rewards/accuracies": 0.96875, "rewards/chosen": -0.2991000711917877, "rewards/margins": 2.7344629764556885, "rewards/rejected": -3.0335628986358643, "step": 1560 }, { "epoch": 1.8090481360839668, "grad_norm": 73.3924537450436, "learning_rate": 4.978283388780002e-09, "logits/chosen": -1.2106759548187256, "logits/rejected": -1.3471499681472778, "logps/chosen": -172.1467742919922, "logps/rejected": -206.36143493652344, "loss": 0.3895, "rewards/accuracies": 0.8125, "rewards/chosen": -0.703016996383667, "rewards/margins": 1.2060117721557617, "rewards/rejected": -1.9090288877487183, "step": 1562 }, { "epoch": 1.8113644589214621, "grad_norm": 90.51012356320436, "learning_rate": 4.858589201996432e-09, "logits/chosen": -1.0378146171569824, "logits/rejected": -1.1732603311538696, "logps/chosen": -141.2643280029297, "logps/rejected": -164.4271697998047, "loss": 0.4848, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8795535564422607, "rewards/margins": 0.9949630498886108, "rewards/rejected": -1.874516487121582, "step": 1564 }, { "epoch": 1.8136807817589577, "grad_norm": 56.01168000132923, "learning_rate": 4.740315667192441e-09, "logits/chosen": -1.1176464557647705, "logits/rejected": -1.2511212825775146, "logps/chosen": -103.09518432617188, "logps/rejected": -135.39122009277344, "loss": 0.4393, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2941249907016754, "rewards/margins": 0.9569557905197144, "rewards/rejected": -1.2510807514190674, "step": 1566 }, { "epoch": 1.8159971045964531, "grad_norm": 48.16666589487942, "learning_rate": 4.623464550429002e-09, "logits/chosen": -1.102777361869812, "logits/rejected": -1.1394641399383545, "logps/chosen": -111.80138397216797, "logps/rejected": -145.2130126953125, "loss": 0.4561, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7176414728164673, "rewards/margins": 1.2812902927398682, "rewards/rejected": -1.998931646347046, "step": 1568 }, { "epoch": 1.8183134274339485, "grad_norm": 50.97749085021057, "learning_rate": 4.508037596527525e-09, "logits/chosen": -1.1966917514801025, "logits/rejected": -1.2247413396835327, "logps/chosen": -114.48523712158203, "logps/rejected": -128.622802734375, "loss": 0.3797, "rewards/accuracies": 0.78125, "rewards/chosen": -0.19608543813228607, "rewards/margins": 0.9697508215904236, "rewards/rejected": -1.1658360958099365, "step": 1570 }, { "epoch": 1.8206297502714441, "grad_norm": 54.387837657286084, "learning_rate": 4.39403652904381e-09, "logits/chosen": -1.1147388219833374, "logits/rejected": -1.1594665050506592, "logps/chosen": -114.78770446777344, "logps/rejected": -155.918701171875, "loss": 0.3961, "rewards/accuracies": 0.9375, "rewards/chosen": -0.18746113777160645, "rewards/margins": 1.8321788311004639, "rewards/rejected": -2.0196399688720703, "step": 1572 }, { "epoch": 1.8229460731089395, "grad_norm": 62.271451803387365, "learning_rate": 4.2814630502422845e-09, "logits/chosen": -1.1847018003463745, "logits/rejected": -1.1410635709762573, "logps/chosen": -178.85458374023438, "logps/rejected": -211.72219848632812, "loss": 0.4279, "rewards/accuracies": 0.75, "rewards/chosen": -0.22784435749053955, "rewards/margins": 2.5258147716522217, "rewards/rejected": -2.7536590099334717, "step": 1574 }, { "epoch": 1.8252623959464351, "grad_norm": 57.174518848316346, "learning_rate": 4.170318841070708e-09, "logits/chosen": -1.120819330215454, "logits/rejected": -1.1634063720703125, "logps/chosen": -140.26319885253906, "logps/rejected": -207.86880493164062, "loss": 0.4442, "rewards/accuracies": 0.84375, "rewards/chosen": -0.25584009289741516, "rewards/margins": 1.6964097023010254, "rewards/rejected": -1.9522497653961182, "step": 1576 }, { "epoch": 1.8275787187839305, "grad_norm": 57.29425789262467, "learning_rate": 4.060605561134889e-09, "logits/chosen": -1.3027273416519165, "logits/rejected": -1.2673333883285522, "logps/chosen": -170.17152404785156, "logps/rejected": -188.33880615234375, "loss": 0.4311, "rewards/accuracies": 0.84375, "rewards/chosen": -0.42000892758369446, "rewards/margins": 1.3922333717346191, "rewards/rejected": -1.8122422695159912, "step": 1578 }, { "epoch": 1.829895041621426, "grad_norm": 65.96165236575662, "learning_rate": 3.952324848674004e-09, "logits/chosen": -1.1435868740081787, "logits/rejected": -1.2456907033920288, "logps/chosen": -118.90472412109375, "logps/rejected": -160.82818603515625, "loss": 0.3963, "rewards/accuracies": 0.875, "rewards/chosen": -0.47032859921455383, "rewards/margins": 1.457180142402649, "rewards/rejected": -1.9275087118148804, "step": 1580 }, { "epoch": 1.8322113644589213, "grad_norm": 62.96392877654251, "learning_rate": 3.8454783205361774e-09, "logits/chosen": -1.2181570529937744, "logits/rejected": -1.2495853900909424, "logps/chosen": -195.74876403808594, "logps/rejected": -292.0626220703125, "loss": 0.4112, "rewards/accuracies": 0.8125, "rewards/chosen": -0.3267236649990082, "rewards/margins": 4.259873390197754, "rewards/rejected": -4.586597442626953, "step": 1582 }, { "epoch": 1.834527687296417, "grad_norm": 58.298377548314235, "learning_rate": 3.740067572154238e-09, "logits/chosen": -1.292594075202942, "logits/rejected": -1.3315826654434204, "logps/chosen": -154.32740783691406, "logps/rejected": -175.98606872558594, "loss": 0.4088, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2856728136539459, "rewards/margins": 1.166110634803772, "rewards/rejected": -1.4517834186553955, "step": 1584 }, { "epoch": 1.8368440101339125, "grad_norm": 59.34894292485851, "learning_rate": 3.6360941775219534e-09, "logits/chosen": -1.2552549839019775, "logits/rejected": -1.3246078491210938, "logps/chosen": -165.2515869140625, "logps/rejected": -189.0300750732422, "loss": 0.3893, "rewards/accuracies": 0.84375, "rewards/chosen": -0.31748124957084656, "rewards/margins": 1.6454672813415527, "rewards/rejected": -1.9629485607147217, "step": 1586 }, { "epoch": 1.839160332971408, "grad_norm": 72.01635850106565, "learning_rate": 3.53355968917054e-09, "logits/chosen": -1.1828457117080688, "logits/rejected": -1.1595231294631958, "logps/chosen": -188.94688415527344, "logps/rejected": -214.75, "loss": 0.3615, "rewards/accuracies": 0.84375, "rewards/chosen": -0.9493909478187561, "rewards/margins": 1.7555177211761475, "rewards/rejected": -2.704908609390259, "step": 1588 }, { "epoch": 1.8414766558089033, "grad_norm": 62.6493622965171, "learning_rate": 3.432465638145443e-09, "logits/chosen": -1.2264246940612793, "logits/rejected": -1.2353841066360474, "logps/chosen": -195.12002563476562, "logps/rejected": -207.8896942138672, "loss": 0.3766, "rewards/accuracies": 0.78125, "rewards/chosen": -0.44329333305358887, "rewards/margins": 1.4592864513397217, "rewards/rejected": -1.9025800228118896, "step": 1590 }, { "epoch": 1.8437929786463987, "grad_norm": 90.93634487708707, "learning_rate": 3.3328135339834917e-09, "logits/chosen": -1.2629611492156982, "logits/rejected": -1.249568223953247, "logps/chosen": -190.23126220703125, "logps/rejected": -229.70721435546875, "loss": 0.4445, "rewards/accuracies": 0.84375, "rewards/chosen": -0.7300775647163391, "rewards/margins": 1.7166606187820435, "rewards/rejected": -2.4467382431030273, "step": 1592 }, { "epoch": 1.8461093014838943, "grad_norm": 70.40594487298823, "learning_rate": 3.234604864690349e-09, "logits/chosen": -1.1563414335250854, "logits/rejected": -1.1132121086120605, "logps/chosen": -112.93782806396484, "logps/rejected": -115.19876861572266, "loss": 0.4635, "rewards/accuracies": 0.75, "rewards/chosen": -0.16091413795948029, "rewards/margins": 1.0079346895217896, "rewards/rejected": -1.1688487529754639, "step": 1594 }, { "epoch": 1.84842562432139, "grad_norm": 70.46959430003976, "learning_rate": 3.13784109671833e-09, "logits/chosen": -1.1287944316864014, "logits/rejected": -1.1954846382141113, "logps/chosen": -145.52749633789062, "logps/rejected": -168.68896484375, "loss": 0.4428, "rewards/accuracies": 0.84375, "rewards/chosen": -0.435101717710495, "rewards/margins": 1.1136534214019775, "rewards/rejected": -1.548755168914795, "step": 1596 }, { "epoch": 1.8507419471588853, "grad_norm": 63.53152924846159, "learning_rate": 3.0425236749444307e-09, "logits/chosen": -1.1079940795898438, "logits/rejected": -1.1838057041168213, "logps/chosen": -104.38517761230469, "logps/rejected": -126.13815307617188, "loss": 0.4119, "rewards/accuracies": 0.8125, "rewards/chosen": -0.20333430171012878, "rewards/margins": 1.2202249765396118, "rewards/rejected": -1.4235591888427734, "step": 1598 }, { "epoch": 1.8530582699963807, "grad_norm": 66.75495278471351, "learning_rate": 2.9486540226488555e-09, "logits/chosen": -1.1984293460845947, "logits/rejected": -1.1811829805374146, "logps/chosen": -107.63018035888672, "logps/rejected": -130.51141357421875, "loss": 0.4185, "rewards/accuracies": 0.75, "rewards/chosen": -0.44386693835258484, "rewards/margins": 1.2538187503814697, "rewards/rejected": -1.697685718536377, "step": 1600 }, { "epoch": 1.8530582699963807, "eval_logits/chosen": -1.2096275091171265, "eval_logits/rejected": -1.2041908502578735, "eval_logps/chosen": -144.02456665039062, "eval_logps/rejected": -149.35797119140625, "eval_loss": 0.59078049659729, "eval_rewards/accuracies": 0.7599999904632568, "eval_rewards/chosen": -0.8393388986587524, "eval_rewards/margins": 0.7011021375656128, "eval_rewards/rejected": -1.5404411554336548, "eval_runtime": 26.1157, "eval_samples_per_second": 3.829, "eval_steps_per_second": 0.957, "step": 1600 }, { "epoch": 1.855374592833876, "grad_norm": 51.25806551926982, "learning_rate": 2.856233541493691e-09, "logits/chosen": -1.1180177927017212, "logits/rejected": -1.153393268585205, "logps/chosen": -144.27127075195312, "logps/rejected": -172.8219757080078, "loss": 0.4103, "rewards/accuracies": 0.875, "rewards/chosen": -0.5550628900527954, "rewards/margins": 1.737399697303772, "rewards/rejected": -2.2924625873565674, "step": 1602 }, { "epoch": 1.8576909156713717, "grad_norm": 44.574245908096394, "learning_rate": 2.7652636115019554e-09, "logits/chosen": -1.2830660343170166, "logits/rejected": -1.3382513523101807, "logps/chosen": -186.8584747314453, "logps/rejected": -214.39341735839844, "loss": 0.336, "rewards/accuracies": 0.90625, "rewards/chosen": 0.011630617082118988, "rewards/margins": 2.068033218383789, "rewards/rejected": -2.0564029216766357, "step": 1604 }, { "epoch": 1.8600072385088673, "grad_norm": 88.24929156836725, "learning_rate": 2.6757455910370487e-09, "logits/chosen": -1.1889640092849731, "logits/rejected": -1.178146481513977, "logps/chosen": -169.42417907714844, "logps/rejected": -202.3689422607422, "loss": 0.453, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4166257083415985, "rewards/margins": 1.3977904319763184, "rewards/rejected": -1.8144161701202393, "step": 1606 }, { "epoch": 1.8623235613463627, "grad_norm": 54.823245822280576, "learning_rate": 2.5876808167825005e-09, "logits/chosen": -1.2597419023513794, "logits/rejected": -1.193768858909607, "logps/chosen": -93.74658203125, "logps/rejected": -92.6616439819336, "loss": 0.3924, "rewards/accuracies": 0.6875, "rewards/chosen": -0.25879502296447754, "rewards/margins": 0.4778652489185333, "rewards/rejected": -0.7366602420806885, "step": 1608 }, { "epoch": 1.864639884183858, "grad_norm": 74.02926136780609, "learning_rate": 2.5010706037218885e-09, "logits/chosen": -1.2314317226409912, "logits/rejected": -1.2886399030685425, "logps/chosen": -163.94842529296875, "logps/rejected": -195.11390686035156, "loss": 0.3971, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2574860155582428, "rewards/margins": 1.3789194822311401, "rewards/rejected": -1.6364054679870605, "step": 1610 }, { "epoch": 1.8669562070213535, "grad_norm": 54.30860117915708, "learning_rate": 2.4159162451193094e-09, "logits/chosen": -1.0902681350708008, "logits/rejected": -1.112775206565857, "logps/chosen": -140.3288116455078, "logps/rejected": -188.05210876464844, "loss": 0.4003, "rewards/accuracies": 0.90625, "rewards/chosen": -0.1892092376947403, "rewards/margins": 1.8315831422805786, "rewards/rejected": -2.0207924842834473, "step": 1612 }, { "epoch": 1.869272529858849, "grad_norm": 62.18685455386846, "learning_rate": 2.3322190125000475e-09, "logits/chosen": -1.0966382026672363, "logits/rejected": -1.1658515930175781, "logps/chosen": -116.89921569824219, "logps/rejected": -151.35015869140625, "loss": 0.4112, "rewards/accuracies": 0.75, "rewards/chosen": -0.4243711233139038, "rewards/margins": 1.4443333148956299, "rewards/rejected": -1.8687043190002441, "step": 1614 }, { "epoch": 1.8715888526963447, "grad_norm": 55.591911669551806, "learning_rate": 2.24998015563157e-09, "logits/chosen": -1.1172374486923218, "logits/rejected": -1.1134474277496338, "logps/chosen": -106.15010070800781, "logps/rejected": -122.0849609375, "loss": 0.42, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5767372250556946, "rewards/margins": 0.9947463274002075, "rewards/rejected": -1.5714833736419678, "step": 1616 }, { "epoch": 1.87390517553384, "grad_norm": 69.67114883544231, "learning_rate": 2.169200902504842e-09, "logits/chosen": -1.291989803314209, "logits/rejected": -1.3668596744537354, "logps/chosen": -140.99050903320312, "logps/rejected": -164.55636596679688, "loss": 0.4102, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1543090045452118, "rewards/margins": 1.0028785467147827, "rewards/rejected": -1.1571874618530273, "step": 1618 }, { "epoch": 1.8762214983713354, "grad_norm": 49.47929313355962, "learning_rate": 2.0898824593160503e-09, "logits/chosen": -1.1244778633117676, "logits/rejected": -1.1484088897705078, "logps/chosen": -115.74507904052734, "logps/rejected": -133.63916015625, "loss": 0.3943, "rewards/accuracies": 0.71875, "rewards/chosen": -0.4605577886104584, "rewards/margins": 0.8462937474250793, "rewards/rejected": -1.3068513870239258, "step": 1620 }, { "epoch": 1.8785378212088308, "grad_norm": 51.80324368762919, "learning_rate": 2.012026010448542e-09, "logits/chosen": -1.0954941511154175, "logits/rejected": -1.160184383392334, "logps/chosen": -131.16983032226562, "logps/rejected": -178.2694549560547, "loss": 0.4011, "rewards/accuracies": 0.84375, "rewards/chosen": -0.1698540300130844, "rewards/margins": 1.7133314609527588, "rewards/rejected": -1.8831853866577148, "step": 1622 }, { "epoch": 1.8808541440463264, "grad_norm": 61.62870922068943, "learning_rate": 1.935632718455171e-09, "logits/chosen": -1.167246699333191, "logits/rejected": -1.2080024480819702, "logps/chosen": -154.55758666992188, "logps/rejected": -188.28201293945312, "loss": 0.3673, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5035812854766846, "rewards/margins": 1.6042046546936035, "rewards/rejected": -2.107785701751709, "step": 1624 }, { "epoch": 1.883170466883822, "grad_norm": 66.67930280370663, "learning_rate": 1.860703724040935e-09, "logits/chosen": -1.1037479639053345, "logits/rejected": -1.0890851020812988, "logps/chosen": -151.0048370361328, "logps/rejected": -170.82943725585938, "loss": 0.4514, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4693964123725891, "rewards/margins": 0.7652498483657837, "rewards/rejected": -1.2346463203430176, "step": 1626 }, { "epoch": 1.8854867897213174, "grad_norm": 64.45068485207041, "learning_rate": 1.7872401460458874e-09, "logits/chosen": -1.1636077165603638, "logits/rejected": -1.2113772630691528, "logps/chosen": -128.23899841308594, "logps/rejected": -151.98800659179688, "loss": 0.4074, "rewards/accuracies": 0.75, "rewards/chosen": -0.15583369135856628, "rewards/margins": 1.3744523525238037, "rewards/rejected": -1.5302859544754028, "step": 1628 }, { "epoch": 1.8878031125588128, "grad_norm": 60.44416949725557, "learning_rate": 1.7152430814285302e-09, "logits/chosen": -1.2152669429779053, "logits/rejected": -1.225684404373169, "logps/chosen": -145.33447265625, "logps/rejected": -170.64059448242188, "loss": 0.4299, "rewards/accuracies": 0.875, "rewards/chosen": -0.09620651602745056, "rewards/margins": 1.3490362167358398, "rewards/rejected": -1.4452428817749023, "step": 1630 }, { "epoch": 1.8901194353963082, "grad_norm": 46.81333988402518, "learning_rate": 1.6447136052493704e-09, "logits/chosen": -1.169386863708496, "logits/rejected": -1.248286247253418, "logps/chosen": -163.55990600585938, "logps/rejected": -216.13929748535156, "loss": 0.3547, "rewards/accuracies": 1.0, "rewards/chosen": -0.5737725496292114, "rewards/margins": 1.9429514408111572, "rewards/rejected": -2.516724109649658, "step": 1632 }, { "epoch": 1.8924357582338038, "grad_norm": 76.34596117355929, "learning_rate": 1.5756527706548561e-09, "logits/chosen": -1.2912683486938477, "logits/rejected": -1.2929950952529907, "logps/chosen": -192.79061889648438, "logps/rejected": -216.2474365234375, "loss": 0.4647, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6607008576393127, "rewards/margins": 1.0471830368041992, "rewards/rejected": -1.7078838348388672, "step": 1634 }, { "epoch": 1.8947520810712994, "grad_norm": 50.49082900430503, "learning_rate": 1.5080616088616882e-09, "logits/chosen": -1.1574630737304688, "logits/rejected": -1.2174662351608276, "logps/chosen": -105.63571166992188, "logps/rejected": -138.35476684570312, "loss": 0.4099, "rewards/accuracies": 0.84375, "rewards/chosen": -0.4468805193901062, "rewards/margins": 1.2918894290924072, "rewards/rejected": -1.7387701272964478, "step": 1636 }, { "epoch": 1.8970684039087948, "grad_norm": 75.9093198815468, "learning_rate": 1.4419411291413885e-09, "logits/chosen": -1.1982598304748535, "logits/rejected": -1.1942592859268188, "logps/chosen": -191.63914489746094, "logps/rejected": -217.94439697265625, "loss": 0.4268, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6322917938232422, "rewards/margins": 1.844104528427124, "rewards/rejected": -2.4763965606689453, "step": 1638 }, { "epoch": 1.8993847267462902, "grad_norm": 58.34682402089141, "learning_rate": 1.3772923188052787e-09, "logits/chosen": -1.0590007305145264, "logits/rejected": -1.0922439098358154, "logps/chosen": -90.13923645019531, "logps/rejected": -109.63917541503906, "loss": 0.4401, "rewards/accuracies": 0.84375, "rewards/chosen": -0.1902827024459839, "rewards/margins": 1.1345347166061401, "rewards/rejected": -1.3248172998428345, "step": 1640 }, { "epoch": 1.9017010495837856, "grad_norm": 60.17667883042894, "learning_rate": 1.3141161431896808e-09, "logits/chosen": -1.292588472366333, "logits/rejected": -1.2948338985443115, "logps/chosen": -141.24307250976562, "logps/rejected": -183.1881866455078, "loss": 0.4093, "rewards/accuracies": 0.84375, "rewards/chosen": -0.06378068029880524, "rewards/margins": 1.4755961894989014, "rewards/rejected": -1.539376974105835, "step": 1642 }, { "epoch": 1.9040173724212812, "grad_norm": 46.419475472935424, "learning_rate": 1.2524135456415286e-09, "logits/chosen": -1.3303675651550293, "logits/rejected": -1.3957011699676514, "logps/chosen": -197.4314727783203, "logps/rejected": -230.6079559326172, "loss": 0.3864, "rewards/accuracies": 0.84375, "rewards/chosen": -0.5181460380554199, "rewards/margins": 1.6645830869674683, "rewards/rejected": -2.1827290058135986, "step": 1644 }, { "epoch": 1.9063336952587768, "grad_norm": 63.51262338219536, "learning_rate": 1.1921854475043125e-09, "logits/chosen": -1.3057444095611572, "logits/rejected": -1.3598231077194214, "logps/chosen": -140.8748779296875, "logps/rejected": -162.47158813476562, "loss": 0.438, "rewards/accuracies": 0.8125, "rewards/chosen": -0.24470748007297516, "rewards/margins": 1.0919952392578125, "rewards/rejected": -1.3367027044296265, "step": 1646 }, { "epoch": 1.9086500180962722, "grad_norm": 59.16346811149136, "learning_rate": 1.133432748104257e-09, "logits/chosen": -1.2664871215820312, "logits/rejected": -1.2415859699249268, "logps/chosen": -168.35910034179688, "logps/rejected": -186.53665161132812, "loss": 0.3824, "rewards/accuracies": 0.75, "rewards/chosen": -0.3128247559070587, "rewards/margins": 1.2026112079620361, "rewards/rejected": -1.5154359340667725, "step": 1648 }, { "epoch": 1.9109663409337676, "grad_norm": 72.05923858294557, "learning_rate": 1.0761563247369322e-09, "logits/chosen": -1.2224782705307007, "logits/rejected": -1.2964767217636108, "logps/chosen": -149.58824157714844, "logps/rejected": -172.06092834472656, "loss": 0.4327, "rewards/accuracies": 0.8125, "rewards/chosen": -0.10623270273208618, "rewards/margins": 1.0352814197540283, "rewards/rejected": -1.1415140628814697, "step": 1650 }, { "epoch": 1.913282663771263, "grad_norm": 72.25189940099223, "learning_rate": 1.0203570326541622e-09, "logits/chosen": -1.0865113735198975, "logits/rejected": -1.1271172761917114, "logps/chosen": -103.49948120117188, "logps/rejected": -108.95231628417969, "loss": 0.4442, "rewards/accuracies": 0.8125, "rewards/chosen": -0.29463884234428406, "rewards/margins": 0.5380735397338867, "rewards/rejected": -0.8327123522758484, "step": 1652 }, { "epoch": 1.9155989866087586, "grad_norm": 58.87468402600475, "learning_rate": 9.660357050512158e-10, "logits/chosen": -1.1705281734466553, "logits/rejected": -1.213219165802002, "logps/chosen": -151.6047821044922, "logps/rejected": -167.16241455078125, "loss": 0.4755, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2385808229446411, "rewards/margins": 0.9532268047332764, "rewards/rejected": -2.191807746887207, "step": 1654 }, { "epoch": 1.9179153094462542, "grad_norm": 61.13297680975046, "learning_rate": 9.131931530544146e-10, "logits/chosen": -1.1430917978286743, "logits/rejected": -1.1427133083343506, "logps/chosen": -152.1170654296875, "logps/rejected": -184.9088592529297, "loss": 0.4595, "rewards/accuracies": 0.8125, "rewards/chosen": -0.958507239818573, "rewards/margins": 1.5942294597625732, "rewards/rejected": -2.552736759185791, "step": 1656 }, { "epoch": 1.9202316322837496, "grad_norm": 52.83655360025748, "learning_rate": 8.618301657089877e-10, "logits/chosen": -1.190130352973938, "logits/rejected": -1.2142915725708008, "logps/chosen": -161.54466247558594, "logps/rejected": -182.72946166992188, "loss": 0.3894, "rewards/accuracies": 0.84375, "rewards/chosen": -0.6212272047996521, "rewards/margins": 1.291230320930481, "rewards/rejected": -1.9124574661254883, "step": 1658 }, { "epoch": 1.922547955121245, "grad_norm": 110.57140073148696, "learning_rate": 8.119475099673035e-10, "logits/chosen": -1.1131267547607422, "logits/rejected": -1.2174605131149292, "logps/chosen": -157.0382843017578, "logps/rejected": -187.213134765625, "loss": 0.4364, "rewards/accuracies": 0.78125, "rewards/chosen": -0.5017825961112976, "rewards/margins": 0.9430520534515381, "rewards/rejected": -1.444834589958191, "step": 1660 }, { "epoch": 1.9248642779587404, "grad_norm": 53.02147974570211, "learning_rate": 7.635459306773784e-10, "logits/chosen": -1.1875760555267334, "logits/rejected": -1.2432739734649658, "logps/chosen": -147.28131103515625, "logps/rejected": -169.49566650390625, "loss": 0.43, "rewards/accuracies": 0.84375, "rewards/chosen": -0.40067267417907715, "rewards/margins": 1.1304292678833008, "rewards/rejected": -1.531101942062378, "step": 1662 }, { "epoch": 1.927180600796236, "grad_norm": 64.80466315551176, "learning_rate": 7.166261505718418e-10, "logits/chosen": -1.222117304801941, "logits/rejected": -1.2081456184387207, "logps/chosen": -157.27780151367188, "logps/rejected": -173.7355499267578, "loss": 0.3734, "rewards/accuracies": 0.84375, "rewards/chosen": -0.33614563941955566, "rewards/margins": 1.0949398279190063, "rewards/rejected": -1.431085467338562, "step": 1664 }, { "epoch": 1.9294969236337316, "grad_norm": 53.26534606993647, "learning_rate": 6.711888702570556e-10, "logits/chosen": -1.2844552993774414, "logits/rejected": -1.3053499460220337, "logps/chosen": -167.5966033935547, "logps/rejected": -168.5198974609375, "loss": 0.4155, "rewards/accuracies": 0.71875, "rewards/chosen": -0.15978145599365234, "rewards/margins": 1.0049140453338623, "rewards/rejected": -1.1646955013275146, "step": 1666 }, { "epoch": 1.931813246471227, "grad_norm": 44.85242610407527, "learning_rate": 6.272347682026779e-10, "logits/chosen": -1.0309640169143677, "logits/rejected": -1.134310007095337, "logps/chosen": -111.43970489501953, "logps/rejected": -144.99891662597656, "loss": 0.3869, "rewards/accuracies": 0.78125, "rewards/chosen": -0.24206304550170898, "rewards/margins": 1.2624578475952148, "rewards/rejected": -1.5045208930969238, "step": 1668 }, { "epoch": 1.9341295693087224, "grad_norm": 47.25828490220509, "learning_rate": 5.847645007315937e-10, "logits/chosen": -1.1614665985107422, "logits/rejected": -1.254847764968872, "logps/chosen": -137.72381591796875, "logps/rejected": -143.80300903320312, "loss": 0.4304, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6075265407562256, "rewards/margins": 0.8250117301940918, "rewards/rejected": -1.4325382709503174, "step": 1670 }, { "epoch": 1.9364458921462178, "grad_norm": 47.60965224972502, "learning_rate": 5.437787020100115e-10, "logits/chosen": -1.2086517810821533, "logits/rejected": -1.1928253173828125, "logps/chosen": -162.096923828125, "logps/rejected": -178.24951171875, "loss": 0.3871, "rewards/accuracies": 0.875, "rewards/chosen": -0.37640607357025146, "rewards/margins": 1.857001781463623, "rewards/rejected": -2.233407974243164, "step": 1672 }, { "epoch": 1.9387622149837134, "grad_norm": 51.270284022735154, "learning_rate": 5.042779840380595e-10, "logits/chosen": -1.2238942384719849, "logits/rejected": -1.195109486579895, "logps/chosen": -119.4543685913086, "logps/rejected": -133.65127563476562, "loss": 0.4054, "rewards/accuracies": 0.65625, "rewards/chosen": -0.3657826781272888, "rewards/margins": 0.8839918375015259, "rewards/rejected": -1.249774694442749, "step": 1674 }, { "epoch": 1.941078537821209, "grad_norm": 58.278892064452975, "learning_rate": 4.662629366406601e-10, "logits/chosen": -1.2001346349716187, "logits/rejected": -1.1202467679977417, "logps/chosen": -128.72256469726562, "logps/rejected": -139.1036376953125, "loss": 0.4051, "rewards/accuracies": 0.8125, "rewards/chosen": -0.27061766386032104, "rewards/margins": 1.1671736240386963, "rewards/rejected": -1.437791347503662, "step": 1676 }, { "epoch": 1.9433948606587044, "grad_norm": 46.614126047623216, "learning_rate": 4.2973412745864744e-10, "logits/chosen": -1.167816162109375, "logits/rejected": -1.1992610692977905, "logps/chosen": -129.0093536376953, "logps/rejected": -169.50445556640625, "loss": 0.3959, "rewards/accuracies": 0.96875, "rewards/chosen": -0.2619994878768921, "rewards/margins": 1.6201242208480835, "rewards/rejected": -1.8821238279342651, "step": 1678 }, { "epoch": 1.9457111834961998, "grad_norm": 75.8267220593344, "learning_rate": 3.946921019403859e-10, "logits/chosen": -1.2351601123809814, "logits/rejected": -1.3069424629211426, "logps/chosen": -130.69961547851562, "logps/rejected": -150.25662231445312, "loss": 0.4755, "rewards/accuracies": 0.59375, "rewards/chosen": -0.33375632762908936, "rewards/margins": 0.7270826697349548, "rewards/rejected": -1.0608389377593994, "step": 1680 }, { "epoch": 1.9480275063336951, "grad_norm": 57.21603903329357, "learning_rate": 3.61137383333554e-10, "logits/chosen": -1.1741724014282227, "logits/rejected": -1.158752679824829, "logps/chosen": -158.3549346923828, "logps/rejected": -179.17391967773438, "loss": 0.393, "rewards/accuracies": 0.75, "rewards/chosen": -0.5816279649734497, "rewards/margins": 1.2226191759109497, "rewards/rejected": -1.8042471408843994, "step": 1682 }, { "epoch": 1.9503438291711908, "grad_norm": 56.808299639154505, "learning_rate": 3.2907047267736186e-10, "logits/chosen": -1.240709900856018, "logits/rejected": -1.223825454711914, "logps/chosen": -163.18728637695312, "logps/rejected": -198.8317413330078, "loss": 0.3877, "rewards/accuracies": 0.78125, "rewards/chosen": -0.28957706689834595, "rewards/margins": 1.6465396881103516, "rewards/rejected": -1.9361168146133423, "step": 1684 }, { "epoch": 1.9526601520086864, "grad_norm": 52.80591879977487, "learning_rate": 2.9849184879506827e-10, "logits/chosen": -1.1730728149414062, "logits/rejected": -1.1454265117645264, "logps/chosen": -136.7293701171875, "logps/rejected": -151.21914672851562, "loss": 0.3944, "rewards/accuracies": 0.78125, "rewards/chosen": -0.25426185131073, "rewards/margins": 1.232313871383667, "rewards/rejected": -1.4865756034851074, "step": 1686 }, { "epoch": 1.9549764748461818, "grad_norm": 75.29247959139558, "learning_rate": 2.6940196828681983e-10, "logits/chosen": -1.093564510345459, "logits/rejected": -1.0963504314422607, "logps/chosen": -189.71131896972656, "logps/rejected": -234.08973693847656, "loss": 0.4931, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0700315237045288, "rewards/margins": 1.948075532913208, "rewards/rejected": -3.0181069374084473, "step": 1688 }, { "epoch": 1.9572927976836771, "grad_norm": 70.12384040466765, "learning_rate": 2.418012655228452e-10, "logits/chosen": -1.2431126832962036, "logits/rejected": -1.2657066583633423, "logps/chosen": -99.85934448242188, "logps/rejected": -121.7491226196289, "loss": 0.426, "rewards/accuracies": 0.75, "rewards/chosen": -0.3242354393005371, "rewards/margins": 0.8404097557067871, "rewards/rejected": -1.1646450757980347, "step": 1690 }, { "epoch": 1.9596091205211725, "grad_norm": 50.23873116829206, "learning_rate": 2.1569015263697143e-10, "logits/chosen": -1.2286624908447266, "logits/rejected": -1.2570605278015137, "logps/chosen": -145.4255828857422, "logps/rejected": -194.1035919189453, "loss": 0.4198, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5209024548530579, "rewards/margins": 1.7652302980422974, "rewards/rejected": -2.286133050918579, "step": 1692 }, { "epoch": 1.9619254433586681, "grad_norm": 54.25222586547325, "learning_rate": 1.9106901952045119e-10, "logits/chosen": -1.2050321102142334, "logits/rejected": -1.2619915008544922, "logps/chosen": -179.3286590576172, "logps/rejected": -227.4953155517578, "loss": 0.4315, "rewards/accuracies": 0.9375, "rewards/chosen": -0.5577185750007629, "rewards/margins": 1.80524742603302, "rewards/rejected": -2.3629660606384277, "step": 1694 }, { "epoch": 1.9642417661961638, "grad_norm": 61.997046360221496, "learning_rate": 1.6793823381614501e-10, "logits/chosen": -1.294581651687622, "logits/rejected": -1.247463583946228, "logps/chosen": -144.93246459960938, "logps/rejected": -168.00628662109375, "loss": 0.444, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0931825190782547, "rewards/margins": 1.4015557765960693, "rewards/rejected": -1.4947383403778076, "step": 1696 }, { "epoch": 1.9665580890336591, "grad_norm": 82.5016461876686, "learning_rate": 1.4629814091307036e-10, "logits/chosen": -1.2317255735397339, "logits/rejected": -1.2456412315368652, "logps/chosen": -156.1422576904297, "logps/rejected": -151.87107849121094, "loss": 0.417, "rewards/accuracies": 0.75, "rewards/chosen": -0.4730032682418823, "rewards/margins": 1.0971145629882812, "rewards/rejected": -1.570117712020874, "step": 1698 }, { "epoch": 1.9688744118711545, "grad_norm": 58.642096283997354, "learning_rate": 1.261490639411833e-10, "logits/chosen": -1.1768873929977417, "logits/rejected": -1.2999684810638428, "logps/chosen": -109.31826782226562, "logps/rejected": -134.15371704101562, "loss": 0.3986, "rewards/accuracies": 0.78125, "rewards/chosen": -0.2187974750995636, "rewards/margins": 1.001755714416504, "rewards/rejected": -1.2205531597137451, "step": 1700 }, { "epoch": 1.9688744118711545, "eval_logits/chosen": -1.215119481086731, "eval_logits/rejected": -1.2099292278289795, "eval_logps/chosen": -144.09429931640625, "eval_logps/rejected": -149.15176391601562, "eval_loss": 0.5949785113334656, "eval_rewards/accuracies": 0.7599999904632568, "eval_rewards/chosen": -0.8463126420974731, "eval_rewards/margins": 0.6735073924064636, "eval_rewards/rejected": -1.5198200941085815, "eval_runtime": 22.9886, "eval_samples_per_second": 4.35, "eval_steps_per_second": 1.087, "step": 1700 }, { "epoch": 1.97119073470865, "grad_norm": 55.677396344114726, "learning_rate": 1.0749130376659366e-10, "logits/chosen": -1.2230623960494995, "logits/rejected": -1.163780689239502, "logps/chosen": -164.1004638671875, "logps/rejected": -179.64088439941406, "loss": 0.3997, "rewards/accuracies": 0.84375, "rewards/chosen": -0.1451684981584549, "rewards/margins": 1.5081610679626465, "rewards/rejected": -1.653329610824585, "step": 1702 }, { "epoch": 1.9735070575461455, "grad_norm": 71.96257699524784, "learning_rate": 9.032513898705741e-11, "logits/chosen": -1.2779675722122192, "logits/rejected": -1.2883471250534058, "logps/chosen": -133.36380004882812, "logps/rejected": -153.81741333007812, "loss": 0.467, "rewards/accuracies": 0.8125, "rewards/chosen": -0.13987727463245392, "rewards/margins": 1.16806161403656, "rewards/rejected": -1.307938814163208, "step": 1704 }, { "epoch": 1.975823380383641, "grad_norm": 49.123608283002156, "learning_rate": 7.465082592782445e-11, "logits/chosen": -1.207802414894104, "logits/rejected": -1.1620241403579712, "logps/chosen": -164.59759521484375, "logps/rejected": -208.24905395507812, "loss": 0.3896, "rewards/accuracies": 0.90625, "rewards/chosen": -0.5080645084381104, "rewards/margins": 2.8074049949645996, "rewards/rejected": -3.315469980239868, "step": 1706 }, { "epoch": 1.9781397032211365, "grad_norm": 51.95036099086183, "learning_rate": 6.04685986378195e-11, "logits/chosen": -1.25301992893219, "logits/rejected": -1.2303074598312378, "logps/chosen": -160.34934997558594, "logps/rejected": -162.81515502929688, "loss": 0.3678, "rewards/accuracies": 0.875, "rewards/chosen": -0.21462872624397278, "rewards/margins": 1.2838810682296753, "rewards/rejected": -1.4985097646713257, "step": 1708 }, { "epoch": 1.980456026058632, "grad_norm": 55.67392221664361, "learning_rate": 4.777866888611148e-11, "logits/chosen": -1.1367592811584473, "logits/rejected": -1.2029287815093994, "logps/chosen": -170.26568603515625, "logps/rejected": -203.4815216064453, "loss": 0.3303, "rewards/accuracies": 0.90625, "rewards/chosen": -0.21711598336696625, "rewards/margins": 1.6773165464401245, "rewards/rejected": -1.8944324254989624, "step": 1710 }, { "epoch": 1.9827723488961273, "grad_norm": 77.61747497941103, "learning_rate": 3.658122615880499e-11, "logits/chosen": -1.198671817779541, "logits/rejected": -1.1844216585159302, "logps/chosen": -188.25486755371094, "logps/rejected": -188.4522247314453, "loss": 0.4122, "rewards/accuracies": 0.75, "rewards/chosen": -0.35006386041641235, "rewards/margins": 1.136628270149231, "rewards/rejected": -1.4866920709609985, "step": 1712 }, { "epoch": 1.985088671733623, "grad_norm": 72.68985609286901, "learning_rate": 2.687643765615366e-11, "logits/chosen": -1.2630504369735718, "logits/rejected": -1.1126054525375366, "logps/chosen": -170.22010803222656, "logps/rejected": -162.9827880859375, "loss": 0.4154, "rewards/accuracies": 0.8125, "rewards/chosen": -0.1824585199356079, "rewards/margins": 1.2966349124908447, "rewards/rejected": -1.479093313217163, "step": 1714 }, { "epoch": 1.9874049945711183, "grad_norm": 57.324360587243646, "learning_rate": 1.8664448290106606e-11, "logits/chosen": -1.1106977462768555, "logits/rejected": -1.1625827550888062, "logps/chosen": -127.45535278320312, "logps/rejected": -168.37313842773438, "loss": 0.4271, "rewards/accuracies": 0.90625, "rewards/chosen": -0.41478192806243896, "rewards/margins": 1.4449265003204346, "rewards/rejected": -1.8597084283828735, "step": 1716 }, { "epoch": 1.989721317408614, "grad_norm": 64.91297510204477, "learning_rate": 1.1945380682132355e-11, "logits/chosen": -1.3344897031784058, "logits/rejected": -1.3637080192565918, "logps/chosen": -155.83399963378906, "logps/rejected": -174.8062744140625, "loss": 0.4455, "rewards/accuracies": 0.8125, "rewards/chosen": -0.2664361596107483, "rewards/margins": 1.3320696353912354, "rewards/rejected": -1.5985058546066284, "step": 1718 }, { "epoch": 1.9920376402461093, "grad_norm": 55.133928870906985, "learning_rate": 6.719335161364803e-12, "logits/chosen": -1.210727334022522, "logits/rejected": -1.2122191190719604, "logps/chosen": -138.89447021484375, "logps/rejected": -185.5248565673828, "loss": 0.407, "rewards/accuracies": 0.84375, "rewards/chosen": -0.43058472871780396, "rewards/margins": 2.1915080547332764, "rewards/rejected": -2.6220927238464355, "step": 1720 }, { "epoch": 1.9943539630836047, "grad_norm": 59.1945232431432, "learning_rate": 2.9863897631488e-12, "logits/chosen": -1.1495387554168701, "logits/rejected": -1.2423110008239746, "logps/chosen": -168.22439575195312, "logps/rejected": -205.48471069335938, "loss": 0.3821, "rewards/accuracies": 0.875, "rewards/chosen": -0.808469295501709, "rewards/margins": 1.811312198638916, "rewards/rejected": -2.619781494140625, "step": 1722 }, { "epoch": 1.9966702859211003, "grad_norm": 50.35911703570131, "learning_rate": 7.466002278522232e-13, "logits/chosen": -1.1888153553009033, "logits/rejected": -1.1903085708618164, "logps/chosen": -183.95315551757812, "logps/rejected": -213.3731689453125, "loss": 0.3585, "rewards/accuracies": 0.90625, "rewards/chosen": -0.34876811504364014, "rewards/margins": 1.8841259479522705, "rewards/rejected": -2.232893943786621, "step": 1724 }, { "epoch": 1.9989866087585957, "grad_norm": 63.15894144617586, "learning_rate": 0.0, "logits/chosen": -1.270525574684143, "logits/rejected": -1.1932576894760132, "logps/chosen": -133.14906311035156, "logps/rejected": -160.24928283691406, "loss": 0.4361, "rewards/accuracies": 0.875, "rewards/chosen": -0.38358262181282043, "rewards/margins": 1.584211826324463, "rewards/rejected": -1.967794418334961, "step": 1726 } ], "logging_steps": 2, "max_steps": 1726, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }