{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3474484256243214, "eval_steps": 100, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0023163228374954757, "grad_norm": 107.28771573687173, "learning_rate": 4e-09, "logits/chosen": -1.124485731124878, "logits/rejected": -1.2086994647979736, "logps/chosen": -156.01296997070312, "logps/rejected": -208.54991149902344, "loss": 0.6233, "rewards/accuracies": 0.59375, "rewards/chosen": 0.07398775964975357, "rewards/margins": 0.2667727768421173, "rewards/rejected": -0.19278500974178314, "step": 2 }, { "epoch": 0.0046326456749909515, "grad_norm": 88.08812844747567, "learning_rate": 8e-09, "logits/chosen": -1.2528715133666992, "logits/rejected": -1.2828329801559448, "logps/chosen": -132.4397430419922, "logps/rejected": -172.45309448242188, "loss": 0.7004, "rewards/accuracies": 0.71875, "rewards/chosen": 0.17529334127902985, "rewards/margins": 0.35418701171875, "rewards/rejected": -0.17889368534088135, "step": 4 }, { "epoch": 0.006948968512486428, "grad_norm": 91.60766301044009, "learning_rate": 1.1999999999999998e-08, "logits/chosen": -1.2843875885009766, "logits/rejected": -1.280484914779663, "logps/chosen": -152.78907775878906, "logps/rejected": -152.23944091796875, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": -0.041480474174022675, "rewards/margins": 0.14676526188850403, "rewards/rejected": -0.1882457733154297, "step": 6 }, { "epoch": 0.009265291349981903, "grad_norm": 98.62741523160896, "learning_rate": 1.6e-08, "logits/chosen": -1.398779273033142, "logits/rejected": -1.4048051834106445, "logps/chosen": -169.36114501953125, "logps/rejected": -187.72393798828125, "loss": 0.6494, "rewards/accuracies": 0.78125, "rewards/chosen": 0.06773313134908676, "rewards/margins": 0.3909590244293213, "rewards/rejected": -0.32322588562965393, "step": 8 }, { "epoch": 0.01158161418747738, "grad_norm": 103.80152080031493, "learning_rate": 2e-08, "logits/chosen": -1.2080261707305908, "logits/rejected": -1.287600040435791, "logps/chosen": -143.546875, "logps/rejected": -167.290283203125, "loss": 0.714, "rewards/accuracies": 0.5, "rewards/chosen": 0.024446196854114532, "rewards/margins": 0.028562966734170914, "rewards/rejected": -0.004116774536669254, "step": 10 }, { "epoch": 0.013897937024972856, "grad_norm": 102.45515514662566, "learning_rate": 2.3999999999999997e-08, "logits/chosen": -1.3106777667999268, "logits/rejected": -1.2940219640731812, "logps/chosen": -146.94171142578125, "logps/rejected": -157.75350952148438, "loss": 0.6722, "rewards/accuracies": 0.5625, "rewards/chosen": 0.09310206025838852, "rewards/margins": 0.06749637424945831, "rewards/rejected": 0.0256056971848011, "step": 12 }, { "epoch": 0.01621425986246833, "grad_norm": 98.56963705951901, "learning_rate": 2.8000000000000003e-08, "logits/chosen": -1.2968071699142456, "logits/rejected": -1.3280229568481445, "logps/chosen": -124.2802505493164, "logps/rejected": -141.4438018798828, "loss": 0.6939, "rewards/accuracies": 0.6875, "rewards/chosen": 0.11542778462171555, "rewards/margins": 0.1968868374824524, "rewards/rejected": -0.08145906031131744, "step": 14 }, { "epoch": 0.018530582699963806, "grad_norm": 126.73593067586688, "learning_rate": 3.2e-08, "logits/chosen": -1.2126774787902832, "logits/rejected": -1.3230162858963013, "logps/chosen": -143.24264526367188, "logps/rejected": -150.46603393554688, "loss": 0.6388, "rewards/accuracies": 0.8125, "rewards/chosen": 0.02438124269247055, "rewards/margins": 0.4613185226917267, "rewards/rejected": -0.43693727254867554, "step": 16 }, { "epoch": 0.020846905537459284, "grad_norm": 125.55466393152622, "learning_rate": 3.6e-08, "logits/chosen": -1.1398718357086182, "logits/rejected": -1.2096312046051025, "logps/chosen": -168.44686889648438, "logps/rejected": -209.51075744628906, "loss": 0.5937, "rewards/accuracies": 0.71875, "rewards/chosen": 0.058702100068330765, "rewards/margins": 0.36269423365592957, "rewards/rejected": -0.3039921820163727, "step": 18 }, { "epoch": 0.02316322837495476, "grad_norm": 99.14617505813268, "learning_rate": 4e-08, "logits/chosen": -1.18150794506073, "logits/rejected": -1.2542356252670288, "logps/chosen": -167.63018798828125, "logps/rejected": -192.35606384277344, "loss": 0.6576, "rewards/accuracies": 0.46875, "rewards/chosen": -0.0018138289451599121, "rewards/margins": 0.2771007716655731, "rewards/rejected": -0.27891457080841064, "step": 20 }, { "epoch": 0.025479551212450234, "grad_norm": 91.07306017372633, "learning_rate": 4.4e-08, "logits/chosen": -1.2071597576141357, "logits/rejected": -1.182037115097046, "logps/chosen": -144.98622131347656, "logps/rejected": -141.1741180419922, "loss": 0.5928, "rewards/accuracies": 0.625, "rewards/chosen": 0.08638761937618256, "rewards/margins": 0.27829694747924805, "rewards/rejected": -0.1919093132019043, "step": 22 }, { "epoch": 0.027795874049945712, "grad_norm": 89.9597440051718, "learning_rate": 4.799999999999999e-08, "logits/chosen": -1.31764554977417, "logits/rejected": -1.3588660955429077, "logps/chosen": -88.85529327392578, "logps/rejected": -98.12319946289062, "loss": 0.6819, "rewards/accuracies": 0.5, "rewards/chosen": 0.034286849200725555, "rewards/margins": -0.03185552358627319, "rewards/rejected": 0.06614237278699875, "step": 24 }, { "epoch": 0.030112196887441187, "grad_norm": 103.03824203236205, "learning_rate": 5.2e-08, "logits/chosen": -1.2300812005996704, "logits/rejected": -1.3013834953308105, "logps/chosen": -136.9016571044922, "logps/rejected": -158.10443115234375, "loss": 0.6651, "rewards/accuracies": 0.5, "rewards/chosen": 0.02496076375246048, "rewards/margins": 0.11402938514947891, "rewards/rejected": -0.08906861394643784, "step": 26 }, { "epoch": 0.03242851972493666, "grad_norm": 82.73233095035695, "learning_rate": 5.6000000000000005e-08, "logits/chosen": -1.2323240041732788, "logits/rejected": -1.279797911643982, "logps/chosen": -129.50372314453125, "logps/rejected": -149.10198974609375, "loss": 0.6957, "rewards/accuracies": 0.625, "rewards/chosen": 0.01655733771622181, "rewards/margins": 0.28547346591949463, "rewards/rejected": -0.26891613006591797, "step": 28 }, { "epoch": 0.03474484256243214, "grad_norm": 81.35170136123334, "learning_rate": 6e-08, "logits/chosen": -1.2328161001205444, "logits/rejected": -1.2630647420883179, "logps/chosen": -115.19596862792969, "logps/rejected": -122.42259216308594, "loss": 0.6297, "rewards/accuracies": 0.65625, "rewards/chosen": 0.05066928267478943, "rewards/margins": 0.29739153385162354, "rewards/rejected": -0.24672222137451172, "step": 30 }, { "epoch": 0.03706116539992761, "grad_norm": 97.64254508804386, "learning_rate": 6.4e-08, "logits/chosen": -1.203454852104187, "logits/rejected": -1.2570163011550903, "logps/chosen": -130.0023193359375, "logps/rejected": -140.43641662597656, "loss": 0.6388, "rewards/accuracies": 0.65625, "rewards/chosen": 0.22588226199150085, "rewards/margins": 0.21626117825508118, "rewards/rejected": 0.009621085599064827, "step": 32 }, { "epoch": 0.03937748823742309, "grad_norm": 77.47976131311218, "learning_rate": 6.8e-08, "logits/chosen": -1.3255200386047363, "logits/rejected": -1.34698486328125, "logps/chosen": -154.41970825195312, "logps/rejected": -185.44137573242188, "loss": 0.6644, "rewards/accuracies": 0.71875, "rewards/chosen": 0.219247967004776, "rewards/margins": 0.5436465740203857, "rewards/rejected": -0.32439863681793213, "step": 34 }, { "epoch": 0.04169381107491857, "grad_norm": 116.81809749704041, "learning_rate": 7.2e-08, "logits/chosen": -1.172755479812622, "logits/rejected": -1.2560861110687256, "logps/chosen": -140.07249450683594, "logps/rejected": -150.0348358154297, "loss": 0.6966, "rewards/accuracies": 0.625, "rewards/chosen": -0.06599292159080505, "rewards/margins": 0.17668266594409943, "rewards/rejected": -0.2426755726337433, "step": 36 }, { "epoch": 0.04401013391241404, "grad_norm": 106.17036044375986, "learning_rate": 7.599999999999999e-08, "logits/chosen": -1.2809267044067383, "logits/rejected": -1.3268154859542847, "logps/chosen": -150.99420166015625, "logps/rejected": -156.5166473388672, "loss": 0.6641, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0937371775507927, "rewards/margins": 0.24054178595542908, "rewards/rejected": -0.14680461585521698, "step": 38 }, { "epoch": 0.04632645674990952, "grad_norm": 92.41470133544318, "learning_rate": 8e-08, "logits/chosen": -1.117331862449646, "logits/rejected": -1.1234540939331055, "logps/chosen": -116.87013244628906, "logps/rejected": -136.917724609375, "loss": 0.6733, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0025692852213978767, "rewards/margins": 0.18151307106018066, "rewards/rejected": -0.17894375324249268, "step": 40 }, { "epoch": 0.048642779587405, "grad_norm": 93.22753554673558, "learning_rate": 8.4e-08, "logits/chosen": -1.2923729419708252, "logits/rejected": -1.297027587890625, "logps/chosen": -166.103759765625, "logps/rejected": -181.45489501953125, "loss": 0.7232, "rewards/accuracies": 0.71875, "rewards/chosen": 0.10465113818645477, "rewards/margins": 0.18375912308692932, "rewards/rejected": -0.07910798490047455, "step": 42 }, { "epoch": 0.05095910242490047, "grad_norm": 106.85916991971588, "learning_rate": 8.8e-08, "logits/chosen": -1.1490933895111084, "logits/rejected": -1.1849195957183838, "logps/chosen": -128.70228576660156, "logps/rejected": -132.4073944091797, "loss": 0.6279, "rewards/accuracies": 0.71875, "rewards/chosen": 0.020340707153081894, "rewards/margins": 0.1922486573457718, "rewards/rejected": -0.1719079315662384, "step": 44 }, { "epoch": 0.053275425262395947, "grad_norm": 97.83169947406368, "learning_rate": 9.2e-08, "logits/chosen": -1.172762393951416, "logits/rejected": -1.2331753969192505, "logps/chosen": -158.21144104003906, "logps/rejected": -166.85243225097656, "loss": 0.678, "rewards/accuracies": 0.46875, "rewards/chosen": -0.032968997955322266, "rewards/margins": -0.025485830381512642, "rewards/rejected": -0.007483180612325668, "step": 46 }, { "epoch": 0.055591748099891425, "grad_norm": 95.31491574095924, "learning_rate": 9.599999999999999e-08, "logits/chosen": -1.1868703365325928, "logits/rejected": -1.1828659772872925, "logps/chosen": -124.27775573730469, "logps/rejected": -134.80194091796875, "loss": 0.6496, "rewards/accuracies": 0.65625, "rewards/chosen": 0.07946968823671341, "rewards/margins": 0.2973785996437073, "rewards/rejected": -0.21790890395641327, "step": 48 }, { "epoch": 0.057908070937386896, "grad_norm": 85.1406868444302, "learning_rate": 1e-07, "logits/chosen": -1.2733350992202759, "logits/rejected": -1.259924054145813, "logps/chosen": -98.49462890625, "logps/rejected": -97.88270568847656, "loss": 0.6637, "rewards/accuracies": 0.625, "rewards/chosen": 0.0715535506606102, "rewards/margins": 0.20892760157585144, "rewards/rejected": -0.13737404346466064, "step": 50 }, { "epoch": 0.060224393774882375, "grad_norm": 125.25778955296133, "learning_rate": 1.04e-07, "logits/chosen": -1.149878740310669, "logits/rejected": -1.2266716957092285, "logps/chosen": -148.4358367919922, "logps/rejected": -194.48550415039062, "loss": 0.6064, "rewards/accuracies": 0.6875, "rewards/chosen": 0.20534949004650116, "rewards/margins": 0.5171413421630859, "rewards/rejected": -0.31179192662239075, "step": 52 }, { "epoch": 0.06254071661237785, "grad_norm": 85.1210580902555, "learning_rate": 1.08e-07, "logits/chosen": -1.167691946029663, "logits/rejected": -1.2391445636749268, "logps/chosen": -89.7966537475586, "logps/rejected": -90.55988311767578, "loss": 0.652, "rewards/accuracies": 0.53125, "rewards/chosen": 0.041167087852954865, "rewards/margins": 0.031343974173069, "rewards/rejected": 0.009823101572692394, "step": 54 }, { "epoch": 0.06485703944987332, "grad_norm": 113.7038348505988, "learning_rate": 1.1200000000000001e-07, "logits/chosen": -1.1915769577026367, "logits/rejected": -1.2897090911865234, "logps/chosen": -128.1698760986328, "logps/rejected": -162.2020721435547, "loss": 0.6575, "rewards/accuracies": 0.59375, "rewards/chosen": 0.04976767301559448, "rewards/margins": 0.225263312458992, "rewards/rejected": -0.17549563944339752, "step": 56 }, { "epoch": 0.0671733622873688, "grad_norm": 101.75813362292396, "learning_rate": 1.1599999999999999e-07, "logits/chosen": -1.1701388359069824, "logits/rejected": -1.2366788387298584, "logps/chosen": -125.07969665527344, "logps/rejected": -143.41500854492188, "loss": 0.6856, "rewards/accuracies": 0.46875, "rewards/chosen": 0.10177399218082428, "rewards/margins": -0.0010812487453222275, "rewards/rejected": 0.10285523533821106, "step": 58 }, { "epoch": 0.06948968512486428, "grad_norm": 126.89350965014833, "learning_rate": 1.2e-07, "logits/chosen": -1.0981509685516357, "logits/rejected": -1.0755817890167236, "logps/chosen": -97.16602325439453, "logps/rejected": -108.92754364013672, "loss": 0.6917, "rewards/accuracies": 0.53125, "rewards/chosen": 0.06149844080209732, "rewards/margins": 0.26494595408439636, "rewards/rejected": -0.20344750583171844, "step": 60 }, { "epoch": 0.07180600796235975, "grad_norm": 104.04126418655044, "learning_rate": 1.24e-07, "logits/chosen": -1.349946141242981, "logits/rejected": -1.3422037363052368, "logps/chosen": -196.04966735839844, "logps/rejected": -191.64556884765625, "loss": 0.6639, "rewards/accuracies": 0.65625, "rewards/chosen": -0.06520761549472809, "rewards/margins": 0.08913937956094742, "rewards/rejected": -0.1543469876050949, "step": 62 }, { "epoch": 0.07412233079985522, "grad_norm": 88.57358643267023, "learning_rate": 1.28e-07, "logits/chosen": -1.301114559173584, "logits/rejected": -1.3226174116134644, "logps/chosen": -193.7246856689453, "logps/rejected": -175.90676879882812, "loss": 0.7147, "rewards/accuracies": 0.65625, "rewards/chosen": -0.012029323726892471, "rewards/margins": 0.13698697090148926, "rewards/rejected": -0.14901632070541382, "step": 64 }, { "epoch": 0.07643865363735071, "grad_norm": 88.52081037901337, "learning_rate": 1.32e-07, "logits/chosen": -1.2255011796951294, "logits/rejected": -1.1852511167526245, "logps/chosen": -126.35673522949219, "logps/rejected": -129.8230743408203, "loss": 0.6276, "rewards/accuracies": 0.65625, "rewards/chosen": -0.08929681777954102, "rewards/margins": 0.20648589730262756, "rewards/rejected": -0.29578274488449097, "step": 66 }, { "epoch": 0.07875497647484618, "grad_norm": 99.62619229467569, "learning_rate": 1.36e-07, "logits/chosen": -1.2398712635040283, "logits/rejected": -1.2752665281295776, "logps/chosen": -120.943115234375, "logps/rejected": -146.46226501464844, "loss": 0.6609, "rewards/accuracies": 0.65625, "rewards/chosen": 0.15306755900382996, "rewards/margins": 0.28030622005462646, "rewards/rejected": -0.12723864614963531, "step": 68 }, { "epoch": 0.08107129931234165, "grad_norm": 123.76564817180537, "learning_rate": 1.3999999999999998e-07, "logits/chosen": -1.2972962856292725, "logits/rejected": -1.2252490520477295, "logps/chosen": -225.8544921875, "logps/rejected": -202.05828857421875, "loss": 0.6665, "rewards/accuracies": 0.59375, "rewards/chosen": -0.04239021614193916, "rewards/margins": 0.21172578632831573, "rewards/rejected": -0.2541159689426422, "step": 70 }, { "epoch": 0.08338762214983714, "grad_norm": 76.83779340876126, "learning_rate": 1.44e-07, "logits/chosen": -1.2308259010314941, "logits/rejected": -1.2874609231948853, "logps/chosen": -122.35316467285156, "logps/rejected": -140.2095489501953, "loss": 0.6382, "rewards/accuracies": 0.40625, "rewards/chosen": -0.03576444834470749, "rewards/margins": 0.04666022211313248, "rewards/rejected": -0.08242467045783997, "step": 72 }, { "epoch": 0.08570394498733261, "grad_norm": 99.14395654649428, "learning_rate": 1.48e-07, "logits/chosen": -1.2242047786712646, "logits/rejected": -1.1648895740509033, "logps/chosen": -129.6905975341797, "logps/rejected": -149.36679077148438, "loss": 0.7052, "rewards/accuracies": 0.46875, "rewards/chosen": -0.054999418556690216, "rewards/margins": -0.0660078227519989, "rewards/rejected": 0.011008389294147491, "step": 74 }, { "epoch": 0.08802026782482808, "grad_norm": 97.12896274616604, "learning_rate": 1.5199999999999998e-07, "logits/chosen": -1.1600117683410645, "logits/rejected": -1.1804448366165161, "logps/chosen": -104.76295471191406, "logps/rejected": -119.62078857421875, "loss": 0.7319, "rewards/accuracies": 0.53125, "rewards/chosen": 0.09297358989715576, "rewards/margins": -0.027700770646333694, "rewards/rejected": 0.12067436426877975, "step": 76 }, { "epoch": 0.09033659066232357, "grad_norm": 97.18348074718145, "learning_rate": 1.56e-07, "logits/chosen": -1.32283616065979, "logits/rejected": -1.3598227500915527, "logps/chosen": -106.35183715820312, "logps/rejected": -126.29917907714844, "loss": 0.6492, "rewards/accuracies": 0.59375, "rewards/chosen": -0.02818872779607773, "rewards/margins": 0.018800366669893265, "rewards/rejected": -0.046989068388938904, "step": 78 }, { "epoch": 0.09265291349981904, "grad_norm": 97.61734233459106, "learning_rate": 1.6e-07, "logits/chosen": -1.271921157836914, "logits/rejected": -1.2778291702270508, "logps/chosen": -131.4869384765625, "logps/rejected": -138.66790771484375, "loss": 0.674, "rewards/accuracies": 0.59375, "rewards/chosen": 0.12225940823554993, "rewards/margins": 0.13028502464294434, "rewards/rejected": -0.008025608956813812, "step": 80 }, { "epoch": 0.09496923633731451, "grad_norm": 107.33609007479366, "learning_rate": 1.6399999999999999e-07, "logits/chosen": -1.2688210010528564, "logits/rejected": -1.157952904701233, "logps/chosen": -139.7398223876953, "logps/rejected": -118.81407928466797, "loss": 0.6931, "rewards/accuracies": 0.59375, "rewards/chosen": 0.09918585419654846, "rewards/margins": 0.21009066700935364, "rewards/rejected": -0.11090480536222458, "step": 82 }, { "epoch": 0.09728555917481, "grad_norm": 110.01597856385645, "learning_rate": 1.68e-07, "logits/chosen": -1.2803689241409302, "logits/rejected": -1.2909519672393799, "logps/chosen": -132.66493225097656, "logps/rejected": -173.28465270996094, "loss": 0.6635, "rewards/accuracies": 0.5625, "rewards/chosen": 0.05728979408740997, "rewards/margins": 0.36615562438964844, "rewards/rejected": -0.30886584520339966, "step": 84 }, { "epoch": 0.09960188201230546, "grad_norm": 83.44332939168942, "learning_rate": 1.7199999999999998e-07, "logits/chosen": -1.3057663440704346, "logits/rejected": -1.385368824005127, "logps/chosen": -107.35197448730469, "logps/rejected": -131.16021728515625, "loss": 0.6259, "rewards/accuracies": 0.75, "rewards/chosen": 0.11738797277212143, "rewards/margins": 0.3890346884727478, "rewards/rejected": -0.2716467082500458, "step": 86 }, { "epoch": 0.10191820484980094, "grad_norm": 111.78643885213532, "learning_rate": 1.76e-07, "logits/chosen": -1.2848472595214844, "logits/rejected": -1.3411585092544556, "logps/chosen": -122.2959976196289, "logps/rejected": -137.3688507080078, "loss": 0.7167, "rewards/accuracies": 0.4375, "rewards/chosen": -0.042663730680942535, "rewards/margins": 0.04965965449810028, "rewards/rejected": -0.09232338517904282, "step": 88 }, { "epoch": 0.10423452768729642, "grad_norm": 98.91788911999603, "learning_rate": 1.8e-07, "logits/chosen": -1.3746676445007324, "logits/rejected": -1.4242061376571655, "logps/chosen": -165.28323364257812, "logps/rejected": -191.28463745117188, "loss": 0.6558, "rewards/accuracies": 0.75, "rewards/chosen": -0.03775983303785324, "rewards/margins": 0.41289234161376953, "rewards/rejected": -0.45065221190452576, "step": 90 }, { "epoch": 0.10655085052479189, "grad_norm": 111.36490143512665, "learning_rate": 1.84e-07, "logits/chosen": -1.3450393676757812, "logits/rejected": -1.2920035123825073, "logps/chosen": -113.93867492675781, "logps/rejected": -127.0622787475586, "loss": 0.652, "rewards/accuracies": 0.59375, "rewards/chosen": 0.016492784023284912, "rewards/margins": 0.23039251565933228, "rewards/rejected": -0.21389973163604736, "step": 92 }, { "epoch": 0.10886717336228736, "grad_norm": 86.77836554288824, "learning_rate": 1.88e-07, "logits/chosen": -1.2809118032455444, "logits/rejected": -1.2533433437347412, "logps/chosen": -126.3050308227539, "logps/rejected": -146.13954162597656, "loss": 0.67, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12605877220630646, "rewards/margins": 0.10366299748420715, "rewards/rejected": -0.2297217696905136, "step": 94 }, { "epoch": 0.11118349619978285, "grad_norm": 88.7771339602004, "learning_rate": 1.9199999999999997e-07, "logits/chosen": -1.3726493120193481, "logits/rejected": -1.3574461936950684, "logps/chosen": -166.51853942871094, "logps/rejected": -152.40924072265625, "loss": 0.6228, "rewards/accuracies": 0.625, "rewards/chosen": 0.10408200323581696, "rewards/margins": 0.2763758897781372, "rewards/rejected": -0.17229388654232025, "step": 96 }, { "epoch": 0.11349981903727832, "grad_norm": 108.88629440565376, "learning_rate": 1.9599999999999998e-07, "logits/chosen": -1.2083911895751953, "logits/rejected": -1.2246620655059814, "logps/chosen": -102.56442260742188, "logps/rejected": -119.50433349609375, "loss": 0.6727, "rewards/accuracies": 0.5625, "rewards/chosen": -0.12194587290287018, "rewards/margins": -0.01594824716448784, "rewards/rejected": -0.10599763691425323, "step": 98 }, { "epoch": 0.11581614187477379, "grad_norm": 107.34841983393876, "learning_rate": 2e-07, "logits/chosen": -1.301100254058838, "logits/rejected": -1.2448687553405762, "logps/chosen": -98.93006896972656, "logps/rejected": -104.18254089355469, "loss": 0.6665, "rewards/accuracies": 0.6875, "rewards/chosen": 0.13594529032707214, "rewards/margins": 0.32611793279647827, "rewards/rejected": -0.19017262756824493, "step": 100 }, { "epoch": 0.11581614187477379, "eval_logits/chosen": -1.2619309425354004, "eval_logits/rejected": -1.26002836227417, "eval_logps/chosen": -136.7689208984375, "eval_logps/rejected": -136.49838256835938, "eval_loss": 0.698452353477478, "eval_rewards/accuracies": 0.6000000238418579, "eval_rewards/chosen": -0.11377277970314026, "eval_rewards/margins": 0.14070965349674225, "eval_rewards/rejected": -0.2544824182987213, "eval_runtime": 24.9769, "eval_samples_per_second": 4.004, "eval_steps_per_second": 1.001, "step": 100 }, { "epoch": 0.11813246471226928, "grad_norm": 116.33900974943187, "learning_rate": 1.9999925339977214e-07, "logits/chosen": -1.184662938117981, "logits/rejected": -1.2719396352767944, "logps/chosen": -97.52359008789062, "logps/rejected": -115.05503845214844, "loss": 0.7448, "rewards/accuracies": 0.53125, "rewards/chosen": 0.214466392993927, "rewards/margins": 0.24747245013713837, "rewards/rejected": -0.033006034791469574, "step": 102 }, { "epoch": 0.12044878754976475, "grad_norm": 104.21886206321497, "learning_rate": 1.9999701361023685e-07, "logits/chosen": -1.3039063215255737, "logits/rejected": -1.3020060062408447, "logps/chosen": -159.8837432861328, "logps/rejected": -170.56915283203125, "loss": 0.6355, "rewards/accuracies": 0.65625, "rewards/chosen": 0.22545436024665833, "rewards/margins": 0.24028439819812775, "rewards/rejected": -0.014830047264695168, "step": 104 }, { "epoch": 0.12276511038726022, "grad_norm": 87.7482533485662, "learning_rate": 1.9999328066483861e-07, "logits/chosen": -1.222218632698059, "logits/rejected": -1.2927945852279663, "logps/chosen": -139.34170532226562, "logps/rejected": -165.7588348388672, "loss": 0.6163, "rewards/accuracies": 0.59375, "rewards/chosen": 0.08087025582790375, "rewards/margins": 0.19403491914272308, "rewards/rejected": -0.11316468566656113, "step": 106 }, { "epoch": 0.1250814332247557, "grad_norm": 99.23120967868726, "learning_rate": 1.9998805461931787e-07, "logits/chosen": -1.4282017946243286, "logits/rejected": -1.4305387735366821, "logps/chosen": -143.5193634033203, "logps/rejected": -159.5055389404297, "loss": 0.6448, "rewards/accuracies": 0.65625, "rewards/chosen": 0.19475266337394714, "rewards/margins": 0.35647571086883545, "rewards/rejected": -0.1617230772972107, "step": 108 }, { "epoch": 0.12739775606225118, "grad_norm": 138.8867591897412, "learning_rate": 1.9998133555170987e-07, "logits/chosen": -1.251868486404419, "logits/rejected": -1.2644716501235962, "logps/chosen": -155.47824096679688, "logps/rejected": -147.9483642578125, "loss": 0.7241, "rewards/accuracies": 0.65625, "rewards/chosen": 0.009776605293154716, "rewards/margins": 0.11597751080989838, "rewards/rejected": -0.1062009185552597, "step": 110 }, { "epoch": 0.12971407889974665, "grad_norm": 83.82733566843218, "learning_rate": 1.9997312356234383e-07, "logits/chosen": -1.1328340768814087, "logits/rejected": -1.1860363483428955, "logps/chosen": -122.01078796386719, "logps/rejected": -129.853759765625, "loss": 0.6493, "rewards/accuracies": 0.5625, "rewards/chosen": 0.04629912227392197, "rewards/margins": 0.14483632147312164, "rewards/rejected": -0.09853721410036087, "step": 112 }, { "epoch": 0.13203040173724212, "grad_norm": 93.13036576501773, "learning_rate": 1.9996341877384118e-07, "logits/chosen": -1.1620995998382568, "logits/rejected": -1.1975159645080566, "logps/chosen": -106.84819793701172, "logps/rejected": -118.56710815429688, "loss": 0.7035, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0475483313202858, "rewards/margins": 0.27000802755355835, "rewards/rejected": -0.22245970368385315, "step": 114 }, { "epoch": 0.1343467245747376, "grad_norm": 91.76740971647824, "learning_rate": 1.9995222133111386e-07, "logits/chosen": -1.41290283203125, "logits/rejected": -1.4713375568389893, "logps/chosen": -150.1622314453125, "logps/rejected": -171.03826904296875, "loss": 0.6605, "rewards/accuracies": 0.625, "rewards/chosen": -0.036708392202854156, "rewards/margins": 0.22243811190128326, "rewards/rejected": -0.259146511554718, "step": 116 }, { "epoch": 0.1366630474122331, "grad_norm": 92.6178794136077, "learning_rate": 1.9993953140136216e-07, "logits/chosen": -1.2917448282241821, "logits/rejected": -1.2992435693740845, "logps/chosen": -156.93075561523438, "logps/rejected": -169.90003967285156, "loss": 0.6092, "rewards/accuracies": 0.71875, "rewards/chosen": 0.2660582661628723, "rewards/margins": 0.3958339989185333, "rewards/rejected": -0.1297757476568222, "step": 118 }, { "epoch": 0.13897937024972856, "grad_norm": 120.21640303391953, "learning_rate": 1.9992534917407219e-07, "logits/chosen": -1.349656105041504, "logits/rejected": -1.3740586042404175, "logps/chosen": -105.40829467773438, "logps/rejected": -121.30719757080078, "loss": 0.6782, "rewards/accuracies": 0.6875, "rewards/chosen": 0.12624263763427734, "rewards/margins": 0.2478535771369934, "rewards/rejected": -0.12161093950271606, "step": 120 }, { "epoch": 0.14129569308722403, "grad_norm": 96.89881415216858, "learning_rate": 1.9990967486101294e-07, "logits/chosen": -1.2133190631866455, "logits/rejected": -1.2981417179107666, "logps/chosen": -143.4733123779297, "logps/rejected": -158.9942169189453, "loss": 0.6542, "rewards/accuracies": 0.78125, "rewards/chosen": 0.12114143371582031, "rewards/margins": 0.3096562922000885, "rewards/rejected": -0.18851490318775177, "step": 122 }, { "epoch": 0.1436120159247195, "grad_norm": 115.73216474991814, "learning_rate": 1.9989250869623339e-07, "logits/chosen": -1.1692413091659546, "logits/rejected": -1.1631613969802856, "logps/chosen": -75.05902099609375, "logps/rejected": -73.19275665283203, "loss": 0.6629, "rewards/accuracies": 0.59375, "rewards/chosen": 0.004932025447487831, "rewards/margins": 0.01587551087141037, "rewards/rejected": -0.010943496599793434, "step": 124 }, { "epoch": 0.14592833876221498, "grad_norm": 91.12287746621037, "learning_rate": 1.9987385093605883e-07, "logits/chosen": -1.2355098724365234, "logits/rejected": -1.3346201181411743, "logps/chosen": -128.04537963867188, "logps/rejected": -157.95831298828125, "loss": 0.6661, "rewards/accuracies": 0.5625, "rewards/chosen": 0.16255205869674683, "rewards/margins": 0.1615760326385498, "rewards/rejected": 0.0009760260581970215, "step": 126 }, { "epoch": 0.14824466159971045, "grad_norm": 75.57776368919416, "learning_rate": 1.9985370185908693e-07, "logits/chosen": -1.324599266052246, "logits/rejected": -1.4256994724273682, "logps/chosen": -160.19496154785156, "logps/rejected": -195.67481994628906, "loss": 0.6052, "rewards/accuracies": 0.625, "rewards/chosen": 0.2864247262477875, "rewards/margins": 0.38650280237197876, "rewards/rejected": -0.10007809102535248, "step": 128 }, { "epoch": 0.15056098443720595, "grad_norm": 99.17567816074047, "learning_rate": 1.9983206176618386e-07, "logits/chosen": -1.3276948928833008, "logits/rejected": -1.355118751525879, "logps/chosen": -150.64071655273438, "logps/rejected": -160.59710693359375, "loss": 0.6456, "rewards/accuracies": 0.59375, "rewards/chosen": 0.24799679219722748, "rewards/margins": 0.17287284135818481, "rewards/rejected": 0.07512396574020386, "step": 130 }, { "epoch": 0.15287730727470142, "grad_norm": 96.98870449195998, "learning_rate": 1.9980893098047952e-07, "logits/chosen": -1.3167519569396973, "logits/rejected": -1.3964886665344238, "logps/chosen": -180.10609436035156, "logps/rejected": -202.38543701171875, "loss": 0.6797, "rewards/accuracies": 0.53125, "rewards/chosen": -0.005836776457726955, "rewards/margins": 0.22576376795768738, "rewards/rejected": -0.2316005378961563, "step": 132 }, { "epoch": 0.1551936301121969, "grad_norm": 117.99608582102925, "learning_rate": 1.9978430984736302e-07, "logits/chosen": -1.2505745887756348, "logits/rejected": -1.2896987199783325, "logps/chosen": -136.62095642089844, "logps/rejected": -174.75283813476562, "loss": 0.6933, "rewards/accuracies": 0.5625, "rewards/chosen": -0.007682096213102341, "rewards/margins": 0.22923235595226288, "rewards/rejected": -0.2369144707918167, "step": 134 }, { "epoch": 0.15750995294969236, "grad_norm": 96.12339888540123, "learning_rate": 1.9975819873447716e-07, "logits/chosen": -1.3039547204971313, "logits/rejected": -1.2686259746551514, "logps/chosen": -148.68177795410156, "logps/rejected": -152.08419799804688, "loss": 0.7376, "rewards/accuracies": 0.4375, "rewards/chosen": 0.060760702937841415, "rewards/margins": 0.03407803922891617, "rewards/rejected": 0.026682645082473755, "step": 136 }, { "epoch": 0.15982627578718783, "grad_norm": 89.28907762268612, "learning_rate": 1.9973059803171318e-07, "logits/chosen": -1.289531946182251, "logits/rejected": -1.3106164932250977, "logps/chosen": -132.0419464111328, "logps/rejected": -141.20553588867188, "loss": 0.6417, "rewards/accuracies": 0.5625, "rewards/chosen": 0.11375564336776733, "rewards/margins": 0.15943853557109833, "rewards/rejected": -0.045682892203330994, "step": 138 }, { "epoch": 0.1621425986246833, "grad_norm": 91.84482179377031, "learning_rate": 1.9970150815120492e-07, "logits/chosen": -1.192561388015747, "logits/rejected": -1.2516599893569946, "logps/chosen": -144.32687377929688, "logps/rejected": -173.77713012695312, "loss": 0.6664, "rewards/accuracies": 0.5625, "rewards/chosen": 0.15235400199890137, "rewards/margins": 0.528164267539978, "rewards/rejected": -0.3758102059364319, "step": 140 }, { "epoch": 0.1644589214621788, "grad_norm": 106.57145249868728, "learning_rate": 1.9967092952732263e-07, "logits/chosen": -1.2669620513916016, "logits/rejected": -1.2892065048217773, "logps/chosen": -158.38601684570312, "logps/rejected": -170.94969177246094, "loss": 0.6949, "rewards/accuracies": 0.59375, "rewards/chosen": -0.014247164130210876, "rewards/margins": 0.1061759814620018, "rewards/rejected": -0.12042315304279327, "step": 142 }, { "epoch": 0.16677524429967427, "grad_norm": 84.38620972657165, "learning_rate": 1.9963886261666644e-07, "logits/chosen": -1.3349426984786987, "logits/rejected": -1.3626400232315063, "logps/chosen": -182.36489868164062, "logps/rejected": -181.57772827148438, "loss": 0.6661, "rewards/accuracies": 0.5, "rewards/chosen": 0.010849647223949432, "rewards/margins": -0.0017997026443481445, "rewards/rejected": 0.012649361044168472, "step": 144 }, { "epoch": 0.16909156713716975, "grad_norm": 98.62106551824567, "learning_rate": 1.996053078980596e-07, "logits/chosen": -1.2585192918777466, "logits/rejected": -1.3876447677612305, "logps/chosen": -197.3888702392578, "logps/rejected": -256.7153625488281, "loss": 0.6226, "rewards/accuracies": 0.59375, "rewards/chosen": 0.11644049733877182, "rewards/margins": 0.36041852831840515, "rewards/rejected": -0.24397803843021393, "step": 146 }, { "epoch": 0.17140788997466522, "grad_norm": 98.22080984411147, "learning_rate": 1.9957026587254133e-07, "logits/chosen": -1.337060809135437, "logits/rejected": -1.409785270690918, "logps/chosen": -132.1688995361328, "logps/rejected": -153.75076293945312, "loss": 0.6727, "rewards/accuracies": 0.625, "rewards/chosen": 0.1054624617099762, "rewards/margins": 0.2318607121706009, "rewards/rejected": -0.1263982504606247, "step": 148 }, { "epoch": 0.1737242128121607, "grad_norm": 97.21935367376963, "learning_rate": 1.9953373706335933e-07, "logits/chosen": -1.1929289102554321, "logits/rejected": -1.2236409187316895, "logps/chosen": -107.02670288085938, "logps/rejected": -127.95613098144531, "loss": 0.6247, "rewards/accuracies": 0.59375, "rewards/chosen": 0.2545817792415619, "rewards/margins": 0.5042745471000671, "rewards/rejected": -0.24969279766082764, "step": 150 }, { "epoch": 0.17604053564965616, "grad_norm": 94.64988218894314, "learning_rate": 1.994957220159619e-07, "logits/chosen": -1.3698749542236328, "logits/rejected": -1.3630839586257935, "logps/chosen": -139.28404235839844, "logps/rejected": -144.27447509765625, "loss": 0.6708, "rewards/accuracies": 0.46875, "rewards/chosen": 0.09452275186777115, "rewards/margins": -0.054528601467609406, "rewards/rejected": 0.14905135333538055, "step": 152 }, { "epoch": 0.17835685848715166, "grad_norm": 84.21031983415891, "learning_rate": 1.9945622129798997e-07, "logits/chosen": -1.1571345329284668, "logits/rejected": -1.1760966777801514, "logps/chosen": -116.92776489257812, "logps/rejected": -130.6823272705078, "loss": 0.7077, "rewards/accuracies": 0.65625, "rewards/chosen": 0.014008231461048126, "rewards/margins": 0.1772165149450302, "rewards/rejected": -0.16320832073688507, "step": 154 }, { "epoch": 0.18067318132464713, "grad_norm": 92.78782165706316, "learning_rate": 1.994152354992684e-07, "logits/chosen": -1.2113721370697021, "logits/rejected": -1.1836888790130615, "logps/chosen": -152.64590454101562, "logps/rejected": -165.62060546875, "loss": 0.6243, "rewards/accuracies": 0.625, "rewards/chosen": 0.08571603149175644, "rewards/margins": 0.3834956884384155, "rewards/rejected": -0.2977796792984009, "step": 156 }, { "epoch": 0.1829895041621426, "grad_norm": 87.13678534194861, "learning_rate": 1.993727652317973e-07, "logits/chosen": -1.3516209125518799, "logits/rejected": -1.3507136106491089, "logps/chosen": -131.21653747558594, "logps/rejected": -137.61451721191406, "loss": 0.663, "rewards/accuracies": 0.625, "rewards/chosen": 0.1327284872531891, "rewards/margins": 0.0872359424829483, "rewards/rejected": 0.04549254849553108, "step": 158 }, { "epoch": 0.18530582699963807, "grad_norm": 95.58101049566258, "learning_rate": 1.9932881112974295e-07, "logits/chosen": -1.33234441280365, "logits/rejected": -1.3648202419281006, "logps/chosen": -145.8714141845703, "logps/rejected": -169.70263671875, "loss": 0.6228, "rewards/accuracies": 0.78125, "rewards/chosen": 0.2145387828350067, "rewards/margins": 0.47029000520706177, "rewards/rejected": -0.25575125217437744, "step": 160 }, { "epoch": 0.18762214983713354, "grad_norm": 102.72583148528975, "learning_rate": 1.9928337384942815e-07, "logits/chosen": -1.3217285871505737, "logits/rejected": -1.3136992454528809, "logps/chosen": -136.3543701171875, "logps/rejected": -162.57989501953125, "loss": 0.6413, "rewards/accuracies": 0.65625, "rewards/chosen": 0.10585685819387436, "rewards/margins": 0.4493997395038605, "rewards/rejected": -0.3435429036617279, "step": 162 }, { "epoch": 0.18993847267462902, "grad_norm": 80.21906160407148, "learning_rate": 1.992364540693226e-07, "logits/chosen": -1.1656920909881592, "logits/rejected": -1.251837134361267, "logps/chosen": -107.77313995361328, "logps/rejected": -126.16749572753906, "loss": 0.6541, "rewards/accuracies": 0.5625, "rewards/chosen": 0.11644032597541809, "rewards/margins": 0.2288074493408203, "rewards/rejected": -0.11236711591482162, "step": 164 }, { "epoch": 0.19225479551212452, "grad_norm": 86.51541074826133, "learning_rate": 1.991880524900327e-07, "logits/chosen": -1.3202519416809082, "logits/rejected": -1.3556690216064453, "logps/chosen": -161.82077026367188, "logps/rejected": -177.78990173339844, "loss": 0.6568, "rewards/accuracies": 0.6875, "rewards/chosen": 0.14771340787410736, "rewards/margins": 0.33536669611930847, "rewards/rejected": -0.1876532882452011, "step": 166 }, { "epoch": 0.19457111834962, "grad_norm": 94.05305295088043, "learning_rate": 1.99138169834291e-07, "logits/chosen": -1.3505268096923828, "logits/rejected": -1.3973050117492676, "logps/chosen": -139.77182006835938, "logps/rejected": -162.59844970703125, "loss": 0.6877, "rewards/accuracies": 0.5625, "rewards/chosen": 0.07011064141988754, "rewards/margins": 0.491350919008255, "rewards/rejected": -0.42124032974243164, "step": 168 }, { "epoch": 0.19688744118711546, "grad_norm": 97.64767459159383, "learning_rate": 1.9908680684694557e-07, "logits/chosen": -1.3445894718170166, "logits/rejected": -1.290609359741211, "logps/chosen": -141.98887634277344, "logps/rejected": -149.97586059570312, "loss": 0.6267, "rewards/accuracies": 0.71875, "rewards/chosen": 0.06746640801429749, "rewards/margins": 0.2343478798866272, "rewards/rejected": -0.1668814867734909, "step": 170 }, { "epoch": 0.19920376402461093, "grad_norm": 83.46398544914094, "learning_rate": 1.990339642949488e-07, "logits/chosen": -1.2715935707092285, "logits/rejected": -1.287178874015808, "logps/chosen": -148.58975219726562, "logps/rejected": -165.84759521484375, "loss": 0.63, "rewards/accuracies": 0.65625, "rewards/chosen": 0.021569374948740005, "rewards/margins": 0.302487850189209, "rewards/rejected": -0.2809184491634369, "step": 172 }, { "epoch": 0.2015200868621064, "grad_norm": 76.3322328554944, "learning_rate": 1.9897964296734585e-07, "logits/chosen": -1.3700447082519531, "logits/rejected": -1.342909336090088, "logps/chosen": -120.27708435058594, "logps/rejected": -138.94029235839844, "loss": 0.5928, "rewards/accuracies": 0.625, "rewards/chosen": 0.13506034016609192, "rewards/margins": 0.2544079124927521, "rewards/rejected": -0.11934758722782135, "step": 174 }, { "epoch": 0.20383640969960187, "grad_norm": 107.54391320136774, "learning_rate": 1.9892384367526306e-07, "logits/chosen": -1.295619249343872, "logits/rejected": -1.310873031616211, "logps/chosen": -121.80717468261719, "logps/rejected": -130.56863403320312, "loss": 0.6713, "rewards/accuracies": 0.71875, "rewards/chosen": 0.04329588636755943, "rewards/margins": 0.15788349509239197, "rewards/rejected": -0.11458761245012283, "step": 176 }, { "epoch": 0.20615273253709737, "grad_norm": 119.93284403188072, "learning_rate": 1.9886656725189573e-07, "logits/chosen": -1.279773473739624, "logits/rejected": -1.248420000076294, "logps/chosen": -140.36367797851562, "logps/rejected": -171.85006713867188, "loss": 0.6378, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0794132649898529, "rewards/margins": 0.44784438610076904, "rewards/rejected": -0.36843112111091614, "step": 178 }, { "epoch": 0.20846905537459284, "grad_norm": 79.14950979441363, "learning_rate": 1.9880781455249567e-07, "logits/chosen": -1.2967725992202759, "logits/rejected": -1.25602126121521, "logps/chosen": -144.56265258789062, "logps/rejected": -162.73004150390625, "loss": 0.6249, "rewards/accuracies": 0.71875, "rewards/chosen": -0.049035411328077316, "rewards/margins": 0.5332698822021484, "rewards/rejected": -0.5823052525520325, "step": 180 }, { "epoch": 0.21078537821208831, "grad_norm": 98.83687345993943, "learning_rate": 1.9874758645435846e-07, "logits/chosen": -1.2088567018508911, "logits/rejected": -1.2226208448410034, "logps/chosen": -123.06584930419922, "logps/rejected": -139.6479034423828, "loss": 0.7016, "rewards/accuracies": 0.65625, "rewards/chosen": 0.08502492308616638, "rewards/margins": 0.19190920889377594, "rewards/rejected": -0.10688426345586777, "step": 182 }, { "epoch": 0.21310170104958379, "grad_norm": 113.97507708078155, "learning_rate": 1.986858838568103e-07, "logits/chosen": -1.3104774951934814, "logits/rejected": -1.4111891984939575, "logps/chosen": -181.88356018066406, "logps/rejected": -208.72584533691406, "loss": 0.705, "rewards/accuracies": 0.65625, "rewards/chosen": -0.08081494271755219, "rewards/margins": 0.3886911869049072, "rewards/rejected": -0.469506174325943, "step": 184 }, { "epoch": 0.21541802388707926, "grad_norm": 77.45086288721679, "learning_rate": 1.986227076811947e-07, "logits/chosen": -1.3617669343948364, "logits/rejected": -1.4249626398086548, "logps/chosen": -129.08343505859375, "logps/rejected": -152.81179809570312, "loss": 0.6411, "rewards/accuracies": 0.625, "rewards/chosen": 0.05278458073735237, "rewards/margins": 0.2645794153213501, "rewards/rejected": -0.21179485321044922, "step": 186 }, { "epoch": 0.21773434672457473, "grad_norm": 84.35008555939693, "learning_rate": 1.985580588708586e-07, "logits/chosen": -1.3356997966766357, "logits/rejected": -1.3520371913909912, "logps/chosen": -116.34514617919922, "logps/rejected": -129.33343505859375, "loss": 0.5923, "rewards/accuracies": 0.75, "rewards/chosen": 0.2643883526325226, "rewards/margins": 0.6749240159988403, "rewards/rejected": -0.41053569316864014, "step": 188 }, { "epoch": 0.2200506695620702, "grad_norm": 123.38101336984641, "learning_rate": 1.984919383911383e-07, "logits/chosen": -1.308461308479309, "logits/rejected": -1.304238200187683, "logps/chosen": -158.46170043945312, "logps/rejected": -180.0099639892578, "loss": 0.6699, "rewards/accuracies": 0.625, "rewards/chosen": 0.2162826955318451, "rewards/margins": 0.2433900237083435, "rewards/rejected": -0.02710733562707901, "step": 190 }, { "epoch": 0.2223669923995657, "grad_norm": 86.07588724640635, "learning_rate": 1.9842434722934515e-07, "logits/chosen": -1.4127757549285889, "logits/rejected": -1.4460113048553467, "logps/chosen": -157.5387420654297, "logps/rejected": -172.41824340820312, "loss": 0.6669, "rewards/accuracies": 0.59375, "rewards/chosen": 0.1006392389535904, "rewards/margins": 0.23149970173835754, "rewards/rejected": -0.13086043298244476, "step": 192 }, { "epoch": 0.22468331523706117, "grad_norm": 94.64802234991522, "learning_rate": 1.9835528639475064e-07, "logits/chosen": -1.388432264328003, "logits/rejected": -1.4818886518478394, "logps/chosen": -141.1114959716797, "logps/rejected": -157.92218017578125, "loss": 0.697, "rewards/accuracies": 0.625, "rewards/chosen": 0.01169365644454956, "rewards/margins": 0.08859987556934357, "rewards/rejected": -0.076906219124794, "step": 194 }, { "epoch": 0.22699963807455664, "grad_norm": 89.06679411091595, "learning_rate": 1.9828475691857144e-07, "logits/chosen": -1.3492029905319214, "logits/rejected": -1.3424463272094727, "logps/chosen": -187.03414916992188, "logps/rejected": -190.73678588867188, "loss": 0.6388, "rewards/accuracies": 0.59375, "rewards/chosen": 0.22039715945720673, "rewards/margins": 0.18239110708236694, "rewards/rejected": 0.038006074726581573, "step": 196 }, { "epoch": 0.2293159609120521, "grad_norm": 95.75837422835144, "learning_rate": 1.982127598539541e-07, "logits/chosen": -1.1775927543640137, "logits/rejected": -1.1424531936645508, "logps/chosen": -148.27951049804688, "logps/rejected": -156.52732849121094, "loss": 0.6159, "rewards/accuracies": 0.53125, "rewards/chosen": 0.1567731648683548, "rewards/margins": 0.08035247027873993, "rewards/rejected": 0.07642071694135666, "step": 198 }, { "epoch": 0.23163228374954759, "grad_norm": 93.08435428016394, "learning_rate": 1.9813929627595906e-07, "logits/chosen": -1.4000098705291748, "logits/rejected": -1.3821650743484497, "logps/chosen": -139.89630126953125, "logps/rejected": -148.32470703125, "loss": 0.6393, "rewards/accuracies": 0.625, "rewards/chosen": 0.09745461493730545, "rewards/margins": 0.24615491926670074, "rewards/rejected": -0.14870032668113708, "step": 200 }, { "epoch": 0.23163228374954759, "eval_logits/chosen": -1.304500937461853, "eval_logits/rejected": -1.3002585172653198, "eval_logps/chosen": -136.9430694580078, "eval_logps/rejected": -136.71730041503906, "eval_loss": 0.6824392080307007, "eval_rewards/accuracies": 0.5600000023841858, "eval_rewards/chosen": -0.13118763267993927, "eval_rewards/margins": 0.14518636465072632, "eval_rewards/rejected": -0.2763740122318268, "eval_runtime": 26.1287, "eval_samples_per_second": 3.827, "eval_steps_per_second": 0.957, "step": 200 }, { "epoch": 0.23394860658704306, "grad_norm": 72.45361347245692, "learning_rate": 1.9806436728154483e-07, "logits/chosen": -1.335165023803711, "logits/rejected": -1.3661694526672363, "logps/chosen": -138.936767578125, "logps/rejected": -158.40025329589844, "loss": 0.6013, "rewards/accuracies": 0.53125, "rewards/chosen": -0.007253367453813553, "rewards/margins": 0.15234613418579102, "rewards/rejected": -0.15959949791431427, "step": 202 }, { "epoch": 0.23626492942453856, "grad_norm": 90.8540277710046, "learning_rate": 1.9798797398955145e-07, "logits/chosen": -1.2384086847305298, "logits/rejected": -1.3494983911514282, "logps/chosen": -128.23599243164062, "logps/rejected": -177.1882781982422, "loss": 0.564, "rewards/accuracies": 0.8125, "rewards/chosen": 0.33455580472946167, "rewards/margins": 0.7392863035202026, "rewards/rejected": -0.40473055839538574, "step": 204 }, { "epoch": 0.23858125226203403, "grad_norm": 74.43794613270396, "learning_rate": 1.9791011754068395e-07, "logits/chosen": -1.3185287714004517, "logits/rejected": -1.3400077819824219, "logps/chosen": -131.19667053222656, "logps/rejected": -163.21923828125, "loss": 0.6032, "rewards/accuracies": 0.65625, "rewards/chosen": 0.1290275603532791, "rewards/margins": 0.47639700770378113, "rewards/rejected": -0.3473694622516632, "step": 206 }, { "epoch": 0.2408975750995295, "grad_norm": 95.09010015175441, "learning_rate": 1.9783079909749514e-07, "logits/chosen": -1.1546052694320679, "logits/rejected": -1.2192610502243042, "logps/chosen": -166.31753540039062, "logps/rejected": -186.45753479003906, "loss": 0.6578, "rewards/accuracies": 0.625, "rewards/chosen": 0.3212764859199524, "rewards/margins": 0.19196587800979614, "rewards/rejected": 0.12931060791015625, "step": 208 }, { "epoch": 0.24321389793702497, "grad_norm": 85.95636909904059, "learning_rate": 1.9775001984436842e-07, "logits/chosen": -1.2965327501296997, "logits/rejected": -1.3256494998931885, "logps/chosen": -150.2991943359375, "logps/rejected": -156.32376098632812, "loss": 0.6469, "rewards/accuracies": 0.375, "rewards/chosen": -0.29619836807250977, "rewards/margins": 0.0699533224105835, "rewards/rejected": -0.36615169048309326, "step": 210 }, { "epoch": 0.24553022077452044, "grad_norm": 100.06649868470993, "learning_rate": 1.9766778098749993e-07, "logits/chosen": -1.216491937637329, "logits/rejected": -1.2272950410842896, "logps/chosen": -164.16220092773438, "logps/rejected": -181.47265625, "loss": 0.6707, "rewards/accuracies": 0.625, "rewards/chosen": -0.06646190583705902, "rewards/margins": 0.46730464696884155, "rewards/rejected": -0.5337665677070618, "step": 212 }, { "epoch": 0.2478465436120159, "grad_norm": 96.52482939931645, "learning_rate": 1.975840837548807e-07, "logits/chosen": -1.3946311473846436, "logits/rejected": -1.3414866924285889, "logps/chosen": -152.69595336914062, "logps/rejected": -164.10858154296875, "loss": 0.7014, "rewards/accuracies": 0.65625, "rewards/chosen": 0.022681551054120064, "rewards/margins": 0.17960719764232635, "rewards/rejected": -0.15692564845085144, "step": 214 }, { "epoch": 0.2501628664495114, "grad_norm": 153.51800892842274, "learning_rate": 1.974989293962781e-07, "logits/chosen": -1.2091304063796997, "logits/rejected": -1.2250198125839233, "logps/chosen": -181.1391143798828, "logps/rejected": -222.74087524414062, "loss": 0.76, "rewards/accuracies": 0.625, "rewards/chosen": -0.0878646969795227, "rewards/margins": 0.9455296993255615, "rewards/rejected": -1.033394455909729, "step": 216 }, { "epoch": 0.25247918928700686, "grad_norm": 99.38458587796177, "learning_rate": 1.974123191832175e-07, "logits/chosen": -1.2188912630081177, "logits/rejected": -1.2454049587249756, "logps/chosen": -114.72016143798828, "logps/rejected": -151.92970275878906, "loss": 0.7037, "rewards/accuracies": 0.46875, "rewards/chosen": -0.28741875290870667, "rewards/margins": 0.1605263203382492, "rewards/rejected": -0.44794508814811707, "step": 218 }, { "epoch": 0.25479551212450235, "grad_norm": 81.97849529400557, "learning_rate": 1.9732425440896293e-07, "logits/chosen": -1.264250636100769, "logits/rejected": -1.3256012201309204, "logps/chosen": -124.54591369628906, "logps/rejected": -148.69161987304688, "loss": 0.5741, "rewards/accuracies": 0.75, "rewards/chosen": 0.18301630020141602, "rewards/margins": 0.42959779500961304, "rewards/rejected": -0.24658147990703583, "step": 220 }, { "epoch": 0.25711183496199785, "grad_norm": 145.57197289231644, "learning_rate": 1.9723473638849804e-07, "logits/chosen": -1.282865285873413, "logits/rejected": -1.3407156467437744, "logps/chosen": -160.74911499023438, "logps/rejected": -179.92872619628906, "loss": 0.6797, "rewards/accuracies": 0.59375, "rewards/chosen": -0.25827834010124207, "rewards/margins": 0.23611454665660858, "rewards/rejected": -0.49439290165901184, "step": 222 }, { "epoch": 0.2594281577994933, "grad_norm": 92.12330941145905, "learning_rate": 1.9714376645850633e-07, "logits/chosen": -1.1877822875976562, "logits/rejected": -1.2320842742919922, "logps/chosen": -112.88442993164062, "logps/rejected": -153.49356079101562, "loss": 0.6879, "rewards/accuracies": 0.59375, "rewards/chosen": -0.06163054332137108, "rewards/margins": 0.5120011568069458, "rewards/rejected": -0.5736316442489624, "step": 224 }, { "epoch": 0.2617444806369888, "grad_norm": 89.27914911232024, "learning_rate": 1.9705134597735113e-07, "logits/chosen": -1.3040626049041748, "logits/rejected": -1.3116812705993652, "logps/chosen": -138.1647491455078, "logps/rejected": -164.8112030029297, "loss": 0.6066, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5537402629852295, "rewards/margins": 0.5435131788253784, "rewards/rejected": -1.097253441810608, "step": 226 }, { "epoch": 0.26406080347448424, "grad_norm": 100.44445362175034, "learning_rate": 1.9695747632505558e-07, "logits/chosen": -1.2323942184448242, "logits/rejected": -1.3942488431930542, "logps/chosen": -95.50066375732422, "logps/rejected": -116.43473815917969, "loss": 0.6697, "rewards/accuracies": 0.625, "rewards/chosen": 0.22127968072891235, "rewards/margins": 0.21899569034576416, "rewards/rejected": 0.0022839903831481934, "step": 228 }, { "epoch": 0.26637712631197974, "grad_norm": 121.25117391544367, "learning_rate": 1.9686215890328168e-07, "logits/chosen": -1.3321830034255981, "logits/rejected": -1.2888097763061523, "logps/chosen": -139.03431701660156, "logps/rejected": -178.35134887695312, "loss": 0.6215, "rewards/accuracies": 0.75, "rewards/chosen": 0.2928890287876129, "rewards/margins": 0.5932635068893433, "rewards/rejected": -0.30037450790405273, "step": 230 }, { "epoch": 0.2686934491494752, "grad_norm": 92.88847876797445, "learning_rate": 1.9676539513530965e-07, "logits/chosen": -1.2525643110275269, "logits/rejected": -1.2690104246139526, "logps/chosen": -137.59510803222656, "logps/rejected": -170.5804901123047, "loss": 0.6265, "rewards/accuracies": 0.75, "rewards/chosen": -0.003497052937746048, "rewards/margins": 0.2578098177909851, "rewards/rejected": -0.2613069415092468, "step": 232 }, { "epoch": 0.2710097719869707, "grad_norm": 109.64954295867085, "learning_rate": 1.966671864660165e-07, "logits/chosen": -1.3433884382247925, "logits/rejected": -1.332856297492981, "logps/chosen": -98.57205200195312, "logps/rejected": -97.49153137207031, "loss": 0.6447, "rewards/accuracies": 0.6875, "rewards/chosen": -0.011620203964412212, "rewards/margins": 0.2672192454338074, "rewards/rejected": -0.27883943915367126, "step": 234 }, { "epoch": 0.2733260948244662, "grad_norm": 88.30404740296814, "learning_rate": 1.9656753436185456e-07, "logits/chosen": -1.4166165590286255, "logits/rejected": -1.3988808393478394, "logps/chosen": -120.94856262207031, "logps/rejected": -126.8148422241211, "loss": 0.5872, "rewards/accuracies": 0.8125, "rewards/chosen": 0.20259736478328705, "rewards/margins": 0.5475842356681824, "rewards/rejected": -0.3449868857860565, "step": 236 }, { "epoch": 0.2756424176619616, "grad_norm": 86.00138406914863, "learning_rate": 1.9646644031082948e-07, "logits/chosen": -1.26936674118042, "logits/rejected": -1.3119958639144897, "logps/chosen": -141.89259338378906, "logps/rejected": -152.75332641601562, "loss": 0.6581, "rewards/accuracies": 0.625, "rewards/chosen": 0.03933039680123329, "rewards/margins": 0.42467713356018066, "rewards/rejected": -0.38534674048423767, "step": 238 }, { "epoch": 0.2779587404994571, "grad_norm": 85.11515238510546, "learning_rate": 1.9636390582247804e-07, "logits/chosen": -1.2737447023391724, "logits/rejected": -1.2945623397827148, "logps/chosen": -119.07524871826172, "logps/rejected": -129.40042114257812, "loss": 0.6561, "rewards/accuracies": 0.5, "rewards/chosen": 0.21082651615142822, "rewards/margins": 0.13657376170158386, "rewards/rejected": 0.07425275444984436, "step": 240 }, { "epoch": 0.28027506333695257, "grad_norm": 78.24054806837644, "learning_rate": 1.9625993242784577e-07, "logits/chosen": -1.3759459257125854, "logits/rejected": -1.4495054483413696, "logps/chosen": -109.1790771484375, "logps/rejected": -141.45123291015625, "loss": 0.5681, "rewards/accuracies": 0.625, "rewards/chosen": 0.19224990904331207, "rewards/margins": 0.4157065451145172, "rewards/rejected": -0.22345668077468872, "step": 242 }, { "epoch": 0.28259138617444807, "grad_norm": 100.77507144115899, "learning_rate": 1.9615452167946382e-07, "logits/chosen": -1.2527633905410767, "logits/rejected": -1.3112610578536987, "logps/chosen": -138.2284393310547, "logps/rejected": -145.6739501953125, "loss": 0.6735, "rewards/accuracies": 0.59375, "rewards/chosen": -0.12001015990972519, "rewards/margins": 0.14908871054649353, "rewards/rejected": -0.2690988779067993, "step": 244 }, { "epoch": 0.28490770901194357, "grad_norm": 81.77193945900667, "learning_rate": 1.9604767515132598e-07, "logits/chosen": -1.30977463722229, "logits/rejected": -1.308231234550476, "logps/chosen": -146.16175842285156, "logps/rejected": -150.26780700683594, "loss": 0.6512, "rewards/accuracies": 0.65625, "rewards/chosen": -0.02212803065776825, "rewards/margins": 0.2704678177833557, "rewards/rejected": -0.29259583353996277, "step": 246 }, { "epoch": 0.287224031849439, "grad_norm": 94.70823896676559, "learning_rate": 1.9593939443886513e-07, "logits/chosen": -1.2869986295700073, "logits/rejected": -1.3198009729385376, "logps/chosen": -205.1080322265625, "logps/rejected": -221.5395050048828, "loss": 0.6069, "rewards/accuracies": 0.625, "rewards/chosen": 0.03687068819999695, "rewards/margins": 0.5602297782897949, "rewards/rejected": -0.5233591794967651, "step": 248 }, { "epoch": 0.2895403546869345, "grad_norm": 78.53341652955227, "learning_rate": 1.9582968115892928e-07, "logits/chosen": -1.5314466953277588, "logits/rejected": -1.4816820621490479, "logps/chosen": -175.5377655029297, "logps/rejected": -167.43699645996094, "loss": 0.6442, "rewards/accuracies": 0.5625, "rewards/chosen": 0.053328827023506165, "rewards/margins": 0.07659827172756195, "rewards/rejected": -0.02326946146786213, "step": 250 }, { "epoch": 0.29185667752442995, "grad_norm": 79.61524235861766, "learning_rate": 1.9571853694975768e-07, "logits/chosen": -1.2131693363189697, "logits/rejected": -1.2827026844024658, "logps/chosen": -135.439453125, "logps/rejected": -152.61849975585938, "loss": 0.6405, "rewards/accuracies": 0.65625, "rewards/chosen": 0.15244589745998383, "rewards/margins": 0.6494119763374329, "rewards/rejected": -0.496966153383255, "step": 252 }, { "epoch": 0.29417300036192545, "grad_norm": 98.13148651308288, "learning_rate": 1.956059634709562e-07, "logits/chosen": -1.1604809761047363, "logits/rejected": -1.2288950681686401, "logps/chosen": -131.82022094726562, "logps/rejected": -161.86351013183594, "loss": 0.6057, "rewards/accuracies": 0.75, "rewards/chosen": 0.1956767737865448, "rewards/margins": 0.9915366172790527, "rewards/rejected": -0.7958598136901855, "step": 254 }, { "epoch": 0.2964893231994209, "grad_norm": 103.85718892196564, "learning_rate": 1.9549196240347248e-07, "logits/chosen": -1.2092281579971313, "logits/rejected": -1.2428326606750488, "logps/chosen": -155.42657470703125, "logps/rejected": -153.75314331054688, "loss": 0.634, "rewards/accuracies": 0.71875, "rewards/chosen": -0.04123011976480484, "rewards/margins": 0.5150858759880066, "rewards/rejected": -0.5563160181045532, "step": 256 }, { "epoch": 0.2988056460369164, "grad_norm": 86.63305573669103, "learning_rate": 1.9537653544957097e-07, "logits/chosen": -1.4105985164642334, "logits/rejected": -1.434030532836914, "logps/chosen": -185.7542266845703, "logps/rejected": -201.953369140625, "loss": 0.607, "rewards/accuracies": 0.59375, "rewards/chosen": 0.03058760240674019, "rewards/margins": 0.36953964829444885, "rewards/rejected": -0.33895203471183777, "step": 258 }, { "epoch": 0.3011219688744119, "grad_norm": 76.86118592942691, "learning_rate": 1.9525968433280754e-07, "logits/chosen": -1.3709828853607178, "logits/rejected": -1.4235422611236572, "logps/chosen": -116.02484130859375, "logps/rejected": -162.0040283203125, "loss": 0.6561, "rewards/accuracies": 0.78125, "rewards/chosen": 0.22844532132148743, "rewards/margins": 0.5770739912986755, "rewards/rejected": -0.3486286997795105, "step": 260 }, { "epoch": 0.30343829171190734, "grad_norm": 87.47994543044294, "learning_rate": 1.9514141079800358e-07, "logits/chosen": -1.3765437602996826, "logits/rejected": -1.4620999097824097, "logps/chosen": -178.11399841308594, "logps/rejected": -184.82977294921875, "loss": 0.6388, "rewards/accuracies": 0.59375, "rewards/chosen": 0.3185967206954956, "rewards/margins": 0.35556116700172424, "rewards/rejected": -0.036964427679777145, "step": 262 }, { "epoch": 0.30575461454940284, "grad_norm": 93.0963118432758, "learning_rate": 1.9502171661121997e-07, "logits/chosen": -1.3460544347763062, "logits/rejected": -1.3444372415542603, "logps/chosen": -129.8871307373047, "logps/rejected": -139.0457305908203, "loss": 0.633, "rewards/accuracies": 0.5, "rewards/chosen": 0.24807024002075195, "rewards/margins": 0.164279505610466, "rewards/rejected": 0.08379074931144714, "step": 264 }, { "epoch": 0.3080709373868983, "grad_norm": 89.98079729490527, "learning_rate": 1.9490060355973096e-07, "logits/chosen": -1.3349117040634155, "logits/rejected": -1.4269710779190063, "logps/chosen": -117.39810180664062, "logps/rejected": -144.81390380859375, "loss": 0.6071, "rewards/accuracies": 0.625, "rewards/chosen": 0.13228380680084229, "rewards/margins": 0.36814936995506287, "rewards/rejected": -0.2358655333518982, "step": 266 }, { "epoch": 0.3103872602243938, "grad_norm": 79.79141686146392, "learning_rate": 1.9477807345199713e-07, "logits/chosen": -1.2187737226486206, "logits/rejected": -1.255754828453064, "logps/chosen": -129.10220336914062, "logps/rejected": -155.4088592529297, "loss": 0.5498, "rewards/accuracies": 0.75, "rewards/chosen": 0.04840267822146416, "rewards/margins": 0.8025398850440979, "rewards/rejected": -0.7541371583938599, "step": 268 }, { "epoch": 0.3127035830618892, "grad_norm": 76.99480771293982, "learning_rate": 1.946541281176386e-07, "logits/chosen": -1.3978129625320435, "logits/rejected": -1.3492777347564697, "logps/chosen": -94.11738586425781, "logps/rejected": -111.17606353759766, "loss": 0.6439, "rewards/accuracies": 0.6875, "rewards/chosen": 0.10735100507736206, "rewards/margins": 0.33690255880355835, "rewards/rejected": -0.22955158352851868, "step": 270 }, { "epoch": 0.3150199058993847, "grad_norm": 123.83701643097643, "learning_rate": 1.9452876940740767e-07, "logits/chosen": -1.3182752132415771, "logits/rejected": -1.3577101230621338, "logps/chosen": -179.61712646484375, "logps/rejected": -175.739501953125, "loss": 0.658, "rewards/accuracies": 0.65625, "rewards/chosen": -0.8353556394577026, "rewards/margins": -0.027862735092639923, "rewards/rejected": -0.8074928522109985, "step": 272 }, { "epoch": 0.3173362287368802, "grad_norm": 85.72321946906887, "learning_rate": 1.9440199919316122e-07, "logits/chosen": -1.288657546043396, "logits/rejected": -1.3015202283859253, "logps/chosen": -119.4478759765625, "logps/rejected": -126.77153015136719, "loss": 0.5747, "rewards/accuracies": 0.59375, "rewards/chosen": -0.10668228566646576, "rewards/margins": 0.2372274398803711, "rewards/rejected": -0.34390971064567566, "step": 274 }, { "epoch": 0.31965255157437567, "grad_norm": 74.20876048665262, "learning_rate": 1.9427381936783265e-07, "logits/chosen": -1.2866897583007812, "logits/rejected": -1.2775256633758545, "logps/chosen": -128.80926513671875, "logps/rejected": -149.09042358398438, "loss": 0.5875, "rewards/accuracies": 0.8125, "rewards/chosen": 0.045619502663612366, "rewards/margins": 0.7617495059967041, "rewards/rejected": -0.7161301374435425, "step": 276 }, { "epoch": 0.32196887441187116, "grad_norm": 92.4643464121328, "learning_rate": 1.9414423184540364e-07, "logits/chosen": -1.2458128929138184, "logits/rejected": -1.3468252420425415, "logps/chosen": -185.1983642578125, "logps/rejected": -208.1055908203125, "loss": 0.6219, "rewards/accuracies": 0.625, "rewards/chosen": -0.6145004034042358, "rewards/margins": 0.43242332339286804, "rewards/rejected": -1.0469236373901367, "step": 278 }, { "epoch": 0.3242851972493666, "grad_norm": 78.52373912784961, "learning_rate": 1.940132385608757e-07, "logits/chosen": -1.2931345701217651, "logits/rejected": -1.285240888595581, "logps/chosen": -120.32514953613281, "logps/rejected": -127.14945983886719, "loss": 0.6194, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13594146072864532, "rewards/margins": 0.45918428897857666, "rewards/rejected": -0.5951257348060608, "step": 280 }, { "epoch": 0.3266015200868621, "grad_norm": 87.99833578536212, "learning_rate": 1.9388084147024119e-07, "logits/chosen": -1.289980411529541, "logits/rejected": -1.3120776414871216, "logps/chosen": -143.28274536132812, "logps/rejected": -184.68968200683594, "loss": 0.5612, "rewards/accuracies": 0.625, "rewards/chosen": -0.36499708890914917, "rewards/margins": 0.8015878200531006, "rewards/rejected": -1.166584849357605, "step": 282 }, { "epoch": 0.3289178429243576, "grad_norm": 130.88630180608516, "learning_rate": 1.93747042550454e-07, "logits/chosen": -1.1992597579956055, "logits/rejected": -1.20537531375885, "logps/chosen": -164.58963012695312, "logps/rejected": -188.33392333984375, "loss": 0.6706, "rewards/accuracies": 0.65625, "rewards/chosen": -0.032479241490364075, "rewards/margins": 0.6490625143051147, "rewards/rejected": -0.6815417408943176, "step": 284 }, { "epoch": 0.33123416576185305, "grad_norm": 129.38381304632557, "learning_rate": 1.9361184379940027e-07, "logits/chosen": -1.3481284379959106, "logits/rejected": -1.3769184350967407, "logps/chosen": -144.51824951171875, "logps/rejected": -184.82598876953125, "loss": 0.5956, "rewards/accuracies": 0.71875, "rewards/chosen": -0.20756953954696655, "rewards/margins": 1.1878336668014526, "rewards/rejected": -1.3954031467437744, "step": 286 }, { "epoch": 0.33355048859934855, "grad_norm": 75.58880846521522, "learning_rate": 1.9347524723586834e-07, "logits/chosen": -1.3142046928405762, "logits/rejected": -1.3535311222076416, "logps/chosen": -127.45948791503906, "logps/rejected": -134.2197723388672, "loss": 0.5643, "rewards/accuracies": 0.53125, "rewards/chosen": -0.029636431485414505, "rewards/margins": 0.3624846637248993, "rewards/rejected": -0.3921211361885071, "step": 288 }, { "epoch": 0.335866811436844, "grad_norm": 73.31791152641335, "learning_rate": 1.9333725489951874e-07, "logits/chosen": -1.279150128364563, "logits/rejected": -1.2785069942474365, "logps/chosen": -137.74392700195312, "logps/rejected": -150.1664581298828, "loss": 0.562, "rewards/accuracies": 0.65625, "rewards/chosen": -0.09417904168367386, "rewards/margins": 0.4170314073562622, "rewards/rejected": -0.5112104415893555, "step": 290 }, { "epoch": 0.3381831342743395, "grad_norm": 92.52553029666018, "learning_rate": 1.9319786885085363e-07, "logits/chosen": -1.2694755792617798, "logits/rejected": -1.2954623699188232, "logps/chosen": -131.68209838867188, "logps/rejected": -147.16407775878906, "loss": 0.5707, "rewards/accuracies": 0.625, "rewards/chosen": -0.23095721006393433, "rewards/margins": 0.6836221814155579, "rewards/rejected": -0.9145793318748474, "step": 292 }, { "epoch": 0.34049945711183494, "grad_norm": 117.1109903878386, "learning_rate": 1.9305709117118614e-07, "logits/chosen": -1.3775343894958496, "logits/rejected": -1.40194571018219, "logps/chosen": -152.88351440429688, "logps/rejected": -152.07113647460938, "loss": 0.6534, "rewards/accuracies": 0.6875, "rewards/chosen": -0.25070011615753174, "rewards/margins": 0.443280965089798, "rewards/rejected": -0.6939811110496521, "step": 294 }, { "epoch": 0.34281577994933043, "grad_norm": 90.63184013182904, "learning_rate": 1.929149239626092e-07, "logits/chosen": -1.3810930252075195, "logits/rejected": -1.375144600868225, "logps/chosen": -138.50286865234375, "logps/rejected": -150.77099609375, "loss": 0.6208, "rewards/accuracies": 0.78125, "rewards/chosen": -0.08024093508720398, "rewards/margins": 0.330517053604126, "rewards/rejected": -0.41075798869132996, "step": 296 }, { "epoch": 0.34513210278682593, "grad_norm": 151.73571113362462, "learning_rate": 1.9277136934796427e-07, "logits/chosen": -1.3765751123428345, "logits/rejected": -1.3763781785964966, "logps/chosen": -133.6553497314453, "logps/rejected": -155.61392211914062, "loss": 0.6849, "rewards/accuracies": 0.59375, "rewards/chosen": -0.21619975566864014, "rewards/margins": 0.11746194958686829, "rewards/rejected": -0.3336617350578308, "step": 298 }, { "epoch": 0.3474484256243214, "grad_norm": 102.44748886372176, "learning_rate": 1.926264294708095e-07, "logits/chosen": -1.2531358003616333, "logits/rejected": -1.280750036239624, "logps/chosen": -142.09063720703125, "logps/rejected": -168.24539184570312, "loss": 0.5871, "rewards/accuracies": 0.65625, "rewards/chosen": -0.10979723930358887, "rewards/margins": 0.8283678293228149, "rewards/rejected": -0.9381651282310486, "step": 300 }, { "epoch": 0.3474484256243214, "eval_logits/chosen": -1.3154690265655518, "eval_logits/rejected": -1.3106950521469116, "eval_logps/chosen": -138.28590393066406, "eval_logps/rejected": -137.84115600585938, "eval_loss": 0.6834442615509033, "eval_rewards/accuracies": 0.6000000238418579, "eval_rewards/chosen": -0.2654740810394287, "eval_rewards/margins": 0.12328676134347916, "eval_rewards/rejected": -0.38876083493232727, "eval_runtime": 24.6332, "eval_samples_per_second": 4.06, "eval_steps_per_second": 1.015, "step": 300 } ], "logging_steps": 2, "max_steps": 1726, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 300, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }