{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 4176, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.8741913641216428, "learning_rate": 1.1961722488038278e-08, "logits/chosen": -2.8505566120147705, "logits/rejected": -2.908921003341675, "logps/chosen": -429.770751953125, "logps/rejected": -264.9197998046875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "grad_norm": 1.8239208035299774, "learning_rate": 1.1961722488038278e-07, "logits/chosen": -2.737058401107788, "logits/rejected": -2.7253358364105225, "logps/chosen": -308.5826721191406, "logps/rejected": -256.5137634277344, "loss": 0.6931, "rewards/accuracies": 0.375, "rewards/chosen": -1.5979426279955078e-06, "rewards/margins": 3.70156740245875e-05, "rewards/margins_max": 0.0033187735825777054, "rewards/margins_min": -0.0024840692058205605, "rewards/margins_std": 0.002600799547508359, "rewards/rejected": -3.8613616197835654e-05, "step": 10 }, { "epoch": 0.0, "grad_norm": 1.670082129212093, "learning_rate": 2.3923444976076555e-07, "logits/chosen": -2.745870590209961, "logits/rejected": -2.7259268760681152, "logps/chosen": -240.11474609375, "logps/rejected": -258.02337646484375, "loss": 0.6933, "rewards/accuracies": 0.4375, "rewards/chosen": -0.00019060683553107083, "rewards/margins": -0.0003666620177682489, "rewards/margins_max": 0.002748889848589897, "rewards/margins_min": -0.003381781280040741, "rewards/margins_std": 0.002722191857174039, "rewards/rejected": 0.0001760551822371781, "step": 20 }, { "epoch": 0.01, "grad_norm": 2.108264998382737, "learning_rate": 3.5885167464114835e-07, "logits/chosen": -2.8813157081604004, "logits/rejected": -2.849266290664673, "logps/chosen": -340.6379699707031, "logps/rejected": -264.9958801269531, "loss": 0.6932, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00023880643129814416, "rewards/margins": 0.0004284608585294336, "rewards/margins_max": 0.003322690026834607, "rewards/margins_min": -0.002509571146219969, "rewards/margins_std": 0.002639041980728507, "rewards/rejected": -0.0001896543544717133, "step": 30 }, { "epoch": 0.01, "grad_norm": 2.2952230312706314, "learning_rate": 4.784688995215311e-07, "logits/chosen": -2.797840118408203, "logits/rejected": -2.7668681144714355, "logps/chosen": -264.33819580078125, "logps/rejected": -238.1956024169922, "loss": 0.6929, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.00012148594396421686, "rewards/margins": 0.00041075778426602483, "rewards/margins_max": 0.0039396812207996845, "rewards/margins_min": -0.002931196242570877, "rewards/margins_std": 0.0030039362609386444, "rewards/rejected": -0.00028927181847393513, "step": 40 }, { "epoch": 0.01, "grad_norm": 1.7814739153480956, "learning_rate": 5.98086124401914e-07, "logits/chosen": -2.8726160526275635, "logits/rejected": -2.856346607208252, "logps/chosen": -328.1170959472656, "logps/rejected": -322.1115417480469, "loss": 0.6926, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 1.3822788787365425e-05, "rewards/margins": 0.0002600129519123584, "rewards/margins_max": 0.004036836326122284, "rewards/margins_min": -0.003525248495861888, "rewards/margins_std": 0.0033416752703487873, "rewards/rejected": -0.0002461901749484241, "step": 50 }, { "epoch": 0.01, "grad_norm": 1.6605664273844842, "learning_rate": 7.177033492822967e-07, "logits/chosen": -2.8413214683532715, "logits/rejected": -2.7684195041656494, "logps/chosen": -306.6330261230469, "logps/rejected": -258.9924011230469, "loss": 0.6927, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0009258159552700818, "rewards/margins": 0.0008747612009756267, "rewards/margins_max": 0.005972309038043022, "rewards/margins_min": -0.004338194150477648, "rewards/margins_std": 0.004698978736996651, "rewards/rejected": 5.105470700073056e-05, "step": 60 }, { "epoch": 0.02, "grad_norm": 3.111510650395858, "learning_rate": 8.373205741626796e-07, "logits/chosen": -2.7511401176452637, "logits/rejected": -2.7465248107910156, "logps/chosen": -288.4776916503906, "logps/rejected": -253.2085418701172, "loss": 0.6919, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0020400371868163347, "rewards/margins": 0.002399927470833063, "rewards/margins_max": 0.00809518713504076, "rewards/margins_min": -0.0018847755854949355, "rewards/margins_std": 0.004446543287485838, "rewards/rejected": -0.0003598902258090675, "step": 70 }, { "epoch": 0.02, "grad_norm": 2.5592202387818874, "learning_rate": 9.569377990430622e-07, "logits/chosen": -2.704948902130127, "logits/rejected": -2.736175775527954, "logps/chosen": -233.33645629882812, "logps/rejected": -252.5889892578125, "loss": 0.6914, "rewards/accuracies": 0.625, "rewards/chosen": 0.003396004904061556, "rewards/margins": 0.0032015512697398663, "rewards/margins_max": 0.011906049214303493, "rewards/margins_min": -0.003259404096752405, "rewards/margins_std": 0.006829865276813507, "rewards/rejected": 0.00019445360521785915, "step": 80 }, { "epoch": 0.02, "grad_norm": 1.969695212512355, "learning_rate": 1.076555023923445e-06, "logits/chosen": -2.8220410346984863, "logits/rejected": -2.791522979736328, "logps/chosen": -282.53839111328125, "logps/rejected": -248.43917846679688, "loss": 0.692, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.005017200019210577, "rewards/margins": 0.0037220902740955353, "rewards/margins_max": 0.012134216725826263, "rewards/margins_min": -0.003787073539569974, "rewards/margins_std": 0.007255645003169775, "rewards/rejected": 0.0012951097451150417, "step": 90 }, { "epoch": 0.02, "grad_norm": 1.6195707589614852, "learning_rate": 1.196172248803828e-06, "logits/chosen": -2.804699659347534, "logits/rejected": -2.7972705364227295, "logps/chosen": -300.19781494140625, "logps/rejected": -310.37847900390625, "loss": 0.6903, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0076782433316111565, "rewards/margins": 0.004169294610619545, "rewards/margins_max": 0.01601630076766014, "rewards/margins_min": -0.0066954344511032104, "rewards/margins_std": 0.010066035203635693, "rewards/rejected": 0.003508948953822255, "step": 100 }, { "epoch": 0.02, "eval_logits/chosen": -2.8015034198760986, "eval_logits/rejected": -2.766679525375366, "eval_logps/chosen": -283.4918212890625, "eval_logps/rejected": -265.434814453125, "eval_loss": 0.6905032992362976, "eval_rewards/accuracies": 0.6635000109672546, "eval_rewards/chosen": 0.009635117836296558, "eval_rewards/margins": 0.005450535099953413, "eval_rewards/margins_max": 0.02791033871471882, "eval_rewards/margins_min": -0.013513891026377678, "eval_rewards/margins_std": 0.01380773913115263, "eval_rewards/rejected": 0.004184580873697996, "eval_runtime": 859.0947, "eval_samples_per_second": 4.656, "eval_steps_per_second": 0.291, "step": 100 }, { "epoch": 0.03, "grad_norm": 1.7836041203321424, "learning_rate": 1.3157894736842106e-06, "logits/chosen": -2.8121228218078613, "logits/rejected": -2.7774009704589844, "logps/chosen": -273.3714294433594, "logps/rejected": -254.922119140625, "loss": 0.6893, "rewards/accuracies": 0.6875, "rewards/chosen": 0.010535641573369503, "rewards/margins": 0.00703816395252943, "rewards/margins_max": 0.02310960367321968, "rewards/margins_min": -0.006729286164045334, "rewards/margins_std": 0.013220730237662792, "rewards/rejected": 0.0034974771551787853, "step": 110 }, { "epoch": 0.03, "grad_norm": 1.6465391102445581, "learning_rate": 1.4354066985645934e-06, "logits/chosen": -2.7969765663146973, "logits/rejected": -2.728602409362793, "logps/chosen": -267.9224548339844, "logps/rejected": -220.8407745361328, "loss": 0.6886, "rewards/accuracies": 0.625, "rewards/chosen": 0.017527183517813683, "rewards/margins": 0.008378724567592144, "rewards/margins_max": 0.03322680667042732, "rewards/margins_min": -0.015282759442925453, "rewards/margins_std": 0.021884554997086525, "rewards/rejected": 0.00914845708757639, "step": 120 }, { "epoch": 0.03, "grad_norm": 2.1274088501819577, "learning_rate": 1.5550239234449763e-06, "logits/chosen": -2.848649024963379, "logits/rejected": -2.8002028465270996, "logps/chosen": -316.5743408203125, "logps/rejected": -283.77984619140625, "loss": 0.6866, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.023616686463356018, "rewards/margins": 0.012971502728760242, "rewards/margins_max": 0.041890598833560944, "rewards/margins_min": -0.010750947520136833, "rewards/margins_std": 0.024407926946878433, "rewards/rejected": 0.0106451865285635, "step": 130 }, { "epoch": 0.03, "grad_norm": 2.186306218972465, "learning_rate": 1.6746411483253591e-06, "logits/chosen": -2.822844982147217, "logits/rejected": -2.810873031616211, "logps/chosen": -285.76202392578125, "logps/rejected": -252.19827270507812, "loss": 0.6851, "rewards/accuracies": 0.625, "rewards/chosen": 0.028137648478150368, "rewards/margins": 0.016734454780817032, "rewards/margins_max": 0.04901372641324997, "rewards/margins_min": -0.012300237081944942, "rewards/margins_std": 0.02730562351644039, "rewards/rejected": 0.011403195559978485, "step": 140 }, { "epoch": 0.04, "grad_norm": 1.9743905781043494, "learning_rate": 1.7942583732057418e-06, "logits/chosen": -2.893601894378662, "logits/rejected": -2.8198986053466797, "logps/chosen": -319.5303955078125, "logps/rejected": -309.4114685058594, "loss": 0.6857, "rewards/accuracies": 0.625, "rewards/chosen": 0.03438293933868408, "rewards/margins": 0.014150666072964668, "rewards/margins_max": 0.055888839066028595, "rewards/margins_min": -0.026385510340332985, "rewards/margins_std": 0.035706646740436554, "rewards/rejected": 0.020232271403074265, "step": 150 }, { "epoch": 0.04, "grad_norm": 2.1110216283516974, "learning_rate": 1.9138755980861244e-06, "logits/chosen": -2.823319911956787, "logits/rejected": -2.836071729660034, "logps/chosen": -253.6246337890625, "logps/rejected": -246.7947235107422, "loss": 0.6822, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03823622316122055, "rewards/margins": 0.017017127946019173, "rewards/margins_max": 0.06454560905694962, "rewards/margins_min": -0.02736133709549904, "rewards/margins_std": 0.04022496938705444, "rewards/rejected": 0.02121909335255623, "step": 160 }, { "epoch": 0.04, "grad_norm": 2.4463925642400772, "learning_rate": 2.0334928229665075e-06, "logits/chosen": -2.764888048171997, "logits/rejected": -2.7301621437072754, "logps/chosen": -293.571044921875, "logps/rejected": -236.1142120361328, "loss": 0.6768, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.04422404617071152, "rewards/margins": 0.02842998132109642, "rewards/margins_max": 0.10024015605449677, "rewards/margins_min": -0.024079352617263794, "rewards/margins_std": 0.056076955050230026, "rewards/rejected": 0.0157940611243248, "step": 170 }, { "epoch": 0.04, "grad_norm": 2.4728952687041255, "learning_rate": 2.15311004784689e-06, "logits/chosen": -2.8003315925598145, "logits/rejected": -2.778505325317383, "logps/chosen": -301.18707275390625, "logps/rejected": -295.223388671875, "loss": 0.6729, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.044958751648664474, "rewards/margins": 0.03759391978383064, "rewards/margins_max": 0.09980084002017975, "rewards/margins_min": -0.03028533235192299, "rewards/margins_std": 0.05645842105150223, "rewards/rejected": 0.007364829070866108, "step": 180 }, { "epoch": 0.05, "grad_norm": 1.5998281625025024, "learning_rate": 2.2727272727272728e-06, "logits/chosen": -2.8096413612365723, "logits/rejected": -2.793257713317871, "logps/chosen": -220.34481811523438, "logps/rejected": -185.27304077148438, "loss": 0.6765, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.026322778314352036, "rewards/margins": 0.03641284257173538, "rewards/margins_max": 0.11540315300226212, "rewards/margins_min": -0.03765136003494263, "rewards/margins_std": 0.0687890350818634, "rewards/rejected": -0.010090066120028496, "step": 190 }, { "epoch": 0.05, "grad_norm": 4.55564553603548, "learning_rate": 2.392344497607656e-06, "logits/chosen": -2.766078233718872, "logits/rejected": -2.728900909423828, "logps/chosen": -259.026611328125, "logps/rejected": -226.94384765625, "loss": 0.6668, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.03736649826169014, "rewards/margins": 0.04638832062482834, "rewards/margins_max": 0.15462318062782288, "rewards/margins_min": -0.0437658354640007, "rewards/margins_std": 0.0871105045080185, "rewards/rejected": -0.009021828882396221, "step": 200 }, { "epoch": 0.05, "eval_logits/chosen": -2.767601251602173, "eval_logits/rejected": -2.7342774868011475, "eval_logps/chosen": -281.96649169921875, "eval_logps/rejected": -268.1767883300781, "eval_loss": 0.6714157462120056, "eval_rewards/accuracies": 0.6644999980926514, "eval_rewards/chosen": 0.024888034909963608, "eval_rewards/margins": 0.048123184591531754, "eval_rewards/margins_max": 0.2298809289932251, "eval_rewards/margins_min": -0.1105237752199173, "eval_rewards/margins_std": 0.11300744116306305, "eval_rewards/rejected": -0.023235153406858444, "eval_runtime": 858.6199, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 200 }, { "epoch": 0.05, "grad_norm": 2.1187508642144257, "learning_rate": 2.5119617224880385e-06, "logits/chosen": -2.7964301109313965, "logits/rejected": -2.7517850399017334, "logps/chosen": -284.44378662109375, "logps/rejected": -252.84963989257812, "loss": 0.6748, "rewards/accuracies": 0.625, "rewards/chosen": 0.012092428281903267, "rewards/margins": 0.026425976306200027, "rewards/margins_max": 0.14213475584983826, "rewards/margins_min": -0.09400559961795807, "rewards/margins_std": 0.10348506271839142, "rewards/rejected": -0.014333548955619335, "step": 210 }, { "epoch": 0.05, "grad_norm": 1.8360164544193054, "learning_rate": 2.631578947368421e-06, "logits/chosen": -2.805187702178955, "logits/rejected": -2.7611403465270996, "logps/chosen": -255.305908203125, "logps/rejected": -240.7138671875, "loss": 0.6603, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.020616654306650162, "rewards/margins": 0.05939360335469246, "rewards/margins_max": 0.2004515677690506, "rewards/margins_min": -0.07490409910678864, "rewards/margins_std": 0.12225624173879623, "rewards/rejected": -0.038776952773332596, "step": 220 }, { "epoch": 0.06, "grad_norm": 2.1513994364284605, "learning_rate": 2.751196172248804e-06, "logits/chosen": -2.7571868896484375, "logits/rejected": -2.741738796234131, "logps/chosen": -275.4943542480469, "logps/rejected": -260.92816162109375, "loss": 0.6511, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.006370508577674627, "rewards/margins": 0.08649794012308121, "rewards/margins_max": 0.2535097599029541, "rewards/margins_min": -0.06593972444534302, "rewards/margins_std": 0.14108982682228088, "rewards/rejected": -0.08012743294239044, "step": 230 }, { "epoch": 0.06, "grad_norm": 2.701152721455728, "learning_rate": 2.870813397129187e-06, "logits/chosen": -2.7947802543640137, "logits/rejected": -2.744377851486206, "logps/chosen": -258.8454895019531, "logps/rejected": -243.67041015625, "loss": 0.6566, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.029876727610826492, "rewards/margins": 0.05030660703778267, "rewards/margins_max": 0.25291213393211365, "rewards/margins_min": -0.11614030599594116, "rewards/margins_std": 0.16259029507637024, "rewards/rejected": -0.08018333464860916, "step": 240 }, { "epoch": 0.06, "grad_norm": 2.2731338770150016, "learning_rate": 2.99043062200957e-06, "logits/chosen": -2.6753697395324707, "logits/rejected": -2.659595251083374, "logps/chosen": -288.3710632324219, "logps/rejected": -305.43194580078125, "loss": 0.6592, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.06379599869251251, "rewards/margins": 0.08909501135349274, "rewards/margins_max": 0.3169664442539215, "rewards/margins_min": -0.11071326583623886, "rewards/margins_std": 0.19236549735069275, "rewards/rejected": -0.15289101004600525, "step": 250 }, { "epoch": 0.06, "grad_norm": 2.8992384140758123, "learning_rate": 3.1100478468899525e-06, "logits/chosen": -2.7028839588165283, "logits/rejected": -2.7511019706726074, "logps/chosen": -259.0841979980469, "logps/rejected": -290.89019775390625, "loss": 0.6361, "rewards/accuracies": 0.6875, "rewards/chosen": -0.01358124427497387, "rewards/margins": 0.1459258645772934, "rewards/margins_max": 0.40666690468788147, "rewards/margins_min": -0.10942129790782928, "rewards/margins_std": 0.22585058212280273, "rewards/rejected": -0.1595071256160736, "step": 260 }, { "epoch": 0.06, "grad_norm": 2.601928593534663, "learning_rate": 3.2296650717703356e-06, "logits/chosen": -2.8080337047576904, "logits/rejected": -2.7446141242980957, "logps/chosen": -326.8064880371094, "logps/rejected": -248.781982421875, "loss": 0.6484, "rewards/accuracies": 0.625, "rewards/chosen": -0.030105959624052048, "rewards/margins": 0.10492241382598877, "rewards/margins_max": 0.4032979905605316, "rewards/margins_min": -0.17702355980873108, "rewards/margins_std": 0.26016703248023987, "rewards/rejected": -0.13502837717533112, "step": 270 }, { "epoch": 0.07, "grad_norm": 2.896070145911363, "learning_rate": 3.3492822966507182e-06, "logits/chosen": -2.731680393218994, "logits/rejected": -2.754075765609741, "logps/chosen": -252.1563262939453, "logps/rejected": -253.5634765625, "loss": 0.6537, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.017906440421938896, "rewards/margins": 0.09158265590667725, "rewards/margins_max": 0.33544182777404785, "rewards/margins_min": -0.14413416385650635, "rewards/margins_std": 0.2152756154537201, "rewards/rejected": -0.0736762136220932, "step": 280 }, { "epoch": 0.07, "grad_norm": 3.747631342178668, "learning_rate": 3.4688995215311005e-06, "logits/chosen": -2.6725401878356934, "logits/rejected": -2.6484413146972656, "logps/chosen": -271.95281982421875, "logps/rejected": -243.95010375976562, "loss": 0.6297, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.06201690435409546, "rewards/margins": 0.15815535187721252, "rewards/margins_max": 0.48090046644210815, "rewards/margins_min": -0.16772928833961487, "rewards/margins_std": 0.29076889157295227, "rewards/rejected": -0.22017225623130798, "step": 290 }, { "epoch": 0.07, "grad_norm": 4.078402338061256, "learning_rate": 3.5885167464114835e-06, "logits/chosen": -2.7627511024475098, "logits/rejected": -2.755988597869873, "logps/chosen": -263.7777404785156, "logps/rejected": -266.8316955566406, "loss": 0.6136, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.1248350590467453, "rewards/margins": 0.14023873209953308, "rewards/margins_max": 0.43113142251968384, "rewards/margins_min": -0.1156766414642334, "rewards/margins_std": 0.24138310551643372, "rewards/rejected": -0.26507383584976196, "step": 300 }, { "epoch": 0.07, "eval_logits/chosen": -2.7086386680603027, "eval_logits/rejected": -2.677701234817505, "eval_logps/chosen": -311.68255615234375, "eval_logps/rejected": -307.8616943359375, "eval_loss": 0.6387784481048584, "eval_rewards/accuracies": 0.6694999933242798, "eval_rewards/chosen": -0.27227237820625305, "eval_rewards/margins": 0.1478119045495987, "eval_rewards/margins_max": 0.6955816745758057, "eval_rewards/margins_min": -0.3145456314086914, "eval_rewards/margins_std": 0.3387521505355835, "eval_rewards/rejected": -0.42008423805236816, "eval_runtime": 858.5655, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 300 }, { "epoch": 0.07, "grad_norm": 2.817843553233008, "learning_rate": 3.708133971291866e-06, "logits/chosen": -2.7599844932556152, "logits/rejected": -2.7003397941589355, "logps/chosen": -285.94659423828125, "logps/rejected": -241.04476928710938, "loss": 0.6316, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2921694815158844, "rewards/margins": 0.13367998600006104, "rewards/margins_max": 0.45650315284729004, "rewards/margins_min": -0.1348828673362732, "rewards/margins_std": 0.2632921040058136, "rewards/rejected": -0.4258494973182678, "step": 310 }, { "epoch": 0.08, "grad_norm": 3.99968909008973, "learning_rate": 3.827751196172249e-06, "logits/chosen": -2.7639968395233154, "logits/rejected": -2.7521986961364746, "logps/chosen": -292.2435607910156, "logps/rejected": -384.36334228515625, "loss": 0.6239, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.16064348816871643, "rewards/margins": 0.14868482947349548, "rewards/margins_max": 0.5704448223114014, "rewards/margins_min": -0.221909761428833, "rewards/margins_std": 0.3567379117012024, "rewards/rejected": -0.3093283474445343, "step": 320 }, { "epoch": 0.08, "grad_norm": 3.9659416294775256, "learning_rate": 3.947368421052632e-06, "logits/chosen": -2.7694015502929688, "logits/rejected": -2.724294900894165, "logps/chosen": -342.5742492675781, "logps/rejected": -293.7865295410156, "loss": 0.6274, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.09227137267589569, "rewards/margins": 0.21184547245502472, "rewards/margins_max": 0.6625819206237793, "rewards/margins_min": -0.22224211692810059, "rewards/margins_std": 0.3937598764896393, "rewards/rejected": -0.3041168749332428, "step": 330 }, { "epoch": 0.08, "grad_norm": 3.6990897167223236, "learning_rate": 4.066985645933015e-06, "logits/chosen": -2.749021291732788, "logits/rejected": -2.7309110164642334, "logps/chosen": -347.3499755859375, "logps/rejected": -315.2572937011719, "loss": 0.6161, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.23952047526836395, "rewards/margins": 0.26224932074546814, "rewards/margins_max": 0.6530129313468933, "rewards/margins_min": -0.14701256155967712, "rewards/margins_std": 0.3552583158016205, "rewards/rejected": -0.5017697811126709, "step": 340 }, { "epoch": 0.08, "grad_norm": 6.5847641882821595, "learning_rate": 4.186602870813398e-06, "logits/chosen": -2.6577677726745605, "logits/rejected": -2.6210625171661377, "logps/chosen": -282.750244140625, "logps/rejected": -261.5845031738281, "loss": 0.5924, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.17961296439170837, "rewards/margins": 0.28644639253616333, "rewards/margins_max": 0.9119589924812317, "rewards/margins_min": -0.2684462368488312, "rewards/margins_std": 0.5482449531555176, "rewards/rejected": -0.4660593569278717, "step": 350 }, { "epoch": 0.09, "grad_norm": 4.3007284745002154, "learning_rate": 4.30622009569378e-06, "logits/chosen": -2.7099499702453613, "logits/rejected": -2.6916935443878174, "logps/chosen": -315.02691650390625, "logps/rejected": -302.62677001953125, "loss": 0.5878, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3279743790626526, "rewards/margins": 0.24689266085624695, "rewards/margins_max": 0.7203132510185242, "rewards/margins_min": -0.2630789577960968, "rewards/margins_std": 0.44619321823120117, "rewards/rejected": -0.5748671293258667, "step": 360 }, { "epoch": 0.09, "grad_norm": 4.929704524039543, "learning_rate": 4.425837320574163e-06, "logits/chosen": -2.7238924503326416, "logits/rejected": -2.6731104850769043, "logps/chosen": -322.6479797363281, "logps/rejected": -278.42822265625, "loss": 0.6296, "rewards/accuracies": 0.6875, "rewards/chosen": -0.25143131613731384, "rewards/margins": 0.22132281959056854, "rewards/margins_max": 0.771989643573761, "rewards/margins_min": -0.33166414499282837, "rewards/margins_std": 0.49805164337158203, "rewards/rejected": -0.4727540910243988, "step": 370 }, { "epoch": 0.09, "grad_norm": 4.565553244484469, "learning_rate": 4.5454545454545455e-06, "logits/chosen": -2.6898770332336426, "logits/rejected": -2.6422266960144043, "logps/chosen": -300.3191223144531, "logps/rejected": -268.66827392578125, "loss": 0.6147, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.19597536325454712, "rewards/margins": 0.24524882435798645, "rewards/margins_max": 0.6597117185592651, "rewards/margins_min": -0.22325678169727325, "rewards/margins_std": 0.3912925124168396, "rewards/rejected": -0.44122418761253357, "step": 380 }, { "epoch": 0.09, "grad_norm": 5.3496282715757895, "learning_rate": 4.665071770334928e-06, "logits/chosen": -2.576572895050049, "logits/rejected": -2.5512681007385254, "logps/chosen": -351.8443603515625, "logps/rejected": -335.647216796875, "loss": 0.5953, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4410029947757721, "rewards/margins": 0.3319907486438751, "rewards/margins_max": 0.8627702593803406, "rewards/margins_min": -0.22415117919445038, "rewards/margins_std": 0.4868497848510742, "rewards/rejected": -0.7729936838150024, "step": 390 }, { "epoch": 0.1, "grad_norm": 6.028764123490772, "learning_rate": 4.784688995215312e-06, "logits/chosen": -2.515878200531006, "logits/rejected": -2.5452802181243896, "logps/chosen": -305.6355895996094, "logps/rejected": -320.40570068359375, "loss": 0.6224, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.35952678322792053, "rewards/margins": 0.2461273968219757, "rewards/margins_max": 0.8966833353042603, "rewards/margins_min": -0.40194329619407654, "rewards/margins_std": 0.5672916769981384, "rewards/rejected": -0.6056541800498962, "step": 400 }, { "epoch": 0.1, "eval_logits/chosen": -2.5540878772735596, "eval_logits/rejected": -2.521780252456665, "eval_logps/chosen": -328.5355529785156, "eval_logps/rejected": -338.51251220703125, "eval_loss": 0.6072334051132202, "eval_rewards/accuracies": 0.6825000047683716, "eval_rewards/chosen": -0.44080233573913574, "eval_rewards/margins": 0.28578999638557434, "eval_rewards/margins_max": 1.2192760705947876, "eval_rewards/margins_min": -0.5525813102722168, "eval_rewards/margins_std": 0.5950839519500732, "eval_rewards/rejected": -0.7265923023223877, "eval_runtime": 858.6606, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 400 }, { "epoch": 0.1, "grad_norm": 4.24483759880568, "learning_rate": 4.904306220095694e-06, "logits/chosen": -2.55665922164917, "logits/rejected": -2.4827256202697754, "logps/chosen": -371.78399658203125, "logps/rejected": -339.4240417480469, "loss": 0.615, "rewards/accuracies": 0.625, "rewards/chosen": -0.4924688935279846, "rewards/margins": 0.3220434784889221, "rewards/margins_max": 1.0101779699325562, "rewards/margins_min": -0.3708079159259796, "rewards/margins_std": 0.6248683929443359, "rewards/rejected": -0.8145123720169067, "step": 410 }, { "epoch": 0.1, "grad_norm": 3.4943782071006932, "learning_rate": 4.999996505732917e-06, "logits/chosen": -2.571901798248291, "logits/rejected": -2.542872667312622, "logps/chosen": -340.65631103515625, "logps/rejected": -359.6463928222656, "loss": 0.5832, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.42032432556152344, "rewards/margins": 0.2428252249956131, "rewards/margins_max": 0.756274402141571, "rewards/margins_min": -0.26175224781036377, "rewards/margins_std": 0.4570883810520172, "rewards/rejected": -0.6631495952606201, "step": 420 }, { "epoch": 0.1, "grad_norm": 4.816489411538745, "learning_rate": 4.999874207410649e-06, "logits/chosen": -2.4666030406951904, "logits/rejected": -2.488049030303955, "logps/chosen": -282.1983642578125, "logps/rejected": -317.11602783203125, "loss": 0.5896, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2820836901664734, "rewards/margins": 0.2688491940498352, "rewards/margins_max": 0.9526284337043762, "rewards/margins_min": -0.3855525851249695, "rewards/margins_std": 0.5899484157562256, "rewards/rejected": -0.5509328842163086, "step": 430 }, { "epoch": 0.11, "grad_norm": 4.468388256023348, "learning_rate": 4.9995772055020396e-06, "logits/chosen": -2.4622530937194824, "logits/rejected": -2.451822519302368, "logps/chosen": -263.818359375, "logps/rejected": -269.6212463378906, "loss": 0.6007, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.2275359183549881, "rewards/margins": 0.2502995729446411, "rewards/margins_max": 1.0142154693603516, "rewards/margins_min": -0.42901545763015747, "rewards/margins_std": 0.6374719738960266, "rewards/rejected": -0.477835476398468, "step": 440 }, { "epoch": 0.11, "grad_norm": 4.094734364744832, "learning_rate": 4.9991055207630545e-06, "logits/chosen": -2.538731098175049, "logits/rejected": -2.476346254348755, "logps/chosen": -329.1538391113281, "logps/rejected": -334.0916442871094, "loss": 0.5838, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.42114001512527466, "rewards/margins": 0.3482816517353058, "rewards/margins_max": 1.0589752197265625, "rewards/margins_min": -0.475161612033844, "rewards/margins_std": 0.683747410774231, "rewards/rejected": -0.7694215774536133, "step": 450 }, { "epoch": 0.11, "grad_norm": 4.98948861538599, "learning_rate": 4.998459186157357e-06, "logits/chosen": -2.575547218322754, "logits/rejected": -2.524850368499756, "logps/chosen": -341.12689208984375, "logps/rejected": -344.8680725097656, "loss": 0.5998, "rewards/accuracies": 0.75, "rewards/chosen": -0.4932452142238617, "rewards/margins": 0.28584393858909607, "rewards/margins_max": 0.9423806071281433, "rewards/margins_min": -0.36224740743637085, "rewards/margins_std": 0.5786666870117188, "rewards/rejected": -0.7790891528129578, "step": 460 }, { "epoch": 0.11, "grad_norm": 5.999470051567738, "learning_rate": 4.997638246854011e-06, "logits/chosen": -2.635892868041992, "logits/rejected": -2.5901548862457275, "logps/chosen": -318.29876708984375, "logps/rejected": -339.3644714355469, "loss": 0.5998, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3319573998451233, "rewards/margins": 0.3469027578830719, "rewards/margins_max": 0.9992606043815613, "rewards/margins_min": -0.3367360234260559, "rewards/margins_std": 0.6007035970687866, "rewards/rejected": -0.6788601875305176, "step": 470 }, { "epoch": 0.11, "grad_norm": 5.96744351293568, "learning_rate": 4.996642760224319e-06, "logits/chosen": -2.31598162651062, "logits/rejected": -2.305828809738159, "logps/chosen": -362.50732421875, "logps/rejected": -382.7879333496094, "loss": 0.5647, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.7479854822158813, "rewards/margins": 0.348560094833374, "rewards/margins_max": 1.2798352241516113, "rewards/margins_min": -0.5504150390625, "rewards/margins_std": 0.7969033122062683, "rewards/rejected": -1.0965455770492554, "step": 480 }, { "epoch": 0.12, "grad_norm": 5.797458999705947, "learning_rate": 4.995472795837813e-06, "logits/chosen": -2.3233327865600586, "logits/rejected": -2.2009968757629395, "logps/chosen": -332.31195068359375, "logps/rejected": -347.9017028808594, "loss": 0.5613, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7816909551620483, "rewards/margins": 0.43898725509643555, "rewards/margins_max": 1.4394053220748901, "rewards/margins_min": -0.550244927406311, "rewards/margins_std": 0.8791268467903137, "rewards/rejected": -1.2206782102584839, "step": 490 }, { "epoch": 0.12, "grad_norm": 5.907907727636047, "learning_rate": 4.994128435457401e-06, "logits/chosen": -2.2398037910461426, "logits/rejected": -2.2091004848480225, "logps/chosen": -397.2497863769531, "logps/rejected": -397.298095703125, "loss": 0.5913, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.8463393449783325, "rewards/margins": 0.4377099871635437, "rewards/margins_max": 1.1393119096755981, "rewards/margins_min": -0.45718902349472046, "rewards/margins_std": 0.7185834646224976, "rewards/rejected": -1.2840492725372314, "step": 500 }, { "epoch": 0.12, "eval_logits/chosen": -2.2226407527923584, "eval_logits/rejected": -2.179306745529175, "eval_logps/chosen": -347.44720458984375, "eval_logps/rejected": -375.13555908203125, "eval_loss": 0.5700246691703796, "eval_rewards/accuracies": 0.6974999904632568, "eval_rewards/chosen": -0.62991863489151, "eval_rewards/margins": 0.462904155254364, "eval_rewards/margins_max": 1.7718589305877686, "eval_rewards/margins_min": -0.6553950309753418, "eval_rewards/margins_std": 0.8141295909881592, "eval_rewards/rejected": -1.0928229093551636, "eval_runtime": 858.6738, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 500 }, { "epoch": 0.12, "grad_norm": 6.813698479459899, "learning_rate": 4.992609773033639e-06, "logits/chosen": -2.305377721786499, "logits/rejected": -2.225247383117676, "logps/chosen": -378.26885986328125, "logps/rejected": -414.1914978027344, "loss": 0.5213, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6292124390602112, "rewards/margins": 0.5893786549568176, "rewards/margins_max": 1.5151879787445068, "rewards/margins_min": -0.3044474720954895, "rewards/margins_std": 0.8068717122077942, "rewards/rejected": -1.2185910940170288, "step": 510 }, { "epoch": 0.12, "grad_norm": 9.580645330749556, "learning_rate": 4.990916914698176e-06, "logits/chosen": -2.310624361038208, "logits/rejected": -2.336876630783081, "logps/chosen": -332.9380187988281, "logps/rejected": -388.8421325683594, "loss": 0.5794, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.6097368001937866, "rewards/margins": 0.44825831055641174, "rewards/margins_max": 1.253227949142456, "rewards/margins_min": -0.3076356053352356, "rewards/margins_std": 0.7074476480484009, "rewards/rejected": -1.057995080947876, "step": 520 }, { "epoch": 0.13, "grad_norm": 7.015220443513785, "learning_rate": 4.989049978756336e-06, "logits/chosen": -2.2326388359069824, "logits/rejected": -2.178192138671875, "logps/chosen": -339.14312744140625, "logps/rejected": -366.4873962402344, "loss": 0.53, "rewards/accuracies": 0.75, "rewards/chosen": -0.7539184093475342, "rewards/margins": 0.664406418800354, "rewards/margins_max": 1.7410306930541992, "rewards/margins_min": -0.4311515688896179, "rewards/margins_std": 0.9934325218200684, "rewards/rejected": -1.4183248281478882, "step": 530 }, { "epoch": 0.13, "grad_norm": 9.688987568499563, "learning_rate": 4.987009095678843e-06, "logits/chosen": -2.171072483062744, "logits/rejected": -2.058274030685425, "logps/chosen": -442.4353942871094, "logps/rejected": -436.27752685546875, "loss": 0.4784, "rewards/accuracies": 0.75, "rewards/chosen": -1.024185299873352, "rewards/margins": 0.7616758346557617, "rewards/margins_max": 1.8031961917877197, "rewards/margins_min": -0.3390945494174957, "rewards/margins_std": 0.9676419496536255, "rewards/rejected": -1.7858610153198242, "step": 540 }, { "epoch": 0.13, "grad_norm": 10.370208001253292, "learning_rate": 4.984794408092712e-06, "logits/chosen": -2.0105438232421875, "logits/rejected": -2.002901077270508, "logps/chosen": -361.4600830078125, "logps/rejected": -438.6343688964844, "loss": 0.5466, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3025832176208496, "rewards/margins": 0.6625264883041382, "rewards/margins_max": 2.053928852081299, "rewards/margins_min": -0.6988664269447327, "rewards/margins_std": 1.2294689416885376, "rewards/rejected": -1.9651100635528564, "step": 550 }, { "epoch": 0.13, "grad_norm": 7.555449170254978, "learning_rate": 4.982406070771277e-06, "logits/chosen": -2.1128172874450684, "logits/rejected": -2.0671145915985107, "logps/chosen": -344.1737060546875, "logps/rejected": -389.8128662109375, "loss": 0.5623, "rewards/accuracies": 0.75, "rewards/chosen": -0.8078567385673523, "rewards/margins": 0.6244903802871704, "rewards/margins_max": 1.681189775466919, "rewards/margins_min": -0.37242117524147034, "rewards/margins_std": 0.8899843096733093, "rewards/rejected": -1.432347059249878, "step": 560 }, { "epoch": 0.14, "grad_norm": 10.346097097876639, "learning_rate": 4.979844250623374e-06, "logits/chosen": -2.087663412094116, "logits/rejected": -2.0414271354675293, "logps/chosen": -371.2855529785156, "logps/rejected": -474.8472595214844, "loss": 0.5343, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0855228900909424, "rewards/margins": 0.8279083967208862, "rewards/margins_max": 2.3168115615844727, "rewards/margins_min": -0.3946274220943451, "rewards/margins_std": 1.1978816986083984, "rewards/rejected": -1.9134315252304077, "step": 570 }, { "epoch": 0.14, "grad_norm": 9.171112600739848, "learning_rate": 4.977109126681678e-06, "logits/chosen": -2.149807929992676, "logits/rejected": -2.0845491886138916, "logps/chosen": -465.80963134765625, "logps/rejected": -463.05438232421875, "loss": 0.6012, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2731866836547852, "rewards/margins": 0.5484720468521118, "rewards/margins_max": 1.956009864807129, "rewards/margins_min": -0.9187488555908203, "rewards/margins_std": 1.2566072940826416, "rewards/rejected": -1.8216584920883179, "step": 580 }, { "epoch": 0.14, "grad_norm": 11.144653105713058, "learning_rate": 4.974200890090192e-06, "logits/chosen": -2.0983707904815674, "logits/rejected": -2.082648754119873, "logps/chosen": -362.5354919433594, "logps/rejected": -439.17303466796875, "loss": 0.5209, "rewards/accuracies": 0.75, "rewards/chosen": -1.1539649963378906, "rewards/margins": 0.8208593130111694, "rewards/margins_max": 2.0563766956329346, "rewards/margins_min": -0.2603703439235687, "rewards/margins_std": 1.0450429916381836, "rewards/rejected": -1.9748245477676392, "step": 590 }, { "epoch": 0.14, "grad_norm": 10.559877690849602, "learning_rate": 4.971119744090887e-06, "logits/chosen": -2.116525173187256, "logits/rejected": -2.046874761581421, "logps/chosen": -391.19940185546875, "logps/rejected": -421.81201171875, "loss": 0.5721, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.2597490549087524, "rewards/margins": 0.530002236366272, "rewards/margins_max": 1.7267780303955078, "rewards/margins_min": -0.5272101163864136, "rewards/margins_std": 1.0259678363800049, "rewards/rejected": -1.789751410484314, "step": 600 }, { "epoch": 0.14, "eval_logits/chosen": -2.1035804748535156, "eval_logits/rejected": -2.054929733276367, "eval_logps/chosen": -395.269775390625, "eval_logps/rejected": -439.3786315917969, "eval_loss": 0.559457540512085, "eval_rewards/accuracies": 0.7145000100135803, "eval_rewards/chosen": -1.1081442832946777, "eval_rewards/margins": 0.6271089911460876, "eval_rewards/margins_max": 2.2933690547943115, "eval_rewards/margins_min": -0.86275714635849, "eval_rewards/margins_std": 1.0596861839294434, "eval_rewards/rejected": -1.7352535724639893, "eval_runtime": 859.0201, "eval_samples_per_second": 4.656, "eval_steps_per_second": 0.291, "step": 600 }, { "epoch": 0.15, "grad_norm": 6.877521399322908, "learning_rate": 4.9678659040095e-06, "logits/chosen": -2.2251172065734863, "logits/rejected": -2.1756205558776855, "logps/chosen": -455.6241149902344, "logps/rejected": -431.32080078125, "loss": 0.5319, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.0841054916381836, "rewards/margins": 0.5707446932792664, "rewards/margins_max": 1.6331932544708252, "rewards/margins_min": -0.3124406933784485, "rewards/margins_std": 0.8899873495101929, "rewards/rejected": -1.6548502445220947, "step": 610 }, { "epoch": 0.15, "grad_norm": 7.8330910286729445, "learning_rate": 4.964439597240486e-06, "logits/chosen": -2.2091095447540283, "logits/rejected": -2.1549906730651855, "logps/chosen": -473.2152404785156, "logps/rejected": -461.6878967285156, "loss": 0.5158, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8578544855117798, "rewards/margins": 0.8130484819412231, "rewards/margins_max": 2.012216567993164, "rewards/margins_min": -0.4897755980491638, "rewards/margins_std": 1.1231465339660645, "rewards/rejected": -1.6709026098251343, "step": 620 }, { "epoch": 0.15, "grad_norm": 10.139338760033747, "learning_rate": 4.960841063231125e-06, "logits/chosen": -2.1521284580230713, "logits/rejected": -2.1005215644836426, "logps/chosen": -502.27703857421875, "logps/rejected": -508.73748779296875, "loss": 0.5195, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.2980985641479492, "rewards/margins": 0.9067466855049133, "rewards/margins_max": 2.169539451599121, "rewards/margins_min": -0.3805146813392639, "rewards/margins_std": 1.126906156539917, "rewards/rejected": -2.2048451900482178, "step": 630 }, { "epoch": 0.15, "grad_norm": 9.951104598337352, "learning_rate": 4.9570705534647904e-06, "logits/chosen": -2.138817310333252, "logits/rejected": -2.0212392807006836, "logps/chosen": -462.64385986328125, "logps/rejected": -468.68994140625, "loss": 0.4968, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3697404861450195, "rewards/margins": 0.8326905965805054, "rewards/margins_max": 2.0178627967834473, "rewards/margins_min": -0.4745658338069916, "rewards/margins_std": 1.1617228984832764, "rewards/rejected": -2.2024314403533936, "step": 640 }, { "epoch": 0.16, "grad_norm": 6.428164926422553, "learning_rate": 4.9531283314433705e-06, "logits/chosen": -2.0984914302825928, "logits/rejected": -1.9827169179916382, "logps/chosen": -427.40863037109375, "logps/rejected": -446.12738037109375, "loss": 0.5836, "rewards/accuracies": 0.625, "rewards/chosen": -1.3525571823120117, "rewards/margins": 0.5879122018814087, "rewards/margins_max": 2.079125165939331, "rewards/margins_min": -0.5389895439147949, "rewards/margins_std": 1.1874258518218994, "rewards/rejected": -1.9404691457748413, "step": 650 }, { "epoch": 0.16, "grad_norm": 8.40431707345644, "learning_rate": 4.949014672668859e-06, "logits/chosen": -1.962658166885376, "logits/rejected": -1.9261329174041748, "logps/chosen": -425.53607177734375, "logps/rejected": -472.46551513671875, "loss": 0.5679, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.6466686725616455, "rewards/margins": 0.5940350294113159, "rewards/margins_max": 1.5654256343841553, "rewards/margins_min": -0.4329650402069092, "rewards/margins_std": 0.8989126086235046, "rewards/rejected": -2.240703582763672, "step": 660 }, { "epoch": 0.16, "grad_norm": 4.569250677935333, "learning_rate": 4.944729864624098e-06, "logits/chosen": -2.178826332092285, "logits/rejected": -2.061540126800537, "logps/chosen": -414.2237854003906, "logps/rejected": -419.55987548828125, "loss": 0.5215, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.8106099963188171, "rewards/margins": 0.7354928851127625, "rewards/margins_max": 1.8997234106063843, "rewards/margins_min": -0.24074482917785645, "rewards/margins_std": 0.9754828214645386, "rewards/rejected": -1.5461028814315796, "step": 670 }, { "epoch": 0.16, "grad_norm": 5.597159768958543, "learning_rate": 4.940274206752688e-06, "logits/chosen": -1.977489709854126, "logits/rejected": -1.9529914855957031, "logps/chosen": -404.0625, "logps/rejected": -381.04803466796875, "loss": 0.5524, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.7249784469604492, "rewards/margins": 0.46373146772384644, "rewards/margins_max": 1.6322214603424072, "rewards/margins_min": -0.5240648984909058, "rewards/margins_std": 0.9609307050704956, "rewards/rejected": -1.1887098550796509, "step": 680 }, { "epoch": 0.17, "grad_norm": 6.205654445320539, "learning_rate": 4.9356480104380584e-06, "logits/chosen": -1.938403844833374, "logits/rejected": -1.9134266376495361, "logps/chosen": -343.5639343261719, "logps/rejected": -407.2478942871094, "loss": 0.5667, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8251941800117493, "rewards/margins": 0.5975659489631653, "rewards/margins_max": 1.6354154348373413, "rewards/margins_min": -0.6076459884643555, "rewards/margins_std": 1.0054713487625122, "rewards/rejected": -1.422760248184204, "step": 690 }, { "epoch": 0.17, "grad_norm": 6.087763736346074, "learning_rate": 4.930851598981714e-06, "logits/chosen": -1.8766651153564453, "logits/rejected": -1.7684656381607056, "logps/chosen": -431.89532470703125, "logps/rejected": -483.7215270996094, "loss": 0.4888, "rewards/accuracies": 0.75, "rewards/chosen": -1.5035260915756226, "rewards/margins": 0.871872067451477, "rewards/margins_max": 2.1253461837768555, "rewards/margins_min": -0.29909127950668335, "rewards/margins_std": 1.0856178998947144, "rewards/rejected": -2.3753981590270996, "step": 700 }, { "epoch": 0.17, "eval_logits/chosen": -1.8361899852752686, "eval_logits/rejected": -1.7782431840896606, "eval_logps/chosen": -429.0588684082031, "eval_logps/rejected": -480.1023864746094, "eval_loss": 0.5545676350593567, "eval_rewards/accuracies": 0.7085000276565552, "eval_rewards/chosen": -1.4460352659225464, "eval_rewards/margins": 0.6964560747146606, "eval_rewards/margins_max": 2.587271213531494, "eval_rewards/margins_min": -0.9395976066589355, "eval_rewards/margins_std": 1.181050181388855, "eval_rewards/rejected": -2.142491102218628, "eval_runtime": 858.4095, "eval_samples_per_second": 4.66, "eval_steps_per_second": 0.291, "step": 700 }, { "epoch": 0.17, "grad_norm": 13.862007725387796, "learning_rate": 4.9258853075806325e-06, "logits/chosen": -1.7580397129058838, "logits/rejected": -1.678428053855896, "logps/chosen": -410.447998046875, "logps/rejected": -459.6873474121094, "loss": 0.5611, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.1808440685272217, "rewards/margins": 0.9788630604743958, "rewards/margins_max": 2.1238019466400146, "rewards/margins_min": -0.15827909111976624, "rewards/margins_std": 1.0226771831512451, "rewards/rejected": -2.159707546234131, "step": 710 }, { "epoch": 0.17, "grad_norm": 14.14174881152406, "learning_rate": 4.920749483303846e-06, "logits/chosen": -1.7164571285247803, "logits/rejected": -1.7320142984390259, "logps/chosen": -380.50714111328125, "logps/rejected": -452.42559814453125, "loss": 0.5171, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.0525697469711304, "rewards/margins": 0.7296100854873657, "rewards/margins_max": 2.144036293029785, "rewards/margins_min": -0.32073545455932617, "rewards/margins_std": 1.145054817199707, "rewards/rejected": -1.782179832458496, "step": 720 }, { "epoch": 0.17, "grad_norm": 12.07935353652469, "learning_rate": 4.915444485068182e-06, "logits/chosen": -2.01137638092041, "logits/rejected": -1.8990983963012695, "logps/chosen": -429.77972412109375, "logps/rejected": -464.586181640625, "loss": 0.5458, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.062665581703186, "rewards/margins": 0.7109260559082031, "rewards/margins_max": 2.0169901847839355, "rewards/margins_min": -0.31690531969070435, "rewards/margins_std": 1.0466145277023315, "rewards/rejected": -1.77359139919281, "step": 730 }, { "epoch": 0.18, "grad_norm": 6.636896069309391, "learning_rate": 4.909970683613181e-06, "logits/chosen": -1.9733998775482178, "logits/rejected": -1.90420401096344, "logps/chosen": -380.3243713378906, "logps/rejected": -472.1915588378906, "loss": 0.4873, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.188886046409607, "rewards/margins": 0.9580060839653015, "rewards/margins_max": 2.52644681930542, "rewards/margins_min": -0.46653881669044495, "rewards/margins_std": 1.3390769958496094, "rewards/rejected": -2.1468920707702637, "step": 740 }, { "epoch": 0.18, "grad_norm": 5.913328420012682, "learning_rate": 4.904328461475189e-06, "logits/chosen": -1.961631178855896, "logits/rejected": -1.9126102924346924, "logps/chosen": -393.39349365234375, "logps/rejected": -474.724365234375, "loss": 0.4747, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0788356065750122, "rewards/margins": 0.9036644697189331, "rewards/margins_max": 2.223555564880371, "rewards/margins_min": -0.32012784481048584, "rewards/margins_std": 1.1188615560531616, "rewards/rejected": -1.9825000762939453, "step": 750 }, { "epoch": 0.18, "grad_norm": 15.530682620578299, "learning_rate": 4.898518212960625e-06, "logits/chosen": -1.8144603967666626, "logits/rejected": -1.8225147724151611, "logps/chosen": -434.203857421875, "logps/rejected": -514.5244750976562, "loss": 0.5174, "rewards/accuracies": 0.6875, "rewards/chosen": -1.5818592309951782, "rewards/margins": 0.7245903015136719, "rewards/margins_max": 2.2298460006713867, "rewards/margins_min": -0.790389895439148, "rewards/margins_std": 1.3366594314575195, "rewards/rejected": -2.3064494132995605, "step": 760 }, { "epoch": 0.18, "grad_norm": 12.653439358051653, "learning_rate": 4.89254034411842e-06, "logits/chosen": -1.8841794729232788, "logits/rejected": -1.7809652090072632, "logps/chosen": -384.07843017578125, "logps/rejected": -467.78546142578125, "loss": 0.5256, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.1333519220352173, "rewards/margins": 0.8361465334892273, "rewards/margins_max": 2.484133243560791, "rewards/margins_min": -0.35427993535995483, "rewards/margins_std": 1.2884294986724854, "rewards/rejected": -1.9694983959197998, "step": 770 }, { "epoch": 0.19, "grad_norm": 7.639927744498773, "learning_rate": 4.886395272711646e-06, "logits/chosen": -1.9852294921875, "logits/rejected": -1.872495412826538, "logps/chosen": -409.69842529296875, "logps/rejected": -411.39227294921875, "loss": 0.5703, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.9921337366104126, "rewards/margins": 0.7185724973678589, "rewards/margins_max": 2.1194543838500977, "rewards/margins_min": -0.6048930287361145, "rewards/margins_std": 1.2133382558822632, "rewards/rejected": -1.710706114768982, "step": 780 }, { "epoch": 0.19, "grad_norm": 12.728507683028441, "learning_rate": 4.880083428188315e-06, "logits/chosen": -1.886370062828064, "logits/rejected": -1.821099042892456, "logps/chosen": -449.9500427246094, "logps/rejected": -478.67071533203125, "loss": 0.4896, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4232423305511475, "rewards/margins": 0.824362576007843, "rewards/margins_max": 2.0511419773101807, "rewards/margins_min": -0.3897000253200531, "rewards/margins_std": 1.1043411493301392, "rewards/rejected": -2.247605085372925, "step": 790 }, { "epoch": 0.19, "grad_norm": 8.275173736315809, "learning_rate": 4.873605251651373e-06, "logits/chosen": -1.8849067687988281, "logits/rejected": -1.758582353591919, "logps/chosen": -469.39691162109375, "logps/rejected": -495.90216064453125, "loss": 0.4774, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.6179670095443726, "rewards/margins": 0.8333891034126282, "rewards/margins_max": 2.280285358428955, "rewards/margins_min": -0.4704156816005707, "rewards/margins_std": 1.2159448862075806, "rewards/rejected": -2.4513559341430664, "step": 800 }, { "epoch": 0.19, "eval_logits/chosen": -1.964450478553772, "eval_logits/rejected": -1.9074342250823975, "eval_logps/chosen": -405.5573425292969, "eval_logps/rejected": -463.86456298828125, "eval_loss": 0.5257607698440552, "eval_rewards/accuracies": 0.7269999980926514, "eval_rewards/chosen": -1.2110201120376587, "eval_rewards/margins": 0.7690924406051636, "eval_rewards/margins_max": 2.5888662338256836, "eval_rewards/margins_min": -0.8329167366027832, "eval_rewards/margins_std": 1.1591479778289795, "eval_rewards/rejected": -1.9801126718521118, "eval_runtime": 858.5014, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 800 }, { "epoch": 0.19, "grad_norm": 5.911315683220568, "learning_rate": 4.86696119582787e-06, "logits/chosen": -2.0180859565734863, "logits/rejected": -2.00112247467041, "logps/chosen": -352.87811279296875, "logps/rejected": -426.2118225097656, "loss": 0.5468, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.0864734649658203, "rewards/margins": 0.7764807343482971, "rewards/margins_max": 1.971764326095581, "rewards/margins_min": -0.297002911567688, "rewards/margins_std": 0.9995386004447937, "rewards/rejected": -1.8629541397094727, "step": 810 }, { "epoch": 0.2, "grad_norm": 10.609396573523721, "learning_rate": 4.860151725037318e-06, "logits/chosen": -2.0635104179382324, "logits/rejected": -2.029193162918091, "logps/chosen": -375.3559265136719, "logps/rejected": -441.69110107421875, "loss": 0.4611, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8636133074760437, "rewards/margins": 0.946708083152771, "rewards/margins_max": 2.2019896507263184, "rewards/margins_min": -0.3230433464050293, "rewards/margins_std": 1.1126000881195068, "rewards/rejected": -1.810321569442749, "step": 820 }, { "epoch": 0.2, "grad_norm": 6.181654201209421, "learning_rate": 4.853177315159254e-06, "logits/chosen": -2.1762642860412598, "logits/rejected": -2.082057476043701, "logps/chosen": -460.008056640625, "logps/rejected": -471.9153747558594, "loss": 0.5431, "rewards/accuracies": 0.75, "rewards/chosen": -1.1086938381195068, "rewards/margins": 0.8144097328186035, "rewards/margins_max": 2.010969400405884, "rewards/margins_min": -0.41428709030151367, "rewards/margins_std": 1.0813654661178589, "rewards/rejected": -1.9231033325195312, "step": 830 }, { "epoch": 0.2, "grad_norm": 8.709681891710234, "learning_rate": 4.846038453599967e-06, "logits/chosen": -2.025879383087158, "logits/rejected": -1.9088315963745117, "logps/chosen": -407.0829772949219, "logps/rejected": -453.6011657714844, "loss": 0.51, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.0397647619247437, "rewards/margins": 0.8760482668876648, "rewards/margins_max": 2.0746395587921143, "rewards/margins_min": -0.20674224197864532, "rewards/margins_std": 1.0134307146072388, "rewards/rejected": -1.9158128499984741, "step": 840 }, { "epoch": 0.2, "grad_norm": 12.001375755284528, "learning_rate": 4.8387356392584485e-06, "logits/chosen": -1.9453413486480713, "logits/rejected": -1.9143909215927124, "logps/chosen": -432.88311767578125, "logps/rejected": -512.09765625, "loss": 0.507, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.727304220199585, "rewards/margins": 0.6467604637145996, "rewards/margins_max": 1.921547532081604, "rewards/margins_min": -0.6729074716567993, "rewards/margins_std": 1.1749274730682373, "rewards/rejected": -2.3740646839141846, "step": 850 }, { "epoch": 0.21, "grad_norm": 6.117327809094898, "learning_rate": 4.831269382491519e-06, "logits/chosen": -1.8847763538360596, "logits/rejected": -1.9050480127334595, "logps/chosen": -393.3268127441406, "logps/rejected": -476.64910888671875, "loss": 0.5546, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.301879644393921, "rewards/margins": 0.6637540459632874, "rewards/margins_max": 2.3034515380859375, "rewards/margins_min": -0.8304089307785034, "rewards/margins_std": 1.4028583765029907, "rewards/rejected": -1.9656336307525635, "step": 860 }, { "epoch": 0.21, "grad_norm": 6.78401177065206, "learning_rate": 4.8236402050781675e-06, "logits/chosen": -1.9095573425292969, "logits/rejected": -1.8866355419158936, "logps/chosen": -350.4537048339844, "logps/rejected": -421.22210693359375, "loss": 0.5278, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.101622223854065, "rewards/margins": 0.653002142906189, "rewards/margins_max": 1.8255516290664673, "rewards/margins_min": -0.48366695642471313, "rewards/margins_std": 1.0432379245758057, "rewards/rejected": -1.7546241283416748, "step": 870 }, { "epoch": 0.21, "grad_norm": 9.5226351468047, "learning_rate": 4.815848640183082e-06, "logits/chosen": -1.7604023218154907, "logits/rejected": -1.694998025894165, "logps/chosen": -463.4356384277344, "logps/rejected": -507.48931884765625, "loss": 0.5034, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.325208306312561, "rewards/margins": 0.8629659414291382, "rewards/margins_max": 2.2164673805236816, "rewards/margins_min": -0.37350529432296753, "rewards/margins_std": 1.1830224990844727, "rewards/rejected": -2.1881744861602783, "step": 880 }, { "epoch": 0.21, "grad_norm": 6.379065619660268, "learning_rate": 4.807895232319394e-06, "logits/chosen": -1.715869665145874, "logits/rejected": -1.6390396356582642, "logps/chosen": -422.7889709472656, "logps/rejected": -409.7266540527344, "loss": 0.567, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.324000597000122, "rewards/margins": 0.6693439483642578, "rewards/margins_max": 1.9024097919464111, "rewards/margins_min": -0.5498406291007996, "rewards/margins_std": 1.081364393234253, "rewards/rejected": -1.9933445453643799, "step": 890 }, { "epoch": 0.22, "grad_norm": 7.621210125729219, "learning_rate": 4.799780537310622e-06, "logits/chosen": -1.7096240520477295, "logits/rejected": -1.6411975622177124, "logps/chosen": -422.93353271484375, "logps/rejected": -464.10107421875, "loss": 0.521, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.0209131240844727, "rewards/margins": 0.8536723852157593, "rewards/margins_max": 2.308784008026123, "rewards/margins_min": -0.36778873205184937, "rewards/margins_std": 1.1979105472564697, "rewards/rejected": -1.874585509300232, "step": 900 }, { "epoch": 0.22, "eval_logits/chosen": -1.5999146699905396, "eval_logits/rejected": -1.539036750793457, "eval_logps/chosen": -424.8804626464844, "eval_logps/rejected": -486.9129638671875, "eval_loss": 0.5286461710929871, "eval_rewards/accuracies": 0.7354999780654907, "eval_rewards/chosen": -1.4042513370513916, "eval_rewards/margins": 0.8063454627990723, "eval_rewards/margins_max": 2.802962064743042, "eval_rewards/margins_min": -0.8889963626861572, "eval_rewards/margins_std": 1.2405749559402466, "eval_rewards/rejected": -2.2105965614318848, "eval_runtime": 858.7423, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 900 }, { "epoch": 0.22, "grad_norm": 8.743018716767011, "learning_rate": 4.7915051222518275e-06, "logits/chosen": -1.5554327964782715, "logits/rejected": -1.442995548248291, "logps/chosen": -414.176513671875, "logps/rejected": -479.6756286621094, "loss": 0.5237, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6969096660614014, "rewards/margins": 0.9398115873336792, "rewards/margins_max": 2.536273956298828, "rewards/margins_min": -0.40416544675827026, "rewards/margins_std": 1.3401415348052979, "rewards/rejected": -2.636721134185791, "step": 910 }, { "epoch": 0.22, "grad_norm": 12.187894374628842, "learning_rate": 4.783069565469986e-06, "logits/chosen": -1.4274368286132812, "logits/rejected": -1.3976366519927979, "logps/chosen": -498.45465087890625, "logps/rejected": -574.7377319335938, "loss": 0.4941, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.152784824371338, "rewards/margins": 0.8610376119613647, "rewards/margins_max": 2.317478656768799, "rewards/margins_min": -0.430935800075531, "rewards/margins_std": 1.2372599840164185, "rewards/rejected": -3.013822317123413, "step": 920 }, { "epoch": 0.22, "grad_norm": 16.594547231025448, "learning_rate": 4.7744744564835705e-06, "logits/chosen": -1.5127589702606201, "logits/rejected": -1.453115701675415, "logps/chosen": -432.2581481933594, "logps/rejected": -446.10003662109375, "loss": 0.5796, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.68351149559021, "rewards/margins": 0.5954945087432861, "rewards/margins_max": 2.063502311706543, "rewards/margins_min": -0.6298249363899231, "rewards/margins_std": 1.2130063772201538, "rewards/rejected": -2.279005765914917, "step": 930 }, { "epoch": 0.23, "grad_norm": 8.032357581093292, "learning_rate": 4.76572039596135e-06, "logits/chosen": -1.7373130321502686, "logits/rejected": -1.7226680517196655, "logps/chosen": -376.7931213378906, "logps/rejected": -452.8414001464844, "loss": 0.5027, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.9889128804206848, "rewards/margins": 0.8536629676818848, "rewards/margins_max": 1.92659592628479, "rewards/margins_min": -0.18499860167503357, "rewards/margins_std": 0.9458996057510376, "rewards/rejected": -1.8425757884979248, "step": 940 }, { "epoch": 0.23, "grad_norm": 9.451295584215682, "learning_rate": 4.756807995680415e-06, "logits/chosen": -1.8367137908935547, "logits/rejected": -1.7846128940582275, "logps/chosen": -429.75823974609375, "logps/rejected": -476.84136962890625, "loss": 0.5287, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0797141790390015, "rewards/margins": 0.7286165952682495, "rewards/margins_max": 2.2649261951446533, "rewards/margins_min": -0.588452935218811, "rewards/margins_std": 1.2799144983291626, "rewards/rejected": -1.8083308935165405, "step": 950 }, { "epoch": 0.23, "grad_norm": 8.241147927627502, "learning_rate": 4.747737878483421e-06, "logits/chosen": -1.786500334739685, "logits/rejected": -1.679176926612854, "logps/chosen": -460.496337890625, "logps/rejected": -487.70941162109375, "loss": 0.4781, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.569098711013794, "rewards/margins": 0.8622958064079285, "rewards/margins_max": 2.1880669593811035, "rewards/margins_min": -0.36647701263427734, "rewards/margins_std": 1.1327682733535767, "rewards/rejected": -2.431394338607788, "step": 960 }, { "epoch": 0.23, "grad_norm": 8.131059939650395, "learning_rate": 4.738510678235064e-06, "logits/chosen": -1.7273290157318115, "logits/rejected": -1.6268259286880493, "logps/chosen": -511.3541564941406, "logps/rejected": -593.7515869140625, "loss": 0.4288, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9094486236572266, "rewards/margins": 1.0982086658477783, "rewards/margins_max": 2.5192136764526367, "rewards/margins_min": -0.3144475817680359, "rewards/margins_std": 1.2649296522140503, "rewards/rejected": -3.007657051086426, "step": 970 }, { "epoch": 0.23, "grad_norm": 6.1516077188695535, "learning_rate": 4.729127039777782e-06, "logits/chosen": -1.6758918762207031, "logits/rejected": -1.6378368139266968, "logps/chosen": -377.09405517578125, "logps/rejected": -446.9493103027344, "loss": 0.5169, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.342261552810669, "rewards/margins": 0.8917317390441895, "rewards/margins_max": 2.52836537361145, "rewards/margins_min": -0.4327452778816223, "rewards/margins_std": 1.325300693511963, "rewards/rejected": -2.2339932918548584, "step": 980 }, { "epoch": 0.24, "grad_norm": 11.833450388441367, "learning_rate": 4.719587618886685e-06, "logits/chosen": -1.949138879776001, "logits/rejected": -1.8769677877426147, "logps/chosen": -388.64703369140625, "logps/rejected": -448.1983947753906, "loss": 0.5646, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.7850435376167297, "rewards/margins": 0.7252563238143921, "rewards/margins_max": 2.1855807304382324, "rewards/margins_min": -0.3862724304199219, "rewards/margins_std": 1.1582493782043457, "rewards/rejected": -1.5103000402450562, "step": 990 }, { "epoch": 0.24, "grad_norm": 9.46841641584421, "learning_rate": 4.7098930822237375e-06, "logits/chosen": -1.944679617881775, "logits/rejected": -1.8805227279663086, "logps/chosen": -385.4551696777344, "logps/rejected": -450.0174255371094, "loss": 0.4871, "rewards/accuracies": 0.75, "rewards/chosen": -0.8702098727226257, "rewards/margins": 0.8217867016792297, "rewards/margins_max": 2.2163033485412598, "rewards/margins_min": -0.5195831656455994, "rewards/margins_std": 1.214090347290039, "rewards/rejected": -1.6919963359832764, "step": 1000 }, { "epoch": 0.24, "eval_logits/chosen": -1.8273050785064697, "eval_logits/rejected": -1.7795426845550537, "eval_logps/chosen": -390.624267578125, "eval_logps/rejected": -455.0901794433594, "eval_loss": 0.5354195237159729, "eval_rewards/accuracies": 0.7250000238418579, "eval_rewards/chosen": -1.0616894960403442, "eval_rewards/margins": 0.830679714679718, "eval_rewards/margins_max": 2.9995877742767334, "eval_rewards/margins_min": -0.8982844352722168, "eval_rewards/margins_std": 1.3137260675430298, "eval_rewards/rejected": -1.892369031906128, "eval_runtime": 858.6462, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 1000 }, { "epoch": 0.24, "grad_norm": 15.223453617332122, "learning_rate": 4.7000441072911565e-06, "logits/chosen": -1.7332909107208252, "logits/rejected": -1.6941293478012085, "logps/chosen": -366.7320861816406, "logps/rejected": -489.7293395996094, "loss": 0.5098, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.1304348707199097, "rewards/margins": 1.0136330127716064, "rewards/margins_max": 2.548954486846924, "rewards/margins_min": -0.31413760781288147, "rewards/margins_std": 1.3152711391448975, "rewards/rejected": -2.1440680027008057, "step": 1010 }, { "epoch": 0.24, "grad_norm": 11.58012539652394, "learning_rate": 4.690041382384071e-06, "logits/chosen": -1.5610673427581787, "logits/rejected": -1.5815423727035522, "logps/chosen": -363.79364013671875, "logps/rejected": -473.7079162597656, "loss": 0.491, "rewards/accuracies": 0.75, "rewards/chosen": -1.3129503726959229, "rewards/margins": 1.0989265441894531, "rewards/margins_max": 2.4932937622070312, "rewards/margins_min": -0.24908895790576935, "rewards/margins_std": 1.2376689910888672, "rewards/rejected": -2.411877155303955, "step": 1020 }, { "epoch": 0.25, "grad_norm": 13.460756129074111, "learning_rate": 4.679885606542423e-06, "logits/chosen": -1.6546895503997803, "logits/rejected": -1.6395809650421143, "logps/chosen": -423.47845458984375, "logps/rejected": -511.65045166015625, "loss": 0.5069, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.668134093284607, "rewards/margins": 0.9138701558113098, "rewards/margins_max": 2.1827869415283203, "rewards/margins_min": -0.40344005823135376, "rewards/margins_std": 1.2095836400985718, "rewards/rejected": -2.5820040702819824, "step": 1030 }, { "epoch": 0.25, "grad_norm": 7.900254701781921, "learning_rate": 4.6695774895021086e-06, "logits/chosen": -1.7738854885101318, "logits/rejected": -1.687738060951233, "logps/chosen": -421.70916748046875, "logps/rejected": -524.0137939453125, "loss": 0.4198, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4014557600021362, "rewards/margins": 1.203538179397583, "rewards/margins_max": 2.7088866233825684, "rewards/margins_min": -0.14117351174354553, "rewards/margins_std": 1.2487276792526245, "rewards/rejected": -2.6049938201904297, "step": 1040 }, { "epoch": 0.25, "grad_norm": 8.44110256568501, "learning_rate": 4.65911775164538e-06, "logits/chosen": -1.5867350101470947, "logits/rejected": -1.595773458480835, "logps/chosen": -430.5469665527344, "logps/rejected": -496.0381774902344, "loss": 0.4626, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.784481406211853, "rewards/margins": 0.7556269764900208, "rewards/margins_max": 2.0112814903259277, "rewards/margins_min": -0.4683353006839752, "rewards/margins_std": 1.1141173839569092, "rewards/rejected": -2.5401084423065186, "step": 1050 }, { "epoch": 0.25, "grad_norm": 13.461806346198086, "learning_rate": 4.648507123950504e-06, "logits/chosen": -1.7120803594589233, "logits/rejected": -1.6997833251953125, "logps/chosen": -464.76483154296875, "logps/rejected": -546.62255859375, "loss": 0.4857, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.7149121761322021, "rewards/margins": 1.1192967891693115, "rewards/margins_max": 2.713026762008667, "rewards/margins_min": -0.522661566734314, "rewards/margins_std": 1.4455353021621704, "rewards/rejected": -2.8342089653015137, "step": 1060 }, { "epoch": 0.26, "grad_norm": 7.18522913345116, "learning_rate": 4.6377463479406785e-06, "logits/chosen": -1.667578935623169, "logits/rejected": -1.5842195749282837, "logps/chosen": -488.5780334472656, "logps/rejected": -586.6904296875, "loss": 0.4322, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.940279245376587, "rewards/margins": 1.4047088623046875, "rewards/margins_max": 3.1882922649383545, "rewards/margins_min": -0.08867797255516052, "rewards/margins_std": 1.4802536964416504, "rewards/rejected": -3.3449883460998535, "step": 1070 }, { "epoch": 0.26, "grad_norm": 12.284178095995703, "learning_rate": 4.626836175632204e-06, "logits/chosen": -1.6814979314804077, "logits/rejected": -1.5941377878189087, "logps/chosen": -513.368896484375, "logps/rejected": -568.8827514648438, "loss": 0.5145, "rewards/accuracies": 0.75, "rewards/chosen": -1.938881516456604, "rewards/margins": 1.0428458452224731, "rewards/margins_max": 2.7509207725524902, "rewards/margins_min": -0.4510120749473572, "rewards/margins_std": 1.42970871925354, "rewards/rejected": -2.981727123260498, "step": 1080 }, { "epoch": 0.26, "grad_norm": 11.044906505570651, "learning_rate": 4.61577736948194e-06, "logits/chosen": -1.6950985193252563, "logits/rejected": -1.6746526956558228, "logps/chosen": -438.19903564453125, "logps/rejected": -605.68115234375, "loss": 0.4891, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6305272579193115, "rewards/margins": 1.140063762664795, "rewards/margins_max": 3.2122528553009033, "rewards/margins_min": -0.49448299407958984, "rewards/margins_std": 1.6591581106185913, "rewards/rejected": -2.7705912590026855, "step": 1090 }, { "epoch": 0.26, "grad_norm": 5.070566414939316, "learning_rate": 4.60457070233401e-06, "logits/chosen": -1.5394463539123535, "logits/rejected": -1.532361388206482, "logps/chosen": -373.4762268066406, "logps/rejected": -451.9698791503906, "loss": 0.5574, "rewards/accuracies": 0.8125, "rewards/chosen": -1.193956732749939, "rewards/margins": 1.0151557922363281, "rewards/margins_max": 2.4098846912384033, "rewards/margins_min": -0.3218299448490143, "rewards/margins_std": 1.229115605354309, "rewards/rejected": -2.2091124057769775, "step": 1100 }, { "epoch": 0.26, "eval_logits/chosen": -1.6950691938400269, "eval_logits/rejected": -1.6402984857559204, "eval_logps/chosen": -410.0580749511719, "eval_logps/rejected": -471.418212890625, "eval_loss": 0.537865161895752, "eval_rewards/accuracies": 0.7204999923706055, "eval_rewards/chosen": -1.2560268640518188, "eval_rewards/margins": 0.799622654914856, "eval_rewards/margins_max": 3.0462958812713623, "eval_rewards/margins_min": -0.8878714442253113, "eval_rewards/margins_std": 1.308493733406067, "eval_rewards/rejected": -2.055649757385254, "eval_runtime": 858.2604, "eval_samples_per_second": 4.661, "eval_steps_per_second": 0.291, "step": 1100 }, { "epoch": 0.27, "grad_norm": 5.878477649970371, "learning_rate": 4.5932169573657996e-06, "logits/chosen": -1.7375917434692383, "logits/rejected": -1.7133474349975586, "logps/chosen": -447.3216857910156, "logps/rejected": -542.3463134765625, "loss": 0.4786, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.2669851779937744, "rewards/margins": 1.0347027778625488, "rewards/margins_max": 2.900076389312744, "rewards/margins_min": -0.2708736062049866, "rewards/margins_std": 1.436565637588501, "rewards/rejected": -2.3016881942749023, "step": 1110 }, { "epoch": 0.27, "grad_norm": 13.26102262532328, "learning_rate": 4.5817169280332165e-06, "logits/chosen": -1.6094753742218018, "logits/rejected": -1.5909000635147095, "logps/chosen": -460.13397216796875, "logps/rejected": -512.9142456054688, "loss": 0.5759, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.7980543375015259, "rewards/margins": 0.49042320251464844, "rewards/margins_max": 2.4922499656677246, "rewards/margins_min": -1.2866321802139282, "rewards/margins_std": 1.698192834854126, "rewards/rejected": -2.2884774208068848, "step": 1120 }, { "epoch": 0.27, "grad_norm": 13.652029681652136, "learning_rate": 4.570071418015247e-06, "logits/chosen": -1.445876955986023, "logits/rejected": -1.3862323760986328, "logps/chosen": -393.8975524902344, "logps/rejected": -491.19769287109375, "loss": 0.5079, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6997969150543213, "rewards/margins": 0.9820052981376648, "rewards/margins_max": 2.592151403427124, "rewards/margins_min": -0.36706727743148804, "rewards/margins_std": 1.2942492961883545, "rewards/rejected": -2.6818020343780518, "step": 1130 }, { "epoch": 0.27, "grad_norm": 7.711277112661167, "learning_rate": 4.55828124115779e-06, "logits/chosen": -1.5958524942398071, "logits/rejected": -1.5559076070785522, "logps/chosen": -458.9775390625, "logps/rejected": -513.0651245117188, "loss": 0.5132, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.795784592628479, "rewards/margins": 0.8057465553283691, "rewards/margins_max": 1.974776029586792, "rewards/margins_min": -0.2285630702972412, "rewards/margins_std": 0.976087212562561, "rewards/rejected": -2.6015312671661377, "step": 1140 }, { "epoch": 0.28, "grad_norm": 11.393886439374379, "learning_rate": 4.5463472214167725e-06, "logits/chosen": -1.6987746953964233, "logits/rejected": -1.6240345239639282, "logps/chosen": -427.27178955078125, "logps/rejected": -530.4534912109375, "loss": 0.513, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6322129964828491, "rewards/margins": 1.2085663080215454, "rewards/margins_max": 3.200082302093506, "rewards/margins_min": -0.19630907475948334, "rewards/margins_std": 1.5108492374420166, "rewards/rejected": -2.8407790660858154, "step": 1150 }, { "epoch": 0.28, "grad_norm": 5.538153526715708, "learning_rate": 4.534270192800582e-06, "logits/chosen": -1.720248818397522, "logits/rejected": -1.6586148738861084, "logps/chosen": -414.6717224121094, "logps/rejected": -533.3748779296875, "loss": 0.4412, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4977530241012573, "rewards/margins": 1.2875648736953735, "rewards/margins_max": 3.0353758335113525, "rewards/margins_min": -0.038429148495197296, "rewards/margins_std": 1.3536832332611084, "rewards/rejected": -2.78531813621521, "step": 1160 }, { "epoch": 0.28, "grad_norm": 8.966400157650192, "learning_rate": 4.522050999311769e-06, "logits/chosen": -1.779441237449646, "logits/rejected": -1.6635583639144897, "logps/chosen": -511.641845703125, "logps/rejected": -585.0018310546875, "loss": 0.4721, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1606664657592773, "rewards/margins": 1.099378228187561, "rewards/margins_max": 2.8274059295654297, "rewards/margins_min": -0.6614146828651428, "rewards/margins_std": 1.5609182119369507, "rewards/rejected": -3.260044813156128, "step": 1170 }, { "epoch": 0.28, "grad_norm": 8.209710897914965, "learning_rate": 4.5096904948880715e-06, "logits/chosen": -1.6777623891830444, "logits/rejected": -1.5609970092773438, "logps/chosen": -533.2882080078125, "logps/rejected": -576.4873046875, "loss": 0.5208, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.9399293661117554, "rewards/margins": 0.9982385635375977, "rewards/margins_max": 2.5487465858459473, "rewards/margins_min": -0.4006895124912262, "rewards/margins_std": 1.3348191976547241, "rewards/rejected": -2.9381680488586426, "step": 1180 }, { "epoch": 0.28, "grad_norm": 10.742162997131146, "learning_rate": 4.4971895433427356e-06, "logits/chosen": -1.650472640991211, "logits/rejected": -1.6278070211410522, "logps/chosen": -398.94183349609375, "logps/rejected": -461.6776428222656, "loss": 0.5207, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.708836317062378, "rewards/margins": 0.7180274724960327, "rewards/margins_max": 2.3449575901031494, "rewards/margins_min": -0.8658682107925415, "rewards/margins_std": 1.4726488590240479, "rewards/rejected": -2.426863670349121, "step": 1190 }, { "epoch": 0.29, "grad_norm": 8.755441336147634, "learning_rate": 4.484549018304146e-06, "logits/chosen": -1.798423409461975, "logits/rejected": -1.787937879562378, "logps/chosen": -424.2244567871094, "logps/rejected": -503.37518310546875, "loss": 0.5017, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.186379075050354, "rewards/margins": 0.8830278515815735, "rewards/margins_max": 2.1508946418762207, "rewards/margins_min": -0.2212848663330078, "rewards/margins_std": 1.0707820653915405, "rewards/rejected": -2.0694069862365723, "step": 1200 }, { "epoch": 0.29, "eval_logits/chosen": -1.757015347480774, "eval_logits/rejected": -1.7024595737457275, "eval_logps/chosen": -417.6535339355469, "eval_logps/rejected": -483.0893859863281, "eval_loss": 0.5261484980583191, "eval_rewards/accuracies": 0.7294999957084656, "eval_rewards/chosen": -1.3319820165634155, "eval_rewards/margins": 0.8403791785240173, "eval_rewards/margins_max": 2.9984922409057617, "eval_rewards/margins_min": -0.8951181769371033, "eval_rewards/margins_std": 1.3030787706375122, "eval_rewards/rejected": -2.172361135482788, "eval_runtime": 858.6072, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 1200 }, { "epoch": 0.29, "grad_norm": 5.296261011958069, "learning_rate": 4.471769803154774e-06, "logits/chosen": -1.8137985467910767, "logits/rejected": -1.7073570489883423, "logps/chosen": -446.5977478027344, "logps/rejected": -499.57989501953125, "loss": 0.461, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3939201831817627, "rewards/margins": 0.9900428056716919, "rewards/margins_max": 2.472843885421753, "rewards/margins_min": -0.2807120084762573, "rewards/margins_std": 1.2429964542388916, "rewards/rejected": -2.383962869644165, "step": 1210 }, { "epoch": 0.29, "grad_norm": 6.446463557255213, "learning_rate": 4.458852790969446e-06, "logits/chosen": -1.7360875606536865, "logits/rejected": -1.677473783493042, "logps/chosen": -423.9715270996094, "logps/rejected": -521.8782958984375, "loss": 0.4487, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.5238450765609741, "rewards/margins": 1.0729334354400635, "rewards/margins_max": 2.3686881065368652, "rewards/margins_min": -0.2810816168785095, "rewards/margins_std": 1.179468035697937, "rewards/rejected": -2.596778392791748, "step": 1220 }, { "epoch": 0.29, "grad_norm": 6.795821465583821, "learning_rate": 4.445798884452921e-06, "logits/chosen": -1.6006437540054321, "logits/rejected": -1.5507539510726929, "logps/chosen": -415.8416442871094, "logps/rejected": -528.8954467773438, "loss": 0.4862, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.656233787536621, "rewards/margins": 0.8889392018318176, "rewards/margins_max": 2.552154302597046, "rewards/margins_min": -0.5644077062606812, "rewards/margins_std": 1.3861337900161743, "rewards/rejected": -2.545172929763794, "step": 1230 }, { "epoch": 0.3, "grad_norm": 9.90531305641601, "learning_rate": 4.432608995876819e-06, "logits/chosen": -1.6049391031265259, "logits/rejected": -1.4755823612213135, "logps/chosen": -426.5703125, "logps/rejected": -479.6333923339844, "loss": 0.5694, "rewards/accuracies": 0.75, "rewards/chosen": -1.583181619644165, "rewards/margins": 0.7669872045516968, "rewards/margins_max": 2.291544198989868, "rewards/margins_min": -0.852423369884491, "rewards/margins_std": 1.3917973041534424, "rewards/rejected": -2.3501687049865723, "step": 1240 }, { "epoch": 0.3, "grad_norm": 4.410263073744997, "learning_rate": 4.419284047015854e-06, "logits/chosen": -1.7579128742218018, "logits/rejected": -1.7100893259048462, "logps/chosen": -430.9188537597656, "logps/rejected": -459.86328125, "loss": 0.5037, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.401460886001587, "rewards/margins": 0.8494556546211243, "rewards/margins_max": 2.1088314056396484, "rewards/margins_min": -0.24145181477069855, "rewards/margins_std": 1.081814169883728, "rewards/rejected": -2.2509164810180664, "step": 1250 }, { "epoch": 0.3, "grad_norm": 9.110431485556946, "learning_rate": 4.405824969083424e-06, "logits/chosen": -1.7449874877929688, "logits/rejected": -1.7175092697143555, "logps/chosen": -400.9250793457031, "logps/rejected": -480.7320861816406, "loss": 0.4702, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5042998790740967, "rewards/margins": 0.9453195333480835, "rewards/margins_max": 2.3987512588500977, "rewards/margins_min": -0.3086710572242737, "rewards/margins_std": 1.195438027381897, "rewards/rejected": -2.4496192932128906, "step": 1260 }, { "epoch": 0.3, "grad_norm": 15.662250518638556, "learning_rate": 4.3922327026665305e-06, "logits/chosen": -1.7813165187835693, "logits/rejected": -1.6941430568695068, "logps/chosen": -442.2898864746094, "logps/rejected": -544.7201538085938, "loss": 0.4673, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5316277742385864, "rewards/margins": 1.0849241018295288, "rewards/margins_max": 2.522190570831299, "rewards/margins_min": -0.5502014756202698, "rewards/margins_std": 1.3649193048477173, "rewards/rejected": -2.616551637649536, "step": 1270 }, { "epoch": 0.31, "grad_norm": 6.2751401726010085, "learning_rate": 4.378508197660045e-06, "logits/chosen": -1.819186806678772, "logits/rejected": -1.7108433246612549, "logps/chosen": -543.03955078125, "logps/rejected": -619.8226318359375, "loss": 0.4497, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.3380777835845947, "rewards/margins": 1.1211988925933838, "rewards/margins_max": 2.8612101078033447, "rewards/margins_min": -0.5747385025024414, "rewards/margins_std": 1.5363645553588867, "rewards/rejected": -3.4592766761779785, "step": 1280 }, { "epoch": 0.31, "grad_norm": 14.386965871486062, "learning_rate": 4.364652413200326e-06, "logits/chosen": -1.822385549545288, "logits/rejected": -1.705594778060913, "logps/chosen": -556.4990234375, "logps/rejected": -613.0399780273438, "loss": 0.4794, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.452467203140259, "rewards/margins": 1.1300464868545532, "rewards/margins_max": 3.0443949699401855, "rewards/margins_min": -0.5965273976325989, "rewards/margins_std": 1.6306991577148438, "rewards/rejected": -3.5825133323669434, "step": 1290 }, { "epoch": 0.31, "grad_norm": 11.1522613148223, "learning_rate": 4.350666317598191e-06, "logits/chosen": -1.7963718175888062, "logits/rejected": -1.7680256366729736, "logps/chosen": -451.4610290527344, "logps/rejected": -548.41064453125, "loss": 0.4478, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6689643859863281, "rewards/margins": 1.191236138343811, "rewards/margins_max": 2.5363879203796387, "rewards/margins_min": -0.16707056760787964, "rewards/margins_std": 1.2129935026168823, "rewards/rejected": -2.8602004051208496, "step": 1300 }, { "epoch": 0.31, "eval_logits/chosen": -1.777889609336853, "eval_logits/rejected": -1.7243633270263672, "eval_logps/chosen": -456.99102783203125, "eval_logps/rejected": -530.8425903320312, "eval_loss": 0.5277400016784668, "eval_rewards/accuracies": 0.7229999899864197, "eval_rewards/chosen": -1.7253568172454834, "eval_rewards/margins": 0.9245365858078003, "eval_rewards/margins_max": 3.2834300994873047, "eval_rewards/margins_min": -1.0237069129943848, "eval_rewards/margins_std": 1.4393820762634277, "eval_rewards/rejected": -2.6498935222625732, "eval_runtime": 858.7243, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 1300 }, { "epoch": 0.31, "grad_norm": 11.797279846116142, "learning_rate": 4.336550888271245e-06, "logits/chosen": -1.806113839149475, "logits/rejected": -1.769254446029663, "logps/chosen": -457.311279296875, "logps/rejected": -503.7276916503906, "loss": 0.4782, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3983782529830933, "rewards/margins": 0.979507327079773, "rewards/margins_max": 2.3090672492980957, "rewards/margins_min": -0.3501203656196594, "rewards/margins_std": 1.217260718345642, "rewards/rejected": -2.3778858184814453, "step": 1310 }, { "epoch": 0.32, "grad_norm": 8.26368087663478, "learning_rate": 4.322307111675573e-06, "logits/chosen": -1.677952527999878, "logits/rejected": -1.6386768817901611, "logps/chosen": -426.4773864746094, "logps/rejected": -476.13519287109375, "loss": 0.4992, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5766832828521729, "rewards/margins": 0.8616814613342285, "rewards/margins_max": 2.152453899383545, "rewards/margins_min": -0.39197486639022827, "rewards/margins_std": 1.1289595365524292, "rewards/rejected": -2.4383647441864014, "step": 1320 }, { "epoch": 0.32, "grad_norm": 8.71931570535377, "learning_rate": 4.307935983236806e-06, "logits/chosen": -1.7720377445220947, "logits/rejected": -1.6484928131103516, "logps/chosen": -426.92822265625, "logps/rejected": -470.3456115722656, "loss": 0.4318, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.3381671905517578, "rewards/margins": 1.123642086982727, "rewards/margins_max": 2.822497844696045, "rewards/margins_min": -0.6034245491027832, "rewards/margins_std": 1.5203471183776855, "rewards/rejected": -2.4618093967437744, "step": 1330 }, { "epoch": 0.32, "grad_norm": 8.303755900828666, "learning_rate": 4.293438507280547e-06, "logits/chosen": -1.6181720495224, "logits/rejected": -1.5430936813354492, "logps/chosen": -452.4730529785156, "logps/rejected": -503.6160583496094, "loss": 0.4575, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8646223545074463, "rewards/margins": 0.9076078534126282, "rewards/margins_max": 2.6683859825134277, "rewards/margins_min": -0.5101832747459412, "rewards/margins_std": 1.421314001083374, "rewards/rejected": -2.7722301483154297, "step": 1340 }, { "epoch": 0.32, "grad_norm": 13.850587555579208, "learning_rate": 4.278815696962195e-06, "logits/chosen": -1.7354412078857422, "logits/rejected": -1.6814638376235962, "logps/chosen": -495.46759033203125, "logps/rejected": -597.3360595703125, "loss": 0.4657, "rewards/accuracies": 0.8125, "rewards/chosen": -2.012983560562134, "rewards/margins": 1.1172600984573364, "rewards/margins_max": 2.5484392642974854, "rewards/margins_min": -0.4459272027015686, "rewards/margins_std": 1.3544065952301025, "rewards/rejected": -3.1302435398101807, "step": 1350 }, { "epoch": 0.33, "grad_norm": 6.084571538179386, "learning_rate": 4.26406857419613e-06, "logits/chosen": -1.7259047031402588, "logits/rejected": -1.665785789489746, "logps/chosen": -467.90631103515625, "logps/rejected": -525.4000854492188, "loss": 0.477, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7396020889282227, "rewards/margins": 1.063480019569397, "rewards/margins_max": 2.3408942222595215, "rewards/margins_min": -0.23185572028160095, "rewards/margins_std": 1.1535142660140991, "rewards/rejected": -2.80308198928833, "step": 1360 }, { "epoch": 0.33, "grad_norm": 4.630469310201164, "learning_rate": 4.249198169584302e-06, "logits/chosen": -1.8905932903289795, "logits/rejected": -1.9151420593261719, "logps/chosen": -379.1492004394531, "logps/rejected": -507.84515380859375, "loss": 0.5471, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2711544036865234, "rewards/margins": 0.9363080859184265, "rewards/margins_max": 2.8914642333984375, "rewards/margins_min": -0.6726094484329224, "rewards/margins_std": 1.6178247928619385, "rewards/rejected": -2.2074623107910156, "step": 1370 }, { "epoch": 0.33, "grad_norm": 4.940525528294287, "learning_rate": 4.23420552234421e-06, "logits/chosen": -1.925498366355896, "logits/rejected": -1.9448429346084595, "logps/chosen": -379.3556823730469, "logps/rejected": -444.2100524902344, "loss": 0.5262, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.9969999194145203, "rewards/margins": 0.8427821397781372, "rewards/margins_max": 2.0701560974121094, "rewards/margins_min": -0.2984394133090973, "rewards/margins_std": 1.056652545928955, "rewards/rejected": -1.8397821187973022, "step": 1380 }, { "epoch": 0.33, "grad_norm": 6.8054125954627045, "learning_rate": 4.219091680236269e-06, "logits/chosen": -1.8516387939453125, "logits/rejected": -1.783252477645874, "logps/chosen": -365.5513610839844, "logps/rejected": -466.2015686035156, "loss": 0.5016, "rewards/accuracies": 0.8125, "rewards/chosen": -1.1488829851150513, "rewards/margins": 0.9253040552139282, "rewards/margins_max": 2.3794498443603516, "rewards/margins_min": -0.3067162334918976, "rewards/margins_std": 1.2158586978912354, "rewards/rejected": -2.0741868019104004, "step": 1390 }, { "epoch": 0.34, "grad_norm": 9.822676708625199, "learning_rate": 4.2038576994905935e-06, "logits/chosen": -1.719948410987854, "logits/rejected": -1.6025689840316772, "logps/chosen": -399.25482177734375, "logps/rejected": -452.61767578125, "loss": 0.4919, "rewards/accuracies": 0.75, "rewards/chosen": -1.2643520832061768, "rewards/margins": 1.0281774997711182, "rewards/margins_max": 2.6281700134277344, "rewards/margins_min": -0.4636843204498291, "rewards/margins_std": 1.3809515237808228, "rewards/rejected": -2.292529582977295, "step": 1400 }, { "epoch": 0.34, "eval_logits/chosen": -1.6223078966140747, "eval_logits/rejected": -1.5533266067504883, "eval_logps/chosen": -401.8750915527344, "eval_logps/rejected": -470.1157531738281, "eval_loss": 0.5188964605331421, "eval_rewards/accuracies": 0.7365000247955322, "eval_rewards/chosen": -1.174198031425476, "eval_rewards/margins": 0.8684269189834595, "eval_rewards/margins_max": 3.033737897872925, "eval_rewards/margins_min": -0.9052166938781738, "eval_rewards/margins_std": 1.3302130699157715, "eval_rewards/rejected": -2.0426247119903564, "eval_runtime": 858.7616, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 1400 }, { "epoch": 0.34, "grad_norm": 9.152546312579153, "learning_rate": 4.1885046447331825e-06, "logits/chosen": -1.563952088356018, "logits/rejected": -1.5418660640716553, "logps/chosen": -425.502197265625, "logps/rejected": -490.28204345703125, "loss": 0.498, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.126869797706604, "rewards/margins": 1.054262638092041, "rewards/margins_max": 2.518481731414795, "rewards/margins_min": -0.2695619463920593, "rewards/margins_std": 1.263863444328308, "rewards/rejected": -2.1811325550079346, "step": 1410 }, { "epoch": 0.34, "grad_norm": 11.57135495493782, "learning_rate": 4.173033588911512e-06, "logits/chosen": -1.6623455286026, "logits/rejected": -1.6167595386505127, "logps/chosen": -476.3836975097656, "logps/rejected": -565.4288940429688, "loss": 0.4843, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6048084497451782, "rewards/margins": 1.045436978340149, "rewards/margins_max": 2.54534649848938, "rewards/margins_min": -0.2801569402217865, "rewards/margins_std": 1.2515109777450562, "rewards/rejected": -2.650245189666748, "step": 1420 }, { "epoch": 0.34, "grad_norm": 8.505868568809612, "learning_rate": 4.15744561321956e-06, "logits/chosen": -1.477988362312317, "logits/rejected": -1.4771525859832764, "logps/chosen": -468.6195373535156, "logps/rejected": -541.9591674804688, "loss": 0.4711, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9870831966400146, "rewards/margins": 0.736418604850769, "rewards/margins_max": 2.268310070037842, "rewards/margins_min": -0.7055023908615112, "rewards/margins_std": 1.3281176090240479, "rewards/rejected": -2.7235019207000732, "step": 1430 }, { "epoch": 0.34, "grad_norm": 12.6539301500438, "learning_rate": 4.141741807022243e-06, "logits/chosen": -1.6856629848480225, "logits/rejected": -1.5737398862838745, "logps/chosen": -465.35345458984375, "logps/rejected": -490.42388916015625, "loss": 0.5485, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6128590106964111, "rewards/margins": 0.846202552318573, "rewards/margins_max": 2.158088207244873, "rewards/margins_min": -0.5375596880912781, "rewards/margins_std": 1.1870211362838745, "rewards/rejected": -2.4590611457824707, "step": 1440 }, { "epoch": 0.35, "grad_norm": 6.206071821499754, "learning_rate": 4.125923267779287e-06, "logits/chosen": -1.7120176553726196, "logits/rejected": -1.6781444549560547, "logps/chosen": -378.5970153808594, "logps/rejected": -435.4115295410156, "loss": 0.513, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.1816582679748535, "rewards/margins": 0.8213415145874023, "rewards/margins_max": 2.2684974670410156, "rewards/margins_min": -0.32325881719589233, "rewards/margins_std": 1.1732237339019775, "rewards/rejected": -2.002999782562256, "step": 1450 }, { "epoch": 0.35, "grad_norm": 10.400483971118657, "learning_rate": 4.1099911009685304e-06, "logits/chosen": -1.6616166830062866, "logits/rejected": -1.635434865951538, "logps/chosen": -446.8877868652344, "logps/rejected": -467.00897216796875, "loss": 0.5185, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.4116919040679932, "rewards/margins": 0.6576821208000183, "rewards/margins_max": 2.1079986095428467, "rewards/margins_min": -0.8050550222396851, "rewards/margins_std": 1.2839460372924805, "rewards/rejected": -2.069373607635498, "step": 1460 }, { "epoch": 0.35, "grad_norm": 10.568269677416893, "learning_rate": 4.093946420008669e-06, "logits/chosen": -1.7710071802139282, "logits/rejected": -1.7250518798828125, "logps/chosen": -404.94683837890625, "logps/rejected": -479.19012451171875, "loss": 0.43, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3460806608200073, "rewards/margins": 1.0333633422851562, "rewards/margins_max": 2.294477939605713, "rewards/margins_min": -0.19916394352912903, "rewards/margins_std": 1.099045991897583, "rewards/rejected": -2.379444122314453, "step": 1470 }, { "epoch": 0.35, "grad_norm": 6.623044859093619, "learning_rate": 4.077790346181445e-06, "logits/chosen": -1.7482569217681885, "logits/rejected": -1.696237564086914, "logps/chosen": -483.416748046875, "logps/rejected": -523.6901245117188, "loss": 0.5202, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8600118160247803, "rewards/margins": 0.8885312080383301, "rewards/margins_max": 2.270716428756714, "rewards/margins_min": -0.4487033784389496, "rewards/margins_std": 1.225481629371643, "rewards/rejected": -2.7485432624816895, "step": 1480 }, { "epoch": 0.36, "grad_norm": 7.771351596707515, "learning_rate": 4.061524008553286e-06, "logits/chosen": -1.6644798517227173, "logits/rejected": -1.6116235256195068, "logps/chosen": -422.78912353515625, "logps/rejected": -502.4689025878906, "loss": 0.489, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.643424391746521, "rewards/margins": 1.056188702583313, "rewards/margins_max": 2.3678672313690186, "rewards/margins_min": -0.5369956493377686, "rewards/margins_std": 1.2547194957733154, "rewards/rejected": -2.699613094329834, "step": 1490 }, { "epoch": 0.36, "grad_norm": 10.956237785998432, "learning_rate": 4.045148543896396e-06, "logits/chosen": -1.8173494338989258, "logits/rejected": -1.7467037439346313, "logps/chosen": -434.1742248535156, "logps/rejected": -478.10760498046875, "loss": 0.4792, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.4843699932098389, "rewards/margins": 0.7845624089241028, "rewards/margins_max": 2.235358715057373, "rewards/margins_min": -0.5615049600601196, "rewards/margins_std": 1.2299386262893677, "rewards/rejected": -2.268932342529297, "step": 1500 }, { "epoch": 0.36, "eval_logits/chosen": -1.759560227394104, "eval_logits/rejected": -1.6972252130508423, "eval_logps/chosen": -423.9220275878906, "eval_logps/rejected": -498.955322265625, "eval_loss": 0.5204965472221375, "eval_rewards/accuracies": 0.734000027179718, "eval_rewards/chosen": -1.3946670293807983, "eval_rewards/margins": 0.9363530874252319, "eval_rewards/margins_max": 3.126479148864746, "eval_rewards/margins_min": -0.9863033890724182, "eval_rewards/margins_std": 1.3912577629089355, "eval_rewards/rejected": -2.3310203552246094, "eval_runtime": 858.9466, "eval_samples_per_second": 4.657, "eval_steps_per_second": 0.291, "step": 1500 }, { "epoch": 0.36, "grad_norm": 11.760689276392748, "learning_rate": 4.028665096609323e-06, "logits/chosen": -1.7991682291030884, "logits/rejected": -1.7784717082977295, "logps/chosen": -457.58807373046875, "logps/rejected": -508.92376708984375, "loss": 0.5169, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.5132601261138916, "rewards/margins": 0.6609223484992981, "rewards/margins_max": 2.068040370941162, "rewards/margins_min": -0.6545791625976562, "rewards/margins_std": 1.2270761728286743, "rewards/rejected": -2.174182653427124, "step": 1510 }, { "epoch": 0.36, "grad_norm": 17.329840283678685, "learning_rate": 4.0120748186369705e-06, "logits/chosen": -1.8941253423690796, "logits/rejected": -1.7933152914047241, "logps/chosen": -479.86053466796875, "logps/rejected": -545.8125, "loss": 0.4741, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.3254212141036987, "rewards/margins": 1.2438846826553345, "rewards/margins_max": 2.868272304534912, "rewards/margins_min": -0.3525157570838928, "rewards/margins_std": 1.4334163665771484, "rewards/rejected": -2.569305896759033, "step": 1520 }, { "epoch": 0.37, "grad_norm": 9.423620666434093, "learning_rate": 3.9953788693901e-06, "logits/chosen": -1.7452366352081299, "logits/rejected": -1.6856008768081665, "logps/chosen": -488.2145080566406, "logps/rejected": -536.3307495117188, "loss": 0.5467, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.7847986221313477, "rewards/margins": 0.8065935969352722, "rewards/margins_max": 2.5380892753601074, "rewards/margins_min": -1.0267566442489624, "rewards/margins_std": 1.5732282400131226, "rewards/rejected": -2.5913922786712646, "step": 1530 }, { "epoch": 0.37, "grad_norm": 9.694357530653019, "learning_rate": 3.978578415664306e-06, "logits/chosen": -1.726875901222229, "logits/rejected": -1.6862808465957642, "logps/chosen": -393.5936279296875, "logps/rejected": -467.02618408203125, "loss": 0.4363, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3473997116088867, "rewards/margins": 1.144679069519043, "rewards/margins_max": 2.7860026359558105, "rewards/margins_min": -0.4872284531593323, "rewards/margins_std": 1.4978790283203125, "rewards/rejected": -2.492079019546509, "step": 1540 }, { "epoch": 0.37, "grad_norm": 7.7265504040445006, "learning_rate": 3.961674631558474e-06, "logits/chosen": -1.8501567840576172, "logits/rejected": -1.686899185180664, "logps/chosen": -505.4443359375, "logps/rejected": -533.4059448242188, "loss": 0.4421, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8153709173202515, "rewards/margins": 1.1461175680160522, "rewards/margins_max": 2.323209047317505, "rewards/margins_min": -0.13645365834236145, "rewards/margins_std": 1.104641318321228, "rewards/rejected": -2.9614882469177246, "step": 1550 }, { "epoch": 0.37, "grad_norm": 9.196483968496466, "learning_rate": 3.944668698392724e-06, "logits/chosen": -1.7843472957611084, "logits/rejected": -1.7490875720977783, "logps/chosen": -442.6293029785156, "logps/rejected": -575.9721069335938, "loss": 0.4483, "rewards/accuracies": 0.8125, "rewards/chosen": -1.847508430480957, "rewards/margins": 1.1995658874511719, "rewards/margins_max": 2.7570912837982178, "rewards/margins_min": -0.5502170324325562, "rewards/margins_std": 1.4573684930801392, "rewards/rejected": -3.047074317932129, "step": 1560 }, { "epoch": 0.38, "grad_norm": 9.223139761917906, "learning_rate": 3.9275618046258636e-06, "logits/chosen": -1.9244571924209595, "logits/rejected": -1.8895126581192017, "logps/chosen": -453.9353942871094, "logps/rejected": -518.6531372070312, "loss": 0.5482, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3890243768692017, "rewards/margins": 0.7506848573684692, "rewards/margins_max": 2.4498183727264404, "rewards/margins_min": -0.7794243693351746, "rewards/margins_std": 1.3999955654144287, "rewards/rejected": -2.139709234237671, "step": 1570 }, { "epoch": 0.38, "grad_norm": 6.239050161500916, "learning_rate": 3.910355145772324e-06, "logits/chosen": -1.963033676147461, "logits/rejected": -1.927878975868225, "logps/chosen": -394.3022766113281, "logps/rejected": -501.08807373046875, "loss": 0.4666, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.1230194568634033, "rewards/margins": 1.2083815336227417, "rewards/margins_max": 2.775930166244507, "rewards/margins_min": -0.13165006041526794, "rewards/margins_std": 1.2905536890029907, "rewards/rejected": -2.3314008712768555, "step": 1580 }, { "epoch": 0.38, "grad_norm": 5.901376479623042, "learning_rate": 3.893049924318614e-06, "logits/chosen": -1.9346328973770142, "logits/rejected": -1.9038560390472412, "logps/chosen": -406.7640380859375, "logps/rejected": -497.94476318359375, "loss": 0.408, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.3095252513885498, "rewards/margins": 0.9361977577209473, "rewards/margins_max": 2.2547264099121094, "rewards/margins_min": -0.38140109181404114, "rewards/margins_std": 1.171440839767456, "rewards/rejected": -2.245723009109497, "step": 1590 }, { "epoch": 0.38, "grad_norm": 12.638881190557314, "learning_rate": 3.875647349639287e-06, "logits/chosen": -1.9173972606658936, "logits/rejected": -1.8337103128433228, "logps/chosen": -477.15252685546875, "logps/rejected": -497.06072998046875, "loss": 0.4952, "rewards/accuracies": 0.75, "rewards/chosen": -1.8901792764663696, "rewards/margins": 0.9839689135551453, "rewards/margins_max": 2.6063270568847656, "rewards/margins_min": -0.3907359540462494, "rewards/margins_std": 1.3758822679519653, "rewards/rejected": -2.8741488456726074, "step": 1600 }, { "epoch": 0.38, "eval_logits/chosen": -1.8827004432678223, "eval_logits/rejected": -1.8292875289916992, "eval_logps/chosen": -468.42816162109375, "eval_logps/rejected": -547.6121215820312, "eval_loss": 0.5315753817558289, "eval_rewards/accuracies": 0.7289999723434448, "eval_rewards/chosen": -1.8397282361984253, "eval_rewards/margins": 0.9778605103492737, "eval_rewards/margins_max": 3.267470359802246, "eval_rewards/margins_min": -1.099694013595581, "eval_rewards/margins_std": 1.4769123792648315, "eval_rewards/rejected": -2.8175888061523438, "eval_runtime": 858.6769, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 1600 }, { "epoch": 0.39, "grad_norm": 6.5706779750276665, "learning_rate": 3.8581486379124185e-06, "logits/chosen": -1.9930328130722046, "logits/rejected": -1.9730329513549805, "logps/chosen": -472.4261169433594, "logps/rejected": -523.6406860351562, "loss": 0.5127, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5972095727920532, "rewards/margins": 1.0377509593963623, "rewards/margins_max": 2.2254226207733154, "rewards/margins_min": -0.008472278714179993, "rewards/margins_std": 1.0063682794570923, "rewards/rejected": -2.634960651397705, "step": 1610 }, { "epoch": 0.39, "grad_norm": 7.723355421293458, "learning_rate": 3.8405550120346225e-06, "logits/chosen": -1.9173921346664429, "logits/rejected": -1.8364118337631226, "logps/chosen": -385.5010070800781, "logps/rejected": -458.40203857421875, "loss": 0.4575, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.330391526222229, "rewards/margins": 0.8410288691520691, "rewards/margins_max": 2.045682907104492, "rewards/margins_min": -0.4266037046909332, "rewards/margins_std": 1.1414289474487305, "rewards/rejected": -2.1714203357696533, "step": 1620 }, { "epoch": 0.39, "grad_norm": 10.009828315352468, "learning_rate": 3.822867701535579e-06, "logits/chosen": -1.9029552936553955, "logits/rejected": -1.8646186590194702, "logps/chosen": -453.0238342285156, "logps/rejected": -516.23974609375, "loss": 0.4531, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8182964324951172, "rewards/margins": 0.9936798214912415, "rewards/margins_max": 2.659271001815796, "rewards/margins_min": -0.5178359150886536, "rewards/margins_std": 1.415496587753296, "rewards/rejected": -2.811976432800293, "step": 1630 }, { "epoch": 0.39, "grad_norm": 5.695821857045502, "learning_rate": 3.805087942492113e-06, "logits/chosen": -1.881407380104065, "logits/rejected": -1.8206695318222046, "logps/chosen": -426.0442810058594, "logps/rejected": -517.4369506835938, "loss": 0.4672, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.4813072681427002, "rewards/margins": 1.1026338338851929, "rewards/margins_max": 2.5537235736846924, "rewards/margins_min": -0.16469474136829376, "rewards/margins_std": 1.2573797702789307, "rewards/rejected": -2.5839409828186035, "step": 1640 }, { "epoch": 0.4, "grad_norm": 12.174615083911535, "learning_rate": 3.7872169774418145e-06, "logits/chosen": -1.9373493194580078, "logits/rejected": -1.884070634841919, "logps/chosen": -394.07696533203125, "logps/rejected": -494.38250732421875, "loss": 0.4721, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.3425207138061523, "rewards/margins": 0.9057194590568542, "rewards/margins_max": 2.490558385848999, "rewards/margins_min": -0.5099620223045349, "rewards/margins_std": 1.3244012594223022, "rewards/rejected": -2.2482399940490723, "step": 1650 }, { "epoch": 0.4, "grad_norm": 7.175826436473134, "learning_rate": 3.769256055296198e-06, "logits/chosen": -1.8649260997772217, "logits/rejected": -1.797133207321167, "logps/chosen": -400.4412841796875, "logps/rejected": -509.9593200683594, "loss": 0.5121, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4928348064422607, "rewards/margins": 0.9923629760742188, "rewards/margins_max": 2.5470902919769287, "rewards/margins_min": -0.3953782618045807, "rewards/margins_std": 1.2963649034500122, "rewards/rejected": -2.4851975440979004, "step": 1660 }, { "epoch": 0.4, "grad_norm": 11.27192078219667, "learning_rate": 3.751206431253428e-06, "logits/chosen": -1.8113701343536377, "logits/rejected": -1.7061907052993774, "logps/chosen": -479.1839904785156, "logps/rejected": -596.3455200195312, "loss": 0.3825, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6503818035125732, "rewards/margins": 1.3517310619354248, "rewards/margins_max": 3.1908390522003174, "rewards/margins_min": -0.2883940637111664, "rewards/margins_std": 1.5308843851089478, "rewards/rejected": -3.002112865447998, "step": 1670 }, { "epoch": 0.4, "grad_norm": 11.466082104117316, "learning_rate": 3.7330693667105945e-06, "logits/chosen": -1.8724464178085327, "logits/rejected": -1.7574745416641235, "logps/chosen": -503.6329650878906, "logps/rejected": -525.1217651367188, "loss": 0.4943, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.8428876399993896, "rewards/margins": 0.9343498349189758, "rewards/margins_max": 2.5058255195617676, "rewards/margins_min": -0.5264105796813965, "rewards/margins_std": 1.4000141620635986, "rewards/rejected": -2.7772374153137207, "step": 1680 }, { "epoch": 0.4, "grad_norm": 8.285608375511345, "learning_rate": 3.714846129175563e-06, "logits/chosen": -1.7399994134902954, "logits/rejected": -1.690792441368103, "logps/chosen": -477.0479431152344, "logps/rejected": -609.5325317382812, "loss": 0.4716, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.053905963897705, "rewards/margins": 1.204507827758789, "rewards/margins_max": 2.5007870197296143, "rewards/margins_min": -0.1460908204317093, "rewards/margins_std": 1.1603546142578125, "rewards/rejected": -3.258413791656494, "step": 1690 }, { "epoch": 0.41, "grad_norm": 6.7347808503177475, "learning_rate": 3.696537992178395e-06, "logits/chosen": -1.722338318824768, "logits/rejected": -1.671705961227417, "logps/chosen": -499.70269775390625, "logps/rejected": -574.0323486328125, "loss": 0.5084, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0683319568634033, "rewards/margins": 1.0070024728775024, "rewards/margins_max": 2.4840290546417236, "rewards/margins_min": -0.4136873185634613, "rewards/margins_std": 1.3120759725570679, "rewards/rejected": -3.075334310531616, "step": 1700 }, { "epoch": 0.41, "eval_logits/chosen": -1.611209750175476, "eval_logits/rejected": -1.547292709350586, "eval_logps/chosen": -527.818115234375, "eval_logps/rejected": -610.689208984375, "eval_loss": 0.5285207629203796, "eval_rewards/accuracies": 0.7294999957084656, "eval_rewards/chosen": -2.4336276054382324, "eval_rewards/margins": 1.0147311687469482, "eval_rewards/margins_max": 3.404564380645752, "eval_rewards/margins_min": -1.1112028360366821, "eval_rewards/margins_std": 1.519903302192688, "eval_rewards/rejected": -3.4483590126037598, "eval_runtime": 858.7084, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 1700 }, { "epoch": 0.41, "grad_norm": 9.791736965062366, "learning_rate": 3.678146235182346e-06, "logits/chosen": -1.6277202367782593, "logits/rejected": -1.6258785724639893, "logps/chosen": -514.1776733398438, "logps/rejected": -663.5776977539062, "loss": 0.5231, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2802860736846924, "rewards/margins": 1.1582790613174438, "rewards/margins_max": 2.977980852127075, "rewards/margins_min": -0.5797690153121948, "rewards/margins_std": 1.5753198862075806, "rewards/rejected": -3.4385650157928467, "step": 1710 }, { "epoch": 0.41, "grad_norm": 10.363810928675418, "learning_rate": 3.6596721434944514e-06, "logits/chosen": -1.7636334896087646, "logits/rejected": -1.6912567615509033, "logps/chosen": -477.99713134765625, "logps/rejected": -552.8597412109375, "loss": 0.5257, "rewards/accuracies": 0.75, "rewards/chosen": -1.9413082599639893, "rewards/margins": 0.8001707792282104, "rewards/margins_max": 2.4251954555511475, "rewards/margins_min": -0.47381410002708435, "rewards/margins_std": 1.308423638343811, "rewards/rejected": -2.7414791584014893, "step": 1720 }, { "epoch": 0.41, "grad_norm": 10.725764790189679, "learning_rate": 3.641117008175703e-06, "logits/chosen": -1.7561897039413452, "logits/rejected": -1.710611343383789, "logps/chosen": -469.9329528808594, "logps/rejected": -530.6676025390625, "loss": 0.5057, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7756751775741577, "rewards/margins": 0.9502261877059937, "rewards/margins_max": 2.501286268234253, "rewards/margins_min": -0.4884006381034851, "rewards/margins_std": 1.3476407527923584, "rewards/rejected": -2.7259013652801514, "step": 1730 }, { "epoch": 0.42, "grad_norm": 6.364635943888872, "learning_rate": 3.6224821259508215e-06, "logits/chosen": -1.8247932195663452, "logits/rejected": -1.8065004348754883, "logps/chosen": -499.60906982421875, "logps/rejected": -552.136474609375, "loss": 0.5095, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8926136493682861, "rewards/margins": 0.8261919021606445, "rewards/margins_max": 2.187981128692627, "rewards/margins_min": -0.43869131803512573, "rewards/margins_std": 1.1800951957702637, "rewards/rejected": -2.7188055515289307, "step": 1740 }, { "epoch": 0.42, "grad_norm": 10.526259148093972, "learning_rate": 3.6037687991176373e-06, "logits/chosen": -1.753335952758789, "logits/rejected": -1.7148799896240234, "logps/chosen": -448.77618408203125, "logps/rejected": -515.3814697265625, "loss": 0.5203, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.6204173564910889, "rewards/margins": 0.9504104852676392, "rewards/margins_max": 2.511536121368408, "rewards/margins_min": -0.5098812580108643, "rewards/margins_std": 1.3177130222320557, "rewards/rejected": -2.5708279609680176, "step": 1750 }, { "epoch": 0.42, "grad_norm": 6.461626784101778, "learning_rate": 3.5849783354560784e-06, "logits/chosen": -1.731987714767456, "logits/rejected": -1.762780785560608, "logps/chosen": -416.75885009765625, "logps/rejected": -525.3709106445312, "loss": 0.4928, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4295904636383057, "rewards/margins": 0.9024019241333008, "rewards/margins_max": 2.575984239578247, "rewards/margins_min": -0.5146855115890503, "rewards/margins_std": 1.3834826946258545, "rewards/rejected": -2.3319923877716064, "step": 1760 }, { "epoch": 0.42, "grad_norm": 7.868839094340852, "learning_rate": 3.566112048136776e-06, "logits/chosen": -1.8803784847259521, "logits/rejected": -1.8323936462402344, "logps/chosen": -467.7734375, "logps/rejected": -507.15802001953125, "loss": 0.5494, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.449544906616211, "rewards/margins": 0.8128727078437805, "rewards/margins_max": 2.301952600479126, "rewards/margins_min": -0.5917859077453613, "rewards/margins_std": 1.2947670221328735, "rewards/rejected": -2.2624175548553467, "step": 1770 }, { "epoch": 0.43, "grad_norm": 8.967684905180029, "learning_rate": 3.5471712556292927e-06, "logits/chosen": -1.6441967487335205, "logits/rejected": -1.5477622747421265, "logps/chosen": -411.14862060546875, "logps/rejected": -454.6239318847656, "loss": 0.4764, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4076836109161377, "rewards/margins": 0.9544715881347656, "rewards/margins_max": 2.0982437133789062, "rewards/margins_min": -0.1824246644973755, "rewards/margins_std": 1.0298194885253906, "rewards/rejected": -2.3621551990509033, "step": 1780 }, { "epoch": 0.43, "grad_norm": 8.149173867884027, "learning_rate": 3.528157281609984e-06, "logits/chosen": -1.5611772537231445, "logits/rejected": -1.5485522747039795, "logps/chosen": -425.988525390625, "logps/rejected": -491.7542419433594, "loss": 0.5533, "rewards/accuracies": 0.75, "rewards/chosen": -2.126368761062622, "rewards/margins": 0.8271281123161316, "rewards/margins_max": 2.19313907623291, "rewards/margins_min": -0.5331050157546997, "rewards/margins_std": 1.2239429950714111, "rewards/rejected": -2.9534969329833984, "step": 1790 }, { "epoch": 0.43, "grad_norm": 8.240574394131446, "learning_rate": 3.509071454869492e-06, "logits/chosen": -1.474047064781189, "logits/rejected": -1.4426792860031128, "logps/chosen": -510.008056640625, "logps/rejected": -554.3078002929688, "loss": 0.4676, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7847309112548828, "rewards/margins": 0.8594381213188171, "rewards/margins_max": 2.2890233993530273, "rewards/margins_min": -0.41944026947021484, "rewards/margins_std": 1.2061141729354858, "rewards/rejected": -2.6441688537597656, "step": 1800 }, { "epoch": 0.43, "eval_logits/chosen": -1.5679718255996704, "eval_logits/rejected": -1.504467487335205, "eval_logps/chosen": -468.0517883300781, "eval_logps/rejected": -536.2839965820312, "eval_loss": 0.5161844491958618, "eval_rewards/accuracies": 0.7369999885559082, "eval_rewards/chosen": -1.8359644412994385, "eval_rewards/margins": 0.8683434128761292, "eval_rewards/margins_max": 2.896904230117798, "eval_rewards/margins_min": -0.9280064105987549, "eval_rewards/margins_std": 1.2953416109085083, "eval_rewards/rejected": -2.7043075561523438, "eval_runtime": 858.787, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 1800 }, { "epoch": 0.43, "grad_norm": 8.96474885911109, "learning_rate": 3.4899151092198824e-06, "logits/chosen": -1.4691169261932373, "logits/rejected": -1.384355902671814, "logps/chosen": -458.85943603515625, "logps/rejected": -523.5655517578125, "loss": 0.4813, "rewards/accuracies": 0.75, "rewards/chosen": -2.1314473152160645, "rewards/margins": 0.8887671232223511, "rewards/margins_max": 2.085184097290039, "rewards/margins_min": -0.3498167097568512, "rewards/margins_std": 1.0913379192352295, "rewards/rejected": -3.020214557647705, "step": 1810 }, { "epoch": 0.44, "grad_norm": 7.144935959354797, "learning_rate": 3.4706895834014298e-06, "logits/chosen": -1.614410400390625, "logits/rejected": -1.5604350566864014, "logps/chosen": -475.09344482421875, "logps/rejected": -561.6032104492188, "loss": 0.4676, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8195034265518188, "rewards/margins": 0.9619612693786621, "rewards/margins_max": 2.435882806777954, "rewards/margins_min": -0.19750697910785675, "rewards/margins_std": 1.1946425437927246, "rewards/rejected": -2.7814643383026123, "step": 1820 }, { "epoch": 0.44, "grad_norm": 8.797017233530422, "learning_rate": 3.4513962209890647e-06, "logits/chosen": -1.6155641078948975, "logits/rejected": -1.4920276403427124, "logps/chosen": -431.6558532714844, "logps/rejected": -495.56585693359375, "loss": 0.4403, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.5048675537109375, "rewards/margins": 1.019412875175476, "rewards/margins_max": 2.4716341495513916, "rewards/margins_min": -0.38423025608062744, "rewards/margins_std": 1.2679500579833984, "rewards/rejected": -2.524280071258545, "step": 1830 }, { "epoch": 0.44, "grad_norm": 10.221828545255185, "learning_rate": 3.4320363702984706e-06, "logits/chosen": -1.5454407930374146, "logits/rejected": -1.4364299774169922, "logps/chosen": -554.786376953125, "logps/rejected": -588.4811401367188, "loss": 0.5262, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.176755666732788, "rewards/margins": 0.8521841168403625, "rewards/margins_max": 2.481351375579834, "rewards/margins_min": -0.6456018090248108, "rewards/margins_std": 1.397610068321228, "rewards/rejected": -3.028939723968506, "step": 1840 }, { "epoch": 0.44, "grad_norm": 8.181556932536253, "learning_rate": 3.4126113842918646e-06, "logits/chosen": -1.4747753143310547, "logits/rejected": -1.40007483959198, "logps/chosen": -485.03448486328125, "logps/rejected": -556.177001953125, "loss": 0.4527, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0176684856414795, "rewards/margins": 1.044903039932251, "rewards/margins_max": 2.4598517417907715, "rewards/margins_min": -0.1600593775510788, "rewards/margins_std": 1.1840791702270508, "rewards/rejected": -3.0625715255737305, "step": 1850 }, { "epoch": 0.45, "grad_norm": 19.139759227868602, "learning_rate": 3.39312262048344e-06, "logits/chosen": -1.466143012046814, "logits/rejected": -1.4342010021209717, "logps/chosen": -576.35693359375, "logps/rejected": -677.9557495117188, "loss": 0.5258, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.498251438140869, "rewards/margins": 1.2752158641815186, "rewards/margins_max": 2.705380916595459, "rewards/margins_min": -0.1201479434967041, "rewards/margins_std": 1.2554144859313965, "rewards/rejected": -3.7734673023223877, "step": 1860 }, { "epoch": 0.45, "grad_norm": 7.78166816537657, "learning_rate": 3.3735714408445002e-06, "logits/chosen": -1.292950987815857, "logits/rejected": -1.2966060638427734, "logps/chosen": -458.279052734375, "logps/rejected": -561.9948120117188, "loss": 0.5545, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -2.074530601501465, "rewards/margins": 0.9663205146789551, "rewards/margins_max": 2.9810922145843506, "rewards/margins_min": -0.9398995637893677, "rewards/margins_std": 1.7453854084014893, "rewards/rejected": -3.04085111618042, "step": 1870 }, { "epoch": 0.45, "grad_norm": 8.335397032473308, "learning_rate": 3.353959211708275e-06, "logits/chosen": -1.3705906867980957, "logits/rejected": -1.2849690914154053, "logps/chosen": -462.18768310546875, "logps/rejected": -551.9871826171875, "loss": 0.4894, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8099162578582764, "rewards/margins": 0.9578437805175781, "rewards/margins_max": 2.466196298599243, "rewards/margins_min": -0.3479187488555908, "rewards/margins_std": 1.2808024883270264, "rewards/rejected": -2.7677600383758545, "step": 1880 }, { "epoch": 0.45, "grad_norm": 10.682866034402208, "learning_rate": 3.334287303674435e-06, "logits/chosen": -1.5480725765228271, "logits/rejected": -1.4992225170135498, "logps/chosen": -434.86614990234375, "logps/rejected": -543.0660400390625, "loss": 0.4954, "rewards/accuracies": 0.75, "rewards/chosen": -1.355137825012207, "rewards/margins": 1.0746350288391113, "rewards/margins_max": 2.55387282371521, "rewards/margins_min": -0.18300321698188782, "rewards/margins_std": 1.2363240718841553, "rewards/rejected": -2.4297728538513184, "step": 1890 }, { "epoch": 0.45, "grad_norm": 9.231543608731009, "learning_rate": 3.3145570915133074e-06, "logits/chosen": -1.4549788236618042, "logits/rejected": -1.3503834009170532, "logps/chosen": -400.1648254394531, "logps/rejected": -487.46612548828125, "loss": 0.4588, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3556721210479736, "rewards/margins": 0.9232860803604126, "rewards/margins_max": 2.2315433025360107, "rewards/margins_min": -0.4330921173095703, "rewards/margins_std": 1.222943663597107, "rewards/rejected": -2.2789580821990967, "step": 1900 }, { "epoch": 0.45, "eval_logits/chosen": -1.3854775428771973, "eval_logits/rejected": -1.3109267950057983, "eval_logps/chosen": -437.90777587890625, "eval_logps/rejected": -511.9908447265625, "eval_loss": 0.5073282718658447, "eval_rewards/accuracies": 0.7434999942779541, "eval_rewards/chosen": -1.5345247983932495, "eval_rewards/margins": 0.926851212978363, "eval_rewards/margins_max": 3.0226550102233887, "eval_rewards/margins_min": -0.9140751957893372, "eval_rewards/margins_std": 1.3340535163879395, "eval_rewards/rejected": -2.4613757133483887, "eval_runtime": 858.7895, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 1900 }, { "epoch": 0.46, "grad_norm": 9.069360701164776, "learning_rate": 3.2947699540698026e-06, "logits/chosen": -1.405133843421936, "logits/rejected": -1.3000431060791016, "logps/chosen": -431.3268127441406, "logps/rejected": -511.42572021484375, "loss": 0.4977, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4626184701919556, "rewards/margins": 1.0502574443817139, "rewards/margins_max": 2.346513032913208, "rewards/margins_min": -0.5906550884246826, "rewards/margins_std": 1.3104816675186157, "rewards/rejected": -2.51287579536438, "step": 1910 }, { "epoch": 0.46, "grad_norm": 7.120034612507216, "learning_rate": 3.2749272741670487e-06, "logits/chosen": -1.447333574295044, "logits/rejected": -1.4090019464492798, "logps/chosen": -411.537841796875, "logps/rejected": -493.18182373046875, "loss": 0.4962, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4495478868484497, "rewards/margins": 0.9764796495437622, "rewards/margins_max": 2.3097217082977295, "rewards/margins_min": -0.5519954562187195, "rewards/margins_std": 1.2779642343521118, "rewards/rejected": -2.426027536392212, "step": 1920 }, { "epoch": 0.46, "grad_norm": 9.217588396758167, "learning_rate": 3.2550304385097582e-06, "logits/chosen": -1.5685369968414307, "logits/rejected": -1.5027443170547485, "logps/chosen": -401.8031005859375, "logps/rejected": -446.0201110839844, "loss": 0.5101, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2456748485565186, "rewards/margins": 0.7952879667282104, "rewards/margins_max": 2.0392043590545654, "rewards/margins_min": -0.5871817469596863, "rewards/margins_std": 1.1961803436279297, "rewards/rejected": -2.0409629344940186, "step": 1930 }, { "epoch": 0.46, "grad_norm": 7.7254431736311355, "learning_rate": 3.2350808375873144e-06, "logits/chosen": -1.536704659461975, "logits/rejected": -1.5022917985916138, "logps/chosen": -356.07830810546875, "logps/rejected": -479.149658203125, "loss": 0.5034, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.3855254650115967, "rewards/margins": 0.8391044735908508, "rewards/margins_max": 2.184088706970215, "rewards/margins_min": -0.31070464849472046, "rewards/margins_std": 1.1125370264053345, "rewards/rejected": -2.2246298789978027, "step": 1940 }, { "epoch": 0.47, "grad_norm": 11.880118140458453, "learning_rate": 3.215079865576599e-06, "logits/chosen": -1.438852071762085, "logits/rejected": -1.476911187171936, "logps/chosen": -455.3157653808594, "logps/rejected": -531.848876953125, "loss": 0.4905, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7348988056182861, "rewards/margins": 0.8303203582763672, "rewards/margins_max": 2.2750494480133057, "rewards/margins_min": -0.6095766425132751, "rewards/margins_std": 1.2953624725341797, "rewards/rejected": -2.5652191638946533, "step": 1950 }, { "epoch": 0.47, "grad_norm": 9.667309005267201, "learning_rate": 3.1950289202445602e-06, "logits/chosen": -1.3323369026184082, "logits/rejected": -1.313685417175293, "logps/chosen": -479.53704833984375, "logps/rejected": -565.7108154296875, "loss": 0.4902, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0214624404907227, "rewards/margins": 0.951602578163147, "rewards/margins_max": 2.3593294620513916, "rewards/margins_min": -0.6272075772285461, "rewards/margins_std": 1.3286540508270264, "rewards/rejected": -2.973065137863159, "step": 1960 }, { "epoch": 0.47, "grad_norm": 9.769862361941906, "learning_rate": 3.1749294028505282e-06, "logits/chosen": -1.5002721548080444, "logits/rejected": -1.3375390768051147, "logps/chosen": -509.45086669921875, "logps/rejected": -583.6255493164062, "loss": 0.4933, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.3004138469696045, "rewards/margins": 0.9270550608634949, "rewards/margins_max": 2.275993585586548, "rewards/margins_min": -0.5272558927536011, "rewards/margins_std": 1.2511769533157349, "rewards/rejected": -3.227468967437744, "step": 1970 }, { "epoch": 0.47, "grad_norm": 9.311342308231112, "learning_rate": 3.1547827180482902e-06, "logits/chosen": -1.3826067447662354, "logits/rejected": -1.3060035705566406, "logps/chosen": -516.3895263671875, "logps/rejected": -611.2321166992188, "loss": 0.4627, "rewards/accuracies": 0.8125, "rewards/chosen": -2.130356788635254, "rewards/margins": 1.1578549146652222, "rewards/margins_max": 2.601581335067749, "rewards/margins_min": -0.4201889932155609, "rewards/margins_std": 1.3453563451766968, "rewards/rejected": -3.2882113456726074, "step": 1980 }, { "epoch": 0.48, "grad_norm": 9.209924022514425, "learning_rate": 3.1345902737879263e-06, "logits/chosen": -1.2021044492721558, "logits/rejected": -1.1528347730636597, "logps/chosen": -483.568359375, "logps/rejected": -593.017578125, "loss": 0.4903, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.2106411457061768, "rewards/margins": 1.1864206790924072, "rewards/margins_max": 2.963520050048828, "rewards/margins_min": -0.31762558221817017, "rewards/margins_std": 1.4709179401397705, "rewards/rejected": -3.397061586380005, "step": 1990 }, { "epoch": 0.48, "grad_norm": 4.373820554606291, "learning_rate": 3.114353481217411e-06, "logits/chosen": -1.4573493003845215, "logits/rejected": -1.347898006439209, "logps/chosen": -499.12646484375, "logps/rejected": -564.9286499023438, "loss": 0.4826, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.7903711795806885, "rewards/margins": 1.3154914379119873, "rewards/margins_max": 3.262065887451172, "rewards/margins_min": -0.5833441615104675, "rewards/margins_std": 1.7034196853637695, "rewards/rejected": -3.105862855911255, "step": 2000 }, { "epoch": 0.48, "eval_logits/chosen": -1.3956191539764404, "eval_logits/rejected": -1.320770263671875, "eval_logps/chosen": -447.22412109375, "eval_logps/rejected": -526.3553466796875, "eval_loss": 0.5103888511657715, "eval_rewards/accuracies": 0.7384999990463257, "eval_rewards/chosen": -1.6276882886886597, "eval_rewards/margins": 0.9773324728012085, "eval_rewards/margins_max": 3.259507417678833, "eval_rewards/margins_min": -0.9829249978065491, "eval_rewards/margins_std": 1.4282149076461792, "eval_rewards/rejected": -2.605020761489868, "eval_runtime": 859.4061, "eval_samples_per_second": 4.654, "eval_steps_per_second": 0.291, "step": 2000 }, { "epoch": 0.48, "grad_norm": 8.912849631279812, "learning_rate": 3.0940737545840017e-06, "logits/chosen": -1.351596713066101, "logits/rejected": -1.257891297340393, "logps/chosen": -440.3953552246094, "logps/rejected": -552.6337890625, "loss": 0.4303, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5836859941482544, "rewards/margins": 1.459411859512329, "rewards/margins_max": 3.0631070137023926, "rewards/margins_min": -0.15214934945106506, "rewards/margins_std": 1.4768000841140747, "rewards/rejected": -3.043097734451294, "step": 2010 }, { "epoch": 0.48, "grad_norm": 6.7366993161177415, "learning_rate": 3.0737525111353977e-06, "logits/chosen": -1.4598480463027954, "logits/rejected": -1.3969615697860718, "logps/chosen": -493.72869873046875, "logps/rejected": -560.8389892578125, "loss": 0.4932, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.030212879180908, "rewards/margins": 1.0158048868179321, "rewards/margins_max": 2.6848368644714355, "rewards/margins_min": -0.7448078393936157, "rewards/margins_std": 1.5434592962265015, "rewards/rejected": -3.0460174083709717, "step": 2020 }, { "epoch": 0.49, "grad_norm": 6.262373615748477, "learning_rate": 3.0533911710207025e-06, "logits/chosen": -1.4170711040496826, "logits/rejected": -1.3265941143035889, "logps/chosen": -483.5945739746094, "logps/rejected": -557.3770751953125, "loss": 0.5179, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8149263858795166, "rewards/margins": 1.0303953886032104, "rewards/margins_max": 2.6037116050720215, "rewards/margins_min": -0.6987926363945007, "rewards/margins_std": 1.462594985961914, "rewards/rejected": -2.8453216552734375, "step": 2030 }, { "epoch": 0.49, "grad_norm": 10.634576408727638, "learning_rate": 3.03299115719117e-06, "logits/chosen": -1.2689852714538574, "logits/rejected": -1.2693557739257812, "logps/chosen": -497.0118103027344, "logps/rejected": -583.0817260742188, "loss": 0.5328, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.43460750579834, "rewards/margins": 0.7952107191085815, "rewards/margins_max": 2.146256685256958, "rewards/margins_min": -0.7708228826522827, "rewards/margins_std": 1.3146307468414307, "rewards/rejected": -3.229818344116211, "step": 2040 }, { "epoch": 0.49, "grad_norm": 7.30996947233868, "learning_rate": 3.0125538953007656e-06, "logits/chosen": -1.2573572397232056, "logits/rejected": -1.2236084938049316, "logps/chosen": -519.5087280273438, "logps/rejected": -611.66943359375, "loss": 0.5091, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.5142078399658203, "rewards/margins": 1.0554949045181274, "rewards/margins_max": 2.699704170227051, "rewards/margins_min": -0.3915478587150574, "rewards/margins_std": 1.3530670404434204, "rewards/rejected": -3.5697033405303955, "step": 2050 }, { "epoch": 0.49, "grad_norm": 7.134936180551099, "learning_rate": 2.992080813606534e-06, "logits/chosen": -1.5767369270324707, "logits/rejected": -1.4981580972671509, "logps/chosen": -432.3941955566406, "logps/rejected": -516.8560791015625, "loss": 0.561, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.529122233390808, "rewards/margins": 0.9361482858657837, "rewards/margins_max": 2.7146384716033936, "rewards/margins_min": -0.4942001700401306, "rewards/margins_std": 1.4506335258483887, "rewards/rejected": -2.465270757675171, "step": 2060 }, { "epoch": 0.5, "grad_norm": 8.562669292920397, "learning_rate": 2.971573342868786e-06, "logits/chosen": -1.6939668655395508, "logits/rejected": -1.5772335529327393, "logps/chosen": -381.19268798828125, "logps/rejected": -468.251953125, "loss": 0.4753, "rewards/accuracies": 0.75, "rewards/chosen": -1.2449021339416504, "rewards/margins": 1.1208477020263672, "rewards/margins_max": 2.7645204067230225, "rewards/margins_min": -0.30286768078804016, "rewards/margins_std": 1.3859376907348633, "rewards/rejected": -2.3657495975494385, "step": 2070 }, { "epoch": 0.5, "grad_norm": 6.820238949793137, "learning_rate": 2.951032916251106e-06, "logits/chosen": -1.5906240940093994, "logits/rejected": -1.5890486240386963, "logps/chosen": -427.60504150390625, "logps/rejected": -464.0176696777344, "loss": 0.5694, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.247363805770874, "rewards/margins": 0.7459095120429993, "rewards/margins_max": 2.2692272663116455, "rewards/margins_min": -0.6600006222724915, "rewards/margins_std": 1.3008311986923218, "rewards/rejected": -1.993273377418518, "step": 2080 }, { "epoch": 0.5, "grad_norm": 7.446090993779655, "learning_rate": 2.9304609692202022e-06, "logits/chosen": -1.6725155115127563, "logits/rejected": -1.5678236484527588, "logps/chosen": -427.3331604003906, "logps/rejected": -532.4674682617188, "loss": 0.5051, "rewards/accuracies": 0.75, "rewards/chosen": -1.7062759399414062, "rewards/margins": 0.8098711967468262, "rewards/margins_max": 2.160189151763916, "rewards/margins_min": -0.6248751878738403, "rewards/margins_std": 1.2679208517074585, "rewards/rejected": -2.5161468982696533, "step": 2090 }, { "epoch": 0.5, "grad_norm": 4.368509982914677, "learning_rate": 2.909858939445584e-06, "logits/chosen": -1.5824897289276123, "logits/rejected": -1.51686692237854, "logps/chosen": -440.3798828125, "logps/rejected": -496.1526794433594, "loss": 0.4925, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7849003076553345, "rewards/margins": 0.745665431022644, "rewards/margins_max": 1.984789252281189, "rewards/margins_min": -0.488166481256485, "rewards/margins_std": 1.1289540529251099, "rewards/rejected": -2.5305657386779785, "step": 2100 }, { "epoch": 0.5, "eval_logits/chosen": -1.5931274890899658, "eval_logits/rejected": -1.5277045965194702, "eval_logps/chosen": -445.2355651855469, "eval_logps/rejected": -518.4150390625, "eval_loss": 0.5078689455986023, "eval_rewards/accuracies": 0.7354999780654907, "eval_rewards/chosen": -1.607802152633667, "eval_rewards/margins": 0.9178158044815063, "eval_rewards/margins_max": 2.9879229068756104, "eval_rewards/margins_min": -0.9517531991004944, "eval_rewards/margins_std": 1.3324427604675293, "eval_rewards/rejected": -2.525617837905884, "eval_runtime": 858.7125, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 2100 }, { "epoch": 0.51, "grad_norm": 8.233232972721316, "learning_rate": 2.8892282666990894e-06, "logits/chosen": -1.6269729137420654, "logits/rejected": -1.505124807357788, "logps/chosen": -400.94525146484375, "logps/rejected": -500.55694580078125, "loss": 0.4489, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.2662280797958374, "rewards/margins": 1.454037070274353, "rewards/margins_max": 3.056608200073242, "rewards/margins_min": -0.026442576199769974, "rewards/margins_std": 1.371537446975708, "rewards/rejected": -2.7202649116516113, "step": 2110 }, { "epoch": 0.51, "grad_norm": 6.551790927331911, "learning_rate": 2.8685703927542724e-06, "logits/chosen": -1.6917814016342163, "logits/rejected": -1.650728464126587, "logps/chosen": -472.22772216796875, "logps/rejected": -574.0148315429688, "loss": 0.5143, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.4886733293533325, "rewards/margins": 1.0912854671478271, "rewards/margins_max": 2.818134307861328, "rewards/margins_min": -0.5472265481948853, "rewards/margins_std": 1.476159691810608, "rewards/rejected": -2.57995867729187, "step": 2120 }, { "epoch": 0.51, "grad_norm": 7.466110720827431, "learning_rate": 2.84788676128564e-06, "logits/chosen": -1.5878061056137085, "logits/rejected": -1.470871090888977, "logps/chosen": -431.2444763183594, "logps/rejected": -497.02618408203125, "loss": 0.4916, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.409977912902832, "rewards/margins": 1.1446112394332886, "rewards/margins_max": 3.0400679111480713, "rewards/margins_min": -0.6352660059928894, "rewards/margins_std": 1.6419284343719482, "rewards/rejected": -2.554589033126831, "step": 2130 }, { "epoch": 0.51, "grad_norm": 9.833435661516258, "learning_rate": 2.8271788177677625e-06, "logits/chosen": -1.3607370853424072, "logits/rejected": -1.305716872215271, "logps/chosen": -495.0362243652344, "logps/rejected": -531.7301635742188, "loss": 0.5599, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9423983097076416, "rewards/margins": 0.839081883430481, "rewards/margins_max": 2.4581165313720703, "rewards/margins_min": -0.7212070226669312, "rewards/margins_std": 1.4980823993682861, "rewards/rejected": -2.781480312347412, "step": 2140 }, { "epoch": 0.51, "grad_norm": 4.981305268290964, "learning_rate": 2.8064480093742568e-06, "logits/chosen": -1.3585203886032104, "logits/rejected": -1.3365938663482666, "logps/chosen": -462.134033203125, "logps/rejected": -544.3034057617188, "loss": 0.4837, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.126803398132324, "rewards/margins": 0.9021514654159546, "rewards/margins_max": 2.3392693996429443, "rewards/margins_min": -0.49673447012901306, "rewards/margins_std": 1.2942036390304565, "rewards/rejected": -3.0289549827575684, "step": 2150 }, { "epoch": 0.52, "grad_norm": 7.941959600126348, "learning_rate": 2.78569578487665e-06, "logits/chosen": -1.3454647064208984, "logits/rejected": -1.1997286081314087, "logps/chosen": -498.728515625, "logps/rejected": -586.7105712890625, "loss": 0.5246, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.29012393951416, "rewards/margins": 1.0181314945220947, "rewards/margins_max": 2.9197440147399902, "rewards/margins_min": -0.5021864175796509, "rewards/margins_std": 1.522291898727417, "rewards/rejected": -3.308255434036255, "step": 2160 }, { "epoch": 0.52, "grad_norm": 6.069507756643778, "learning_rate": 2.7649235945431343e-06, "logits/chosen": -1.4169657230377197, "logits/rejected": -1.3416153192520142, "logps/chosen": -479.25872802734375, "logps/rejected": -619.8359985351562, "loss": 0.4488, "rewards/accuracies": 0.8125, "rewards/chosen": -1.854587197303772, "rewards/margins": 1.0979933738708496, "rewards/margins_max": 2.4382119178771973, "rewards/margins_min": -0.16580168902873993, "rewards/margins_std": 1.1742658615112305, "rewards/rejected": -2.952580213546753, "step": 2170 }, { "epoch": 0.52, "grad_norm": 5.5684816580531376, "learning_rate": 2.7441328900372104e-06, "logits/chosen": -1.5215349197387695, "logits/rejected": -1.4582687616348267, "logps/chosen": -466.25213623046875, "logps/rejected": -549.2777099609375, "loss": 0.4515, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.8330154418945312, "rewards/margins": 1.0216562747955322, "rewards/margins_max": 2.6479744911193848, "rewards/margins_min": -0.49261340498924255, "rewards/margins_std": 1.4038164615631104, "rewards/rejected": -2.8546714782714844, "step": 2180 }, { "epoch": 0.52, "grad_norm": 6.752654789799719, "learning_rate": 2.723325124316244e-06, "logits/chosen": -1.5991014242172241, "logits/rejected": -1.5353456735610962, "logps/chosen": -453.159423828125, "logps/rejected": -541.349853515625, "loss": 0.4456, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.3850345611572266, "rewards/margins": 1.1308423280715942, "rewards/margins_max": 2.5857348442077637, "rewards/margins_min": -0.2589626610279083, "rewards/margins_std": 1.2938416004180908, "rewards/rejected": -2.5158767700195312, "step": 2190 }, { "epoch": 0.53, "grad_norm": 8.345803454595272, "learning_rate": 2.7025017515299212e-06, "logits/chosen": -1.5189369916915894, "logits/rejected": -1.4542373418807983, "logps/chosen": -440.6048889160156, "logps/rejected": -473.508544921875, "loss": 0.546, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.675101637840271, "rewards/margins": 0.6560320854187012, "rewards/margins_max": 2.195218324661255, "rewards/margins_min": -0.7555332183837891, "rewards/margins_std": 1.287394404411316, "rewards/rejected": -2.3311336040496826, "step": 2200 }, { "epoch": 0.53, "eval_logits/chosen": -1.4937621355056763, "eval_logits/rejected": -1.424726963043213, "eval_logps/chosen": -455.42156982421875, "eval_logps/rejected": -534.668701171875, "eval_loss": 0.5099875330924988, "eval_rewards/accuracies": 0.7369999885559082, "eval_rewards/chosen": -1.709661841392517, "eval_rewards/margins": 0.9784924983978271, "eval_rewards/margins_max": 3.1491787433624268, "eval_rewards/margins_min": -1.0010526180267334, "eval_rewards/margins_std": 1.4117072820663452, "eval_rewards/rejected": -2.6881542205810547, "eval_runtime": 859.4742, "eval_samples_per_second": 4.654, "eval_steps_per_second": 0.291, "step": 2200 }, { "epoch": 0.53, "grad_norm": 10.298341962081505, "learning_rate": 2.6816642269186277e-06, "logits/chosen": -1.4257646799087524, "logits/rejected": -1.3905993700027466, "logps/chosen": -472.2066345214844, "logps/rejected": -537.9432373046875, "loss": 0.487, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8823211193084717, "rewards/margins": 0.9094759821891785, "rewards/margins_max": 2.337937831878662, "rewards/margins_min": -0.5735450983047485, "rewards/margins_std": 1.3096787929534912, "rewards/rejected": -2.791797161102295, "step": 2210 }, { "epoch": 0.53, "grad_norm": 7.580733583729247, "learning_rate": 2.6608140067117484e-06, "logits/chosen": -1.3724651336669922, "logits/rejected": -1.4244968891143799, "logps/chosen": -462.14031982421875, "logps/rejected": -577.19775390625, "loss": 0.4905, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.0610499382019043, "rewards/margins": 0.820233941078186, "rewards/margins_max": 2.245171308517456, "rewards/margins_min": -0.5233384966850281, "rewards/margins_std": 1.2698733806610107, "rewards/rejected": -2.881283760070801, "step": 2220 }, { "epoch": 0.53, "grad_norm": 5.6578845618167, "learning_rate": 2.6399525480258993e-06, "logits/chosen": -1.5305548906326294, "logits/rejected": -1.3694111108779907, "logps/chosen": -461.5701599121094, "logps/rejected": -510.45123291015625, "loss": 0.5134, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.61298406124115, "rewards/margins": 1.0377482175827026, "rewards/margins_max": 3.010249614715576, "rewards/margins_min": -0.5587365627288818, "rewards/margins_std": 1.6186710596084595, "rewards/rejected": -2.6507325172424316, "step": 2230 }, { "epoch": 0.54, "grad_norm": 15.996480554694484, "learning_rate": 2.6190813087630975e-06, "logits/chosen": -1.5510215759277344, "logits/rejected": -1.5052480697631836, "logps/chosen": -444.14288330078125, "logps/rejected": -504.14178466796875, "loss": 0.4887, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4947961568832397, "rewards/margins": 0.9640704989433289, "rewards/margins_max": 2.346989870071411, "rewards/margins_min": -0.4340631067752838, "rewards/margins_std": 1.25941002368927, "rewards/rejected": -2.458866596221924, "step": 2240 }, { "epoch": 0.54, "grad_norm": 9.475721958807414, "learning_rate": 2.5982017475088754e-06, "logits/chosen": -1.5905696153640747, "logits/rejected": -1.59246027469635, "logps/chosen": -457.93408203125, "logps/rejected": -535.5284423828125, "loss": 0.4699, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.4241809844970703, "rewards/margins": 0.9767951965332031, "rewards/margins_max": 2.300793409347534, "rewards/margins_min": -0.3483489453792572, "rewards/margins_std": 1.1980713605880737, "rewards/rejected": -2.4009761810302734, "step": 2250 }, { "epoch": 0.54, "grad_norm": 7.248570005205257, "learning_rate": 2.577315323430346e-06, "logits/chosen": -1.5093209743499756, "logits/rejected": -1.4536101818084717, "logps/chosen": -480.13653564453125, "logps/rejected": -582.1636962890625, "loss": 0.4745, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.9030964374542236, "rewards/margins": 1.1840343475341797, "rewards/margins_max": 2.510838031768799, "rewards/margins_min": -0.3299921154975891, "rewards/margins_std": 1.2596131563186646, "rewards/rejected": -3.087130546569824, "step": 2260 }, { "epoch": 0.54, "grad_norm": 11.502037862918032, "learning_rate": 2.5564234961742316e-06, "logits/chosen": -1.595126986503601, "logits/rejected": -1.523802399635315, "logps/chosen": -519.5052490234375, "logps/rejected": -578.7010498046875, "loss": 0.5315, "rewards/accuracies": 0.75, "rewards/chosen": -1.8327480554580688, "rewards/margins": 1.1002259254455566, "rewards/margins_max": 2.925300359725952, "rewards/margins_min": -0.5309810042381287, "rewards/margins_std": 1.5419347286224365, "rewards/rejected": -2.932974100112915, "step": 2270 }, { "epoch": 0.55, "grad_norm": 6.217147544009582, "learning_rate": 2.5355277257648557e-06, "logits/chosen": -1.6701971292495728, "logits/rejected": -1.585320234298706, "logps/chosen": -437.57720947265625, "logps/rejected": -504.685546875, "loss": 0.4862, "rewards/accuracies": 0.75, "rewards/chosen": -1.6871299743652344, "rewards/margins": 0.8367882966995239, "rewards/margins_max": 2.318037509918213, "rewards/margins_min": -0.5204218029975891, "rewards/margins_std": 1.3011711835861206, "rewards/rejected": -2.5239181518554688, "step": 2280 }, { "epoch": 0.55, "grad_norm": 8.781861633503457, "learning_rate": 2.5146294725021082e-06, "logits/chosen": -1.6889711618423462, "logits/rejected": -1.6827964782714844, "logps/chosen": -463.13201904296875, "logps/rejected": -524.9637451171875, "loss": 0.4436, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.2657660245895386, "rewards/margins": 1.1767778396606445, "rewards/margins_max": 2.671440839767456, "rewards/margins_min": -0.17516405880451202, "rewards/margins_std": 1.266649842262268, "rewards/rejected": -2.4425437450408936, "step": 2290 }, { "epoch": 0.55, "grad_norm": 9.734515291675656, "learning_rate": 2.493730196859392e-06, "logits/chosen": -1.6907333135604858, "logits/rejected": -1.6500955820083618, "logps/chosen": -389.38641357421875, "logps/rejected": -476.61077880859375, "loss": 0.4958, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.3276891708374023, "rewards/margins": 1.0072036981582642, "rewards/margins_max": 2.4188737869262695, "rewards/margins_min": -0.11966164410114288, "rewards/margins_std": 1.1348726749420166, "rewards/rejected": -2.334892749786377, "step": 2300 }, { "epoch": 0.55, "eval_logits/chosen": -1.7328070402145386, "eval_logits/rejected": -1.6757779121398926, "eval_logps/chosen": -432.6925048828125, "eval_logps/rejected": -505.2043151855469, "eval_loss": 0.5047218799591064, "eval_rewards/accuracies": 0.7384999990463257, "eval_rewards/chosen": -1.4823716878890991, "eval_rewards/margins": 0.9111387133598328, "eval_rewards/margins_max": 2.9983861446380615, "eval_rewards/margins_min": -0.8453922867774963, "eval_rewards/margins_std": 1.2950828075408936, "eval_rewards/rejected": -2.393510341644287, "eval_runtime": 858.3926, "eval_samples_per_second": 4.66, "eval_steps_per_second": 0.291, "step": 2300 }, { "epoch": 0.55, "grad_norm": 9.515888780467463, "learning_rate": 2.47283135938156e-06, "logits/chosen": -1.7710479497909546, "logits/rejected": -1.7229636907577515, "logps/chosen": -420.6233825683594, "logps/rejected": -493.7510681152344, "loss": 0.448, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.5302379131317139, "rewards/margins": 1.109464406967163, "rewards/margins_max": 2.5636467933654785, "rewards/margins_min": -0.19980910420417786, "rewards/margins_std": 1.212843656539917, "rewards/rejected": -2.639702558517456, "step": 2310 }, { "epoch": 0.56, "grad_norm": 6.805771050533889, "learning_rate": 2.451934420582846e-06, "logits/chosen": -1.6977558135986328, "logits/rejected": -1.6382665634155273, "logps/chosen": -461.22381591796875, "logps/rejected": -544.3499145507812, "loss": 0.4974, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.911831259727478, "rewards/margins": 1.0310547351837158, "rewards/margins_max": 2.8909430503845215, "rewards/margins_min": -0.4502525329589844, "rewards/margins_std": 1.5359809398651123, "rewards/rejected": -2.9428858757019043, "step": 2320 }, { "epoch": 0.56, "grad_norm": 11.681430811577153, "learning_rate": 2.4310408408447904e-06, "logits/chosen": -1.5689303874969482, "logits/rejected": -1.4718858003616333, "logps/chosen": -409.4576416015625, "logps/rejected": -487.1415100097656, "loss": 0.5504, "rewards/accuracies": 0.75, "rewards/chosen": -1.6568679809570312, "rewards/margins": 0.9590173959732056, "rewards/margins_max": 2.5301778316497803, "rewards/margins_min": -0.5030272603034973, "rewards/margins_std": 1.3432531356811523, "rewards/rejected": -2.6158852577209473, "step": 2330 }, { "epoch": 0.56, "grad_norm": 7.362163428497115, "learning_rate": 2.4101520803141904e-06, "logits/chosen": -1.7077395915985107, "logits/rejected": -1.6148802042007446, "logps/chosen": -437.78265380859375, "logps/rejected": -502.65118408203125, "loss": 0.4922, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.43501877784729, "rewards/margins": 0.8356463313102722, "rewards/margins_max": 2.3644096851348877, "rewards/margins_min": -0.5209301710128784, "rewards/margins_std": 1.3067666292190552, "rewards/rejected": -2.270665168762207, "step": 2340 }, { "epoch": 0.56, "grad_norm": 5.33987946906488, "learning_rate": 2.3892695988010486e-06, "logits/chosen": -1.6722971200942993, "logits/rejected": -1.5398986339569092, "logps/chosen": -410.517822265625, "logps/rejected": -461.22412109375, "loss": 0.4375, "rewards/accuracies": 0.75, "rewards/chosen": -1.2639796733856201, "rewards/margins": 1.1010314226150513, "rewards/margins_max": 2.4134328365325928, "rewards/margins_min": -0.39789700508117676, "rewards/margins_std": 1.2852272987365723, "rewards/rejected": -2.365011215209961, "step": 2350 }, { "epoch": 0.57, "grad_norm": 11.237788714984667, "learning_rate": 2.3683948556765626e-06, "logits/chosen": -1.5942354202270508, "logits/rejected": -1.5177490711212158, "logps/chosen": -415.64794921875, "logps/rejected": -490.9806213378906, "loss": 0.5399, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.590652585029602, "rewards/margins": 1.0099694728851318, "rewards/margins_max": 2.7013745307922363, "rewards/margins_min": -0.4725824296474457, "rewards/margins_std": 1.445387363433838, "rewards/rejected": -2.6006219387054443, "step": 2360 }, { "epoch": 0.57, "grad_norm": 9.764070579414364, "learning_rate": 2.3475293097711306e-06, "logits/chosen": -1.552592396736145, "logits/rejected": -1.4822497367858887, "logps/chosen": -433.27587890625, "logps/rejected": -524.9519653320312, "loss": 0.5305, "rewards/accuracies": 0.75, "rewards/chosen": -1.5414097309112549, "rewards/margins": 0.9306495785713196, "rewards/margins_max": 2.711658000946045, "rewards/margins_min": -0.6717098951339722, "rewards/margins_std": 1.5256764888763428, "rewards/rejected": -2.472059488296509, "step": 2370 }, { "epoch": 0.57, "grad_norm": 5.42189610721756, "learning_rate": 2.3266744192724055e-06, "logits/chosen": -1.5940955877304077, "logits/rejected": -1.537226915359497, "logps/chosen": -458.8291015625, "logps/rejected": -528.36865234375, "loss": 0.4893, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.6696497201919556, "rewards/margins": 0.947510838508606, "rewards/margins_max": 2.48576283454895, "rewards/margins_min": -0.6630622744560242, "rewards/margins_std": 1.4288580417633057, "rewards/rejected": -2.6171603202819824, "step": 2380 }, { "epoch": 0.57, "grad_norm": 6.4090450981268825, "learning_rate": 2.3058316416233865e-06, "logits/chosen": -1.6004807949066162, "logits/rejected": -1.553714632987976, "logps/chosen": -449.9927673339844, "logps/rejected": -502.8304138183594, "loss": 0.4881, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.611804723739624, "rewards/margins": 0.8119745254516602, "rewards/margins_max": 2.386160373687744, "rewards/margins_min": -0.7855569124221802, "rewards/margins_std": 1.407949686050415, "rewards/rejected": -2.423779249191284, "step": 2390 }, { "epoch": 0.57, "grad_norm": 10.322497521342642, "learning_rate": 2.285002433420566e-06, "logits/chosen": -1.517950177192688, "logits/rejected": -1.4765161275863647, "logps/chosen": -453.1875, "logps/rejected": -550.1472778320312, "loss": 0.4757, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8422619104385376, "rewards/margins": 1.1389998197555542, "rewards/margins_max": 3.0864944458007812, "rewards/margins_min": -0.42541661858558655, "rewards/margins_std": 1.5969220399856567, "rewards/rejected": -2.981261730194092, "step": 2400 }, { "epoch": 0.57, "eval_logits/chosen": -1.5347298383712769, "eval_logits/rejected": -1.46698796749115, "eval_logps/chosen": -451.443603515625, "eval_logps/rejected": -528.8899536132812, "eval_loss": 0.5021093487739563, "eval_rewards/accuracies": 0.7379999756813049, "eval_rewards/chosen": -1.6698826551437378, "eval_rewards/margins": 0.960483729839325, "eval_rewards/margins_max": 3.1589808464050293, "eval_rewards/margins_min": -0.8924158811569214, "eval_rewards/margins_std": 1.365614414215088, "eval_rewards/rejected": -2.630366563796997, "eval_runtime": 858.8112, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 2400 }, { "epoch": 0.58, "grad_norm": 10.451303092348102, "learning_rate": 2.2641882503121386e-06, "logits/chosen": -1.5329070091247559, "logits/rejected": -1.4015053510665894, "logps/chosen": -452.67816162109375, "logps/rejected": -502.8497009277344, "loss": 0.4265, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.630070686340332, "rewards/margins": 1.1837068796157837, "rewards/margins_max": 2.721210241317749, "rewards/margins_min": -0.14316949248313904, "rewards/margins_std": 1.2844146490097046, "rewards/rejected": -2.8137776851654053, "step": 2410 }, { "epoch": 0.58, "grad_norm": 6.571477943140168, "learning_rate": 2.243390546896268e-06, "logits/chosen": -1.5893810987472534, "logits/rejected": -1.5259112119674683, "logps/chosen": -465.62213134765625, "logps/rejected": -523.9273681640625, "loss": 0.4667, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6419111490249634, "rewards/margins": 1.0950944423675537, "rewards/margins_max": 2.4420688152313232, "rewards/margins_min": -0.21850213408470154, "rewards/margins_std": 1.2208373546600342, "rewards/rejected": -2.7370057106018066, "step": 2420 }, { "epoch": 0.58, "grad_norm": 10.08170698517187, "learning_rate": 2.2226107766194392e-06, "logits/chosen": -1.5206003189086914, "logits/rejected": -1.4219610691070557, "logps/chosen": -497.80389404296875, "logps/rejected": -551.6619873046875, "loss": 0.462, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.0135703086853027, "rewards/margins": 1.0748012065887451, "rewards/margins_max": 2.6845531463623047, "rewards/margins_min": -0.5345717072486877, "rewards/margins_std": 1.4747865200042725, "rewards/rejected": -3.088371515274048, "step": 2430 }, { "epoch": 0.58, "grad_norm": 6.685030472314355, "learning_rate": 2.2018503916748775e-06, "logits/chosen": -1.498291254043579, "logits/rejected": -1.3792328834533691, "logps/chosen": -519.913330078125, "logps/rejected": -557.4743041992188, "loss": 0.4755, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.151853322982788, "rewards/margins": 0.9892845153808594, "rewards/margins_max": 2.2692980766296387, "rewards/margins_min": -0.5079289078712463, "rewards/margins_std": 1.270462989807129, "rewards/rejected": -3.1411375999450684, "step": 2440 }, { "epoch": 0.59, "grad_norm": 17.122433658069756, "learning_rate": 2.181110842901066e-06, "logits/chosen": -1.539520025253296, "logits/rejected": -1.383545994758606, "logps/chosen": -495.7974548339844, "logps/rejected": -555.2947387695312, "loss": 0.4944, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.2047414779663086, "rewards/margins": 0.9906905293464661, "rewards/margins_max": 2.7851786613464355, "rewards/margins_min": -0.4489772915840149, "rewards/margins_std": 1.4740935564041138, "rewards/rejected": -3.19543194770813, "step": 2450 }, { "epoch": 0.59, "grad_norm": 8.580609182503949, "learning_rate": 2.160393579680353e-06, "logits/chosen": -1.4576019048690796, "logits/rejected": -1.4369218349456787, "logps/chosen": -473.08154296875, "logps/rejected": -626.2312622070312, "loss": 0.4195, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.0928921699523926, "rewards/margins": 1.649112343788147, "rewards/margins_max": 3.92876935005188, "rewards/margins_min": -0.25991302728652954, "rewards/margins_std": 1.8757679462432861, "rewards/rejected": -3.742004871368408, "step": 2460 }, { "epoch": 0.59, "grad_norm": 6.9128058202050875, "learning_rate": 2.139700049837664e-06, "logits/chosen": -1.624568223953247, "logits/rejected": -1.4699199199676514, "logps/chosen": -461.64398193359375, "logps/rejected": -560.53564453125, "loss": 0.4234, "rewards/accuracies": 0.75, "rewards/chosen": -1.8971761465072632, "rewards/margins": 1.1934576034545898, "rewards/margins_max": 3.282578706741333, "rewards/margins_min": -0.5413662791252136, "rewards/margins_std": 1.6816641092300415, "rewards/rejected": -3.0906338691711426, "step": 2470 }, { "epoch": 0.59, "grad_norm": 9.559451346514962, "learning_rate": 2.1190316995393146e-06, "logits/chosen": -1.6060011386871338, "logits/rejected": -1.4749659299850464, "logps/chosen": -463.02880859375, "logps/rejected": -529.7693481445312, "loss": 0.4496, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -2.0343003273010254, "rewards/margins": 1.1056135892868042, "rewards/margins_max": 2.5998406410217285, "rewards/margins_min": -0.3666464686393738, "rewards/margins_std": 1.3378335237503052, "rewards/rejected": -3.1399142742156982, "step": 2480 }, { "epoch": 0.6, "grad_norm": 9.020896388678535, "learning_rate": 2.0983899731919534e-06, "logits/chosen": -1.7245687246322632, "logits/rejected": -1.673567533493042, "logps/chosen": -460.3617248535156, "logps/rejected": -564.45263671875, "loss": 0.4557, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6444097757339478, "rewards/margins": 1.1680079698562622, "rewards/margins_max": 2.8182384967803955, "rewards/margins_min": -0.3254225552082062, "rewards/margins_std": 1.419060468673706, "rewards/rejected": -2.812417507171631, "step": 2490 }, { "epoch": 0.6, "grad_norm": 7.442230002798273, "learning_rate": 2.077776313341612e-06, "logits/chosen": -1.669634222984314, "logits/rejected": -1.5472640991210938, "logps/chosen": -431.654296875, "logps/rejected": -494.5664978027344, "loss": 0.4539, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.6709175109863281, "rewards/margins": 1.1873729228973389, "rewards/margins_max": 2.611132860183716, "rewards/margins_min": 0.01896936632692814, "rewards/margins_std": 1.182826280593872, "rewards/rejected": -2.858290195465088, "step": 2500 }, { "epoch": 0.6, "eval_logits/chosen": -1.6230649948120117, "eval_logits/rejected": -1.5550963878631592, "eval_logps/chosen": -458.697021484375, "eval_logps/rejected": -544.7555541992188, "eval_loss": 0.5025209188461304, "eval_rewards/accuracies": 0.7400000095367432, "eval_rewards/chosen": -1.7424167394638062, "eval_rewards/margins": 1.0466059446334839, "eval_rewards/margins_max": 3.431600570678711, "eval_rewards/margins_min": -1.003440022468567, "eval_rewards/margins_std": 1.5000827312469482, "eval_rewards/rejected": -2.78902268409729, "eval_runtime": 858.3316, "eval_samples_per_second": 4.66, "eval_steps_per_second": 0.291, "step": 2500 }, { "epoch": 0.6, "grad_norm": 10.100147027182482, "learning_rate": 2.0571921605728983e-06, "logits/chosen": -1.6438572406768799, "logits/rejected": -1.492493748664856, "logps/chosen": -437.442626953125, "logps/rejected": -565.9816284179688, "loss": 0.4586, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6739299297332764, "rewards/margins": 1.2464573383331299, "rewards/margins_max": 3.027808427810669, "rewards/margins_min": -0.3091750741004944, "rewards/margins_std": 1.465628981590271, "rewards/rejected": -2.9203872680664062, "step": 2510 }, { "epoch": 0.6, "grad_norm": 9.245904376625319, "learning_rate": 2.0366389534083184e-06, "logits/chosen": -1.589672327041626, "logits/rejected": -1.5032165050506592, "logps/chosen": -478.7520446777344, "logps/rejected": -560.1414794921875, "loss": 0.4923, "rewards/accuracies": 0.75, "rewards/chosen": -1.9261289834976196, "rewards/margins": 1.2117140293121338, "rewards/margins_max": 3.2072558403015137, "rewards/margins_min": -0.40109068155288696, "rewards/margins_std": 1.5912171602249146, "rewards/rejected": -3.137842893600464, "step": 2520 }, { "epoch": 0.61, "grad_norm": 6.7266413277538515, "learning_rate": 2.0161181282077473e-06, "logits/chosen": -1.5167386531829834, "logits/rejected": -1.496211290359497, "logps/chosen": -453.845947265625, "logps/rejected": -567.504638671875, "loss": 0.4548, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -2.1155858039855957, "rewards/margins": 1.1841260194778442, "rewards/margins_max": 3.0232720375061035, "rewards/margins_min": -0.2574582099914551, "rewards/margins_std": 1.4592278003692627, "rewards/rejected": -3.2997124195098877, "step": 2530 }, { "epoch": 0.61, "grad_norm": 6.991490232342838, "learning_rate": 1.9956311190680467e-06, "logits/chosen": -1.513106107711792, "logits/rejected": -1.491588830947876, "logps/chosen": -441.78271484375, "logps/rejected": -581.3522338867188, "loss": 0.5095, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8922529220581055, "rewards/margins": 1.2860217094421387, "rewards/margins_max": 3.363598585128784, "rewards/margins_min": -0.38733160495758057, "rewards/margins_std": 1.7006635665893555, "rewards/rejected": -3.178274631500244, "step": 2540 }, { "epoch": 0.61, "grad_norm": 5.991526978853561, "learning_rate": 1.9751793577228458e-06, "logits/chosen": -1.5937877893447876, "logits/rejected": -1.5765856504440308, "logps/chosen": -477.9012756347656, "logps/rejected": -564.12158203125, "loss": 0.4735, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.774701714515686, "rewards/margins": 1.1578586101531982, "rewards/margins_max": 2.723811388015747, "rewards/margins_min": -0.13567550480365753, "rewards/margins_std": 1.292783260345459, "rewards/rejected": -2.932560682296753, "step": 2550 }, { "epoch": 0.61, "grad_norm": 6.353001955776313, "learning_rate": 1.9547642734424826e-06, "logits/chosen": -1.4566872119903564, "logits/rejected": -1.462127447128296, "logps/chosen": -449.5235900878906, "logps/rejected": -576.8457641601562, "loss": 0.4553, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6022322177886963, "rewards/margins": 1.4534753561019897, "rewards/margins_max": 3.9526607990264893, "rewards/margins_min": -0.3992669880390167, "rewards/margins_std": 1.9969440698623657, "rewards/rejected": -3.0557074546813965, "step": 2560 }, { "epoch": 0.62, "grad_norm": 10.669705037667176, "learning_rate": 1.93438729293412e-06, "logits/chosen": -1.548723816871643, "logits/rejected": -1.4915927648544312, "logps/chosen": -449.71600341796875, "logps/rejected": -549.2357177734375, "loss": 0.4603, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.57132887840271, "rewards/margins": 1.1639609336853027, "rewards/margins_max": 3.138021230697632, "rewards/margins_min": -0.3436947762966156, "rewards/margins_std": 1.5656259059906006, "rewards/rejected": -2.7352898120880127, "step": 2570 }, { "epoch": 0.62, "grad_norm": 7.253583877294134, "learning_rate": 1.914049840242042e-06, "logits/chosen": -1.5208799839019775, "logits/rejected": -1.4518271684646606, "logps/chosen": -486.6087341308594, "logps/rejected": -611.173583984375, "loss": 0.4637, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8173754215240479, "rewards/margins": 1.4125771522521973, "rewards/margins_max": 3.496774673461914, "rewards/margins_min": -0.3764607012271881, "rewards/margins_std": 1.7513564825057983, "rewards/rejected": -3.229952335357666, "step": 2580 }, { "epoch": 0.62, "grad_norm": 7.649436441245351, "learning_rate": 1.893753336648131e-06, "logits/chosen": -1.387618064880371, "logits/rejected": -1.3994258642196655, "logps/chosen": -475.49603271484375, "logps/rejected": -597.6386108398438, "loss": 0.5623, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -2.152209520339966, "rewards/margins": 1.1594469547271729, "rewards/margins_max": 3.144350528717041, "rewards/margins_min": -0.6203373670578003, "rewards/margins_std": 1.6667858362197876, "rewards/rejected": -3.3116562366485596, "step": 2590 }, { "epoch": 0.62, "grad_norm": 7.489460724387215, "learning_rate": 1.8734992005725466e-06, "logits/chosen": -1.364940881729126, "logits/rejected": -1.3583667278289795, "logps/chosen": -491.06805419921875, "logps/rejected": -601.6492919921875, "loss": 0.4612, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.972088098526001, "rewards/margins": 1.2795677185058594, "rewards/margins_max": 3.2342934608459473, "rewards/margins_min": -0.40469950437545776, "rewards/margins_std": 1.6071163415908813, "rewards/rejected": -3.2516555786132812, "step": 2600 }, { "epoch": 0.62, "eval_logits/chosen": -1.5622367858886719, "eval_logits/rejected": -1.4927018880844116, "eval_logps/chosen": -459.4844055175781, "eval_logps/rejected": -547.0906982421875, "eval_loss": 0.4991469085216522, "eval_rewards/accuracies": 0.7415000200271606, "eval_rewards/chosen": -1.7502907514572144, "eval_rewards/margins": 1.0620832443237305, "eval_rewards/margins_max": 3.47208833694458, "eval_rewards/margins_min": -0.9694882035255432, "eval_rewards/margins_std": 1.5040746927261353, "eval_rewards/rejected": -2.8123738765716553, "eval_runtime": 858.3544, "eval_samples_per_second": 4.66, "eval_steps_per_second": 0.291, "step": 2600 }, { "epoch": 0.62, "grad_norm": 7.094045221176537, "learning_rate": 1.8532888474745942e-06, "logits/chosen": -1.6516609191894531, "logits/rejected": -1.5637575387954712, "logps/chosen": -486.46258544921875, "logps/rejected": -554.5183715820312, "loss": 0.4473, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.787824034690857, "rewards/margins": 1.0513440370559692, "rewards/margins_max": 2.7819669246673584, "rewards/margins_min": -0.47929859161376953, "rewards/margins_std": 1.4460865259170532, "rewards/rejected": -2.839168071746826, "step": 2610 }, { "epoch": 0.63, "grad_norm": 7.807628119617894, "learning_rate": 1.8331236897538067e-06, "logits/chosen": -1.588700532913208, "logits/rejected": -1.560967206954956, "logps/chosen": -454.8846740722656, "logps/rejected": -550.0186157226562, "loss": 0.51, "rewards/accuracies": 0.75, "rewards/chosen": -1.7678813934326172, "rewards/margins": 1.013344168663025, "rewards/margins_max": 2.955592393875122, "rewards/margins_min": -0.6843963861465454, "rewards/margins_std": 1.608072280883789, "rewards/rejected": -2.7812259197235107, "step": 2620 }, { "epoch": 0.63, "grad_norm": 6.397922369373612, "learning_rate": 1.813005136651245e-06, "logits/chosen": -1.6216070652008057, "logits/rejected": -1.4607044458389282, "logps/chosen": -466.19122314453125, "logps/rejected": -584.9078369140625, "loss": 0.465, "rewards/accuracies": 0.75, "rewards/chosen": -1.722050428390503, "rewards/margins": 1.1701828241348267, "rewards/margins_max": 3.015313148498535, "rewards/margins_min": -0.30299392342567444, "rewards/margins_std": 1.4862773418426514, "rewards/rejected": -2.892233371734619, "step": 2630 }, { "epoch": 0.63, "grad_norm": 10.641082440148976, "learning_rate": 1.7929345941510033e-06, "logits/chosen": -1.5517534017562866, "logits/rejected": -1.505121111869812, "logps/chosen": -505.3042907714844, "logps/rejected": -573.451416015625, "loss": 0.5207, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -2.387955904006958, "rewards/margins": 0.9375236630439758, "rewards/margins_max": 2.6147899627685547, "rewards/margins_min": -0.6164341568946838, "rewards/margins_std": 1.448771595954895, "rewards/rejected": -3.325479507446289, "step": 2640 }, { "epoch": 0.63, "grad_norm": 5.575597195649518, "learning_rate": 1.7729134648819607e-06, "logits/chosen": -1.3920881748199463, "logits/rejected": -1.3064746856689453, "logps/chosen": -471.138427734375, "logps/rejected": -569.7762451171875, "loss": 0.4743, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.2370474338531494, "rewards/margins": 1.1462113857269287, "rewards/margins_max": 3.2237114906311035, "rewards/margins_min": -0.6230897903442383, "rewards/margins_std": 1.693587064743042, "rewards/rejected": -3.383258819580078, "step": 2650 }, { "epoch": 0.64, "grad_norm": 10.344222781160944, "learning_rate": 1.7529431480197535e-06, "logits/chosen": -1.4968098402023315, "logits/rejected": -1.3700649738311768, "logps/chosen": -478.87322998046875, "logps/rejected": -576.4004516601562, "loss": 0.4663, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -2.0489962100982666, "rewards/margins": 1.1615712642669678, "rewards/margins_max": 2.915788173675537, "rewards/margins_min": -0.542585551738739, "rewards/margins_std": 1.4739781618118286, "rewards/rejected": -3.2105674743652344, "step": 2660 }, { "epoch": 0.64, "grad_norm": 9.058745349637018, "learning_rate": 1.7330250391889964e-06, "logits/chosen": -1.5811752080917358, "logits/rejected": -1.4057408571243286, "logps/chosen": -462.890625, "logps/rejected": -519.0880126953125, "loss": 0.4325, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8253253698349, "rewards/margins": 1.271043300628662, "rewards/margins_max": 2.5177042484283447, "rewards/margins_min": -0.029818082228302956, "rewards/margins_std": 1.1593960523605347, "rewards/rejected": -3.0963687896728516, "step": 2670 }, { "epoch": 0.64, "grad_norm": 11.319961363386126, "learning_rate": 1.7131605303657473e-06, "logits/chosen": -1.6442034244537354, "logits/rejected": -1.584252119064331, "logps/chosen": -490.168212890625, "logps/rejected": -553.1834716796875, "loss": 0.5053, "rewards/accuracies": 0.6875, "rewards/chosen": -1.98139226436615, "rewards/margins": 0.735024094581604, "rewards/margins_max": 2.279207706451416, "rewards/margins_min": -0.6796191334724426, "rewards/margins_std": 1.3292591571807861, "rewards/rejected": -2.716416358947754, "step": 2680 }, { "epoch": 0.64, "grad_norm": 11.03358568746165, "learning_rate": 1.693351009780231e-06, "logits/chosen": -1.522397756576538, "logits/rejected": -1.4206411838531494, "logps/chosen": -475.608642578125, "logps/rejected": -556.1173095703125, "loss": 0.545, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.894669532775879, "rewards/margins": 1.2010729312896729, "rewards/margins_max": 3.348712921142578, "rewards/margins_min": -0.7458361387252808, "rewards/margins_std": 1.8151811361312866, "rewards/rejected": -3.0957422256469727, "step": 2690 }, { "epoch": 0.65, "grad_norm": 8.029822552954306, "learning_rate": 1.6735978618198217e-06, "logits/chosen": -1.6624904870986938, "logits/rejected": -1.6340761184692383, "logps/chosen": -408.81597900390625, "logps/rejected": -529.294677734375, "loss": 0.5267, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.58699631690979, "rewards/margins": 0.8897632360458374, "rewards/margins_max": 2.805419683456421, "rewards/margins_min": -0.672760009765625, "rewards/margins_std": 1.577953815460205, "rewards/rejected": -2.476759433746338, "step": 2700 }, { "epoch": 0.65, "eval_logits/chosen": -1.616060495376587, "eval_logits/rejected": -1.547584891319275, "eval_logps/chosen": -444.3344421386719, "eval_logps/rejected": -524.54541015625, "eval_loss": 0.4989367127418518, "eval_rewards/accuracies": 0.7409999966621399, "eval_rewards/chosen": -1.5987910032272339, "eval_rewards/margins": 0.9881307482719421, "eval_rewards/margins_max": 3.2210049629211426, "eval_rewards/margins_min": -0.940081775188446, "eval_rewards/margins_std": 1.4114196300506592, "eval_rewards/rejected": -2.5869216918945312, "eval_runtime": 858.6728, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 2700 }, { "epoch": 0.65, "grad_norm": 8.45888358394986, "learning_rate": 1.6539024669322957e-06, "logits/chosen": -1.6182868480682373, "logits/rejected": -1.5641367435455322, "logps/chosen": -442.4955139160156, "logps/rejected": -505.029052734375, "loss": 0.4903, "rewards/accuracies": 0.75, "rewards/chosen": -1.5408172607421875, "rewards/margins": 1.0318057537078857, "rewards/margins_max": 2.6445024013519287, "rewards/margins_min": -0.5217481851577759, "rewards/margins_std": 1.3896628618240356, "rewards/rejected": -2.5726232528686523, "step": 2710 }, { "epoch": 0.65, "grad_norm": 10.691905758791686, "learning_rate": 1.6342662015293586e-06, "logits/chosen": -1.5570952892303467, "logits/rejected": -1.483463168144226, "logps/chosen": -485.99761962890625, "logps/rejected": -539.72021484375, "loss": 0.5289, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.807952880859375, "rewards/margins": 0.9752623438835144, "rewards/margins_max": 2.3032968044281006, "rewards/margins_min": -0.1790485382080078, "rewards/margins_std": 1.1256717443466187, "rewards/rejected": -2.783215045928955, "step": 2720 }, { "epoch": 0.65, "grad_norm": 11.097580283420916, "learning_rate": 1.6146904378904537e-06, "logits/chosen": -1.7505366802215576, "logits/rejected": -1.6678968667984009, "logps/chosen": -519.6309814453125, "logps/rejected": -583.5548095703125, "loss": 0.5402, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.851824402809143, "rewards/margins": 0.8206771016120911, "rewards/margins_max": 2.1993632316589355, "rewards/margins_min": -0.792681872844696, "rewards/margins_std": 1.3641926050186157, "rewards/rejected": -2.6725013256073, "step": 2730 }, { "epoch": 0.66, "grad_norm": 8.10669700784192, "learning_rate": 1.5951765440668638e-06, "logits/chosen": -1.6676275730133057, "logits/rejected": -1.5545639991760254, "logps/chosen": -462.1187438964844, "logps/rejected": -506.47802734375, "loss": 0.4352, "rewards/accuracies": 0.75, "rewards/chosen": -1.534622073173523, "rewards/margins": 1.1413172483444214, "rewards/margins_max": 2.672224760055542, "rewards/margins_min": -0.2933486998081207, "rewards/margins_std": 1.300026297569275, "rewards/rejected": -2.6759395599365234, "step": 2740 }, { "epoch": 0.66, "grad_norm": 9.536508779151006, "learning_rate": 1.5757258837861001e-06, "logits/chosen": -1.590012550354004, "logits/rejected": -1.5067466497421265, "logps/chosen": -455.28448486328125, "logps/rejected": -542.62646484375, "loss": 0.4475, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.769948959350586, "rewards/margins": 1.2897696495056152, "rewards/margins_max": 2.9668567180633545, "rewards/margins_min": -0.004758751485496759, "rewards/margins_std": 1.3211925029754639, "rewards/rejected": -3.059718608856201, "step": 2750 }, { "epoch": 0.66, "grad_norm": 7.123985352130034, "learning_rate": 1.5563398163566035e-06, "logits/chosen": -1.5851211547851562, "logits/rejected": -1.5841548442840576, "logps/chosen": -432.78973388671875, "logps/rejected": -567.9972534179688, "loss": 0.4379, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7309300899505615, "rewards/margins": 1.2033436298370361, "rewards/margins_max": 3.1999754905700684, "rewards/margins_min": -0.28025731444358826, "rewards/margins_std": 1.574230432510376, "rewards/rejected": -2.9342737197875977, "step": 2760 }, { "epoch": 0.66, "grad_norm": 11.118477270031144, "learning_rate": 1.5370196965727441e-06, "logits/chosen": -1.5144541263580322, "logits/rejected": -1.4531757831573486, "logps/chosen": -469.0777282714844, "logps/rejected": -545.5760498046875, "loss": 0.4884, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.9975650310516357, "rewards/margins": 0.8732191920280457, "rewards/margins_max": 2.437157392501831, "rewards/margins_min": -0.6706037521362305, "rewards/margins_std": 1.38334059715271, "rewards/rejected": -2.870784282684326, "step": 2770 }, { "epoch": 0.67, "grad_norm": 5.2405335592937465, "learning_rate": 1.5177668746201457e-06, "logits/chosen": -1.54389226436615, "logits/rejected": -1.5514787435531616, "logps/chosen": -413.199462890625, "logps/rejected": -538.0057983398438, "loss": 0.489, "rewards/accuracies": 0.75, "rewards/chosen": -1.6714541912078857, "rewards/margins": 1.1747454404830933, "rewards/margins_max": 2.8171987533569336, "rewards/margins_min": -0.4911231994628906, "rewards/margins_std": 1.5027049779891968, "rewards/rejected": -2.8461995124816895, "step": 2780 }, { "epoch": 0.67, "grad_norm": 6.755045382716433, "learning_rate": 1.4985826959813257e-06, "logits/chosen": -1.6939125061035156, "logits/rejected": -1.5935533046722412, "logps/chosen": -512.621826171875, "logps/rejected": -584.4033813476562, "loss": 0.476, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.954210877418518, "rewards/margins": 0.9447482228279114, "rewards/margins_max": 2.5510458946228027, "rewards/margins_min": -0.5085705518722534, "rewards/margins_std": 1.3709584474563599, "rewards/rejected": -2.898958921432495, "step": 2790 }, { "epoch": 0.67, "grad_norm": 5.09636148547422, "learning_rate": 1.4794685013416676e-06, "logits/chosen": -1.604547142982483, "logits/rejected": -1.5176610946655273, "logps/chosen": -450.4493103027344, "logps/rejected": -519.4747314453125, "loss": 0.4999, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.4736088514328003, "rewards/margins": 1.0175435543060303, "rewards/margins_max": 2.346195697784424, "rewards/margins_min": -0.12045016139745712, "rewards/margins_std": 1.1006370782852173, "rewards/rejected": -2.491152286529541, "step": 2800 }, { "epoch": 0.67, "eval_logits/chosen": -1.5935293436050415, "eval_logits/rejected": -1.5259904861450195, "eval_logps/chosen": -444.4690246582031, "eval_logps/rejected": -525.3958129882812, "eval_loss": 0.4973601996898651, "eval_rewards/accuracies": 0.746999979019165, "eval_rewards/chosen": -1.6001369953155518, "eval_rewards/margins": 0.9952890872955322, "eval_rewards/margins_max": 3.227216958999634, "eval_rewards/margins_min": -0.8963525891304016, "eval_rewards/margins_std": 1.397277593612671, "eval_rewards/rejected": -2.595426082611084, "eval_runtime": 858.8733, "eval_samples_per_second": 4.657, "eval_steps_per_second": 0.291, "step": 2800 }, { "epoch": 0.67, "grad_norm": 9.286459337463723, "learning_rate": 1.460425626495725e-06, "logits/chosen": -1.586920976638794, "logits/rejected": -1.52964448928833, "logps/chosen": -413.439697265625, "logps/rejected": -500.71551513671875, "loss": 0.5014, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6407222747802734, "rewards/margins": 1.0434004068374634, "rewards/margins_max": 2.5435268878936768, "rewards/margins_min": -0.5732196569442749, "rewards/margins_std": 1.4040426015853882, "rewards/rejected": -2.6841228008270264, "step": 2810 }, { "epoch": 0.68, "grad_norm": 5.687634812318806, "learning_rate": 1.4414554022538737e-06, "logits/chosen": -1.6608855724334717, "logits/rejected": -1.538491129875183, "logps/chosen": -452.3095703125, "logps/rejected": -519.2757568359375, "loss": 0.4354, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6371246576309204, "rewards/margins": 1.1805613040924072, "rewards/margins_max": 2.540097713470459, "rewards/margins_min": -0.027507686987519264, "rewards/margins_std": 1.1434564590454102, "rewards/rejected": -2.8176863193511963, "step": 2820 }, { "epoch": 0.68, "grad_norm": 7.945864617923413, "learning_rate": 1.4225591543493028e-06, "logits/chosen": -1.4341697692871094, "logits/rejected": -1.4068892002105713, "logps/chosen": -422.993408203125, "logps/rejected": -575.92236328125, "loss": 0.4388, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7757136821746826, "rewards/margins": 1.1609383821487427, "rewards/margins_max": 2.6687464714050293, "rewards/margins_min": -0.3195762038230896, "rewards/margins_std": 1.3747609853744507, "rewards/rejected": -2.936652183532715, "step": 2830 }, { "epoch": 0.68, "grad_norm": 8.434924827652262, "learning_rate": 1.40373820334537e-06, "logits/chosen": -1.5997850894927979, "logits/rejected": -1.5765736103057861, "logps/chosen": -472.60430908203125, "logps/rejected": -581.9644165039062, "loss": 0.4531, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.8966352939605713, "rewards/margins": 1.1718589067459106, "rewards/margins_max": 2.7525086402893066, "rewards/margins_min": -0.3338681161403656, "rewards/margins_std": 1.3764301538467407, "rewards/rejected": -3.0684940814971924, "step": 2840 }, { "epoch": 0.68, "grad_norm": 10.211000262813997, "learning_rate": 1.384993864543314e-06, "logits/chosen": -1.6397355794906616, "logits/rejected": -1.5828436613082886, "logps/chosen": -461.42767333984375, "logps/rejected": -585.218017578125, "loss": 0.4332, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.781672477722168, "rewards/margins": 1.2030837535858154, "rewards/margins_max": 2.842581033706665, "rewards/margins_min": -0.37432295083999634, "rewards/margins_std": 1.43550443649292, "rewards/rejected": -2.9847559928894043, "step": 2850 }, { "epoch": 0.68, "grad_norm": 7.999647848007978, "learning_rate": 1.366327447890332e-06, "logits/chosen": -1.6448423862457275, "logits/rejected": -1.5426054000854492, "logps/chosen": -488.250244140625, "logps/rejected": -604.1597900390625, "loss": 0.4441, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8200136423110962, "rewards/margins": 1.3400704860687256, "rewards/margins_max": 3.0916287899017334, "rewards/margins_min": -0.24774429202079773, "rewards/margins_std": 1.4649914503097534, "rewards/rejected": -3.1600842475891113, "step": 2860 }, { "epoch": 0.69, "grad_norm": 4.195972582499311, "learning_rate": 1.3477402578880358e-06, "logits/chosen": -1.6599258184432983, "logits/rejected": -1.570160150527954, "logps/chosen": -504.46746826171875, "logps/rejected": -615.2977905273438, "loss": 0.4453, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.0757226943969727, "rewards/margins": 1.215172529220581, "rewards/margins_max": 2.9469292163848877, "rewards/margins_min": -0.24775537848472595, "rewards/margins_std": 1.434788703918457, "rewards/rejected": -3.2908949851989746, "step": 2870 }, { "epoch": 0.69, "grad_norm": 10.574804599120037, "learning_rate": 1.3292335935012856e-06, "logits/chosen": -1.6585735082626343, "logits/rejected": -1.5773788690567017, "logps/chosen": -524.6055908203125, "logps/rejected": -599.4426879882812, "loss": 0.4814, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.973162055015564, "rewards/margins": 1.288515567779541, "rewards/margins_max": 3.0959415435791016, "rewards/margins_min": -0.2980514168739319, "rewards/margins_std": 1.5396877527236938, "rewards/rejected": -3.2616775035858154, "step": 2880 }, { "epoch": 0.69, "grad_norm": 8.526267808180576, "learning_rate": 1.3108087480674166e-06, "logits/chosen": -1.6349773406982422, "logits/rejected": -1.6315793991088867, "logps/chosen": -527.72705078125, "logps/rejected": -616.4989013671875, "loss": 0.5073, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.1446034908294678, "rewards/margins": 1.0406761169433594, "rewards/margins_max": 2.6676039695739746, "rewards/margins_min": -0.47941136360168457, "rewards/margins_std": 1.421641230583191, "rewards/rejected": -3.185279369354248, "step": 2890 }, { "epoch": 0.69, "grad_norm": 5.141769328192468, "learning_rate": 1.2924670092058466e-06, "logits/chosen": -1.6630945205688477, "logits/rejected": -1.5793529748916626, "logps/chosen": -482.8494567871094, "logps/rejected": -524.4531860351562, "loss": 0.4589, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7761814594268799, "rewards/margins": 1.1712186336517334, "rewards/margins_max": 2.8722996711730957, "rewards/margins_min": -0.43779173493385315, "rewards/margins_std": 1.4756114482879639, "rewards/rejected": -2.947399854660034, "step": 2900 }, { "epoch": 0.69, "eval_logits/chosen": -1.615408182144165, "eval_logits/rejected": -1.5484470129013062, "eval_logps/chosen": -462.74639892578125, "eval_logps/rejected": -552.1007690429688, "eval_loss": 0.4977453052997589, "eval_rewards/accuracies": 0.7415000200271606, "eval_rewards/chosen": -1.782910943031311, "eval_rewards/margins": 1.079564094543457, "eval_rewards/margins_max": 3.5812063217163086, "eval_rewards/margins_min": -0.9487740993499756, "eval_rewards/margins_std": 1.5303826332092285, "eval_rewards/rejected": -2.8624749183654785, "eval_runtime": 858.6315, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 2900 }, { "epoch": 0.7, "grad_norm": 10.438711130377953, "learning_rate": 1.2742096587280967e-06, "logits/chosen": -1.6030820608139038, "logits/rejected": -1.510866403579712, "logps/chosen": -451.1293029785156, "logps/rejected": -519.8489990234375, "loss": 0.5079, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7694326639175415, "rewards/margins": 1.0720126628875732, "rewards/margins_max": 3.1856532096862793, "rewards/margins_min": -0.5586379170417786, "rewards/margins_std": 1.6931205987930298, "rewards/rejected": -2.841445207595825, "step": 2910 }, { "epoch": 0.7, "grad_norm": 11.077391176440367, "learning_rate": 1.2560379725482076e-06, "logits/chosen": -1.7409336566925049, "logits/rejected": -1.6093370914459229, "logps/chosen": -431.6946716308594, "logps/rejected": -508.25079345703125, "loss": 0.4324, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.388869047164917, "rewards/margins": 1.1599212884902954, "rewards/margins_max": 2.8614420890808105, "rewards/margins_min": -0.3841838538646698, "rewards/margins_std": 1.4353570938110352, "rewards/rejected": -2.548790454864502, "step": 2920 }, { "epoch": 0.7, "grad_norm": 8.31223061490778, "learning_rate": 1.2379532205935793e-06, "logits/chosen": -1.7721216678619385, "logits/rejected": -1.6664607524871826, "logps/chosen": -453.82647705078125, "logps/rejected": -534.6204833984375, "loss": 0.5053, "rewards/accuracies": 0.75, "rewards/chosen": -1.4804136753082275, "rewards/margins": 1.2547385692596436, "rewards/margins_max": 3.4649479389190674, "rewards/margins_min": -0.6966042518615723, "rewards/margins_std": 1.8686481714248657, "rewards/rejected": -2.735152244567871, "step": 2930 }, { "epoch": 0.7, "grad_norm": 7.3410296657031955, "learning_rate": 1.219956666716213e-06, "logits/chosen": -1.7742465734481812, "logits/rejected": -1.6420280933380127, "logps/chosen": -470.18560791015625, "logps/rejected": -543.0935668945312, "loss": 0.4292, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5196362733840942, "rewards/margins": 1.3284763097763062, "rewards/margins_max": 2.972414493560791, "rewards/margins_min": -0.5097461938858032, "rewards/margins_std": 1.522706151008606, "rewards/rejected": -2.8481128215789795, "step": 2940 }, { "epoch": 0.71, "grad_norm": 5.984865492199202, "learning_rate": 1.2020495686043926e-06, "logits/chosen": -1.6936085224151611, "logits/rejected": -1.6235971450805664, "logps/chosen": -466.8501892089844, "logps/rejected": -558.4576416015625, "loss": 0.4287, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6507959365844727, "rewards/margins": 1.3607865571975708, "rewards/margins_max": 3.0247764587402344, "rewards/margins_min": -0.17899832129478455, "rewards/margins_std": 1.45443594455719, "rewards/rejected": -3.011582136154175, "step": 2950 }, { "epoch": 0.71, "grad_norm": 7.042578764161184, "learning_rate": 1.1842331776947932e-06, "logits/chosen": -1.7281017303466797, "logits/rejected": -1.6456937789916992, "logps/chosen": -495.4107971191406, "logps/rejected": -518.2816162109375, "loss": 0.4912, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6347084045410156, "rewards/margins": 1.0368341207504272, "rewards/margins_max": 2.684016227722168, "rewards/margins_min": -0.4110879898071289, "rewards/margins_std": 1.3898698091506958, "rewards/rejected": -2.6715426445007324, "step": 2960 }, { "epoch": 0.71, "grad_norm": 8.568570623321925, "learning_rate": 1.1665087390850188e-06, "logits/chosen": -1.636214017868042, "logits/rejected": -1.5875431299209595, "logps/chosen": -409.6216735839844, "logps/rejected": -538.3866577148438, "loss": 0.5023, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9594217538833618, "rewards/margins": 0.9890398979187012, "rewards/margins_max": 2.798807144165039, "rewards/margins_min": -0.5868322253227234, "rewards/margins_std": 1.5370733737945557, "rewards/rejected": -2.9484617710113525, "step": 2970 }, { "epoch": 0.71, "grad_norm": 7.396282458212379, "learning_rate": 1.1488774914465919e-06, "logits/chosen": -1.6374868154525757, "logits/rejected": -1.596036672592163, "logps/chosen": -441.7151794433594, "logps/rejected": -586.535400390625, "loss": 0.4903, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.857410192489624, "rewards/margins": 1.359551191329956, "rewards/margins_max": 3.483840227127075, "rewards/margins_min": -0.4195144772529602, "rewards/margins_std": 1.7567079067230225, "rewards/rejected": -3.21696138381958, "step": 2980 }, { "epoch": 0.72, "grad_norm": 6.9006435370613035, "learning_rate": 1.1313406669383878e-06, "logits/chosen": -1.705688238143921, "logits/rejected": -1.598695993423462, "logps/chosen": -494.7142028808594, "logps/rejected": -557.2818603515625, "loss": 0.5475, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7172324657440186, "rewards/margins": 1.1084479093551636, "rewards/margins_max": 3.171581506729126, "rewards/margins_min": -0.878155529499054, "rewards/margins_std": 1.8477665185928345, "rewards/rejected": -2.8256804943084717, "step": 2990 }, { "epoch": 0.72, "grad_norm": 6.698405847637048, "learning_rate": 1.1138994911205285e-06, "logits/chosen": -1.6819279193878174, "logits/rejected": -1.5916599035263062, "logps/chosen": -439.94317626953125, "logps/rejected": -582.9528198242188, "loss": 0.4433, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.4526106119155884, "rewards/margins": 1.4489765167236328, "rewards/margins_max": 3.090031147003174, "rewards/margins_min": -0.1245226263999939, "rewards/margins_std": 1.4432562589645386, "rewards/rejected": -2.9015872478485107, "step": 3000 }, { "epoch": 0.72, "eval_logits/chosen": -1.6589401960372925, "eval_logits/rejected": -1.5921745300292969, "eval_logps/chosen": -462.6559753417969, "eval_logps/rejected": -554.1235961914062, "eval_loss": 0.49947747588157654, "eval_rewards/accuracies": 0.7394999861717224, "eval_rewards/chosen": -1.7820063829421997, "eval_rewards/margins": 1.1006966829299927, "eval_rewards/margins_max": 3.646759510040283, "eval_rewards/margins_min": -0.9944899678230286, "eval_rewards/margins_std": 1.5726896524429321, "eval_rewards/rejected": -2.8827030658721924, "eval_runtime": 858.373, "eval_samples_per_second": 4.66, "eval_steps_per_second": 0.291, "step": 3000 }, { "epoch": 0.72, "grad_norm": 5.627112945437297, "learning_rate": 1.0965551828687298e-06, "logits/chosen": -1.6769380569458008, "logits/rejected": -1.5857552289962769, "logps/chosen": -443.0960388183594, "logps/rejected": -591.6414794921875, "loss": 0.4465, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7624861001968384, "rewards/margins": 1.2883433103561401, "rewards/margins_max": 2.8570215702056885, "rewards/margins_min": -0.3133588135242462, "rewards/margins_std": 1.4152709245681763, "rewards/rejected": -3.0508296489715576, "step": 3010 }, { "epoch": 0.72, "grad_norm": 8.44393472081209, "learning_rate": 1.0793089542891231e-06, "logits/chosen": -1.599174976348877, "logits/rejected": -1.4982831478118896, "logps/chosen": -467.6996154785156, "logps/rejected": -542.078125, "loss": 0.4099, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7276941537857056, "rewards/margins": 1.3440990447998047, "rewards/margins_max": 2.9610838890075684, "rewards/margins_min": -0.025234121829271317, "rewards/margins_std": 1.3704931735992432, "rewards/rejected": -3.0717933177948, "step": 3020 }, { "epoch": 0.73, "grad_norm": 6.562826384647798, "learning_rate": 1.062162010633545e-06, "logits/chosen": -1.7041943073272705, "logits/rejected": -1.5975539684295654, "logps/chosen": -462.03369140625, "logps/rejected": -545.7349853515625, "loss": 0.4674, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -1.7101600170135498, "rewards/margins": 1.3789432048797607, "rewards/margins_max": 2.9714462757110596, "rewards/margins_min": -0.04658883064985275, "rewards/margins_std": 1.381899118423462, "rewards/rejected": -3.0891032218933105, "step": 3030 }, { "epoch": 0.73, "grad_norm": 7.130847338475872, "learning_rate": 1.0451155502153141e-06, "logits/chosen": -1.7324345111846924, "logits/rejected": -1.6601059436798096, "logps/chosen": -470.0752868652344, "logps/rejected": -501.4732360839844, "loss": 0.5877, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8416751623153687, "rewards/margins": 0.7419594526290894, "rewards/margins_max": 2.6581192016601562, "rewards/margins_min": -1.0817208290100098, "rewards/margins_std": 1.6870219707489014, "rewards/rejected": -2.5836341381073, "step": 3040 }, { "epoch": 0.73, "grad_norm": 4.642431042579677, "learning_rate": 1.028170764325479e-06, "logits/chosen": -1.7943503856658936, "logits/rejected": -1.7403968572616577, "logps/chosen": -482.03643798828125, "logps/rejected": -539.9556274414062, "loss": 0.5271, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.681033730506897, "rewards/margins": 0.9902159571647644, "rewards/margins_max": 3.030367374420166, "rewards/margins_min": -0.7307082414627075, "rewards/margins_std": 1.6854665279388428, "rewards/rejected": -2.6712498664855957, "step": 3050 }, { "epoch": 0.73, "grad_norm": 10.927268512467087, "learning_rate": 1.0113288371495708e-06, "logits/chosen": -1.7122856378555298, "logits/rejected": -1.7047529220581055, "logps/chosen": -474.58349609375, "logps/rejected": -521.8853149414062, "loss": 0.5224, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -1.6963857412338257, "rewards/margins": 0.8555315136909485, "rewards/margins_max": 2.699737071990967, "rewards/margins_min": -0.678202748298645, "rewards/margins_std": 1.5165655612945557, "rewards/rejected": -2.551917552947998, "step": 3060 }, { "epoch": 0.74, "grad_norm": 9.557387637177097, "learning_rate": 9.945909456848436e-07, "logits/chosen": -1.7446874380111694, "logits/rejected": -1.6789486408233643, "logps/chosen": -453.08258056640625, "logps/rejected": -476.5225524902344, "loss": 0.5017, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.5125553607940674, "rewards/margins": 0.8446477651596069, "rewards/margins_max": 2.52988862991333, "rewards/margins_min": -0.6797794103622437, "rewards/margins_std": 1.455639123916626, "rewards/rejected": -2.3572030067443848, "step": 3070 }, { "epoch": 0.74, "grad_norm": 7.058486743447343, "learning_rate": 9.779582596580204e-07, "logits/chosen": -1.6608178615570068, "logits/rejected": -1.625554084777832, "logps/chosen": -398.79595947265625, "logps/rejected": -520.8382568359375, "loss": 0.4551, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.4478676319122314, "rewards/margins": 1.3355865478515625, "rewards/margins_max": 3.0741171836853027, "rewards/margins_min": -0.38496366143226624, "rewards/margins_std": 1.5238888263702393, "rewards/rejected": -2.783454179763794, "step": 3080 }, { "epoch": 0.74, "grad_norm": 7.274483479519273, "learning_rate": 9.6143194144355e-07, "logits/chosen": -1.8469903469085693, "logits/rejected": -1.7680364847183228, "logps/chosen": -421.186279296875, "logps/rejected": -473.44671630859375, "loss": 0.3955, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.3940449953079224, "rewards/margins": 1.0108753442764282, "rewards/margins_max": 2.6060848236083984, "rewards/margins_min": -0.31543099880218506, "rewards/margins_std": 1.296598196029663, "rewards/rejected": -2.4049201011657715, "step": 3090 }, { "epoch": 0.74, "grad_norm": 11.78340044303751, "learning_rate": 9.450131459823689e-07, "logits/chosen": -1.8017295598983765, "logits/rejected": -1.7521486282348633, "logps/chosen": -468.4088439941406, "logps/rejected": -488.5104064941406, "loss": 0.4908, "rewards/accuracies": 0.75, "rewards/chosen": -1.4093245267868042, "rewards/margins": 0.8025612831115723, "rewards/margins_max": 2.084317684173584, "rewards/margins_min": -0.42825132608413696, "rewards/margins_std": 1.1152690649032593, "rewards/rejected": -2.211885929107666, "step": 3100 }, { "epoch": 0.74, "eval_logits/chosen": -1.6807482242584229, "eval_logits/rejected": -1.6165412664413452, "eval_logps/chosen": -457.6893615722656, "eval_logps/rejected": -545.781005859375, "eval_loss": 0.4970407485961914, "eval_rewards/accuracies": 0.7415000200271606, "eval_rewards/chosen": -1.7323403358459473, "eval_rewards/margins": 1.0669374465942383, "eval_rewards/margins_max": 3.526780366897583, "eval_rewards/margins_min": -0.9553115367889404, "eval_rewards/margins_std": 1.5147596597671509, "eval_rewards/rejected": -2.7992777824401855, "eval_runtime": 858.5896, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 3100 }, { "epoch": 0.74, "grad_norm": 11.854799455984127, "learning_rate": 9.28703020701193e-07, "logits/chosen": -1.563434362411499, "logits/rejected": -1.541565179824829, "logps/chosen": -448.0342712402344, "logps/rejected": -561.870361328125, "loss": 0.5013, "rewards/accuracies": 0.75, "rewards/chosen": -1.712728500366211, "rewards/margins": 1.2351288795471191, "rewards/margins_max": 2.937047243118286, "rewards/margins_min": -0.44396305084228516, "rewards/margins_std": 1.55026113986969, "rewards/rejected": -2.947857141494751, "step": 3110 }, { "epoch": 0.75, "grad_norm": 13.553868513622607, "learning_rate": 9.125027054323257e-07, "logits/chosen": -1.649717092514038, "logits/rejected": -1.562253713607788, "logps/chosen": -497.48687744140625, "logps/rejected": -561.8211669921875, "loss": 0.4807, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7405760288238525, "rewards/margins": 1.3019063472747803, "rewards/margins_max": 3.358339309692383, "rewards/margins_min": -0.3791218400001526, "rewards/margins_std": 1.6496785879135132, "rewards/rejected": -3.0424821376800537, "step": 3120 }, { "epoch": 0.75, "grad_norm": 14.197166611508152, "learning_rate": 8.964133323340083e-07, "logits/chosen": -1.5067158937454224, "logits/rejected": -1.3952124118804932, "logps/chosen": -421.15869140625, "logps/rejected": -503.23052978515625, "loss": 0.4927, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8762842416763306, "rewards/margins": 1.0793484449386597, "rewards/margins_max": 2.984920024871826, "rewards/margins_min": -0.6503228545188904, "rewards/margins_std": 1.623496413230896, "rewards/rejected": -2.9556326866149902, "step": 3130 }, { "epoch": 0.75, "grad_norm": 4.513696384074523, "learning_rate": 8.804360258112862e-07, "logits/chosen": -1.8032138347625732, "logits/rejected": -1.705063819885254, "logps/chosen": -451.15118408203125, "logps/rejected": -508.64129638671875, "loss": 0.4327, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6817461252212524, "rewards/margins": 1.025824785232544, "rewards/margins_max": 2.5554795265197754, "rewards/margins_min": -0.38372859358787537, "rewards/margins_std": 1.3719170093536377, "rewards/rejected": -2.707570791244507, "step": 3140 }, { "epoch": 0.75, "grad_norm": 6.193160425297152, "learning_rate": 8.645719024374446e-07, "logits/chosen": -1.7389684915542603, "logits/rejected": -1.6230958700180054, "logps/chosen": -474.1650390625, "logps/rejected": -540.5087280273438, "loss": 0.5038, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.623030424118042, "rewards/margins": 1.073654055595398, "rewards/margins_max": 2.717353105545044, "rewards/margins_min": -0.7271397709846497, "rewards/margins_std": 1.549439787864685, "rewards/rejected": -2.6966843605041504, "step": 3150 }, { "epoch": 0.76, "grad_norm": 8.50820526813767, "learning_rate": 8.488220708759668e-07, "logits/chosen": -1.7873504161834717, "logits/rejected": -1.6921641826629639, "logps/chosen": -516.7462768554688, "logps/rejected": -572.9259033203125, "loss": 0.4474, "rewards/accuracies": 0.75, "rewards/chosen": -1.999103307723999, "rewards/margins": 1.1502858400344849, "rewards/margins_max": 2.661588668823242, "rewards/margins_min": -0.6546133756637573, "rewards/margins_std": 1.4813575744628906, "rewards/rejected": -3.1493890285491943, "step": 3160 }, { "epoch": 0.76, "grad_norm": 9.168338230198552, "learning_rate": 8.331876318030585e-07, "logits/chosen": -1.7162977457046509, "logits/rejected": -1.6486173868179321, "logps/chosen": -456.36126708984375, "logps/rejected": -541.7515869140625, "loss": 0.5029, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7549670934677124, "rewards/margins": 1.1238515377044678, "rewards/margins_max": 2.697988748550415, "rewards/margins_min": -0.22190897166728973, "rewards/margins_std": 1.323172688484192, "rewards/rejected": -2.878818988800049, "step": 3170 }, { "epoch": 0.76, "grad_norm": 7.322586797354139, "learning_rate": 8.17669677830727e-07, "logits/chosen": -1.7330348491668701, "logits/rejected": -1.6514427661895752, "logps/chosen": -468.08306884765625, "logps/rejected": -561.5731201171875, "loss": 0.4444, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.696531891822815, "rewards/margins": 1.1889557838439941, "rewards/margins_max": 2.742466449737549, "rewards/margins_min": -0.49170589447021484, "rewards/margins_std": 1.4486651420593262, "rewards/rejected": -2.8854873180389404, "step": 3180 }, { "epoch": 0.76, "grad_norm": 14.489089017654749, "learning_rate": 8.022692934304238e-07, "logits/chosen": -1.7671037912368774, "logits/rejected": -1.6336908340454102, "logps/chosen": -455.9390563964844, "logps/rejected": -528.6905517578125, "loss": 0.4878, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.5576273202896118, "rewards/margins": 1.1882274150848389, "rewards/margins_max": 2.925454616546631, "rewards/margins_min": -0.3569898307323456, "rewards/margins_std": 1.4550426006317139, "rewards/rejected": -2.745854377746582, "step": 3190 }, { "epoch": 0.77, "grad_norm": 9.631336850932627, "learning_rate": 7.86987554857259e-07, "logits/chosen": -1.7733577489852905, "logits/rejected": -1.740085244178772, "logps/chosen": -391.9376525878906, "logps/rejected": -482.2456970214844, "loss": 0.4325, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.3694288730621338, "rewards/margins": 1.1222963333129883, "rewards/margins_max": 2.686767339706421, "rewards/margins_min": -0.44451904296875, "rewards/margins_std": 1.386244535446167, "rewards/rejected": -2.491725206375122, "step": 3200 }, { "epoch": 0.77, "eval_logits/chosen": -1.7599533796310425, "eval_logits/rejected": -1.6999260187149048, "eval_logps/chosen": -424.0385437011719, "eval_logps/rejected": -506.6104431152344, "eval_loss": 0.4972234070301056, "eval_rewards/accuracies": 0.75, "eval_rewards/chosen": -1.3958321809768677, "eval_rewards/margins": 1.0117398500442505, "eval_rewards/margins_max": 3.34749174118042, "eval_rewards/margins_min": -0.9044501185417175, "eval_rewards/margins_std": 1.4383246898651123, "eval_rewards/rejected": -2.407572031021118, "eval_runtime": 858.2922, "eval_samples_per_second": 4.66, "eval_steps_per_second": 0.291, "step": 3200 }, { "epoch": 0.77, "grad_norm": 6.253188273706924, "learning_rate": 7.718255300747818e-07, "logits/chosen": -1.7191455364227295, "logits/rejected": -1.6749334335327148, "logps/chosen": -405.9119567871094, "logps/rejected": -538.2217407226562, "loss": 0.4476, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.2882969379425049, "rewards/margins": 1.3046976327896118, "rewards/margins_max": 3.0062499046325684, "rewards/margins_min": -0.28342074155807495, "rewards/margins_std": 1.471156358718872, "rewards/rejected": -2.592994451522827, "step": 3210 }, { "epoch": 0.77, "grad_norm": 17.371777571850824, "learning_rate": 7.567842786803503e-07, "logits/chosen": -1.6931642293930054, "logits/rejected": -1.5891650915145874, "logps/chosen": -436.12322998046875, "logps/rejected": -535.6715698242188, "loss": 0.4393, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -1.5193614959716797, "rewards/margins": 1.150536060333252, "rewards/margins_max": 2.8979475498199463, "rewards/margins_min": -0.5388752222061157, "rewards/margins_std": 1.5520358085632324, "rewards/rejected": -2.6698975563049316, "step": 3220 }, { "epoch": 0.77, "grad_norm": 6.91320229906749, "learning_rate": 7.418648518310797e-07, "logits/chosen": -1.7672685384750366, "logits/rejected": -1.6705958843231201, "logps/chosen": -417.9222106933594, "logps/rejected": -457.97552490234375, "loss": 0.4604, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.3160896301269531, "rewards/margins": 1.0938535928726196, "rewards/margins_max": 2.7365756034851074, "rewards/margins_min": -0.3321813642978668, "rewards/margins_std": 1.3784321546554565, "rewards/rejected": -2.409943103790283, "step": 3230 }, { "epoch": 0.78, "grad_norm": 6.756699804529957, "learning_rate": 7.270682921703853e-07, "logits/chosen": -1.7060285806655884, "logits/rejected": -1.6961209774017334, "logps/chosen": -446.6856994628906, "logps/rejected": -537.2447509765625, "loss": 0.4496, "rewards/accuracies": 0.8125, "rewards/chosen": -1.5772435665130615, "rewards/margins": 1.0725020170211792, "rewards/margins_max": 2.7619194984436035, "rewards/margins_min": -0.6677699685096741, "rewards/margins_std": 1.5119699239730835, "rewards/rejected": -2.6497457027435303, "step": 3240 }, { "epoch": 0.78, "grad_norm": 5.398605675292898, "learning_rate": 7.123956337551116e-07, "logits/chosen": -1.680284857749939, "logits/rejected": -1.6974903345108032, "logps/chosen": -427.00921630859375, "logps/rejected": -478.3519592285156, "loss": 0.5381, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.4999085664749146, "rewards/margins": 0.9045706987380981, "rewards/margins_max": 2.535834789276123, "rewards/margins_min": -0.7897529006004333, "rewards/margins_std": 1.4983514547348022, "rewards/rejected": -2.4044792652130127, "step": 3250 }, { "epoch": 0.78, "grad_norm": 14.315628176332599, "learning_rate": 6.978479019832726e-07, "logits/chosen": -1.653009057044983, "logits/rejected": -1.5728065967559814, "logps/chosen": -475.44647216796875, "logps/rejected": -540.3555908203125, "loss": 0.4753, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.5316696166992188, "rewards/margins": 1.2920219898223877, "rewards/margins_max": 3.180108070373535, "rewards/margins_min": -0.43005838990211487, "rewards/margins_std": 1.6344013214111328, "rewards/rejected": -2.8236916065216064, "step": 3260 }, { "epoch": 0.78, "grad_norm": 8.915574716987104, "learning_rate": 6.834261135223891e-07, "logits/chosen": -1.666823148727417, "logits/rejected": -1.6157718896865845, "logps/chosen": -449.7420349121094, "logps/rejected": -507.77618408203125, "loss": 0.4356, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.5863635540008545, "rewards/margins": 0.9456008076667786, "rewards/margins_max": 2.299577474594116, "rewards/margins_min": -0.3729218542575836, "rewards/margins_std": 1.2145780324935913, "rewards/rejected": -2.5319645404815674, "step": 3270 }, { "epoch": 0.79, "grad_norm": 12.604888306863954, "learning_rate": 6.691312762384397e-07, "logits/chosen": -1.7692794799804688, "logits/rejected": -1.676082968711853, "logps/chosen": -395.3002014160156, "logps/rejected": -466.67254638671875, "loss": 0.467, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.3118970394134521, "rewards/margins": 1.1058380603790283, "rewards/margins_max": 2.493607759475708, "rewards/margins_min": -0.1631411612033844, "rewards/margins_std": 1.2165981531143188, "rewards/rejected": -2.4177350997924805, "step": 3280 }, { "epoch": 0.79, "grad_norm": 11.34273236546858, "learning_rate": 6.549643891254282e-07, "logits/chosen": -1.6171706914901733, "logits/rejected": -1.5927175283432007, "logps/chosen": -442.04681396484375, "logps/rejected": -554.6174926757812, "loss": 0.4605, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.817350149154663, "rewards/margins": 1.0553823709487915, "rewards/margins_max": 2.650606632232666, "rewards/margins_min": -0.3532094955444336, "rewards/margins_std": 1.3306806087493896, "rewards/rejected": -2.872732400894165, "step": 3290 }, { "epoch": 0.79, "grad_norm": 11.060563947154845, "learning_rate": 6.409264422355643e-07, "logits/chosen": -1.7066056728363037, "logits/rejected": -1.6793180704116821, "logps/chosen": -469.4725646972656, "logps/rejected": -584.8623657226562, "loss": 0.4645, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.6153961420059204, "rewards/margins": 1.3554171323776245, "rewards/margins_max": 3.341646671295166, "rewards/margins_min": -0.3783667981624603, "rewards/margins_std": 1.6680500507354736, "rewards/rejected": -2.970813512802124, "step": 3300 }, { "epoch": 0.79, "eval_logits/chosen": -1.650549292564392, "eval_logits/rejected": -1.5845167636871338, "eval_logps/chosen": -456.6324462890625, "eval_logps/rejected": -546.2211303710938, "eval_loss": 0.49698585271835327, "eval_rewards/accuracies": 0.7484999895095825, "eval_rewards/chosen": -1.7217708826065063, "eval_rewards/margins": 1.081907868385315, "eval_rewards/margins_max": 3.5294883251190186, "eval_rewards/margins_min": -0.9806877374649048, "eval_rewards/margins_std": 1.52902090549469, "eval_rewards/rejected": -2.8036789894104004, "eval_runtime": 858.5937, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 3300 }, { "epoch": 0.79, "grad_norm": 13.265245987113575, "learning_rate": 6.270184166100782e-07, "logits/chosen": -1.6866852045059204, "logits/rejected": -1.607731819152832, "logps/chosen": -403.41339111328125, "logps/rejected": -572.7368774414062, "loss": 0.4474, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5276778936386108, "rewards/margins": 1.1753528118133545, "rewards/margins_max": 3.3483657836914062, "rewards/margins_min": -0.46091729402542114, "rewards/margins_std": 1.708674669265747, "rewards/rejected": -2.703030586242676, "step": 3310 }, { "epoch": 0.8, "grad_norm": 6.035917541486708, "learning_rate": 6.132412842106573e-07, "logits/chosen": -1.697725534439087, "logits/rejected": -1.6164137125015259, "logps/chosen": -477.47357177734375, "logps/rejected": -576.3294067382812, "loss": 0.5207, "rewards/accuracies": 0.75, "rewards/chosen": -1.735945701599121, "rewards/margins": 1.064760446548462, "rewards/margins_max": 2.688159465789795, "rewards/margins_min": -0.5810497999191284, "rewards/margins_std": 1.4618176221847534, "rewards/rejected": -2.800706148147583, "step": 3320 }, { "epoch": 0.8, "grad_norm": 6.443783258450331, "learning_rate": 5.995960078515256e-07, "logits/chosen": -1.6920620203018188, "logits/rejected": -1.5482776165008545, "logps/chosen": -453.76678466796875, "logps/rejected": -529.1069946289062, "loss": 0.5146, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.678052544593811, "rewards/margins": 1.0520880222320557, "rewards/margins_max": 2.777189016342163, "rewards/margins_min": -0.46846669912338257, "rewards/margins_std": 1.4209071397781372, "rewards/rejected": -2.7301406860351562, "step": 3330 }, { "epoch": 0.8, "grad_norm": 4.609287687864791, "learning_rate": 5.860835411321494e-07, "logits/chosen": -1.6179773807525635, "logits/rejected": -1.5531318187713623, "logps/chosen": -474.0083923339844, "logps/rejected": -561.9898071289062, "loss": 0.4853, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7620006799697876, "rewards/margins": 1.0599331855773926, "rewards/margins_max": 2.9547011852264404, "rewards/margins_min": -0.6196690201759338, "rewards/margins_std": 1.6309099197387695, "rewards/rejected": -2.821934223175049, "step": 3340 }, { "epoch": 0.8, "grad_norm": 9.420310831631342, "learning_rate": 5.727048283706046e-07, "logits/chosen": -1.7121317386627197, "logits/rejected": -1.6220109462738037, "logps/chosen": -472.6026306152344, "logps/rejected": -524.837646484375, "loss": 0.442, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.704514741897583, "rewards/margins": 1.1313450336456299, "rewards/margins_max": 2.440160036087036, "rewards/margins_min": -0.11581452190876007, "rewards/margins_std": 1.124191164970398, "rewards/rejected": -2.835860013961792, "step": 3350 }, { "epoch": 0.8, "grad_norm": 6.816860098608736, "learning_rate": 5.594608045375743e-07, "logits/chosen": -1.551119089126587, "logits/rejected": -1.502192497253418, "logps/chosen": -442.8492736816406, "logps/rejected": -522.5615844726562, "loss": 0.4435, "rewards/accuracies": 0.75, "rewards/chosen": -1.4069032669067383, "rewards/margins": 1.1684728860855103, "rewards/margins_max": 2.7896196842193604, "rewards/margins_min": -0.37664252519607544, "rewards/margins_std": 1.4177650213241577, "rewards/rejected": -2.575376272201538, "step": 3360 }, { "epoch": 0.81, "grad_norm": 9.840914460156098, "learning_rate": 5.463523951910171e-07, "logits/chosen": -1.578587293624878, "logits/rejected": -1.6227643489837646, "logps/chosen": -448.31134033203125, "logps/rejected": -571.0477294921875, "loss": 0.5147, "rewards/accuracies": 0.8125, "rewards/chosen": -1.7114700078964233, "rewards/margins": 1.0203540325164795, "rewards/margins_max": 2.720900297164917, "rewards/margins_min": -0.7232763767242432, "rewards/margins_std": 1.5306042432785034, "rewards/rejected": -2.7318239212036133, "step": 3370 }, { "epoch": 0.81, "grad_norm": 7.505769163766368, "learning_rate": 5.333805164114744e-07, "logits/chosen": -1.5661756992340088, "logits/rejected": -1.4561227560043335, "logps/chosen": -490.28265380859375, "logps/rejected": -594.8649291992188, "loss": 0.4355, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.7822291851043701, "rewards/margins": 1.3433473110198975, "rewards/margins_max": 2.9801688194274902, "rewards/margins_min": -0.19646409153938293, "rewards/margins_std": 1.421388030052185, "rewards/rejected": -3.1255760192871094, "step": 3380 }, { "epoch": 0.81, "grad_norm": 13.090089709715455, "learning_rate": 5.205460747380589e-07, "logits/chosen": -1.6630405187606812, "logits/rejected": -1.6118190288543701, "logps/chosen": -445.25982666015625, "logps/rejected": -532.396240234375, "loss": 0.4646, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.78061044216156, "rewards/margins": 1.0901373624801636, "rewards/margins_max": 2.672598361968994, "rewards/margins_min": -0.24360835552215576, "rewards/margins_std": 1.300402045249939, "rewards/rejected": -2.8707478046417236, "step": 3390 }, { "epoch": 0.81, "grad_norm": 8.119275702884655, "learning_rate": 5.07849967105098e-07, "logits/chosen": -1.5982238054275513, "logits/rejected": -1.5523476600646973, "logps/chosen": -555.5810546875, "logps/rejected": -632.5535278320312, "loss": 0.4612, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -2.011742353439331, "rewards/margins": 1.0433838367462158, "rewards/margins_max": 2.658341407775879, "rewards/margins_min": -0.8179477453231812, "rewards/margins_std": 1.5378905534744263, "rewards/rejected": -3.055126190185547, "step": 3400 }, { "epoch": 0.81, "eval_logits/chosen": -1.5677586793899536, "eval_logits/rejected": -1.498044729232788, "eval_logps/chosen": -472.3240661621094, "eval_logps/rejected": -565.0458984375, "eval_loss": 0.4980122148990631, "eval_rewards/accuracies": 0.7444999814033508, "eval_rewards/chosen": -1.8786872625350952, "eval_rewards/margins": 1.113239049911499, "eval_rewards/margins_max": 3.663968324661255, "eval_rewards/margins_min": -1.0012972354888916, "eval_rewards/margins_std": 1.577628254890442, "eval_rewards/rejected": -2.9919259548187256, "eval_runtime": 858.9046, "eval_samples_per_second": 4.657, "eval_steps_per_second": 0.291, "step": 3400 }, { "epoch": 0.82, "grad_norm": 7.889688991513702, "learning_rate": 4.952930807794503e-07, "logits/chosen": -1.5934724807739258, "logits/rejected": -1.5488868951797485, "logps/chosen": -450.05584716796875, "logps/rejected": -570.4329223632812, "loss": 0.48, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8906238079071045, "rewards/margins": 1.0919045209884644, "rewards/margins_max": 2.9547343254089355, "rewards/margins_min": -0.5420499444007874, "rewards/margins_std": 1.58566153049469, "rewards/rejected": -2.9825284481048584, "step": 3410 }, { "epoch": 0.82, "grad_norm": 4.722133610710559, "learning_rate": 4.828762932985009e-07, "logits/chosen": -1.6002585887908936, "logits/rejected": -1.4672209024429321, "logps/chosen": -498.3033142089844, "logps/rejected": -595.5597534179688, "loss": 0.4066, "rewards/accuracies": 0.875, "rewards/chosen": -1.998109221458435, "rewards/margins": 1.490399718284607, "rewards/margins_max": 3.4044978618621826, "rewards/margins_min": -0.23177340626716614, "rewards/margins_std": 1.6151764392852783, "rewards/rejected": -3.488508939743042, "step": 3420 }, { "epoch": 0.82, "grad_norm": 11.51770779163274, "learning_rate": 4.7060047240883285e-07, "logits/chosen": -1.5348848104476929, "logits/rejected": -1.4140112400054932, "logps/chosen": -520.8869018554688, "logps/rejected": -607.1759033203125, "loss": 0.5668, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.2268495559692383, "rewards/margins": 0.9183933138847351, "rewards/margins_max": 2.911619186401367, "rewards/margins_min": -0.7192481160163879, "rewards/margins_std": 1.6191437244415283, "rewards/rejected": -3.145242691040039, "step": 3430 }, { "epoch": 0.82, "grad_norm": 10.479706908925108, "learning_rate": 4.5846647600558817e-07, "logits/chosen": -1.5732555389404297, "logits/rejected": -1.51780104637146, "logps/chosen": -420.404052734375, "logps/rejected": -515.4987182617188, "loss": 0.4422, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.81316339969635, "rewards/margins": 1.178539752960205, "rewards/margins_max": 2.7211461067199707, "rewards/margins_min": -0.4756883978843689, "rewards/margins_std": 1.4252105951309204, "rewards/rejected": -2.9917032718658447, "step": 3440 }, { "epoch": 0.83, "grad_norm": 7.881857656336233, "learning_rate": 4.464751520725094e-07, "logits/chosen": -1.7074670791625977, "logits/rejected": -1.6189085245132446, "logps/chosen": -495.5066833496094, "logps/rejected": -566.2218627929688, "loss": 0.4839, "rewards/accuracies": 0.75, "rewards/chosen": -1.9538028240203857, "rewards/margins": 1.0302854776382446, "rewards/margins_max": 2.534728765487671, "rewards/margins_min": -0.3840886950492859, "rewards/margins_std": 1.336159348487854, "rewards/rejected": -2.98408842086792, "step": 3450 }, { "epoch": 0.83, "grad_norm": 7.585243012590868, "learning_rate": 4.3462733862268124e-07, "logits/chosen": -1.5768866539001465, "logits/rejected": -1.555135726928711, "logps/chosen": -497.5037536621094, "logps/rejected": -571.2515258789062, "loss": 0.5288, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9346675872802734, "rewards/margins": 1.0581872463226318, "rewards/margins_max": 2.6524899005889893, "rewards/margins_min": -0.4539794921875, "rewards/margins_std": 1.4103864431381226, "rewards/rejected": -2.9928548336029053, "step": 3460 }, { "epoch": 0.83, "grad_norm": 7.118866539219756, "learning_rate": 4.229238636399649e-07, "logits/chosen": -1.6214975118637085, "logits/rejected": -1.5408142805099487, "logps/chosen": -513.720703125, "logps/rejected": -585.4041748046875, "loss": 0.5025, "rewards/accuracies": 0.6875, "rewards/chosen": -2.1007537841796875, "rewards/margins": 1.0723035335540771, "rewards/margins_max": 2.867311954498291, "rewards/margins_min": -0.6651767492294312, "rewards/margins_std": 1.6020774841308594, "rewards/rejected": -3.1730570793151855, "step": 3470 }, { "epoch": 0.83, "grad_norm": 12.157780018993368, "learning_rate": 4.113655450211368e-07, "logits/chosen": -1.547278642654419, "logits/rejected": -1.5706777572631836, "logps/chosen": -512.8460693359375, "logps/rejected": -630.23779296875, "loss": 0.4838, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -2.4057161808013916, "rewards/margins": 0.9649950265884399, "rewards/margins_max": 3.0158610343933105, "rewards/margins_min": -0.6118464469909668, "rewards/margins_std": 1.6102758646011353, "rewards/rejected": -3.370711088180542, "step": 3480 }, { "epoch": 0.84, "grad_norm": 7.7501939318603705, "learning_rate": 3.999531905187257e-07, "logits/chosen": -1.6263949871063232, "logits/rejected": -1.5438145399093628, "logps/chosen": -497.8841857910156, "logps/rejected": -609.7381591796875, "loss": 0.4455, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9176429510116577, "rewards/margins": 1.3525667190551758, "rewards/margins_max": 2.888054609298706, "rewards/margins_min": -0.34442782402038574, "rewards/margins_std": 1.4610884189605713, "rewards/rejected": -3.270209789276123, "step": 3490 }, { "epoch": 0.84, "grad_norm": 9.216378220453054, "learning_rate": 3.886875976845661e-07, "logits/chosen": -1.7654060125350952, "logits/rejected": -1.686044692993164, "logps/chosen": -509.27508544921875, "logps/rejected": -588.5357666015625, "loss": 0.4023, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.9301483631134033, "rewards/margins": 1.2077137231826782, "rewards/margins_max": 2.7348074913024902, "rewards/margins_min": -0.27688318490982056, "rewards/margins_std": 1.327042818069458, "rewards/rejected": -3.1378626823425293, "step": 3500 }, { "epoch": 0.84, "eval_logits/chosen": -1.5625070333480835, "eval_logits/rejected": -1.492349624633789, "eval_logps/chosen": -490.86077880859375, "eval_logps/rejected": -585.3400268554688, "eval_loss": 0.4987245500087738, "eval_rewards/accuracies": 0.7409999966621399, "eval_rewards/chosen": -2.0640547275543213, "eval_rewards/margins": 1.130812168121338, "eval_rewards/margins_max": 3.7330985069274902, "eval_rewards/margins_min": -1.0134087800979614, "eval_rewards/margins_std": 1.6034061908721924, "eval_rewards/rejected": -3.19486665725708, "eval_runtime": 858.7578, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 3500 }, { "epoch": 0.84, "grad_norm": 10.706715937873593, "learning_rate": 3.7756955381406084e-07, "logits/chosen": -1.5635570287704468, "logits/rejected": -1.4497935771942139, "logps/chosen": -464.245849609375, "logps/rejected": -547.8213500976562, "loss": 0.5222, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.114765167236328, "rewards/margins": 1.2438435554504395, "rewards/margins_max": 3.47216796875, "rewards/margins_min": -0.7468990087509155, "rewards/margins_std": 1.8321501016616821, "rewards/rejected": -3.3586087226867676, "step": 3510 }, { "epoch": 0.84, "grad_norm": 9.452349771285999, "learning_rate": 3.6659983589115934e-07, "logits/chosen": -1.5787897109985352, "logits/rejected": -1.4548214673995972, "logps/chosen": -454.0419921875, "logps/rejected": -580.2047729492188, "loss": 0.4932, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.0241169929504395, "rewards/margins": 1.2483015060424805, "rewards/margins_max": 3.2758662700653076, "rewards/margins_min": -0.3563303053379059, "rewards/margins_std": 1.6317192316055298, "rewards/rejected": -3.27241849899292, "step": 3520 }, { "epoch": 0.85, "grad_norm": 8.359051906723446, "learning_rate": 3.5577921053406213e-07, "logits/chosen": -1.7231124639511108, "logits/rejected": -1.6327896118164062, "logps/chosen": -473.8720703125, "logps/rejected": -566.1983032226562, "loss": 0.4717, "rewards/accuracies": 0.75, "rewards/chosen": -1.9246323108673096, "rewards/margins": 1.1547865867614746, "rewards/margins_max": 3.191124439239502, "rewards/margins_min": -0.6173357367515564, "rewards/margins_std": 1.6585915088653564, "rewards/rejected": -3.079418659210205, "step": 3530 }, { "epoch": 0.85, "grad_norm": 11.948146047388548, "learning_rate": 3.451084339416397e-07, "logits/chosen": -1.5407123565673828, "logits/rejected": -1.5107948780059814, "logps/chosen": -477.5166931152344, "logps/rejected": -639.1512451171875, "loss": 0.5105, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.8434597253799438, "rewards/margins": 1.6794121265411377, "rewards/margins_max": 3.466728925704956, "rewards/margins_min": -0.0010077148908749223, "rewards/margins_std": 1.5939804315567017, "rewards/rejected": -3.522871732711792, "step": 3540 }, { "epoch": 0.85, "grad_norm": 8.85474200604695, "learning_rate": 3.3458825184059183e-07, "logits/chosen": -1.5605331659317017, "logits/rejected": -1.579959511756897, "logps/chosen": -414.41241455078125, "logps/rejected": -562.7239990234375, "loss": 0.4689, "rewards/accuracies": 0.875, "rewards/chosen": -1.786630630493164, "rewards/margins": 1.3613005876541138, "rewards/margins_max": 3.182543992996216, "rewards/margins_min": -0.10906486213207245, "rewards/margins_std": 1.4642200469970703, "rewards/rejected": -3.147930860519409, "step": 3550 }, { "epoch": 0.85, "grad_norm": 13.755109912539947, "learning_rate": 3.242193994333279e-07, "logits/chosen": -1.5289247035980225, "logits/rejected": -1.4496179819107056, "logps/chosen": -489.3772888183594, "logps/rejected": -548.5111694335938, "loss": 0.5472, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -2.333698272705078, "rewards/margins": 0.8207733035087585, "rewards/margins_max": 2.586226224899292, "rewards/margins_min": -0.9457889795303345, "rewards/margins_std": 1.5833828449249268, "rewards/rejected": -3.1544718742370605, "step": 3560 }, { "epoch": 0.85, "grad_norm": 4.8555422076854855, "learning_rate": 3.1400260134659105e-07, "logits/chosen": -1.5545639991760254, "logits/rejected": -1.616835355758667, "logps/chosen": -471.56427001953125, "logps/rejected": -574.0294799804688, "loss": 0.4345, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7827537059783936, "rewards/margins": 1.061098337173462, "rewards/margins_max": 2.390902042388916, "rewards/margins_min": -0.28154340386390686, "rewards/margins_std": 1.2370855808258057, "rewards/rejected": -2.8438525199890137, "step": 3570 }, { "epoch": 0.86, "grad_norm": 11.990988844818533, "learning_rate": 3.0393857158081214e-07, "logits/chosen": -1.5989338159561157, "logits/rejected": -1.4652072191238403, "logps/chosen": -433.7322692871094, "logps/rejected": -521.4385986328125, "loss": 0.4284, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.7932720184326172, "rewards/margins": 1.3295361995697021, "rewards/margins_max": 3.2473080158233643, "rewards/margins_min": -0.2499588280916214, "rewards/margins_std": 1.5861493349075317, "rewards/rejected": -3.1228089332580566, "step": 3580 }, { "epoch": 0.86, "grad_norm": 5.993209174839379, "learning_rate": 2.940280134602194e-07, "logits/chosen": -1.6760985851287842, "logits/rejected": -1.5036050081253052, "logps/chosen": -536.2041015625, "logps/rejected": -601.5537719726562, "loss": 0.476, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -2.0729622840881348, "rewards/margins": 1.2368301153182983, "rewards/margins_max": 3.1068215370178223, "rewards/margins_min": -0.7157915234565735, "rewards/margins_std": 1.7304604053497314, "rewards/rejected": -3.3097922801971436, "step": 3590 }, { "epoch": 0.86, "grad_norm": 18.244762766564083, "learning_rate": 2.8427161958368007e-07, "logits/chosen": -1.5737125873565674, "logits/rejected": -1.4657363891601562, "logps/chosen": -472.1244201660156, "logps/rejected": -570.1707763671875, "loss": 0.4564, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.7630841732025146, "rewards/margins": 1.4402949810028076, "rewards/margins_max": 2.9757933616638184, "rewards/margins_min": -0.11270264536142349, "rewards/margins_std": 1.389777421951294, "rewards/rejected": -3.2033791542053223, "step": 3600 }, { "epoch": 0.86, "eval_logits/chosen": -1.595453143119812, "eval_logits/rejected": -1.5267976522445679, "eval_logps/chosen": -473.3586730957031, "eval_logps/rejected": -564.1885375976562, "eval_loss": 0.4952239394187927, "eval_rewards/accuracies": 0.7444999814033508, "eval_rewards/chosen": -1.8890334367752075, "eval_rewards/margins": 1.0943188667297363, "eval_rewards/margins_max": 3.5912556648254395, "eval_rewards/margins_min": -0.9689545631408691, "eval_rewards/margins_std": 1.5434987545013428, "eval_rewards/rejected": -2.9833521842956543, "eval_runtime": 858.5704, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 3600 }, { "epoch": 0.86, "grad_norm": 5.769227760509782, "learning_rate": 2.746700717763018e-07, "logits/chosen": -1.685681939125061, "logits/rejected": -1.6469557285308838, "logps/chosen": -516.54052734375, "logps/rejected": -625.3527221679688, "loss": 0.4431, "rewards/accuracies": 0.8125, "rewards/chosen": -1.9687515497207642, "rewards/margins": 1.1721570491790771, "rewards/margins_max": 2.89247989654541, "rewards/margins_min": -0.5372272729873657, "rewards/margins_std": 1.5280717611312866, "rewards/rejected": -3.1409084796905518, "step": 3610 }, { "epoch": 0.87, "grad_norm": 10.50364694385211, "learning_rate": 2.6522404104178195e-07, "logits/chosen": -1.6576299667358398, "logits/rejected": -1.5381555557250977, "logps/chosen": -499.1231384277344, "logps/rejected": -535.855712890625, "loss": 0.5049, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.9912456274032593, "rewards/margins": 0.8773018717765808, "rewards/margins_max": 2.3392295837402344, "rewards/margins_min": -0.6671187281608582, "rewards/margins_std": 1.3481765985488892, "rewards/rejected": -2.8685474395751953, "step": 3620 }, { "epoch": 0.87, "grad_norm": 12.744088391894875, "learning_rate": 2.559341875155144e-07, "logits/chosen": -1.686824083328247, "logits/rejected": -1.5752757787704468, "logps/chosen": -518.01904296875, "logps/rejected": -568.8172607421875, "loss": 0.3939, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8957996368408203, "rewards/margins": 1.2966911792755127, "rewards/margins_max": 2.677760124206543, "rewards/margins_min": -0.12015140056610107, "rewards/margins_std": 1.239686131477356, "rewards/rejected": -3.192491054534912, "step": 3630 }, { "epoch": 0.87, "grad_norm": 5.75817356504437, "learning_rate": 2.4680116041845835e-07, "logits/chosen": -1.584856390953064, "logits/rejected": -1.5600625276565552, "logps/chosen": -458.5347595214844, "logps/rejected": -579.8786010742188, "loss": 0.421, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.802154541015625, "rewards/margins": 1.328449010848999, "rewards/margins_max": 3.085721492767334, "rewards/margins_min": -0.179740771651268, "rewards/margins_std": 1.4748588800430298, "rewards/rejected": -3.130603551864624, "step": 3640 }, { "epoch": 0.87, "grad_norm": 5.629724510594239, "learning_rate": 2.378255980117636e-07, "logits/chosen": -1.6194804906845093, "logits/rejected": -1.5626702308654785, "logps/chosen": -488.17669677734375, "logps/rejected": -619.7413940429688, "loss": 0.4344, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8197624683380127, "rewards/margins": 1.3161327838897705, "rewards/margins_max": 2.9797418117523193, "rewards/margins_min": -0.1867613047361374, "rewards/margins_std": 1.428399682044983, "rewards/rejected": -3.135895013809204, "step": 3650 }, { "epoch": 0.88, "grad_norm": 5.28591748808056, "learning_rate": 2.2900812755216883e-07, "logits/chosen": -1.496659755706787, "logits/rejected": -1.479811668395996, "logps/chosen": -471.2086486816406, "logps/rejected": -543.2459716796875, "loss": 0.5246, "rewards/accuracies": 0.6875, "rewards/chosen": -2.201799154281616, "rewards/margins": 0.8525907397270203, "rewards/margins_max": 2.6183009147644043, "rewards/margins_min": -0.5536326169967651, "rewards/margins_std": 1.4253017902374268, "rewards/rejected": -3.0543899536132812, "step": 3660 }, { "epoch": 0.88, "grad_norm": 6.470142748995791, "learning_rate": 2.203493652481639e-07, "logits/chosen": -1.5702474117279053, "logits/rejected": -1.588547945022583, "logps/chosen": -492.46429443359375, "logps/rejected": -610.8482666015625, "loss": 0.5308, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.929785966873169, "rewards/margins": 0.9548202753067017, "rewards/margins_max": 3.0041441917419434, "rewards/margins_min": -0.8008396029472351, "rewards/margins_std": 1.7058013677597046, "rewards/rejected": -2.884606122970581, "step": 3670 }, { "epoch": 0.88, "grad_norm": 8.73507594808698, "learning_rate": 2.1184991621692852e-07, "logits/chosen": -1.6459605693817139, "logits/rejected": -1.5309419631958008, "logps/chosen": -550.017333984375, "logps/rejected": -595.8959350585938, "loss": 0.4813, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8764235973358154, "rewards/margins": 1.3332127332687378, "rewards/margins_max": 2.8086400032043457, "rewards/margins_min": -0.1475955992937088, "rewards/margins_std": 1.339147686958313, "rewards/rejected": -3.2096359729766846, "step": 3680 }, { "epoch": 0.88, "grad_norm": 10.896204828429298, "learning_rate": 2.035103744420408e-07, "logits/chosen": -1.7131307125091553, "logits/rejected": -1.604412317276001, "logps/chosen": -548.2545776367188, "logps/rejected": -604.6376953125, "loss": 0.5001, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.94809091091156, "rewards/margins": 1.172905683517456, "rewards/margins_max": 3.539712905883789, "rewards/margins_min": -0.5458081960678101, "rewards/margins_std": 1.8031011819839478, "rewards/rejected": -3.1209967136383057, "step": 3690 }, { "epoch": 0.89, "grad_norm": 8.115342254472486, "learning_rate": 1.9533132273196892e-07, "logits/chosen": -1.5727499723434448, "logits/rejected": -1.4796936511993408, "logps/chosen": -476.2752990722656, "logps/rejected": -572.4835815429688, "loss": 0.4337, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.6234632730484009, "rewards/margins": 1.432694673538208, "rewards/margins_max": 3.4879558086395264, "rewards/margins_min": -0.5784531831741333, "rewards/margins_std": 1.7987644672393799, "rewards/rejected": -3.0561575889587402, "step": 3700 }, { "epoch": 0.89, "eval_logits/chosen": -1.6174299716949463, "eval_logits/rejected": -1.5500813722610474, "eval_logps/chosen": -463.4457092285156, "eval_logps/rejected": -553.7645874023438, "eval_loss": 0.49475446343421936, "eval_rewards/accuracies": 0.7480000257492065, "eval_rewards/chosen": -1.789903998374939, "eval_rewards/margins": 1.0892086029052734, "eval_rewards/margins_max": 3.5650243759155273, "eval_rewards/margins_min": -0.9670560359954834, "eval_rewards/margins_std": 1.5348496437072754, "eval_rewards/rejected": -2.879112720489502, "eval_runtime": 858.3469, "eval_samples_per_second": 4.66, "eval_steps_per_second": 0.291, "step": 3700 }, { "epoch": 0.89, "grad_norm": 10.345705890421096, "learning_rate": 1.873133326793397e-07, "logits/chosen": -1.638527274131775, "logits/rejected": -1.5753681659698486, "logps/chosen": -458.96087646484375, "logps/rejected": -542.0418701171875, "loss": 0.5058, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.7274707555770874, "rewards/margins": 1.00150465965271, "rewards/margins_max": 2.298344135284424, "rewards/margins_min": -0.3573911488056183, "rewards/margins_std": 1.198754906654358, "rewards/rejected": -2.728975772857666, "step": 3710 }, { "epoch": 0.89, "grad_norm": 4.451858715245556, "learning_rate": 1.794569646209948e-07, "logits/chosen": -1.5639150142669678, "logits/rejected": -1.4909937381744385, "logps/chosen": -492.00604248046875, "logps/rejected": -525.77392578125, "loss": 0.556, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.912846326828003, "rewards/margins": 0.7676688432693481, "rewards/margins_max": 2.4897422790527344, "rewards/margins_min": -0.8197135925292969, "rewards/margins_std": 1.4892566204071045, "rewards/rejected": -2.6805148124694824, "step": 3720 }, { "epoch": 0.89, "grad_norm": 12.562315982211006, "learning_rate": 1.717627675988315e-07, "logits/chosen": -1.6141542196273804, "logits/rejected": -1.5782029628753662, "logps/chosen": -471.81854248046875, "logps/rejected": -541.2411499023438, "loss": 0.4798, "rewards/accuracies": 0.75, "rewards/chosen": -1.7173553705215454, "rewards/margins": 1.1299710273742676, "rewards/margins_max": 2.905287504196167, "rewards/margins_min": -0.3574836850166321, "rewards/margins_std": 1.4663981199264526, "rewards/rejected": -2.8473262786865234, "step": 3730 }, { "epoch": 0.9, "grad_norm": 7.071849474690181, "learning_rate": 1.642312793214293e-07, "logits/chosen": -1.5640629529953003, "logits/rejected": -1.4748176336288452, "logps/chosen": -431.48345947265625, "logps/rejected": -581.3865356445312, "loss": 0.4427, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.6791887283325195, "rewards/margins": 1.340438723564148, "rewards/margins_max": 3.3646247386932373, "rewards/margins_min": -0.27356553077697754, "rewards/margins_std": 1.654958963394165, "rewards/rejected": -3.019627571105957, "step": 3740 }, { "epoch": 0.9, "grad_norm": 7.99582181541749, "learning_rate": 1.5686302612647891e-07, "logits/chosen": -1.6436965465545654, "logits/rejected": -1.5594273805618286, "logps/chosen": -465.6665954589844, "logps/rejected": -509.5194396972656, "loss": 0.5389, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.9255895614624023, "rewards/margins": 0.9076477289199829, "rewards/margins_max": 2.54972505569458, "rewards/margins_min": -0.5376116037368774, "rewards/margins_std": 1.3574801683425903, "rewards/rejected": -2.8332371711730957, "step": 3750 }, { "epoch": 0.9, "grad_norm": 5.282662106945439, "learning_rate": 1.4965852294399203e-07, "logits/chosen": -1.580666422843933, "logits/rejected": -1.5030317306518555, "logps/chosen": -388.2557373046875, "logps/rejected": -545.5868530273438, "loss": 0.415, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.532097578048706, "rewards/margins": 1.4141669273376465, "rewards/margins_max": 3.018325090408325, "rewards/margins_min": -0.08981673419475555, "rewards/margins_std": 1.4211039543151855, "rewards/rejected": -2.9462647438049316, "step": 3760 }, { "epoch": 0.9, "grad_norm": 6.315098293865397, "learning_rate": 1.4261827326032123e-07, "logits/chosen": -1.685455083847046, "logits/rejected": -1.594001293182373, "logps/chosen": -478.80194091796875, "logps/rejected": -560.7512817382812, "loss": 0.4722, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8066685199737549, "rewards/margins": 1.1716893911361694, "rewards/margins_max": 2.9957664012908936, "rewards/margins_min": -0.5524840354919434, "rewards/margins_std": 1.573940396308899, "rewards/rejected": -2.9783577919006348, "step": 3770 }, { "epoch": 0.91, "grad_norm": 9.399769118699846, "learning_rate": 1.3574276908296908e-07, "logits/chosen": -1.550453782081604, "logits/rejected": -1.4576085805892944, "logps/chosen": -412.32958984375, "logps/rejected": -544.8856201171875, "loss": 0.4337, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8018356561660767, "rewards/margins": 1.3136392831802368, "rewards/margins_max": 2.730268955230713, "rewards/margins_min": -0.064711794257164, "rewards/margins_std": 1.2573935985565186, "rewards/rejected": -3.1154751777648926, "step": 3780 }, { "epoch": 0.91, "grad_norm": 10.681113314660836, "learning_rate": 1.290324909062085e-07, "logits/chosen": -1.7514564990997314, "logits/rejected": -1.6136010885238647, "logps/chosen": -505.0232849121094, "logps/rejected": -564.8137817382812, "loss": 0.4831, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6792089939117432, "rewards/margins": 1.144392490386963, "rewards/margins_max": 2.667393445968628, "rewards/margins_min": -0.476982444524765, "rewards/margins_std": 1.4074592590332031, "rewards/rejected": -2.823601722717285, "step": 3790 }, { "epoch": 0.91, "grad_norm": 11.804657770065145, "learning_rate": 1.2248790767750013e-07, "logits/chosen": -1.546578049659729, "logits/rejected": -1.5517480373382568, "logps/chosen": -406.1092834472656, "logps/rejected": -539.877197265625, "loss": 0.4687, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8440258502960205, "rewards/margins": 1.0885521173477173, "rewards/margins_max": 2.5944013595581055, "rewards/margins_min": -0.3807124197483063, "rewards/margins_std": 1.3260362148284912, "rewards/rejected": -2.9325778484344482, "step": 3800 }, { "epoch": 0.91, "eval_logits/chosen": -1.6238332986831665, "eval_logits/rejected": -1.557306170463562, "eval_logps/chosen": -464.16265869140625, "eval_logps/rejected": -554.9319458007812, "eval_loss": 0.4949202239513397, "eval_rewards/accuracies": 0.7475000023841858, "eval_rewards/chosen": -1.7970731258392334, "eval_rewards/margins": 1.0937137603759766, "eval_rewards/margins_max": 3.58449125289917, "eval_rewards/margins_min": -0.9701796174049377, "eval_rewards/margins_std": 1.5426721572875977, "eval_rewards/rejected": -2.890786647796631, "eval_runtime": 858.7373, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 3800 }, { "epoch": 0.91, "grad_norm": 10.856134647509826, "learning_rate": 1.1610947676472279e-07, "logits/chosen": -1.6303218603134155, "logits/rejected": -1.571274995803833, "logps/chosen": -476.108642578125, "logps/rejected": -567.3362426757812, "loss": 0.5047, "rewards/accuracies": 0.75, "rewards/chosen": -1.900127649307251, "rewards/margins": 1.1418402194976807, "rewards/margins_max": 3.1849331855773926, "rewards/margins_min": -0.4592459201812744, "rewards/margins_std": 1.624070167541504, "rewards/rejected": -3.0419678688049316, "step": 3810 }, { "epoch": 0.91, "grad_norm": 9.489856725803058, "learning_rate": 1.0989764392420694e-07, "logits/chosen": -1.6528451442718506, "logits/rejected": -1.5458862781524658, "logps/chosen": -488.4986267089844, "logps/rejected": -579.4044189453125, "loss": 0.4283, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7201915979385376, "rewards/margins": 1.1673822402954102, "rewards/margins_max": 3.2683162689208984, "rewards/margins_min": -0.5650268197059631, "rewards/margins_std": 1.6839526891708374, "rewards/rejected": -2.8875739574432373, "step": 3820 }, { "epoch": 0.92, "grad_norm": 4.684515040812927, "learning_rate": 1.0385284326958594e-07, "logits/chosen": -1.7365949153900146, "logits/rejected": -1.5946829319000244, "logps/chosen": -488.33197021484375, "logps/rejected": -554.3961791992188, "loss": 0.4623, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.6608030796051025, "rewards/margins": 1.1060221195220947, "rewards/margins_max": 2.7679672241210938, "rewards/margins_min": -0.3001128137111664, "rewards/margins_std": 1.347150206565857, "rewards/rejected": -2.7668251991271973, "step": 3830 }, { "epoch": 0.92, "grad_norm": 9.992888783175271, "learning_rate": 9.797549724145733e-08, "logits/chosen": -1.746307373046875, "logits/rejected": -1.6239124536514282, "logps/chosen": -520.9617919921875, "logps/rejected": -572.9327392578125, "loss": 0.467, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -1.8659394979476929, "rewards/margins": 1.1183037757873535, "rewards/margins_max": 3.1687064170837402, "rewards/margins_min": -0.4290328919887543, "rewards/margins_std": 1.6328144073486328, "rewards/rejected": -2.984243392944336, "step": 3840 }, { "epoch": 0.92, "grad_norm": 6.814149709317101, "learning_rate": 9.226601657785994e-08, "logits/chosen": -1.6548357009887695, "logits/rejected": -1.6614749431610107, "logps/chosen": -441.4059143066406, "logps/rejected": -591.0307006835938, "loss": 0.4868, "rewards/accuracies": 0.6875, "rewards/chosen": -1.784348726272583, "rewards/margins": 1.1438257694244385, "rewards/margins_max": 3.282926559448242, "rewards/margins_min": -0.5800758600234985, "rewards/margins_std": 1.7275254726409912, "rewards/rejected": -2.9281744956970215, "step": 3850 }, { "epoch": 0.92, "grad_norm": 9.635822107531265, "learning_rate": 8.672480028556973e-08, "logits/chosen": -1.4599096775054932, "logits/rejected": -1.4498183727264404, "logps/chosen": -446.60455322265625, "logps/rejected": -573.4720458984375, "loss": 0.4955, "rewards/accuracies": 0.8125, "rewards/chosen": -1.8060026168823242, "rewards/margins": 1.0216182470321655, "rewards/margins_max": 2.609518527984619, "rewards/margins_min": -0.44868722558021545, "rewards/margins_std": 1.38383948802948, "rewards/rejected": -2.8276209831237793, "step": 3860 }, { "epoch": 0.93, "grad_norm": 6.757780041919884, "learning_rate": 8.135223561221512e-08, "logits/chosen": -1.7357505559921265, "logits/rejected": -1.5997146368026733, "logps/chosen": -447.9969787597656, "logps/rejected": -545.6727905273438, "loss": 0.4922, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.6534388065338135, "rewards/margins": 1.1515710353851318, "rewards/margins_max": 2.689906597137451, "rewards/margins_min": -0.2146696150302887, "rewards/margins_std": 1.283358097076416, "rewards/rejected": -2.8050098419189453, "step": 3870 }, { "epoch": 0.93, "grad_norm": 7.502165163952124, "learning_rate": 7.614869801921527e-08, "logits/chosen": -1.6426407098770142, "logits/rejected": -1.5870745182037354, "logps/chosen": -456.63641357421875, "logps/rejected": -538.5047607421875, "loss": 0.4762, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8083254098892212, "rewards/margins": 0.9464817047119141, "rewards/margins_max": 2.507932186126709, "rewards/margins_min": -0.535599410533905, "rewards/margins_std": 1.3531973361968994, "rewards/rejected": -2.7548067569732666, "step": 3880 }, { "epoch": 0.93, "grad_norm": 18.113001146879967, "learning_rate": 7.111455115553944e-08, "logits/chosen": -1.6291415691375732, "logits/rejected": -1.5897624492645264, "logps/chosen": -426.6095275878906, "logps/rejected": -583.9949951171875, "loss": 0.4925, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.63641357421875, "rewards/margins": 1.2930591106414795, "rewards/margins_max": 3.2561588287353516, "rewards/margins_min": -0.5704279541969299, "rewards/margins_std": 1.7241647243499756, "rewards/rejected": -2.9294726848602295, "step": 3890 }, { "epoch": 0.93, "grad_norm": 8.288461445868494, "learning_rate": 6.625014683229431e-08, "logits/chosen": -1.6557000875473022, "logits/rejected": -1.5877348184585571, "logps/chosen": -476.70751953125, "logps/rejected": -528.3411865234375, "loss": 0.4624, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.8729069232940674, "rewards/margins": 1.0431404113769531, "rewards/margins_max": 2.7050397396087646, "rewards/margins_min": -0.6716644167900085, "rewards/margins_std": 1.514846920967102, "rewards/rejected": -2.9160473346710205, "step": 3900 }, { "epoch": 0.93, "eval_logits/chosen": -1.6287753582000732, "eval_logits/rejected": -1.5625290870666504, "eval_logps/chosen": -460.3305969238281, "eval_logps/rejected": -550.8040161132812, "eval_loss": 0.4946449100971222, "eval_rewards/accuracies": 0.7480000257492065, "eval_rewards/chosen": -1.758752703666687, "eval_rewards/margins": 1.0907542705535889, "eval_rewards/margins_max": 3.578942060470581, "eval_rewards/margins_min": -0.9632537961006165, "eval_rewards/margins_std": 1.5385786294937134, "eval_rewards/rejected": -2.8495070934295654, "eval_runtime": 858.2303, "eval_samples_per_second": 4.661, "eval_steps_per_second": 0.291, "step": 3900 }, { "epoch": 0.94, "grad_norm": 16.08380779824901, "learning_rate": 6.155582499813655e-08, "logits/chosen": -1.6376491785049438, "logits/rejected": -1.5388031005859375, "logps/chosen": -459.8504943847656, "logps/rejected": -551.7870483398438, "loss": 0.542, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7968746423721313, "rewards/margins": 0.9826971888542175, "rewards/margins_max": 2.5308897495269775, "rewards/margins_min": -0.28135213255882263, "rewards/margins_std": 1.230656623840332, "rewards/rejected": -2.779571771621704, "step": 3910 }, { "epoch": 0.94, "grad_norm": 5.6024725904904935, "learning_rate": 5.7031913715518416e-08, "logits/chosen": -1.7070891857147217, "logits/rejected": -1.4941651821136475, "logps/chosen": -546.1607055664062, "logps/rejected": -553.6309814453125, "loss": 0.4534, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7424392700195312, "rewards/margins": 1.1457065343856812, "rewards/margins_max": 2.5940098762512207, "rewards/margins_min": -0.4168698787689209, "rewards/margins_std": 1.3514846563339233, "rewards/rejected": -2.888145923614502, "step": 3920 }, { "epoch": 0.94, "grad_norm": 7.138156455828818, "learning_rate": 5.267872913775757e-08, "logits/chosen": -1.7731412649154663, "logits/rejected": -1.6890032291412354, "logps/chosen": -451.65704345703125, "logps/rejected": -510.294677734375, "loss": 0.4781, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.6432549953460693, "rewards/margins": 1.082852840423584, "rewards/margins_max": 2.8969006538391113, "rewards/margins_min": -0.5714850425720215, "rewards/margins_std": 1.557543396949768, "rewards/rejected": -2.7261078357696533, "step": 3930 }, { "epoch": 0.94, "grad_norm": 25.60345059027994, "learning_rate": 4.849657548694375e-08, "logits/chosen": -1.7023370265960693, "logits/rejected": -1.5279595851898193, "logps/chosen": -524.5924682617188, "logps/rejected": -572.289306640625, "loss": 0.4321, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.8240480422973633, "rewards/margins": 1.2655783891677856, "rewards/margins_max": 2.688533067703247, "rewards/margins_min": -0.3931960165500641, "rewards/margins_std": 1.4061410427093506, "rewards/rejected": -3.0896263122558594, "step": 3940 }, { "epoch": 0.95, "grad_norm": 9.28738847668153, "learning_rate": 4.4485745032680775e-08, "logits/chosen": -1.5178053379058838, "logits/rejected": -1.4593212604522705, "logps/chosen": -450.4862365722656, "logps/rejected": -587.2463989257812, "loss": 0.4492, "rewards/accuracies": 0.75, "rewards/chosen": -1.8849847316741943, "rewards/margins": 1.383073091506958, "rewards/margins_max": 3.138340950012207, "rewards/margins_min": -0.4815470576286316, "rewards/margins_std": 1.668229103088379, "rewards/rejected": -3.2680580615997314, "step": 3950 }, { "epoch": 0.95, "grad_norm": 14.957740254806989, "learning_rate": 4.0646518071657815e-08, "logits/chosen": -1.5812273025512695, "logits/rejected": -1.4790027141571045, "logps/chosen": -447.2183532714844, "logps/rejected": -550.7565307617188, "loss": 0.4039, "rewards/accuracies": 0.8125, "rewards/chosen": -1.937771201133728, "rewards/margins": 1.4771236181259155, "rewards/margins_max": 3.0331015586853027, "rewards/margins_min": -0.13048198819160461, "rewards/margins_std": 1.42926025390625, "rewards/rejected": -3.4148945808410645, "step": 3960 }, { "epoch": 0.95, "grad_norm": 5.984022802442056, "learning_rate": 3.697916290806292e-08, "logits/chosen": -1.725451111793518, "logits/rejected": -1.598273515701294, "logps/chosen": -450.244140625, "logps/rejected": -505.60321044921875, "loss": 0.4344, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.5799996852874756, "rewards/margins": 1.0841538906097412, "rewards/margins_max": 2.7669124603271484, "rewards/margins_min": -0.19554653763771057, "rewards/margins_std": 1.3304919004440308, "rewards/rejected": -2.664153575897217, "step": 3970 }, { "epoch": 0.95, "grad_norm": 14.728531035539278, "learning_rate": 3.348393583483101e-08, "logits/chosen": -1.6615912914276123, "logits/rejected": -1.5487226247787476, "logps/chosen": -464.12176513671875, "logps/rejected": -585.9742431640625, "loss": 0.4111, "rewards/accuracies": 0.875, "rewards/chosen": -1.6889644861221313, "rewards/margins": 1.5552904605865479, "rewards/margins_max": 3.3712832927703857, "rewards/margins_min": 0.10417358577251434, "rewards/margins_std": 1.5120866298675537, "rewards/rejected": -3.2442545890808105, "step": 3980 }, { "epoch": 0.96, "grad_norm": 8.396051623312479, "learning_rate": 3.0161081115735456e-08, "logits/chosen": -1.7062709331512451, "logits/rejected": -1.6190780401229858, "logps/chosen": -486.57208251953125, "logps/rejected": -543.07177734375, "loss": 0.4661, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.7609214782714844, "rewards/margins": 0.8885887265205383, "rewards/margins_max": 2.6856846809387207, "rewards/margins_min": -0.5098963975906372, "rewards/margins_std": 1.4077028036117554, "rewards/rejected": -2.649510383605957, "step": 3990 }, { "epoch": 0.96, "grad_norm": 21.316200449689156, "learning_rate": 2.7010830968314805e-08, "logits/chosen": -1.6319202184677124, "logits/rejected": -1.5818144083023071, "logps/chosen": -429.73724365234375, "logps/rejected": -543.8162231445312, "loss": 0.4744, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -1.5965850353240967, "rewards/margins": 1.3478695154190063, "rewards/margins_max": 2.609862804412842, "rewards/margins_min": -0.08456478267908096, "rewards/margins_std": 1.199184536933899, "rewards/rejected": -2.9444546699523926, "step": 4000 }, { "epoch": 0.96, "eval_logits/chosen": -1.6239460706710815, "eval_logits/rejected": -1.5573369264602661, "eval_logps/chosen": -462.5721435546875, "eval_logps/rejected": -553.3814697265625, "eval_loss": 0.49475616216659546, "eval_rewards/accuracies": 0.746999979019165, "eval_rewards/chosen": -1.7811682224273682, "eval_rewards/margins": 1.0941139459609985, "eval_rewards/margins_max": 3.585127830505371, "eval_rewards/margins_min": -0.9684752821922302, "eval_rewards/margins_std": 1.5427662134170532, "eval_rewards/rejected": -2.8752822875976562, "eval_runtime": 858.7082, "eval_samples_per_second": 4.658, "eval_steps_per_second": 0.291, "step": 4000 }, { "epoch": 0.96, "grad_norm": 6.606659748575468, "learning_rate": 2.403340554764655e-08, "logits/chosen": -1.613621711730957, "logits/rejected": -1.554945945739746, "logps/chosen": -426.925537109375, "logps/rejected": -618.7645263671875, "loss": 0.4114, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.779359221458435, "rewards/margins": 1.4528939723968506, "rewards/margins_max": 3.380446672439575, "rewards/margins_min": -0.12249007076025009, "rewards/margins_std": 1.555430293083191, "rewards/rejected": -3.232253313064575, "step": 4010 }, { "epoch": 0.96, "grad_norm": 11.845343570645836, "learning_rate": 2.1229012930959193e-08, "logits/chosen": -1.6595861911773682, "logits/rejected": -1.562392234802246, "logps/chosen": -439.08294677734375, "logps/rejected": -543.6439208984375, "loss": 0.479, "rewards/accuracies": 0.75, "rewards/chosen": -1.4726041555404663, "rewards/margins": 1.266902208328247, "rewards/margins_max": 3.2616868019104004, "rewards/margins_min": -0.30579134821891785, "rewards/margins_std": 1.6205259561538696, "rewards/rejected": -2.739506483078003, "step": 4020 }, { "epoch": 0.97, "grad_norm": 11.216491301829794, "learning_rate": 1.8597849103094144e-08, "logits/chosen": -1.6443026065826416, "logits/rejected": -1.5746989250183105, "logps/chosen": -450.4159240722656, "logps/rejected": -564.2777099609375, "loss": 0.5141, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -1.63075852394104, "rewards/margins": 1.233082890510559, "rewards/margins_max": 2.968273639678955, "rewards/margins_min": -0.34106966853141785, "rewards/margins_std": 1.4606444835662842, "rewards/rejected": -2.8638415336608887, "step": 4030 }, { "epoch": 0.97, "grad_norm": 9.311471767112144, "learning_rate": 1.614009794280613e-08, "logits/chosen": -1.6348568201065063, "logits/rejected": -1.5376774072647095, "logps/chosen": -502.3479919433594, "logps/rejected": -593.45654296875, "loss": 0.4231, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -2.0759177207946777, "rewards/margins": 1.0674737691879272, "rewards/margins_max": 2.7976551055908203, "rewards/margins_min": -0.5501125454902649, "rewards/margins_std": 1.5319316387176514, "rewards/rejected": -3.1433911323547363, "step": 4040 }, { "epoch": 0.97, "grad_norm": 12.096105380694208, "learning_rate": 1.3855931209914297e-08, "logits/chosen": -1.6270204782485962, "logits/rejected": -1.6009836196899414, "logps/chosen": -482.4786071777344, "logps/rejected": -572.970947265625, "loss": 0.4956, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9904104471206665, "rewards/margins": 0.8864049911499023, "rewards/margins_max": 2.519026279449463, "rewards/margins_min": -0.6849830746650696, "rewards/margins_std": 1.4272019863128662, "rewards/rejected": -2.8768155574798584, "step": 4050 }, { "epoch": 0.97, "grad_norm": 6.121013667611909, "learning_rate": 1.1745508533298755e-08, "logits/chosen": -1.6496082544326782, "logits/rejected": -1.531686782836914, "logps/chosen": -471.07354736328125, "logps/rejected": -559.8525390625, "loss": 0.4232, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -1.8364298343658447, "rewards/margins": 1.3823045492172241, "rewards/margins_max": 2.8318071365356445, "rewards/margins_min": -0.06584613025188446, "rewards/margins_std": 1.3195993900299072, "rewards/rejected": -3.2187342643737793, "step": 4060 }, { "epoch": 0.97, "grad_norm": 8.117926232626123, "learning_rate": 9.808977399744512e-09, "logits/chosen": -1.569108009338379, "logits/rejected": -1.518903136253357, "logps/chosen": -453.6517028808594, "logps/rejected": -540.436279296875, "loss": 0.4865, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -1.7296392917633057, "rewards/margins": 1.1852201223373413, "rewards/margins_max": 2.792182683944702, "rewards/margins_min": -0.38893580436706543, "rewards/margins_std": 1.4746558666229248, "rewards/rejected": -2.9148592948913574, "step": 4070 }, { "epoch": 0.98, "grad_norm": 7.472991632731238, "learning_rate": 8.04647314363527e-09, "logits/chosen": -1.5730459690093994, "logits/rejected": -1.5244677066802979, "logps/chosen": -457.33929443359375, "logps/rejected": -552.5709228515625, "loss": 0.4988, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.723022699356079, "rewards/margins": 1.142809510231018, "rewards/margins_max": 2.987234115600586, "rewards/margins_min": -0.4525206685066223, "rewards/margins_std": 1.517905354499817, "rewards/rejected": -2.8658318519592285, "step": 4080 }, { "epoch": 0.98, "grad_norm": 5.797369840739203, "learning_rate": 6.4581189374943176e-09, "logits/chosen": -1.559385895729065, "logits/rejected": -1.5344467163085938, "logps/chosen": -503.1419372558594, "logps/rejected": -605.9370727539062, "loss": 0.4685, "rewards/accuracies": 0.75, "rewards/chosen": -1.830702543258667, "rewards/margins": 1.3102327585220337, "rewards/margins_max": 3.2229743003845215, "rewards/margins_min": -0.39028316736221313, "rewards/margins_std": 1.5903432369232178, "rewards/rejected": -3.140935182571411, "step": 4090 }, { "epoch": 0.98, "grad_norm": 13.18271177605381, "learning_rate": 5.04402578337726e-09, "logits/chosen": -1.6623458862304688, "logits/rejected": -1.583723783493042, "logps/chosen": -496.91290283203125, "logps/rejected": -611.0901489257812, "loss": 0.4294, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -1.9050073623657227, "rewards/margins": 1.4758739471435547, "rewards/margins_max": 3.018911600112915, "rewards/margins_min": 0.0300117377191782, "rewards/margins_std": 1.3294204473495483, "rewards/rejected": -3.3808815479278564, "step": 4100 }, { "epoch": 0.98, "eval_logits/chosen": -1.6196486949920654, "eval_logits/rejected": -1.552680253982544, "eval_logps/chosen": -463.0417785644531, "eval_logps/rejected": -553.8444213867188, "eval_loss": 0.49497443437576294, "eval_rewards/accuracies": 0.7480000257492065, "eval_rewards/chosen": -1.7858645915985107, "eval_rewards/margins": 1.0940468311309814, "eval_rewards/margins_max": 3.5862772464752197, "eval_rewards/margins_min": -0.970641016960144, "eval_rewards/margins_std": 1.5435758829116821, "eval_rewards/rejected": -2.8799116611480713, "eval_runtime": 858.6164, "eval_samples_per_second": 4.659, "eval_steps_per_second": 0.291, "step": 4100 }, { "epoch": 0.98, "grad_norm": 7.575916086330362, "learning_rate": 3.8042925051148815e-09, "logits/chosen": -1.576736569404602, "logits/rejected": -1.506089210510254, "logps/chosen": -492.5606384277344, "logps/rejected": -565.4273681640625, "loss": 0.4561, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.8778760433197021, "rewards/margins": 1.143216609954834, "rewards/margins_max": 2.8820621967315674, "rewards/margins_min": -0.5599583983421326, "rewards/margins_std": 1.5313533544540405, "rewards/rejected": -3.0210928916931152, "step": 4110 }, { "epoch": 0.99, "grad_norm": 6.31245588532698, "learning_rate": 2.7390057414064532e-09, "logits/chosen": -1.6171531677246094, "logits/rejected": -1.5825830698013306, "logps/chosen": -483.2496032714844, "logps/rejected": -573.5797119140625, "loss": 0.4136, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.7681468725204468, "rewards/margins": 1.304854393005371, "rewards/margins_max": 3.1026484966278076, "rewards/margins_min": -0.2878190875053406, "rewards/margins_std": 1.5061283111572266, "rewards/rejected": -3.073000907897949, "step": 4120 }, { "epoch": 0.99, "grad_norm": 13.551451230194349, "learning_rate": 1.848239939765406e-09, "logits/chosen": -1.688840627670288, "logits/rejected": -1.6429994106292725, "logps/chosen": -484.8333435058594, "logps/rejected": -593.3790893554688, "loss": 0.4716, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -1.8072439432144165, "rewards/margins": 1.1053558588027954, "rewards/margins_max": 2.7052226066589355, "rewards/margins_min": -0.23039250075817108, "rewards/margins_std": 1.3157598972320557, "rewards/rejected": -2.912600040435791, "step": 4130 }, { "epoch": 0.99, "grad_norm": 12.511111394484317, "learning_rate": 1.132057351315996e-09, "logits/chosen": -1.6320797204971313, "logits/rejected": -1.5492708683013916, "logps/chosen": -463.752685546875, "logps/rejected": -509.2848205566406, "loss": 0.4976, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.0202388763427734, "rewards/margins": 0.7255627512931824, "rewards/margins_max": 2.3149142265319824, "rewards/margins_min": -0.9404655694961548, "rewards/margins_std": 1.4641788005828857, "rewards/rejected": -2.7458014488220215, "step": 4140 }, { "epoch": 0.99, "grad_norm": 13.290080359497985, "learning_rate": 5.905080264431706e-10, "logits/chosen": -1.6289660930633545, "logits/rejected": -1.5693305730819702, "logps/chosen": -454.62005615234375, "logps/rejected": -550.0135498046875, "loss": 0.4699, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.7225040197372437, "rewards/margins": 1.2006828784942627, "rewards/margins_max": 2.8866634368896484, "rewards/margins_min": -0.33014729619026184, "rewards/margins_std": 1.4278733730316162, "rewards/rejected": -2.923186779022217, "step": 4150 }, { "epoch": 1.0, "grad_norm": 10.03710880491096, "learning_rate": 2.2362981129508966e-10, "logits/chosen": -1.639051079750061, "logits/rejected": -1.553789734840393, "logps/chosen": -471.66033935546875, "logps/rejected": -588.3956298828125, "loss": 0.4306, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.744632363319397, "rewards/margins": 1.3093732595443726, "rewards/margins_max": 3.391417980194092, "rewards/margins_min": -0.37286436557769775, "rewards/margins_std": 1.723730444908142, "rewards/rejected": -3.0540053844451904, "step": 4160 }, { "epoch": 1.0, "grad_norm": 12.219441816252543, "learning_rate": 3.144834513746364e-11, "logits/chosen": -1.6646182537078857, "logits/rejected": -1.6153373718261719, "logps/chosen": -485.84552001953125, "logps/rejected": -578.265380859375, "loss": 0.4481, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -1.639169454574585, "rewards/margins": 1.3950141668319702, "rewards/margins_max": 3.2100536823272705, "rewards/margins_min": -0.15085026621818542, "rewards/margins_std": 1.5013129711151123, "rewards/rejected": -3.0341832637786865, "step": 4170 }, { "epoch": 1.0, "step": 4176, "total_flos": 0.0, "train_loss": 0.5053737083043175, "train_runtime": 67897.0602, "train_samples_per_second": 0.984, "train_steps_per_second": 0.062 } ], "logging_steps": 10, "max_steps": 4176, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }