diff --git "a/last-checkpoint/trainer_state.json" "b/last-checkpoint/trainer_state.json" --- "a/last-checkpoint/trainer_state.json" +++ "b/last-checkpoint/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 0.6920914137408983, + "epoch": 1.3841828274817964, "eval_steps": 100, - "global_step": 600, + "global_step": 1200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -4603,6 +4603,4602 @@ "eval_samples_per_second": 2.701, "eval_steps_per_second": 0.675, "step": 600 + }, + { + "epoch": 0.6943983851200346, + "grad_norm": 60.2099543993624, + "learning_rate": 1.5683084510833155e-07, + "logits/chosen": -1.506928563117981, + "logits/rejected": -1.4527332782745361, + "logps/chosen": -136.79698181152344, + "logps/rejected": -171.7388916015625, + "loss": 0.528, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.40472638607025146, + "rewards/margins": 0.5502471923828125, + "rewards/rejected": -0.9549736380577087, + "step": 602 + }, + { + "epoch": 0.696705356499171, + "grad_norm": 61.94420954124892, + "learning_rate": 1.5651364144225918e-07, + "logits/chosen": -1.385040521621704, + "logits/rejected": -1.4811543226242065, + "logps/chosen": -156.2510986328125, + "logps/rejected": -228.10455322265625, + "loss": 0.5886, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.45847296714782715, + "rewards/margins": 0.805530309677124, + "rewards/rejected": -1.2640032768249512, + "step": 604 + }, + { + "epoch": 0.6990123278783072, + "grad_norm": 68.10520883639806, + "learning_rate": 1.5619560010768892e-07, + "logits/chosen": -1.429363489151001, + "logits/rejected": -1.474833369255066, + "logps/chosen": -105.5804672241211, + "logps/rejected": -149.79159545898438, + "loss": 0.5776, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.4616313576698303, + "rewards/margins": 0.11374694108963013, + "rewards/rejected": -0.5753782987594604, + "step": 606 + }, + { + "epoch": 0.7013192992574436, + "grad_norm": 80.98545740701343, + "learning_rate": 1.558767258187605e-07, + "logits/chosen": -1.3892110586166382, + "logits/rejected": -1.3783142566680908, + "logps/chosen": -192.46571350097656, + "logps/rejected": -301.35589599609375, + "loss": 0.5993, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7047985196113586, + "rewards/margins": 0.531038761138916, + "rewards/rejected": -1.2358373403549194, + "step": 608 + }, + { + "epoch": 0.7036262706365799, + "grad_norm": 67.59649388895859, + "learning_rate": 1.555570233019602e-07, + "logits/chosen": -1.528752326965332, + "logits/rejected": -1.4884089231491089, + "logps/chosen": -149.33953857421875, + "logps/rejected": -156.87937927246094, + "loss": 0.5549, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5484266877174377, + "rewards/margins": 0.2577958106994629, + "rewards/rejected": -0.8062225580215454, + "step": 610 + }, + { + "epoch": 0.7059332420157163, + "grad_norm": 70.41159723680883, + "learning_rate": 1.5523649729605057e-07, + "logits/chosen": -1.549803614616394, + "logits/rejected": -1.5478185415267944, + "logps/chosen": -202.39064025878906, + "logps/rejected": -191.8920135498047, + "loss": 0.5702, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7880931496620178, + "rewards/margins": 0.3749513328075409, + "rewards/rejected": -1.1630443334579468, + "step": 612 + }, + { + "epoch": 0.7082402133948525, + "grad_norm": 84.62171550148092, + "learning_rate": 1.5491515255200023e-07, + "logits/chosen": -1.3567522764205933, + "logits/rejected": -1.3661439418792725, + "logps/chosen": -194.17117309570312, + "logps/rejected": -233.16098022460938, + "loss": 0.5773, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7068712711334229, + "rewards/margins": 0.5711615085601807, + "rewards/rejected": -1.2780327796936035, + "step": 614 + }, + { + "epoch": 0.7105471847739889, + "grad_norm": 78.0462456849789, + "learning_rate": 1.5459299383291345e-07, + "logits/chosen": -1.4880882501602173, + "logits/rejected": -1.5348231792449951, + "logps/chosen": -165.6639404296875, + "logps/rejected": -203.5524139404297, + "loss": 0.5763, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4784421920776367, + "rewards/margins": 0.41431570053100586, + "rewards/rejected": -0.8927579522132874, + "step": 616 + }, + { + "epoch": 0.7128541561531252, + "grad_norm": 77.7841366725459, + "learning_rate": 1.5427002591395964e-07, + "logits/chosen": -1.4515254497528076, + "logits/rejected": -1.4519662857055664, + "logps/chosen": -172.7522430419922, + "logps/rejected": -327.67791748046875, + "loss": 0.6045, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.5820431709289551, + "rewards/margins": 0.3262157440185547, + "rewards/rejected": -0.908258855342865, + "step": 618 + }, + { + "epoch": 0.7151611275322616, + "grad_norm": 62.11750703500811, + "learning_rate": 1.539462535823025e-07, + "logits/chosen": -1.3129442930221558, + "logits/rejected": -1.4445109367370605, + "logps/chosen": -101.69780731201172, + "logps/rejected": -159.57347106933594, + "loss": 0.5584, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.33953019976615906, + "rewards/margins": 0.539714515209198, + "rewards/rejected": -0.8792447447776794, + "step": 620 + }, + { + "epoch": 0.7174680989113978, + "grad_norm": 64.90210429350962, + "learning_rate": 1.5362168163702897e-07, + "logits/chosen": -1.4593255519866943, + "logits/rejected": -1.497260570526123, + "logps/chosen": -174.7530975341797, + "logps/rejected": -190.82069396972656, + "loss": 0.5744, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.44477853178977966, + "rewards/margins": 0.10030045360326767, + "rewards/rejected": -0.5450789928436279, + "step": 622 + }, + { + "epoch": 0.7197750702905342, + "grad_norm": 62.61220782799182, + "learning_rate": 1.5329631488907834e-07, + "logits/chosen": -1.3799552917480469, + "logits/rejected": -1.3643782138824463, + "logps/chosen": -106.81502532958984, + "logps/rejected": -163.37252807617188, + "loss": 0.5539, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.45432302355766296, + "rewards/margins": 0.5782679319381714, + "rewards/rejected": -1.0325908660888672, + "step": 624 + }, + { + "epoch": 0.7220820416696705, + "grad_norm": 64.30209297911543, + "learning_rate": 1.529701581611707e-07, + "logits/chosen": -1.3249229192733765, + "logits/rejected": -1.312375783920288, + "logps/chosen": -188.57989501953125, + "logps/rejected": -259.47991943359375, + "loss": 0.5864, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6669080257415771, + "rewards/margins": 0.5166423916816711, + "rewards/rejected": -1.1835503578186035, + "step": 626 + }, + { + "epoch": 0.7243890130488069, + "grad_norm": 81.32626957899065, + "learning_rate": 1.5264321628773557e-07, + "logits/chosen": -1.5589840412139893, + "logits/rejected": -1.6068872213363647, + "logps/chosen": -152.81272888183594, + "logps/rejected": -160.92356872558594, + "loss": 0.5575, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5344883799552917, + "rewards/margins": 0.2917326092720032, + "rewards/rejected": -0.8262209296226501, + "step": 628 + }, + { + "epoch": 0.7266959844279431, + "grad_norm": 63.10819469515795, + "learning_rate": 1.5231549411484021e-07, + "logits/chosen": -1.3950941562652588, + "logits/rejected": -1.4998093843460083, + "logps/chosen": -182.9403533935547, + "logps/rejected": -533.6143798828125, + "loss": 0.528, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6201209425926208, + "rewards/margins": 0.9562212228775024, + "rewards/rejected": -1.5763421058654785, + "step": 630 + }, + { + "epoch": 0.7290029558070795, + "grad_norm": 74.17672200121388, + "learning_rate": 1.5198699650011783e-07, + "logits/chosen": -1.5746220350265503, + "logits/rejected": -1.5152575969696045, + "logps/chosen": -114.51097869873047, + "logps/rejected": -142.73123168945312, + "loss": 0.5682, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.5173571705818176, + "rewards/margins": 0.4040333032608032, + "rewards/rejected": -0.9213904738426208, + "step": 632 + }, + { + "epoch": 0.7313099271862158, + "grad_norm": 74.93724286416074, + "learning_rate": 1.5165772831269546e-07, + "logits/chosen": -1.4648115634918213, + "logits/rejected": -1.353775978088379, + "logps/chosen": -166.06561279296875, + "logps/rejected": -161.02513122558594, + "loss": 0.5914, + "rewards/accuracies": 0.53125, + "rewards/chosen": -0.6317393779754639, + "rewards/margins": 0.12096243351697922, + "rewards/rejected": -0.7527018189430237, + "step": 634 + }, + { + "epoch": 0.7336168985653522, + "grad_norm": 78.1768997076042, + "learning_rate": 1.5132769443312206e-07, + "logits/chosen": -1.3494369983673096, + "logits/rejected": -1.3753654956817627, + "logps/chosen": -170.8379669189453, + "logps/rejected": -236.137939453125, + "loss": 0.5601, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7240103483200073, + "rewards/margins": 0.7221677303314209, + "rewards/rejected": -1.4461781978607178, + "step": 636 + }, + { + "epoch": 0.7359238699444886, + "grad_norm": 72.68675984518603, + "learning_rate": 1.5099689975329582e-07, + "logits/chosen": -1.454272747039795, + "logits/rejected": -1.3425519466400146, + "logps/chosen": -159.56729125976562, + "logps/rejected": -176.78076171875, + "loss": 0.5931, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6215909719467163, + "rewards/margins": 0.44397681951522827, + "rewards/rejected": -1.0655678510665894, + "step": 638 + }, + { + "epoch": 0.7382308413236248, + "grad_norm": 103.38173736649145, + "learning_rate": 1.5066534917639194e-07, + "logits/chosen": -1.5096681118011475, + "logits/rejected": -1.4664478302001953, + "logps/chosen": -168.00894165039062, + "logps/rejected": -172.64730834960938, + "loss": 0.6387, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.6311260461807251, + "rewards/margins": 0.37282636761665344, + "rewards/rejected": -1.0039525032043457, + "step": 640 + }, + { + "epoch": 0.7405378127027612, + "grad_norm": 89.90439421362515, + "learning_rate": 1.5033304761678974e-07, + "logits/chosen": -1.4626061916351318, + "logits/rejected": -1.3341164588928223, + "logps/chosen": -205.5465087890625, + "logps/rejected": -253.4654541015625, + "loss": 0.694, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.898277997970581, + "rewards/margins": 0.6167319416999817, + "rewards/rejected": -1.5150099992752075, + "step": 642 + }, + { + "epoch": 0.7428447840818975, + "grad_norm": 80.94685802233828, + "learning_rate": 1.5e-07, + "logits/chosen": -1.5373523235321045, + "logits/rejected": -1.5548286437988281, + "logps/chosen": -202.79559326171875, + "logps/rejected": -259.5960998535156, + "loss": 0.5766, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6312007308006287, + "rewards/margins": 0.5061548948287964, + "rewards/rejected": -1.1373556852340698, + "step": 644 + }, + { + "epoch": 0.7451517554610338, + "grad_norm": 76.95960849605379, + "learning_rate": 1.4966621126259182e-07, + "logits/chosen": -1.3829816579818726, + "logits/rejected": -1.437648892402649, + "logps/chosen": -176.02261352539062, + "logps/rejected": -231.1138916015625, + "loss": 0.5749, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8245862126350403, + "rewards/margins": 0.5100637674331665, + "rewards/rejected": -1.3346500396728516, + "step": 646 + }, + { + "epoch": 0.7474587268401701, + "grad_norm": 86.16535127809355, + "learning_rate": 1.4933168635211954e-07, + "logits/chosen": -1.3192147016525269, + "logits/rejected": -1.2893555164337158, + "logps/chosen": -180.10354614257812, + "logps/rejected": -209.48648071289062, + "loss": 0.6253, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6453790664672852, + "rewards/margins": 0.4227368235588074, + "rewards/rejected": -1.0681159496307373, + "step": 648 + }, + { + "epoch": 0.7497656982193065, + "grad_norm": 76.07003275403453, + "learning_rate": 1.489964302270493e-07, + "logits/chosen": -1.3541210889816284, + "logits/rejected": -1.4539867639541626, + "logps/chosen": -152.72354125976562, + "logps/rejected": -201.84646606445312, + "loss": 0.5653, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6403982043266296, + "rewards/margins": 0.3868240416049957, + "rewards/rejected": -1.0272222757339478, + "step": 650 + }, + { + "epoch": 0.7520726695984428, + "grad_norm": 57.518835795007355, + "learning_rate": 1.4866044785668562e-07, + "logits/chosen": -1.496565341949463, + "logits/rejected": -1.5742026567459106, + "logps/chosen": -184.71319580078125, + "logps/rejected": -223.27279663085938, + "loss": 0.5689, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.731714129447937, + "rewards/margins": 0.3506891429424286, + "rewards/rejected": -1.082403302192688, + "step": 652 + }, + { + "epoch": 0.7543796409775791, + "grad_norm": 69.54916104252412, + "learning_rate": 1.483237442210978e-07, + "logits/chosen": -1.5298668146133423, + "logits/rejected": -1.4372026920318604, + "logps/chosen": -182.90591430664062, + "logps/rejected": -185.94142150878906, + "loss": 0.5684, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.652667224407196, + "rewards/margins": 0.2604321837425232, + "rewards/rejected": -0.9130994081497192, + "step": 654 + }, + { + "epoch": 0.7566866123567154, + "grad_norm": 86.35756436296293, + "learning_rate": 1.479863243110459e-07, + "logits/chosen": -1.5615227222442627, + "logits/rejected": -1.5152493715286255, + "logps/chosen": -135.16763305664062, + "logps/rejected": -151.04348754882812, + "loss": 0.5864, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.4842577576637268, + "rewards/margins": 0.39588865637779236, + "rewards/rejected": -0.8801463842391968, + "step": 656 + }, + { + "epoch": 0.7589935837358518, + "grad_norm": 71.27708160399212, + "learning_rate": 1.4764819312790704e-07, + "logits/chosen": -1.3233333826065063, + "logits/rejected": -1.3788268566131592, + "logps/chosen": -167.7100067138672, + "logps/rejected": -222.81932067871094, + "loss": 0.5365, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.4721766710281372, + "rewards/margins": 0.8050169944763184, + "rewards/rejected": -1.277193546295166, + "step": 658 + }, + { + "epoch": 0.7613005551149881, + "grad_norm": 67.4040730071334, + "learning_rate": 1.4730935568360101e-07, + "logits/chosen": -1.4443995952606201, + "logits/rejected": -1.4764584302902222, + "logps/chosen": -132.14419555664062, + "logps/rejected": -231.72938537597656, + "loss": 0.574, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.47360759973526, + "rewards/margins": 0.66644686460495, + "rewards/rejected": -1.14005446434021, + "step": 660 + }, + { + "epoch": 0.7636075264941244, + "grad_norm": 72.31061103050068, + "learning_rate": 1.4696981700051613e-07, + "logits/chosen": -1.4308209419250488, + "logits/rejected": -1.5072565078735352, + "logps/chosen": -215.11119079589844, + "logps/rejected": -308.4154968261719, + "loss": 0.5539, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7280828952789307, + "rewards/margins": 0.589575469493866, + "rewards/rejected": -1.3176583051681519, + "step": 662 + }, + { + "epoch": 0.7659144978732607, + "grad_norm": 78.75618194817389, + "learning_rate": 1.4662958211143478e-07, + "logits/chosen": -1.4515475034713745, + "logits/rejected": -1.3340954780578613, + "logps/chosen": -177.06072998046875, + "logps/rejected": -191.52259826660156, + "loss": 0.546, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.6530216336250305, + "rewards/margins": 0.680112361907959, + "rewards/rejected": -1.3331340551376343, + "step": 664 + }, + { + "epoch": 0.7682214692523971, + "grad_norm": 71.81422997168194, + "learning_rate": 1.4628865605945884e-07, + "logits/chosen": -1.4638550281524658, + "logits/rejected": -1.562100887298584, + "logps/chosen": -149.26190185546875, + "logps/rejected": -181.56045532226562, + "loss": 0.5891, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6122503876686096, + "rewards/margins": 0.3339022099971771, + "rewards/rejected": -0.9461526870727539, + "step": 666 + }, + { + "epoch": 0.7705284406315334, + "grad_norm": 64.13905687965561, + "learning_rate": 1.4594704389793476e-07, + "logits/chosen": -1.342291235923767, + "logits/rejected": -1.3401648998260498, + "logps/chosen": -132.9333038330078, + "logps/rejected": -151.05657958984375, + "loss": 0.5292, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.4934079647064209, + "rewards/margins": 0.5540266036987305, + "rewards/rejected": -1.0474345684051514, + "step": 668 + }, + { + "epoch": 0.7728354120106697, + "grad_norm": 64.25891918041506, + "learning_rate": 1.4560475069037895e-07, + "logits/chosen": -1.4274932146072388, + "logits/rejected": -1.320129156112671, + "logps/chosen": -131.24008178710938, + "logps/rejected": -158.29779052734375, + "loss": 0.559, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.47251197695732117, + "rewards/margins": 0.6539871692657471, + "rewards/rejected": -1.126499056816101, + "step": 670 + }, + { + "epoch": 0.7751423833898061, + "grad_norm": 79.74116137345376, + "learning_rate": 1.4526178151040238e-07, + "logits/chosen": -1.4236022233963013, + "logits/rejected": -1.4343180656433105, + "logps/chosen": -216.5073699951172, + "logps/rejected": -253.56356811523438, + "loss": 0.6319, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8843379616737366, + "rewards/margins": 0.40150371193885803, + "rewards/rejected": -1.285841703414917, + "step": 672 + }, + { + "epoch": 0.7774493547689424, + "grad_norm": 56.17449348305076, + "learning_rate": 1.449181414416357e-07, + "logits/chosen": -1.43699312210083, + "logits/rejected": -1.4178071022033691, + "logps/chosen": -157.00311279296875, + "logps/rejected": -156.90017700195312, + "loss": 0.5701, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6511672139167786, + "rewards/margins": 0.3164646625518799, + "rewards/rejected": -0.9676318168640137, + "step": 674 + }, + { + "epoch": 0.7797563261480788, + "grad_norm": 66.63614533861005, + "learning_rate": 1.4457383557765383e-07, + "logits/chosen": -1.4309780597686768, + "logits/rejected": -1.4221723079681396, + "logps/chosen": -151.88272094726562, + "logps/rejected": -181.7390594482422, + "loss": 0.5585, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7592535614967346, + "rewards/margins": 0.47323358058929443, + "rewards/rejected": -1.2324870824813843, + "step": 676 + }, + { + "epoch": 0.782063297527215, + "grad_norm": 72.3899064306705, + "learning_rate": 1.4422886902190013e-07, + "logits/chosen": -1.4335781335830688, + "logits/rejected": -1.3959101438522339, + "logps/chosen": -227.32432556152344, + "logps/rejected": -237.228515625, + "loss": 0.5979, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7621864676475525, + "rewards/margins": 0.6047258377075195, + "rewards/rejected": -1.3669122457504272, + "step": 678 + }, + { + "epoch": 0.7843702689063514, + "grad_norm": 68.22407643194211, + "learning_rate": 1.438832468876112e-07, + "logits/chosen": -1.5670726299285889, + "logits/rejected": -1.5206338167190552, + "logps/chosen": -162.11500549316406, + "logps/rejected": -189.17723083496094, + "loss": 0.5806, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.46744874119758606, + "rewards/margins": 0.37639617919921875, + "rewards/rejected": -0.843845009803772, + "step": 680 + }, + { + "epoch": 0.7866772402854877, + "grad_norm": 76.17187863595697, + "learning_rate": 1.435369742977408e-07, + "logits/chosen": -1.3989527225494385, + "logits/rejected": -1.331002116203308, + "logps/chosen": -150.4923553466797, + "logps/rejected": -152.4705810546875, + "loss": 0.5466, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.5739311575889587, + "rewards/margins": 0.6111598610877991, + "rewards/rejected": -1.1850910186767578, + "step": 682 + }, + { + "epoch": 0.7889842116646241, + "grad_norm": 71.79810308029995, + "learning_rate": 1.4319005638488411e-07, + "logits/chosen": -1.4992091655731201, + "logits/rejected": -1.5468194484710693, + "logps/chosen": -139.28636169433594, + "logps/rejected": -164.89332580566406, + "loss": 0.6042, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5346908569335938, + "rewards/margins": 0.4574483633041382, + "rewards/rejected": -0.9921392202377319, + "step": 684 + }, + { + "epoch": 0.7912911830437603, + "grad_norm": 84.61839383932663, + "learning_rate": 1.4284249829120144e-07, + "logits/chosen": -1.4420665502548218, + "logits/rejected": -1.4561961889266968, + "logps/chosen": -163.67910766601562, + "logps/rejected": -212.11285400390625, + "loss": 0.5338, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6722000241279602, + "rewards/margins": 0.5183134078979492, + "rewards/rejected": -1.1905133724212646, + "step": 686 + }, + { + "epoch": 0.7935981544228967, + "grad_norm": 69.04050689616894, + "learning_rate": 1.4249430516834219e-07, + "logits/chosen": -1.427943468093872, + "logits/rejected": -1.2925009727478027, + "logps/chosen": -170.96783447265625, + "logps/rejected": -154.1614227294922, + "loss": 0.6101, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6926352977752686, + "rewards/margins": 0.3386973738670349, + "rewards/rejected": -1.0313327312469482, + "step": 688 + }, + { + "epoch": 0.795905125802033, + "grad_norm": 79.30403180179479, + "learning_rate": 1.4214548217736842e-07, + "logits/chosen": -1.42530357837677, + "logits/rejected": -1.4162826538085938, + "logps/chosen": -159.53546142578125, + "logps/rejected": -169.24526977539062, + "loss": 0.5704, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.5039569139480591, + "rewards/margins": 0.28192925453186035, + "rewards/rejected": -0.7858862280845642, + "step": 690 + }, + { + "epoch": 0.7982120971811694, + "grad_norm": 73.96748010582954, + "learning_rate": 1.4179603448867835e-07, + "logits/chosen": -1.3817723989486694, + "logits/rejected": -1.493046760559082, + "logps/chosen": -159.07061767578125, + "logps/rejected": -224.4259033203125, + "loss": 0.5923, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6248303651809692, + "rewards/margins": 0.5017886161804199, + "rewards/rejected": -1.1266189813613892, + "step": 692 + }, + { + "epoch": 0.8005190685603056, + "grad_norm": 84.16059294161194, + "learning_rate": 1.414459672819297e-07, + "logits/chosen": -1.3444278240203857, + "logits/rejected": -1.3752013444900513, + "logps/chosen": -148.96214294433594, + "logps/rejected": -246.97567749023438, + "loss": 0.6059, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.723760187625885, + "rewards/margins": 0.6408222913742065, + "rewards/rejected": -1.3645824193954468, + "step": 694 + }, + { + "epoch": 0.802826039939442, + "grad_norm": 81.81384074979675, + "learning_rate": 1.41095285745963e-07, + "logits/chosen": -1.3597787618637085, + "logits/rejected": -1.422480821609497, + "logps/chosen": -184.29083251953125, + "logps/rejected": -324.6047668457031, + "loss": 0.57, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7926025390625, + "rewards/margins": 0.9075123071670532, + "rewards/rejected": -1.7001147270202637, + "step": 696 + }, + { + "epoch": 0.8051330113185783, + "grad_norm": 86.69046431842193, + "learning_rate": 1.4074399507872455e-07, + "logits/chosen": -1.46107017993927, + "logits/rejected": -1.5106903314590454, + "logps/chosen": -169.36878967285156, + "logps/rejected": -217.03964233398438, + "loss": 0.5998, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.9666664004325867, + "rewards/margins": 0.4315177798271179, + "rewards/rejected": -1.3981841802597046, + "step": 698 + }, + { + "epoch": 0.8074399826977147, + "grad_norm": 72.764039414317, + "learning_rate": 1.4039210048718947e-07, + "logits/chosen": -1.3857448101043701, + "logits/rejected": -1.38966703414917, + "logps/chosen": -236.34902954101562, + "logps/rejected": -287.2010803222656, + "loss": 0.5593, + "rewards/accuracies": 0.625, + "rewards/chosen": -1.1096588373184204, + "rewards/margins": 0.5392670631408691, + "rewards/rejected": -1.648926019668579, + "step": 700 + }, + { + "epoch": 0.8074399826977147, + "eval_logits/chosen": -1.4024839401245117, + "eval_logits/rejected": -1.3188287019729614, + "eval_logps/chosen": -193.01976013183594, + "eval_logps/rejected": -162.88392639160156, + "eval_loss": 0.581591010093689, + "eval_rewards/accuracies": 0.7200000286102295, + "eval_rewards/chosen": -0.7837581038475037, + "eval_rewards/margins": 0.45309457182884216, + "eval_rewards/rejected": -1.2368526458740234, + "eval_runtime": 28.9042, + "eval_samples_per_second": 3.46, + "eval_steps_per_second": 0.865, + "step": 700 + }, + { + "epoch": 0.8097469540768509, + "grad_norm": 92.59632644873196, + "learning_rate": 1.4003960718728458e-07, + "logits/chosen": -1.434832215309143, + "logits/rejected": -1.368481159210205, + "logps/chosen": -188.60244750976562, + "logps/rejected": -191.86891174316406, + "loss": 0.5808, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7094125747680664, + "rewards/margins": 0.36493563652038574, + "rewards/rejected": -1.0743482112884521, + "step": 702 + }, + { + "epoch": 0.8120539254559873, + "grad_norm": 88.02921453439615, + "learning_rate": 1.3968652040381087e-07, + "logits/chosen": -1.5649467706680298, + "logits/rejected": -1.6187723875045776, + "logps/chosen": -174.0366668701172, + "logps/rejected": -194.51516723632812, + "loss": 0.5795, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7656524777412415, + "rewards/margins": 0.09420950710773468, + "rewards/rejected": -0.8598620295524597, + "step": 704 + }, + { + "epoch": 0.8143608968351237, + "grad_norm": 70.53446281652334, + "learning_rate": 1.3933284537036626e-07, + "logits/chosen": -1.3043855428695679, + "logits/rejected": -1.4392000436782837, + "logps/chosen": -184.783935546875, + "logps/rejected": -343.3705139160156, + "loss": 0.56, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8058075904846191, + "rewards/margins": 0.9816871285438538, + "rewards/rejected": -1.7874946594238281, + "step": 706 + }, + { + "epoch": 0.81666786821426, + "grad_norm": 85.1907599000231, + "learning_rate": 1.3897858732926794e-07, + "logits/chosen": -1.5064363479614258, + "logits/rejected": -1.4735333919525146, + "logps/chosen": -206.25363159179688, + "logps/rejected": -199.1461944580078, + "loss": 0.6216, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7892999053001404, + "rewards/margins": 0.3833533525466919, + "rewards/rejected": -1.1726531982421875, + "step": 708 + }, + { + "epoch": 0.8189748395933963, + "grad_norm": 73.09038090478488, + "learning_rate": 1.3862375153147464e-07, + "logits/chosen": -1.545288324356079, + "logits/rejected": -1.5258712768554688, + "logps/chosen": -154.7598419189453, + "logps/rejected": -196.91165161132812, + "loss": 0.5838, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.47864556312561035, + "rewards/margins": 0.47909313440322876, + "rewards/rejected": -0.9577386379241943, + "step": 710 + }, + { + "epoch": 0.8212818109725326, + "grad_norm": 93.34170526258869, + "learning_rate": 1.3826834323650897e-07, + "logits/chosen": -1.4695608615875244, + "logits/rejected": -1.4735374450683594, + "logps/chosen": -188.8636474609375, + "logps/rejected": -209.92041015625, + "loss": 0.5469, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8135924935340881, + "rewards/margins": 0.24801144003868103, + "rewards/rejected": -1.0616040229797363, + "step": 712 + }, + { + "epoch": 0.823588782351669, + "grad_norm": 74.73130142354223, + "learning_rate": 1.3791236771237917e-07, + "logits/chosen": -1.384385108947754, + "logits/rejected": -1.5454891920089722, + "logps/chosen": -167.55824279785156, + "logps/rejected": -229.03851318359375, + "loss": 0.5864, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.6291375756263733, + "rewards/margins": 0.5017825961112976, + "rewards/rejected": -1.130920171737671, + "step": 714 + }, + { + "epoch": 0.8258957537308053, + "grad_norm": 82.88128786393553, + "learning_rate": 1.3755583023550127e-07, + "logits/chosen": -1.4609194993972778, + "logits/rejected": -1.5298848152160645, + "logps/chosen": -197.1524658203125, + "logps/rejected": -219.78167724609375, + "loss": 0.5435, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7119307518005371, + "rewards/margins": 0.5048573613166809, + "rewards/rejected": -1.2167880535125732, + "step": 716 + }, + { + "epoch": 0.8282027251099416, + "grad_norm": 81.32166792287437, + "learning_rate": 1.3719873609062075e-07, + "logits/chosen": -1.3578637838363647, + "logits/rejected": -1.3476288318634033, + "logps/chosen": -183.71852111816406, + "logps/rejected": -201.42977905273438, + "loss": 0.6069, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7198283076286316, + "rewards/margins": 0.41808462142944336, + "rewards/rejected": -1.1379129886627197, + "step": 718 + }, + { + "epoch": 0.8305096964890779, + "grad_norm": 76.1412354910528, + "learning_rate": 1.3684109057073433e-07, + "logits/chosen": -1.5357989072799683, + "logits/rejected": -1.5547668933868408, + "logps/chosen": -158.44583129882812, + "logps/rejected": -186.08767700195312, + "loss": 0.5387, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.55253666639328, + "rewards/margins": 0.6118079423904419, + "rewards/rejected": -1.1643445491790771, + "step": 720 + }, + { + "epoch": 0.8328166678682143, + "grad_norm": 76.88330992572241, + "learning_rate": 1.3648289897701134e-07, + "logits/chosen": -1.4407174587249756, + "logits/rejected": -1.4180082082748413, + "logps/chosen": -229.4559326171875, + "logps/rejected": -275.36004638671875, + "loss": 0.5708, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9139447808265686, + "rewards/margins": 0.7458863258361816, + "rewards/rejected": -1.6598312854766846, + "step": 722 + }, + { + "epoch": 0.8351236392473506, + "grad_norm": 76.40403716300928, + "learning_rate": 1.361241666187153e-07, + "logits/chosen": -1.5596188306808472, + "logits/rejected": -1.4953889846801758, + "logps/chosen": -181.32513427734375, + "logps/rejected": -228.9872589111328, + "loss": 0.558, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.6869115233421326, + "rewards/margins": 0.6277297735214233, + "rewards/rejected": -1.3146413564682007, + "step": 724 + }, + { + "epoch": 0.8374306106264869, + "grad_norm": 88.00729362814612, + "learning_rate": 1.3576489881312516e-07, + "logits/chosen": -1.4083036184310913, + "logits/rejected": -1.4413796663284302, + "logps/chosen": -163.39768981933594, + "logps/rejected": -190.90277099609375, + "loss": 0.6253, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.7067568898200989, + "rewards/margins": 0.3021257817745209, + "rewards/rejected": -1.0088826417922974, + "step": 726 + }, + { + "epoch": 0.8397375820056232, + "grad_norm": 69.35704721685642, + "learning_rate": 1.354051008854565e-07, + "logits/chosen": -1.4755336046218872, + "logits/rejected": -1.4025901556015015, + "logps/chosen": -166.0547332763672, + "logps/rejected": -179.10647583007812, + "loss": 0.5798, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.5479276776313782, + "rewards/margins": 0.6779038310050964, + "rewards/rejected": -1.2258315086364746, + "step": 728 + }, + { + "epoch": 0.8420445533847596, + "grad_norm": 63.6003273966082, + "learning_rate": 1.3504477816878258e-07, + "logits/chosen": -1.4906251430511475, + "logits/rejected": -1.524994134902954, + "logps/chosen": -152.8936309814453, + "logps/rejected": -154.26930236816406, + "loss": 0.5613, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.41379597783088684, + "rewards/margins": 0.31041520833969116, + "rewards/rejected": -0.7242112159729004, + "step": 730 + }, + { + "epoch": 0.8443515247638959, + "grad_norm": 72.20930120779202, + "learning_rate": 1.3468393600395524e-07, + "logits/chosen": -1.4875296354293823, + "logits/rejected": -1.4794172048568726, + "logps/chosen": -180.05352783203125, + "logps/rejected": -201.59454345703125, + "loss": 0.5522, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7309121489524841, + "rewards/margins": 0.2411407232284546, + "rewards/rejected": -0.972052812576294, + "step": 732 + }, + { + "epoch": 0.8466584961430322, + "grad_norm": 68.5642733850933, + "learning_rate": 1.3432257973952592e-07, + "logits/chosen": -1.4924743175506592, + "logits/rejected": -1.450230598449707, + "logps/chosen": -197.60543823242188, + "logps/rejected": -195.38174438476562, + "loss": 0.6159, + "rewards/accuracies": 0.5, + "rewards/chosen": -0.9123257994651794, + "rewards/margins": 0.038229409605264664, + "rewards/rejected": -0.9505552053451538, + "step": 734 + }, + { + "epoch": 0.8489654675221685, + "grad_norm": 89.1573385492241, + "learning_rate": 1.3396071473166612e-07, + "logits/chosen": -1.4010210037231445, + "logits/rejected": -1.3554859161376953, + "logps/chosen": -166.287841796875, + "logps/rejected": -207.92405700683594, + "loss": 0.5817, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.7026151418685913, + "rewards/margins": 0.763970136642456, + "rewards/rejected": -1.4665852785110474, + "step": 736 + }, + { + "epoch": 0.8512724389013049, + "grad_norm": 102.9368979611397, + "learning_rate": 1.3359834634408828e-07, + "logits/chosen": -1.4197382926940918, + "logits/rejected": -1.431471586227417, + "logps/chosen": -194.96832275390625, + "logps/rejected": -240.55545043945312, + "loss": 0.5919, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6298823356628418, + "rewards/margins": 0.6693976521492004, + "rewards/rejected": -1.299280047416687, + "step": 738 + }, + { + "epoch": 0.8535794102804412, + "grad_norm": 66.37386209857932, + "learning_rate": 1.3323547994796595e-07, + "logits/chosen": -1.3606467247009277, + "logits/rejected": -1.399714708328247, + "logps/chosen": -149.84036254882812, + "logps/rejected": -184.58444213867188, + "loss": 0.5977, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.527281641960144, + "rewards/margins": 0.46527186036109924, + "rewards/rejected": -0.9925534725189209, + "step": 740 + }, + { + "epoch": 0.8558863816595775, + "grad_norm": 64.67920251978872, + "learning_rate": 1.3287212092185464e-07, + "logits/chosen": -1.3525441884994507, + "logits/rejected": -1.3670110702514648, + "logps/chosen": -173.00009155273438, + "logps/rejected": -217.89830017089844, + "loss": 0.5148, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.4874517023563385, + "rewards/margins": 0.9208201766014099, + "rewards/rejected": -1.4082717895507812, + "step": 742 + }, + { + "epoch": 0.8581933530387139, + "grad_norm": 63.56760422068707, + "learning_rate": 1.3250827465161151e-07, + "logits/chosen": -1.5336980819702148, + "logits/rejected": -1.560868501663208, + "logps/chosen": -146.79489135742188, + "logps/rejected": -155.26324462890625, + "loss": 0.6079, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.4935835003852844, + "rewards/margins": 0.11392290890216827, + "rewards/rejected": -0.6075063943862915, + "step": 744 + }, + { + "epoch": 0.8605003244178502, + "grad_norm": 78.89522512382298, + "learning_rate": 1.3214394653031614e-07, + "logits/chosen": -1.4548529386520386, + "logits/rejected": -1.3734767436981201, + "logps/chosen": -234.0319061279297, + "logps/rejected": -297.91796875, + "loss": 0.5865, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8137804269790649, + "rewards/margins": 0.6477915048599243, + "rewards/rejected": -1.4615719318389893, + "step": 746 + }, + { + "epoch": 0.8628072957969866, + "grad_norm": 65.37034003085697, + "learning_rate": 1.3177914195819015e-07, + "logits/chosen": -1.5144624710083008, + "logits/rejected": -1.4655039310455322, + "logps/chosen": -158.4036407470703, + "logps/rejected": -191.18508911132812, + "loss": 0.555, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5954052209854126, + "rewards/margins": 0.38705208897590637, + "rewards/rejected": -0.9824572801589966, + "step": 748 + }, + { + "epoch": 0.8651142671761228, + "grad_norm": 59.16704488192239, + "learning_rate": 1.3141386634251734e-07, + "logits/chosen": -1.3658645153045654, + "logits/rejected": -1.4540934562683105, + "logps/chosen": -150.2242431640625, + "logps/rejected": -222.25180053710938, + "loss": 0.5299, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5700072646141052, + "rewards/margins": 0.9387847185134888, + "rewards/rejected": -1.5087921619415283, + "step": 750 + }, + { + "epoch": 0.8674212385552592, + "grad_norm": 67.80027199845135, + "learning_rate": 1.3104812509756348e-07, + "logits/chosen": -1.3619776964187622, + "logits/rejected": -1.4681222438812256, + "logps/chosen": -215.67242431640625, + "logps/rejected": -251.7977294921875, + "loss": 0.581, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6836814880371094, + "rewards/margins": 0.4148728549480438, + "rewards/rejected": -1.0985543727874756, + "step": 752 + }, + { + "epoch": 0.8697282099343955, + "grad_norm": 78.14387370484089, + "learning_rate": 1.3068192364449616e-07, + "logits/chosen": -1.4636235237121582, + "logits/rejected": -1.381880760192871, + "logps/chosen": -200.4328155517578, + "logps/rejected": -232.8759765625, + "loss": 0.6182, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7163984179496765, + "rewards/margins": 0.484641432762146, + "rewards/rejected": -1.2010399103164673, + "step": 754 + }, + { + "epoch": 0.8720351813135319, + "grad_norm": 81.47898503729805, + "learning_rate": 1.3031526741130435e-07, + "logits/chosen": -1.4324874877929688, + "logits/rejected": -1.4492231607437134, + "logps/chosen": -242.81385803222656, + "logps/rejected": -297.9952697753906, + "loss": 0.5814, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.759175717830658, + "rewards/margins": 0.6992828845977783, + "rewards/rejected": -1.458458423614502, + "step": 756 + }, + { + "epoch": 0.8743421526926681, + "grad_norm": 68.36284032683393, + "learning_rate": 1.2994816183271787e-07, + "logits/chosen": -1.4733608961105347, + "logits/rejected": -1.48493492603302, + "logps/chosen": -170.6515350341797, + "logps/rejected": -165.4122314453125, + "loss": 0.6076, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7016798257827759, + "rewards/margins": 0.40051549673080444, + "rewards/rejected": -1.1021952629089355, + "step": 758 + }, + { + "epoch": 0.8766491240718045, + "grad_norm": 61.599893474989166, + "learning_rate": 1.2958061235012705e-07, + "logits/chosen": -1.496275544166565, + "logits/rejected": -1.460773229598999, + "logps/chosen": -176.19471740722656, + "logps/rejected": -198.0939178466797, + "loss": 0.5757, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5562551021575928, + "rewards/margins": 0.5198081135749817, + "rewards/rejected": -1.0760632753372192, + "step": 760 + }, + { + "epoch": 0.8789560954509408, + "grad_norm": 70.46057628732943, + "learning_rate": 1.2921262441150183e-07, + "logits/chosen": -1.3644537925720215, + "logits/rejected": -1.445560097694397, + "logps/chosen": -151.073974609375, + "logps/rejected": -173.70281982421875, + "loss": 0.5721, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5794097781181335, + "rewards/margins": 0.4722179174423218, + "rewards/rejected": -1.0516277551651, + "step": 762 + }, + { + "epoch": 0.8812630668300772, + "grad_norm": 69.10199599212679, + "learning_rate": 1.2884420347131121e-07, + "logits/chosen": -1.4612513780593872, + "logits/rejected": -1.3745832443237305, + "logps/chosen": -196.2886962890625, + "logps/rejected": -216.65379333496094, + "loss": 0.5377, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7168931365013123, + "rewards/margins": 0.5213975310325623, + "rewards/rejected": -1.2382906675338745, + "step": 764 + }, + { + "epoch": 0.8835700382092134, + "grad_norm": 90.51296519002845, + "learning_rate": 1.284753549904423e-07, + "logits/chosen": -1.4165701866149902, + "logits/rejected": -1.3087973594665527, + "logps/chosen": -173.92013549804688, + "logps/rejected": -149.10337829589844, + "loss": 0.6138, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6343285441398621, + "rewards/margins": 0.3654475212097168, + "rewards/rejected": -0.9997760653495789, + "step": 766 + }, + { + "epoch": 0.8858770095883498, + "grad_norm": 65.98966257822153, + "learning_rate": 1.281060844361194e-07, + "logits/chosen": -1.4396263360977173, + "logits/rejected": -1.4435733556747437, + "logps/chosen": -174.42852783203125, + "logps/rejected": -210.5076904296875, + "loss": 0.5741, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.8263449668884277, + "rewards/margins": 0.2867377698421478, + "rewards/rejected": -1.1130828857421875, + "step": 768 + }, + { + "epoch": 0.8881839809674861, + "grad_norm": 74.46183309877627, + "learning_rate": 1.277363972818229e-07, + "logits/chosen": -1.4313443899154663, + "logits/rejected": -1.3114702701568604, + "logps/chosen": -224.059326171875, + "logps/rejected": -208.44281005859375, + "loss": 0.5699, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7303236126899719, + "rewards/margins": 0.44990167021751404, + "rewards/rejected": -1.1802253723144531, + "step": 770 + }, + { + "epoch": 0.8904909523466225, + "grad_norm": 73.75893335166967, + "learning_rate": 1.273662990072083e-07, + "logits/chosen": -1.5675609111785889, + "logits/rejected": -1.5368494987487793, + "logps/chosen": -171.8189239501953, + "logps/rejected": -178.9991912841797, + "loss": 0.5579, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6912405490875244, + "rewards/margins": 0.27446943521499634, + "rewards/rejected": -0.965709924697876, + "step": 772 + }, + { + "epoch": 0.8927979237257587, + "grad_norm": 74.45202982534715, + "learning_rate": 1.2699579509802477e-07, + "logits/chosen": -1.381720781326294, + "logits/rejected": -1.378785252571106, + "logps/chosen": -250.43006896972656, + "logps/rejected": -286.6543884277344, + "loss": 0.5975, + "rewards/accuracies": 0.5, + "rewards/chosen": -1.0578484535217285, + "rewards/margins": 0.05342524126172066, + "rewards/rejected": -1.1112737655639648, + "step": 774 + }, + { + "epoch": 0.8951048951048951, + "grad_norm": 70.06032358406004, + "learning_rate": 1.2662489104603408e-07, + "logits/chosen": -1.3842642307281494, + "logits/rejected": -1.447717547416687, + "logps/chosen": -168.9984893798828, + "logps/rejected": -207.52444458007812, + "loss": 0.5499, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.7638756632804871, + "rewards/margins": 0.5212388038635254, + "rewards/rejected": -1.2851144075393677, + "step": 776 + }, + { + "epoch": 0.8974118664840315, + "grad_norm": 66.63104285153635, + "learning_rate": 1.2625359234892904e-07, + "logits/chosen": -1.494588851928711, + "logits/rejected": -1.451404333114624, + "logps/chosen": -174.24356079101562, + "logps/rejected": -197.6580810546875, + "loss": 0.5362, + "rewards/accuracies": 0.59375, + "rewards/chosen": -0.6031234860420227, + "rewards/margins": 0.3164597749710083, + "rewards/rejected": -0.9195833206176758, + "step": 778 + }, + { + "epoch": 0.8997188378631678, + "grad_norm": 79.81280336065728, + "learning_rate": 1.2588190451025208e-07, + "logits/chosen": -1.5040947198867798, + "logits/rejected": -1.4643135070800781, + "logps/chosen": -162.25146484375, + "logps/rejected": -201.78736877441406, + "loss": 0.5741, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6632722616195679, + "rewards/margins": 0.5367907881736755, + "rewards/rejected": -1.2000629901885986, + "step": 780 + }, + { + "epoch": 0.9020258092423041, + "grad_norm": 52.09260056010907, + "learning_rate": 1.2550983303931355e-07, + "logits/chosen": -1.459741234779358, + "logits/rejected": -1.5611631870269775, + "logps/chosen": -142.47457885742188, + "logps/rejected": -182.74696350097656, + "loss": 0.527, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6918852925300598, + "rewards/margins": 0.42247286438941956, + "rewards/rejected": -1.1143580675125122, + "step": 782 + }, + { + "epoch": 0.9043327806214404, + "grad_norm": 85.88356052811159, + "learning_rate": 1.2513738345111027e-07, + "logits/chosen": -1.4681496620178223, + "logits/rejected": -1.5013573169708252, + "logps/chosen": -177.7476806640625, + "logps/rejected": -230.48324584960938, + "loss": 0.5762, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7633711695671082, + "rewards/margins": 0.5456692576408386, + "rewards/rejected": -1.3090405464172363, + "step": 784 + }, + { + "epoch": 0.9066397520005768, + "grad_norm": 65.36266724814506, + "learning_rate": 1.2476456126624362e-07, + "logits/chosen": -1.42805814743042, + "logits/rejected": -1.410811185836792, + "logps/chosen": -192.83355712890625, + "logps/rejected": -199.4910430908203, + "loss": 0.5464, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.6136312484741211, + "rewards/margins": 0.6339128613471985, + "rewards/rejected": -1.2475441694259644, + "step": 786 + }, + { + "epoch": 0.908946723379713, + "grad_norm": 70.49473006417183, + "learning_rate": 1.2439137201083773e-07, + "logits/chosen": -1.395878553390503, + "logits/rejected": -1.4350143671035767, + "logps/chosen": -168.0191650390625, + "logps/rejected": -204.7823944091797, + "loss": 0.5237, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.6597675681114197, + "rewards/margins": 0.4419274926185608, + "rewards/rejected": -1.10169517993927, + "step": 788 + }, + { + "epoch": 0.9112536947588494, + "grad_norm": 66.94572101908982, + "learning_rate": 1.2401782121645766e-07, + "logits/chosen": -1.4355082511901855, + "logits/rejected": -1.3577499389648438, + "logps/chosen": -182.4734344482422, + "logps/rejected": -229.62594604492188, + "loss": 0.5091, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.931696891784668, + "rewards/margins": 0.7762287259101868, + "rewards/rejected": -1.70792555809021, + "step": 790 + }, + { + "epoch": 0.9135606661379857, + "grad_norm": 61.38646622891244, + "learning_rate": 1.236439144200273e-07, + "logits/chosen": -1.4619312286376953, + "logits/rejected": -1.400052547454834, + "logps/chosen": -193.9796600341797, + "logps/rejected": -239.8403778076172, + "loss": 0.5438, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6676934957504272, + "rewards/margins": 0.9647277593612671, + "rewards/rejected": -1.6324213743209839, + "step": 792 + }, + { + "epoch": 0.9158676375171221, + "grad_norm": 64.0363496745867, + "learning_rate": 1.2326965716374745e-07, + "logits/chosen": -1.377258062362671, + "logits/rejected": -1.3179837465286255, + "logps/chosen": -163.70166015625, + "logps/rejected": -187.57347106933594, + "loss": 0.5332, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6885131597518921, + "rewards/margins": 0.731677770614624, + "rewards/rejected": -1.4201909303665161, + "step": 794 + }, + { + "epoch": 0.9181746088962583, + "grad_norm": 84.44705948583216, + "learning_rate": 1.2289505499501342e-07, + "logits/chosen": -1.402807593345642, + "logits/rejected": -1.5039292573928833, + "logps/chosen": -173.46568298339844, + "logps/rejected": -235.6153564453125, + "loss": 0.5977, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7485046982765198, + "rewards/margins": 0.44616127014160156, + "rewards/rejected": -1.1946659088134766, + "step": 796 + }, + { + "epoch": 0.9204815802753947, + "grad_norm": 78.69447391068418, + "learning_rate": 1.2252011346633304e-07, + "logits/chosen": -1.4286327362060547, + "logits/rejected": -1.5545501708984375, + "logps/chosen": -181.63096618652344, + "logps/rejected": -206.10789489746094, + "loss": 0.5791, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7195813655853271, + "rewards/margins": 0.3036889433860779, + "rewards/rejected": -1.0232703685760498, + "step": 798 + }, + { + "epoch": 0.922788551654531, + "grad_norm": 75.64682467171481, + "learning_rate": 1.2214483813524428e-07, + "logits/chosen": -1.4555621147155762, + "logits/rejected": -1.455336332321167, + "logps/chosen": -175.06056213378906, + "logps/rejected": -196.7834930419922, + "loss": 0.6186, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.8022370934486389, + "rewards/margins": 0.4286205768585205, + "rewards/rejected": -1.2308577299118042, + "step": 800 + }, + { + "epoch": 0.922788551654531, + "eval_logits/chosen": -1.3925344944000244, + "eval_logits/rejected": -1.3117806911468506, + "eval_logps/chosen": -194.27879333496094, + "eval_logps/rejected": -164.40199279785156, + "eval_loss": 0.5684230327606201, + "eval_rewards/accuracies": 0.7200000286102295, + "eval_rewards/chosen": -0.9096614718437195, + "eval_rewards/margins": 0.47899794578552246, + "eval_rewards/rejected": -1.3886592388153076, + "eval_runtime": 27.1566, + "eval_samples_per_second": 3.682, + "eval_steps_per_second": 0.921, + "step": 800 + }, + { + "epoch": 0.9250955230336674, + "grad_norm": 58.026864033274386, + "learning_rate": 1.2176923456423282e-07, + "logits/chosen": -1.4152424335479736, + "logits/rejected": -1.4212266206741333, + "logps/chosen": -213.00340270996094, + "logps/rejected": -278.18646240234375, + "loss": 0.5195, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8872989416122437, + "rewards/margins": 0.8531396389007568, + "rewards/rejected": -1.74043869972229, + "step": 802 + }, + { + "epoch": 0.9274024944128036, + "grad_norm": 79.10112459107924, + "learning_rate": 1.2139330832064973e-07, + "logits/chosen": -1.3687729835510254, + "logits/rejected": -1.3873934745788574, + "logps/chosen": -171.1462860107422, + "logps/rejected": -211.0199737548828, + "loss": 0.6017, + "rewards/accuracies": 0.4375, + "rewards/chosen": -0.9144918918609619, + "rewards/margins": 0.24607527256011963, + "rewards/rejected": -1.160567283630371, + "step": 804 + }, + { + "epoch": 0.92970946579194, + "grad_norm": 76.51626942304289, + "learning_rate": 1.2101706497662877e-07, + "logits/chosen": -1.4041192531585693, + "logits/rejected": -1.489201307296753, + "logps/chosen": -116.04072570800781, + "logps/rejected": -152.79568481445312, + "loss": 0.6006, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.623583197593689, + "rewards/margins": 0.3285295367240906, + "rewards/rejected": -0.9521127343177795, + "step": 806 + }, + { + "epoch": 0.9320164371710763, + "grad_norm": 101.33844171967075, + "learning_rate": 1.2064051010900395e-07, + "logits/chosen": -1.5294184684753418, + "logits/rejected": -1.4096076488494873, + "logps/chosen": -190.36326599121094, + "logps/rejected": -191.35450744628906, + "loss": 0.5799, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8971662521362305, + "rewards/margins": 0.6536959409713745, + "rewards/rejected": -1.550862193107605, + "step": 808 + }, + { + "epoch": 0.9343234085502127, + "grad_norm": 83.05378892487924, + "learning_rate": 1.202636492992268e-07, + "logits/chosen": -1.3589162826538086, + "logits/rejected": -1.4270801544189453, + "logps/chosen": -175.5372314453125, + "logps/rejected": -191.54107666015625, + "loss": 0.5364, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.812555730342865, + "rewards/margins": 0.5130024552345276, + "rewards/rejected": -1.325558066368103, + "step": 810 + }, + { + "epoch": 0.936630379929349, + "grad_norm": 91.03023580750298, + "learning_rate": 1.1988648813328367e-07, + "logits/chosen": -1.458229660987854, + "logits/rejected": -1.499216079711914, + "logps/chosen": -161.9021759033203, + "logps/rejected": -194.4558563232422, + "loss": 0.6202, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8818404078483582, + "rewards/margins": 0.5195723176002502, + "rewards/rejected": -1.4014127254486084, + "step": 812 + }, + { + "epoch": 0.9389373513084853, + "grad_norm": 66.03519272128433, + "learning_rate": 1.1950903220161285e-07, + "logits/chosen": -1.4822351932525635, + "logits/rejected": -1.4912382364273071, + "logps/chosen": -217.2351837158203, + "logps/rejected": -242.7360076904297, + "loss": 0.5274, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6898525357246399, + "rewards/margins": 0.736096203327179, + "rewards/rejected": -1.4259487390518188, + "step": 814 + }, + { + "epoch": 0.9412443226876217, + "grad_norm": 75.29793707597936, + "learning_rate": 1.1913128709902181e-07, + "logits/chosen": -1.3654075860977173, + "logits/rejected": -1.2488131523132324, + "logps/chosen": -214.5912628173828, + "logps/rejected": -244.13392639160156, + "loss": 0.5761, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9588913321495056, + "rewards/margins": 0.6373323202133179, + "rewards/rejected": -1.5962237119674683, + "step": 816 + }, + { + "epoch": 0.943551294066758, + "grad_norm": 68.17305476915286, + "learning_rate": 1.1875325842460422e-07, + "logits/chosen": -1.5225324630737305, + "logits/rejected": -1.4224079847335815, + "logps/chosen": -172.602294921875, + "logps/rejected": -187.74075317382812, + "loss": 0.5471, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.5260685682296753, + "rewards/margins": 0.7308987975120544, + "rewards/rejected": -1.256967544555664, + "step": 818 + }, + { + "epoch": 0.9458582654458944, + "grad_norm": 68.2006052745151, + "learning_rate": 1.1837495178165705e-07, + "logits/chosen": -1.3616454601287842, + "logits/rejected": -1.3527371883392334, + "logps/chosen": -141.04449462890625, + "logps/rejected": -203.32545471191406, + "loss": 0.5124, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.4041934609413147, + "rewards/margins": 0.8920172452926636, + "rewards/rejected": -1.296210765838623, + "step": 820 + }, + { + "epoch": 0.9481652368250306, + "grad_norm": 77.87367487197442, + "learning_rate": 1.1799637277759728e-07, + "logits/chosen": -1.3988251686096191, + "logits/rejected": -1.43572998046875, + "logps/chosen": -181.34315490722656, + "logps/rejected": -251.28089904785156, + "loss": 0.5431, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.8347877264022827, + "rewards/margins": 0.829275369644165, + "rewards/rejected": -1.6640632152557373, + "step": 822 + }, + { + "epoch": 0.950472208204167, + "grad_norm": 76.077749149788, + "learning_rate": 1.1761752702387911e-07, + "logits/chosen": -1.4185755252838135, + "logits/rejected": -1.3347017765045166, + "logps/chosen": -160.37074279785156, + "logps/rejected": -183.9326171875, + "loss": 0.5697, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8654420375823975, + "rewards/margins": 0.6139148473739624, + "rewards/rejected": -1.4793568849563599, + "step": 824 + }, + { + "epoch": 0.9527791795833033, + "grad_norm": 75.00986364125768, + "learning_rate": 1.1723842013591043e-07, + "logits/chosen": -1.3774935007095337, + "logits/rejected": -1.3088113069534302, + "logps/chosen": -166.6764678955078, + "logps/rejected": -186.21182250976562, + "loss": 0.5373, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.6048790812492371, + "rewards/margins": 0.6833386421203613, + "rewards/rejected": -1.2882177829742432, + "step": 826 + }, + { + "epoch": 0.9550861509624397, + "grad_norm": 72.996726199162, + "learning_rate": 1.1685905773296991e-07, + "logits/chosen": -1.362646222114563, + "logits/rejected": -1.376693606376648, + "logps/chosen": -195.9502410888672, + "logps/rejected": -225.694091796875, + "loss": 0.5749, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9920042157173157, + "rewards/margins": 0.47201159596443176, + "rewards/rejected": -1.4640157222747803, + "step": 828 + }, + { + "epoch": 0.9573931223415759, + "grad_norm": 75.95414823856552, + "learning_rate": 1.1647944543812354e-07, + "logits/chosen": -1.315481424331665, + "logits/rejected": -1.3268320560455322, + "logps/chosen": -131.01446533203125, + "logps/rejected": -158.54641723632812, + "loss": 0.5773, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.684681236743927, + "rewards/margins": 0.20931464433670044, + "rewards/rejected": -0.8939959406852722, + "step": 830 + }, + { + "epoch": 0.9597000937207123, + "grad_norm": 60.32263903162582, + "learning_rate": 1.1609958887814128e-07, + "logits/chosen": -1.5966579914093018, + "logits/rejected": -1.4850637912750244, + "logps/chosen": -175.0374755859375, + "logps/rejected": -158.18453979492188, + "loss": 0.5429, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.6850875616073608, + "rewards/margins": 0.4383313059806824, + "rewards/rejected": -1.1234188079833984, + "step": 832 + }, + { + "epoch": 0.9620070650998486, + "grad_norm": 73.91015165855379, + "learning_rate": 1.1571949368341369e-07, + "logits/chosen": -1.4515485763549805, + "logits/rejected": -1.4335086345672607, + "logps/chosen": -194.0410614013672, + "logps/rejected": -221.74459838867188, + "loss": 0.5637, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.9065202474594116, + "rewards/margins": 0.5008846521377563, + "rewards/rejected": -1.407404899597168, + "step": 834 + }, + { + "epoch": 0.964314036478985, + "grad_norm": 70.40231383528358, + "learning_rate": 1.1533916548786855e-07, + "logits/chosen": -1.3997491598129272, + "logits/rejected": -1.412211537361145, + "logps/chosen": -148.58609008789062, + "logps/rejected": -177.541748046875, + "loss": 0.5569, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6641438603401184, + "rewards/margins": 0.4562597870826721, + "rewards/rejected": -1.12040376663208, + "step": 836 + }, + { + "epoch": 0.9666210078581212, + "grad_norm": 102.06203365904534, + "learning_rate": 1.149586099288871e-07, + "logits/chosen": -1.5500867366790771, + "logits/rejected": -1.5184556245803833, + "logps/chosen": -221.6972198486328, + "logps/rejected": -220.07156372070312, + "loss": 0.5965, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.777428388595581, + "rewards/margins": 0.4735814929008484, + "rewards/rejected": -1.2510098218917847, + "step": 838 + }, + { + "epoch": 0.9689279792372576, + "grad_norm": 66.29447298049823, + "learning_rate": 1.1457783264722085e-07, + "logits/chosen": -1.4360487461090088, + "logits/rejected": -1.462703824043274, + "logps/chosen": -152.64370727539062, + "logps/rejected": -184.56980895996094, + "loss": 0.5717, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.6025056838989258, + "rewards/margins": 0.5723504424095154, + "rewards/rejected": -1.1748559474945068, + "step": 840 + }, + { + "epoch": 0.9712349506163939, + "grad_norm": 78.55117522201758, + "learning_rate": 1.1419683928690765e-07, + "logits/chosen": -1.4210811853408813, + "logits/rejected": -1.3443081378936768, + "logps/chosen": -170.15673828125, + "logps/rejected": -172.01361083984375, + "loss": 0.6309, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8064141869544983, + "rewards/margins": 0.3521476089954376, + "rewards/rejected": -1.1585618257522583, + "step": 842 + }, + { + "epoch": 0.9735419219955302, + "grad_norm": 69.22598182098773, + "learning_rate": 1.1381563549518822e-07, + "logits/chosen": -1.49723482131958, + "logits/rejected": -1.410536527633667, + "logps/chosen": -210.95184326171875, + "logps/rejected": -203.29696655273438, + "loss": 0.5463, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6800968647003174, + "rewards/margins": 0.37646064162254333, + "rewards/rejected": -1.056557536125183, + "step": 844 + }, + { + "epoch": 0.9758488933746665, + "grad_norm": 75.78361862598538, + "learning_rate": 1.1343422692242233e-07, + "logits/chosen": -1.6359862089157104, + "logits/rejected": -1.5996215343475342, + "logps/chosen": -228.96900939941406, + "logps/rejected": -303.59930419921875, + "loss": 0.4869, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8022640943527222, + "rewards/margins": 0.9272950291633606, + "rewards/rejected": -1.729559063911438, + "step": 846 + }, + { + "epoch": 0.9781558647538029, + "grad_norm": 70.34054678321432, + "learning_rate": 1.1305261922200517e-07, + "logits/chosen": -1.516348123550415, + "logits/rejected": -1.415407657623291, + "logps/chosen": -143.39247131347656, + "logps/rejected": -152.77947998046875, + "loss": 0.5319, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.4436328709125519, + "rewards/margins": 0.6538999676704407, + "rewards/rejected": -1.097532868385315, + "step": 848 + }, + { + "epoch": 0.9804628361329393, + "grad_norm": 68.10947471952929, + "learning_rate": 1.1267081805028337e-07, + "logits/chosen": -1.435686469078064, + "logits/rejected": -1.481810212135315, + "logps/chosen": -241.48797607421875, + "logps/rejected": -261.9681701660156, + "loss": 0.484, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.825613260269165, + "rewards/margins": 0.9319115281105042, + "rewards/rejected": -1.7575247287750244, + "step": 850 + }, + { + "epoch": 0.9827698075120755, + "grad_norm": 59.37071682800829, + "learning_rate": 1.1228882906647141e-07, + "logits/chosen": -1.4485299587249756, + "logits/rejected": -1.4876492023468018, + "logps/chosen": -106.88033294677734, + "logps/rejected": -123.23741149902344, + "loss": 0.5289, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.5273531079292297, + "rewards/margins": 0.2863643765449524, + "rewards/rejected": -0.8137176036834717, + "step": 852 + }, + { + "epoch": 0.9850767788912119, + "grad_norm": 70.05549851743753, + "learning_rate": 1.1190665793256748e-07, + "logits/chosen": -1.4265426397323608, + "logits/rejected": -1.3960927724838257, + "logps/chosen": -188.1573944091797, + "logps/rejected": -207.80072021484375, + "loss": 0.5884, + "rewards/accuracies": 0.5625, + "rewards/chosen": -0.8124721050262451, + "rewards/margins": 0.4456351101398468, + "rewards/rejected": -1.2581071853637695, + "step": 854 + }, + { + "epoch": 0.9873837502703482, + "grad_norm": 73.14550405861347, + "learning_rate": 1.1152431031326976e-07, + "logits/chosen": -1.4437755346298218, + "logits/rejected": -1.4316751956939697, + "logps/chosen": -165.9943084716797, + "logps/rejected": -192.70034790039062, + "loss": 0.5473, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.4255121052265167, + "rewards/margins": 0.6612747311592102, + "rewards/rejected": -1.0867868661880493, + "step": 856 + }, + { + "epoch": 0.9896907216494846, + "grad_norm": 85.66970631714595, + "learning_rate": 1.1114179187589233e-07, + "logits/chosen": -1.3930505514144897, + "logits/rejected": -1.3498919010162354, + "logps/chosen": -256.72113037109375, + "logps/rejected": -278.62896728515625, + "loss": 0.5699, + "rewards/accuracies": 0.5625, + "rewards/chosen": -1.049241542816162, + "rewards/margins": 0.28325188159942627, + "rewards/rejected": -1.332493543624878, + "step": 858 + }, + { + "epoch": 0.9919976930286208, + "grad_norm": 71.05679162032278, + "learning_rate": 1.1075910829028114e-07, + "logits/chosen": -1.437722086906433, + "logits/rejected": -1.4826639890670776, + "logps/chosen": -202.7578887939453, + "logps/rejected": -261.2666015625, + "loss": 0.5684, + "rewards/accuracies": 0.65625, + "rewards/chosen": -0.6350253820419312, + "rewards/margins": 0.6514700651168823, + "rewards/rejected": -1.286495566368103, + "step": 860 + }, + { + "epoch": 0.9943046644077572, + "grad_norm": 70.36206426876662, + "learning_rate": 1.1037626522873019e-07, + "logits/chosen": -1.41848623752594, + "logits/rejected": -1.4980326890945435, + "logps/chosen": -144.5270233154297, + "logps/rejected": -200.35751342773438, + "loss": 0.5309, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.522635817527771, + "rewards/margins": 0.7352780699729919, + "rewards/rejected": -1.2579139471054077, + "step": 862 + }, + { + "epoch": 0.9966116357868935, + "grad_norm": 72.67382936260628, + "learning_rate": 1.0999326836589715e-07, + "logits/chosen": -1.3685966730117798, + "logits/rejected": -1.394378423690796, + "logps/chosen": -166.87631225585938, + "logps/rejected": -200.53883361816406, + "loss": 0.5929, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.7416751384735107, + "rewards/margins": 0.5859281420707703, + "rewards/rejected": -1.3276032209396362, + "step": 864 + }, + { + "epoch": 0.9989186071660299, + "grad_norm": 84.36611725847108, + "learning_rate": 1.0961012337871949e-07, + "logits/chosen": -1.3915830850601196, + "logits/rejected": -1.3707163333892822, + "logps/chosen": -165.9635009765625, + "logps/rejected": -182.629150390625, + "loss": 0.5651, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.719946026802063, + "rewards/margins": 0.30040302872657776, + "rewards/rejected": -1.0203490257263184, + "step": 866 + }, + { + "epoch": 1.0012255785451663, + "grad_norm": 65.19893923782902, + "learning_rate": 1.092268359463302e-07, + "logits/chosen": -1.380025029182434, + "logits/rejected": -1.3949352502822876, + "logps/chosen": -170.98512268066406, + "logps/rejected": -166.76988220214844, + "loss": 0.4741, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.654694139957428, + "rewards/margins": 0.7143049240112305, + "rewards/rejected": -1.3689990043640137, + "step": 868 + }, + { + "epoch": 1.0035325499243024, + "grad_norm": 48.4483138752335, + "learning_rate": 1.0884341174997366e-07, + "logits/chosen": -1.497901439666748, + "logits/rejected": -1.5079542398452759, + "logps/chosen": -142.2536163330078, + "logps/rejected": -170.02557373046875, + "loss": 0.4175, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.33470043540000916, + "rewards/margins": 0.9799707531929016, + "rewards/rejected": -1.3146711587905884, + "step": 870 + }, + { + "epoch": 1.0058395213034388, + "grad_norm": 47.35534970375215, + "learning_rate": 1.0845985647292139e-07, + "logits/chosen": -1.383826494216919, + "logits/rejected": -1.449791669845581, + "logps/chosen": -158.0416717529297, + "logps/rejected": -176.6147918701172, + "loss": 0.4029, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4849981963634491, + "rewards/margins": 0.8160945177078247, + "rewards/rejected": -1.3010927438735962, + "step": 872 + }, + { + "epoch": 1.0081464926825752, + "grad_norm": 43.77560135583366, + "learning_rate": 1.0807617580038795e-07, + "logits/chosen": -1.5267244577407837, + "logits/rejected": -1.5721828937530518, + "logps/chosen": -199.91258239746094, + "logps/rejected": -248.39645385742188, + "loss": 0.3876, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.6073986291885376, + "rewards/margins": 1.2279101610183716, + "rewards/rejected": -1.8353086709976196, + "step": 874 + }, + { + "epoch": 1.0104534640617115, + "grad_norm": 42.52904656646403, + "learning_rate": 1.0769237541944638e-07, + "logits/chosen": -1.4312025308609009, + "logits/rejected": -1.4796687364578247, + "logps/chosen": -145.2313232421875, + "logps/rejected": -158.61622619628906, + "loss": 0.4168, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.3942721486091614, + "rewards/margins": 0.6692065000534058, + "rewards/rejected": -1.063478708267212, + "step": 876 + }, + { + "epoch": 1.0127604354408477, + "grad_norm": 48.90185425829871, + "learning_rate": 1.0730846101894426e-07, + "logits/chosen": -1.5106614828109741, + "logits/rejected": -1.4605977535247803, + "logps/chosen": -138.33749389648438, + "logps/rejected": -161.0500946044922, + "loss": 0.4457, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.4576440453529358, + "rewards/margins": 0.9095314741134644, + "rewards/rejected": -1.367175579071045, + "step": 878 + }, + { + "epoch": 1.015067406819984, + "grad_norm": 43.02658405721309, + "learning_rate": 1.0692443828941917e-07, + "logits/chosen": -1.5405513048171997, + "logits/rejected": -1.5392036437988281, + "logps/chosen": -182.1168670654297, + "logps/rejected": -233.71990966796875, + "loss": 0.4028, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.6404014825820923, + "rewards/margins": 1.0966987609863281, + "rewards/rejected": -1.73710036277771, + "step": 880 + }, + { + "epoch": 1.0173743781991205, + "grad_norm": 48.32435717433864, + "learning_rate": 1.0654031292301431e-07, + "logits/chosen": -1.3465638160705566, + "logits/rejected": -1.3601791858673096, + "logps/chosen": -173.30892944335938, + "logps/rejected": -222.39129638671875, + "loss": 0.4313, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6509138345718384, + "rewards/margins": 0.8590379357337952, + "rewards/rejected": -1.5099515914916992, + "step": 882 + }, + { + "epoch": 1.0196813495782568, + "grad_norm": 42.7359721693661, + "learning_rate": 1.061560906133943e-07, + "logits/chosen": -1.4033329486846924, + "logits/rejected": -1.3475041389465332, + "logps/chosen": -179.88003540039062, + "logps/rejected": -126.02889251708984, + "loss": 0.4241, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.39270564913749695, + "rewards/margins": 0.8330918550491333, + "rewards/rejected": -1.2257975339889526, + "step": 884 + }, + { + "epoch": 1.0219883209573932, + "grad_norm": 58.308886811175576, + "learning_rate": 1.057717770556606e-07, + "logits/chosen": -1.3382686376571655, + "logits/rejected": -1.354683756828308, + "logps/chosen": -163.423828125, + "logps/rejected": -212.0486602783203, + "loss": 0.4546, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7182069420814514, + "rewards/margins": 0.8601030111312866, + "rewards/rejected": -1.5783098936080933, + "step": 886 + }, + { + "epoch": 1.0242952923365294, + "grad_norm": 52.41648679332183, + "learning_rate": 1.0538737794626732e-07, + "logits/chosen": -1.4164263010025024, + "logits/rejected": -1.4541833400726318, + "logps/chosen": -183.5728759765625, + "logps/rejected": -235.9500274658203, + "loss": 0.375, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.634645938873291, + "rewards/margins": 1.2829779386520386, + "rewards/rejected": -1.9176236391067505, + "step": 888 + }, + { + "epoch": 1.0266022637156658, + "grad_norm": 59.99002267055787, + "learning_rate": 1.0500289898293653e-07, + "logits/chosen": -1.2998998165130615, + "logits/rejected": -1.2930588722229004, + "logps/chosen": -127.99386596679688, + "logps/rejected": -172.67710876464844, + "loss": 0.4373, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.6694655418395996, + "rewards/margins": 1.0086742639541626, + "rewards/rejected": -1.6781398057937622, + "step": 890 + }, + { + "epoch": 1.0289092350948021, + "grad_norm": 59.24537451441084, + "learning_rate": 1.0461834586457397e-07, + "logits/chosen": -1.4053970575332642, + "logits/rejected": -1.4072213172912598, + "logps/chosen": -207.63168334960938, + "logps/rejected": -327.9807434082031, + "loss": 0.4149, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.9034566879272461, + "rewards/margins": 1.59257972240448, + "rewards/rejected": -2.4960365295410156, + "step": 892 + }, + { + "epoch": 1.0312162064739385, + "grad_norm": 46.427849200004886, + "learning_rate": 1.0423372429118453e-07, + "logits/chosen": -1.4245054721832275, + "logits/rejected": -1.3622853755950928, + "logps/chosen": -121.53076171875, + "logps/rejected": -153.8699951171875, + "loss": 0.4026, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.4701806306838989, + "rewards/margins": 1.028322458267212, + "rewards/rejected": -1.4985029697418213, + "step": 894 + }, + { + "epoch": 1.0335231778530747, + "grad_norm": 50.69335447767206, + "learning_rate": 1.0384903996378782e-07, + "logits/chosen": -1.3767621517181396, + "logits/rejected": -1.2876626253128052, + "logps/chosen": -129.5188751220703, + "logps/rejected": -140.3291778564453, + "loss": 0.4704, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.6933719515800476, + "rewards/margins": 0.5130415558815002, + "rewards/rejected": -1.2064135074615479, + "step": 896 + }, + { + "epoch": 1.035830149232211, + "grad_norm": 51.48813587010882, + "learning_rate": 1.0346429858433352e-07, + "logits/chosen": -1.316386342048645, + "logits/rejected": -1.3732808828353882, + "logps/chosen": -123.05658721923828, + "logps/rejected": -217.4422607421875, + "loss": 0.3842, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.6196972727775574, + "rewards/margins": 0.9186638593673706, + "rewards/rejected": -1.5383610725402832, + "step": 898 + }, + { + "epoch": 1.0381371206113474, + "grad_norm": 58.72057690388196, + "learning_rate": 1.0307950585561704e-07, + "logits/chosen": -1.3871917724609375, + "logits/rejected": -1.3122071027755737, + "logps/chosen": -149.32241821289062, + "logps/rejected": -177.53614807128906, + "loss": 0.435, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.593085765838623, + "rewards/margins": 0.9729093909263611, + "rewards/rejected": -1.565995216369629, + "step": 900 + }, + { + "epoch": 1.0381371206113474, + "eval_logits/chosen": -1.368806004524231, + "eval_logits/rejected": -1.2883577346801758, + "eval_logps/chosen": -195.90835571289062, + "eval_logps/rejected": -166.81427001953125, + "eval_loss": 0.5445123314857483, + "eval_rewards/accuracies": 0.6800000071525574, + "eval_rewards/chosen": -1.072619080543518, + "eval_rewards/margins": 0.5572704076766968, + "eval_rewards/rejected": -1.6298894882202148, + "eval_runtime": 25.9079, + "eval_samples_per_second": 3.86, + "eval_steps_per_second": 0.965, + "step": 900 + }, + { + "epoch": 1.0404440919904838, + "grad_norm": 46.08264467438493, + "learning_rate": 1.0269466748119488e-07, + "logits/chosen": -1.3400801420211792, + "logits/rejected": -1.3535000085830688, + "logps/chosen": -231.73153686523438, + "logps/rejected": -337.78509521484375, + "loss": 0.4217, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.846785306930542, + "rewards/margins": 1.6273854970932007, + "rewards/rejected": -2.474170684814453, + "step": 902 + }, + { + "epoch": 1.04275106336962, + "grad_norm": 47.478718630060946, + "learning_rate": 1.023097891653001e-07, + "logits/chosen": -1.4396333694458008, + "logits/rejected": -1.3391939401626587, + "logps/chosen": -192.85023498535156, + "logps/rejected": -182.81593322753906, + "loss": 0.4663, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.5741289854049683, + "rewards/margins": 0.8019107580184937, + "rewards/rejected": -1.376039743423462, + "step": 904 + }, + { + "epoch": 1.0450580347487564, + "grad_norm": 42.07373362101555, + "learning_rate": 1.0192487661275784e-07, + "logits/chosen": -1.3626071214675903, + "logits/rejected": -1.3448035717010498, + "logps/chosen": -198.46783447265625, + "logps/rejected": -268.89617919921875, + "loss": 0.3584, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.7600739002227783, + "rewards/margins": 1.7053577899932861, + "rewards/rejected": -2.4654316902160645, + "step": 906 + }, + { + "epoch": 1.0473650061278927, + "grad_norm": 64.80462314401966, + "learning_rate": 1.0153993552890068e-07, + "logits/chosen": -1.287585973739624, + "logits/rejected": -1.2358753681182861, + "logps/chosen": -201.56106567382812, + "logps/rejected": -247.15313720703125, + "loss": 0.4304, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9964087009429932, + "rewards/margins": 1.5290757417678833, + "rewards/rejected": -2.525484323501587, + "step": 908 + }, + { + "epoch": 1.0496719775070291, + "grad_norm": 49.86586606129721, + "learning_rate": 1.0115497161948408e-07, + "logits/chosen": -1.4171031713485718, + "logits/rejected": -1.4290010929107666, + "logps/chosen": -183.05397033691406, + "logps/rejected": -225.85943603515625, + "loss": 0.3901, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.8562214970588684, + "rewards/margins": 1.18411123752594, + "rewards/rejected": -2.040332794189453, + "step": 910 + }, + { + "epoch": 1.0519789488861653, + "grad_norm": 53.027457701933855, + "learning_rate": 1.0076999059060187e-07, + "logits/chosen": -1.299911379814148, + "logits/rejected": -1.2465267181396484, + "logps/chosen": -120.93717193603516, + "logps/rejected": -130.535400390625, + "loss": 0.4346, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7218230962753296, + "rewards/margins": 0.7116464376449585, + "rewards/rejected": -1.4334694147109985, + "step": 912 + }, + { + "epoch": 1.0542859202653017, + "grad_norm": 66.83942678385118, + "learning_rate": 1.0038499814860157e-07, + "logits/chosen": -1.4031901359558105, + "logits/rejected": -1.3700772523880005, + "logps/chosen": -198.3015899658203, + "logps/rejected": -224.3856658935547, + "loss": 0.4385, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9429548978805542, + "rewards/margins": 0.9581265449523926, + "rewards/rejected": -1.9010815620422363, + "step": 914 + }, + { + "epoch": 1.056592891644438, + "grad_norm": 39.525727639140996, + "learning_rate": 1e-07, + "logits/chosen": -1.2172764539718628, + "logits/rejected": -1.089323878288269, + "logps/chosen": -138.57373046875, + "logps/rejected": -164.6386260986328, + "loss": 0.4169, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.6064477562904358, + "rewards/margins": 1.2073055505752563, + "rewards/rejected": -1.813753366470337, + "step": 916 + }, + { + "epoch": 1.0588998630235744, + "grad_norm": 38.99856681963842, + "learning_rate": 9.961500185139842e-08, + "logits/chosen": -1.3283095359802246, + "logits/rejected": -1.3477778434753418, + "logps/chosen": -203.667236328125, + "logps/rejected": -270.05352783203125, + "loss": 0.3598, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.8687477707862854, + "rewards/margins": 1.5828710794448853, + "rewards/rejected": -2.4516186714172363, + "step": 918 + }, + { + "epoch": 1.0612068344027108, + "grad_norm": 43.560042003470414, + "learning_rate": 9.923000940939814e-08, + "logits/chosen": -1.3665335178375244, + "logits/rejected": -1.3108726739883423, + "logps/chosen": -132.06370544433594, + "logps/rejected": -142.1192626953125, + "loss": 0.4212, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.608191967010498, + "rewards/margins": 0.7716971635818481, + "rewards/rejected": -1.3798891305923462, + "step": 920 + }, + { + "epoch": 1.063513805781847, + "grad_norm": 42.934097495699255, + "learning_rate": 9.884502838051594e-08, + "logits/chosen": -1.3540059328079224, + "logits/rejected": -1.2924156188964844, + "logps/chosen": -124.7350845336914, + "logps/rejected": -100.70684814453125, + "loss": 0.4639, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5759610533714294, + "rewards/margins": 0.6847679615020752, + "rewards/rejected": -1.2607290744781494, + "step": 922 + }, + { + "epoch": 1.0658207771609833, + "grad_norm": 53.04637102765547, + "learning_rate": 9.846006447109932e-08, + "logits/chosen": -1.4333351850509644, + "logits/rejected": -1.3867840766906738, + "logps/chosen": -116.47769165039062, + "logps/rejected": -108.24977111816406, + "loss": 0.4403, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.5576272010803223, + "rewards/margins": 0.5203125476837158, + "rewards/rejected": -1.077939748764038, + "step": 924 + }, + { + "epoch": 1.0681277485401197, + "grad_norm": 47.85818098965212, + "learning_rate": 9.807512338724216e-08, + "logits/chosen": -1.435934066772461, + "logits/rejected": -1.355026125907898, + "logps/chosen": -135.8258056640625, + "logps/rejected": -158.5557861328125, + "loss": 0.4239, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7292733192443848, + "rewards/margins": 1.0533298254013062, + "rewards/rejected": -1.782603144645691, + "step": 926 + }, + { + "epoch": 1.070434719919256, + "grad_norm": 72.70327780863528, + "learning_rate": 9.769021083469989e-08, + "logits/chosen": -1.4023041725158691, + "logits/rejected": -1.3046897649765015, + "logps/chosen": -141.689453125, + "logps/rejected": -162.58108520507812, + "loss": 0.408, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6851909756660461, + "rewards/margins": 1.2052912712097168, + "rewards/rejected": -1.8904823064804077, + "step": 928 + }, + { + "epoch": 1.0727416912983923, + "grad_norm": 53.100154938901355, + "learning_rate": 9.730533251880515e-08, + "logits/chosen": -1.4096457958221436, + "logits/rejected": -1.4489352703094482, + "logps/chosen": -193.12330627441406, + "logps/rejected": -209.87611389160156, + "loss": 0.372, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.7944599390029907, + "rewards/margins": 1.150054931640625, + "rewards/rejected": -1.9445148706436157, + "step": 930 + }, + { + "epoch": 1.0750486626775286, + "grad_norm": 45.4444435369967, + "learning_rate": 9.692049414438298e-08, + "logits/chosen": -1.5730786323547363, + "logits/rejected": -1.509131669998169, + "logps/chosen": -170.0063934326172, + "logps/rejected": -200.8542938232422, + "loss": 0.4116, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8289997577667236, + "rewards/margins": 1.3618550300598145, + "rewards/rejected": -2.190855026245117, + "step": 932 + }, + { + "epoch": 1.077355634056665, + "grad_norm": 54.74440505132641, + "learning_rate": 9.653570141566652e-08, + "logits/chosen": -1.3093923330307007, + "logits/rejected": -1.4106438159942627, + "logps/chosen": -144.6334686279297, + "logps/rejected": -195.14810180664062, + "loss": 0.4191, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6524811387062073, + "rewards/margins": 1.1764483451843262, + "rewards/rejected": -1.8289295434951782, + "step": 934 + }, + { + "epoch": 1.0796626054358014, + "grad_norm": 60.39174139928238, + "learning_rate": 9.61509600362122e-08, + "logits/chosen": -1.405623435974121, + "logits/rejected": -1.45893132686615, + "logps/chosen": -167.4293670654297, + "logps/rejected": -190.83563232421875, + "loss": 0.4699, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.5997176766395569, + "rewards/margins": 0.7248048186302185, + "rewards/rejected": -1.3245224952697754, + "step": 936 + }, + { + "epoch": 1.0819695768149376, + "grad_norm": 57.80810696219112, + "learning_rate": 9.576627570881549e-08, + "logits/chosen": -1.3913531303405762, + "logits/rejected": -1.3687852621078491, + "logps/chosen": -196.201416015625, + "logps/rejected": -240.41012573242188, + "loss": 0.4425, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.9165946245193481, + "rewards/margins": 0.9089750051498413, + "rewards/rejected": -1.8255695104599, + "step": 938 + }, + { + "epoch": 1.084276548194074, + "grad_norm": 47.94477989169309, + "learning_rate": 9.538165413542607e-08, + "logits/chosen": -1.3398971557617188, + "logits/rejected": -1.3986927270889282, + "logps/chosen": -222.11538696289062, + "logps/rejected": -260.91241455078125, + "loss": 0.3776, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.653100848197937, + "rewards/margins": 1.5059380531311035, + "rewards/rejected": -2.159038543701172, + "step": 940 + }, + { + "epoch": 1.0865835195732103, + "grad_norm": 55.94671589153818, + "learning_rate": 9.499710101706346e-08, + "logits/chosen": -1.4237765073776245, + "logits/rejected": -1.4448227882385254, + "logps/chosen": -210.11026000976562, + "logps/rejected": -244.41043090820312, + "loss": 0.4016, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.4846809208393097, + "rewards/margins": 1.2182962894439697, + "rewards/rejected": -1.702976942062378, + "step": 942 + }, + { + "epoch": 1.0888904909523467, + "grad_norm": 51.245921498733374, + "learning_rate": 9.461262205373268e-08, + "logits/chosen": -1.3350170850753784, + "logits/rejected": -1.3754874467849731, + "logps/chosen": -167.61691284179688, + "logps/rejected": -182.33018493652344, + "loss": 0.3982, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8486964702606201, + "rewards/margins": 1.005102276802063, + "rewards/rejected": -1.8537986278533936, + "step": 944 + }, + { + "epoch": 1.0911974623314828, + "grad_norm": 53.58036009588695, + "learning_rate": 9.422822294433938e-08, + "logits/chosen": -1.2717465162277222, + "logits/rejected": -1.2129467725753784, + "logps/chosen": -150.95550537109375, + "logps/rejected": -160.45558166503906, + "loss": 0.4122, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.9215366840362549, + "rewards/margins": 0.8769155740737915, + "rewards/rejected": -1.7984521389007568, + "step": 946 + }, + { + "epoch": 1.0935044337106192, + "grad_norm": 42.3521804722769, + "learning_rate": 9.38439093866057e-08, + "logits/chosen": -1.3253931999206543, + "logits/rejected": -1.3314661979675293, + "logps/chosen": -137.23956298828125, + "logps/rejected": -147.44735717773438, + "loss": 0.4522, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.5181431770324707, + "rewards/margins": 0.8911628723144531, + "rewards/rejected": -1.4093059301376343, + "step": 948 + }, + { + "epoch": 1.0958114050897556, + "grad_norm": 52.84833057426872, + "learning_rate": 9.345968707698568e-08, + "logits/chosen": -1.51572585105896, + "logits/rejected": -1.5353983640670776, + "logps/chosen": -162.04800415039062, + "logps/rejected": -193.7029266357422, + "loss": 0.3981, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.5837193131446838, + "rewards/margins": 0.8988637328147888, + "rewards/rejected": -1.482582926750183, + "step": 950 + }, + { + "epoch": 1.098118376468892, + "grad_norm": 51.08843143746294, + "learning_rate": 9.307556171058084e-08, + "logits/chosen": -1.3710289001464844, + "logits/rejected": -1.501839280128479, + "logps/chosen": -177.76930236816406, + "logps/rejected": -265.036865234375, + "loss": 0.3462, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6180956363677979, + "rewards/margins": 1.6954776048660278, + "rewards/rejected": -2.3135733604431152, + "step": 952 + }, + { + "epoch": 1.1004253478480281, + "grad_norm": 39.48580147628757, + "learning_rate": 9.269153898105571e-08, + "logits/chosen": -1.3733117580413818, + "logits/rejected": -1.370259404182434, + "logps/chosen": -196.75180053710938, + "logps/rejected": -220.63250732421875, + "loss": 0.3896, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.897487223148346, + "rewards/margins": 1.2682368755340576, + "rewards/rejected": -2.165724039077759, + "step": 954 + }, + { + "epoch": 1.1027323192271645, + "grad_norm": 40.58851230363069, + "learning_rate": 9.230762458055362e-08, + "logits/chosen": -1.3409641981124878, + "logits/rejected": -1.5087528228759766, + "logps/chosen": -162.93719482421875, + "logps/rejected": -228.28309631347656, + "loss": 0.4086, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7766309380531311, + "rewards/margins": 1.0290602445602417, + "rewards/rejected": -1.8056910037994385, + "step": 956 + }, + { + "epoch": 1.105039290606301, + "grad_norm": 48.07092159296336, + "learning_rate": 9.192382419961207e-08, + "logits/chosen": -1.2765452861785889, + "logits/rejected": -1.3672202825546265, + "logps/chosen": -156.22647094726562, + "logps/rejected": -200.10728454589844, + "loss": 0.4416, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.6270304918289185, + "rewards/margins": 0.8705189824104309, + "rewards/rejected": -1.4975494146347046, + "step": 958 + }, + { + "epoch": 1.1073462619854373, + "grad_norm": 58.85426890701801, + "learning_rate": 9.15401435270786e-08, + "logits/chosen": -1.4625585079193115, + "logits/rejected": -1.4515876770019531, + "logps/chosen": -175.11221313476562, + "logps/rejected": -179.67308044433594, + "loss": 0.433, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8706232905387878, + "rewards/margins": 0.8775316476821899, + "rewards/rejected": -1.748154878616333, + "step": 960 + }, + { + "epoch": 1.1096532333645737, + "grad_norm": 48.13444718101857, + "learning_rate": 9.115658825002634e-08, + "logits/chosen": -1.4449965953826904, + "logits/rejected": -1.4944627285003662, + "logps/chosen": -148.5018768310547, + "logps/rejected": -192.43560791015625, + "loss": 0.4478, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.6375682353973389, + "rewards/margins": 1.2029744386672974, + "rewards/rejected": -1.8405426740646362, + "step": 962 + }, + { + "epoch": 1.1119602047437098, + "grad_norm": 45.630310836419994, + "learning_rate": 9.077316405366981e-08, + "logits/chosen": -1.5309690237045288, + "logits/rejected": -1.4158992767333984, + "logps/chosen": -237.97152709960938, + "logps/rejected": -307.7381286621094, + "loss": 0.438, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.753577470779419, + "rewards/margins": 1.5821527242660522, + "rewards/rejected": -2.3357303142547607, + "step": 964 + }, + { + "epoch": 1.1142671761228462, + "grad_norm": 54.60688529032658, + "learning_rate": 9.03898766212805e-08, + "logits/chosen": -1.425299048423767, + "logits/rejected": -1.4033348560333252, + "logps/chosen": -216.15237426757812, + "logps/rejected": -262.237548828125, + "loss": 0.4143, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.9375750422477722, + "rewards/margins": 1.173370122909546, + "rewards/rejected": -2.110945224761963, + "step": 966 + }, + { + "epoch": 1.1165741475019826, + "grad_norm": 47.343958384978336, + "learning_rate": 9.000673163410286e-08, + "logits/chosen": -1.194286823272705, + "logits/rejected": -1.2353427410125732, + "logps/chosen": -137.03073120117188, + "logps/rejected": -158.67300415039062, + "loss": 0.4048, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.8332533240318298, + "rewards/margins": 0.733294665813446, + "rewards/rejected": -1.5665481090545654, + "step": 968 + }, + { + "epoch": 1.118881118881119, + "grad_norm": 48.41236984702601, + "learning_rate": 8.962373477126982e-08, + "logits/chosen": -1.3657209873199463, + "logits/rejected": -1.4510833024978638, + "logps/chosen": -112.81056213378906, + "logps/rejected": -161.59786987304688, + "loss": 0.4003, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.6659589409828186, + "rewards/margins": 0.8242032527923584, + "rewards/rejected": -1.4901621341705322, + "step": 970 + }, + { + "epoch": 1.1211880902602551, + "grad_norm": 47.3495191045043, + "learning_rate": 8.924089170971887e-08, + "logits/chosen": -1.4114599227905273, + "logits/rejected": -1.4022752046585083, + "logps/chosen": -142.42562866210938, + "logps/rejected": -152.8037872314453, + "loss": 0.3903, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.6654877662658691, + "rewards/margins": 0.8193020820617676, + "rewards/rejected": -1.4847897291183472, + "step": 972 + }, + { + "epoch": 1.1234950616393915, + "grad_norm": 56.44959316652502, + "learning_rate": 8.885820812410769e-08, + "logits/chosen": -1.2863538265228271, + "logits/rejected": -1.3213986158370972, + "logps/chosen": -161.03211975097656, + "logps/rejected": -256.6575012207031, + "loss": 0.4055, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.6689996123313904, + "rewards/margins": 1.2615712881088257, + "rewards/rejected": -1.9305710792541504, + "step": 974 + }, + { + "epoch": 1.1258020330185279, + "grad_norm": 48.64694104803259, + "learning_rate": 8.847568968673024e-08, + "logits/chosen": -1.2597205638885498, + "logits/rejected": -1.2746870517730713, + "logps/chosen": -167.2078399658203, + "logps/rejected": -216.3764190673828, + "loss": 0.4015, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.7549591064453125, + "rewards/margins": 1.0452518463134766, + "rewards/rejected": -1.800210952758789, + "step": 976 + }, + { + "epoch": 1.1281090043976643, + "grad_norm": 54.54545983114275, + "learning_rate": 8.809334206743251e-08, + "logits/chosen": -1.3839998245239258, + "logits/rejected": -1.2565964460372925, + "logps/chosen": -220.92575073242188, + "logps/rejected": -232.58221435546875, + "loss": 0.3953, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8072776794433594, + "rewards/margins": 1.098924994468689, + "rewards/rejected": -1.9062025547027588, + "step": 978 + }, + { + "epoch": 1.1304159757768004, + "grad_norm": 50.669478447045044, + "learning_rate": 8.77111709335286e-08, + "logits/chosen": -1.3114320039749146, + "logits/rejected": -1.3967440128326416, + "logps/chosen": -183.023193359375, + "logps/rejected": -242.5177764892578, + "loss": 0.4064, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.8590079545974731, + "rewards/margins": 1.4726592302322388, + "rewards/rejected": -2.331667184829712, + "step": 980 + }, + { + "epoch": 1.1327229471559368, + "grad_norm": 45.01192442719795, + "learning_rate": 8.732918194971663e-08, + "logits/chosen": -1.4155831336975098, + "logits/rejected": -1.362624168395996, + "logps/chosen": -142.08741760253906, + "logps/rejected": -175.91061401367188, + "loss": 0.3832, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.6290794014930725, + "rewards/margins": 1.268273115158081, + "rewards/rejected": -1.8973525762557983, + "step": 982 + }, + { + "epoch": 1.1350299185350732, + "grad_norm": 52.363728579221444, + "learning_rate": 8.694738077799486e-08, + "logits/chosen": -1.3025627136230469, + "logits/rejected": -1.4123029708862305, + "logps/chosen": -118.02716827392578, + "logps/rejected": -147.27581787109375, + "loss": 0.4048, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.43484801054000854, + "rewards/margins": 0.925700306892395, + "rewards/rejected": -1.3605482578277588, + "step": 984 + }, + { + "epoch": 1.1373368899142096, + "grad_norm": 48.92241704444725, + "learning_rate": 8.656577307757766e-08, + "logits/chosen": -1.397874355316162, + "logits/rejected": -1.3083161115646362, + "logps/chosen": -183.96243286132812, + "logps/rejected": -178.71238708496094, + "loss": 0.4146, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.0279752016067505, + "rewards/margins": 0.7176418304443359, + "rewards/rejected": -1.7456170320510864, + "step": 986 + }, + { + "epoch": 1.139643861293346, + "grad_norm": 47.66250134338094, + "learning_rate": 8.618436450481181e-08, + "logits/chosen": -1.4895576238632202, + "logits/rejected": -1.5260589122772217, + "logps/chosen": -224.04379272460938, + "logps/rejected": -272.7261962890625, + "loss": 0.3914, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.801969587802887, + "rewards/margins": 1.2178035974502563, + "rewards/rejected": -2.019773006439209, + "step": 988 + }, + { + "epoch": 1.141950832672482, + "grad_norm": 65.26400202215471, + "learning_rate": 8.580316071309234e-08, + "logits/chosen": -1.4646153450012207, + "logits/rejected": -1.486820936203003, + "logps/chosen": -154.2437744140625, + "logps/rejected": -184.03640747070312, + "loss": 0.4269, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8698714971542358, + "rewards/margins": 1.1085283756256104, + "rewards/rejected": -1.9783998727798462, + "step": 990 + }, + { + "epoch": 1.1442578040516185, + "grad_norm": 55.38310724678633, + "learning_rate": 8.542216735277917e-08, + "logits/chosen": -1.3877445459365845, + "logits/rejected": -1.426324725151062, + "logps/chosen": -226.8168182373047, + "logps/rejected": -320.26861572265625, + "loss": 0.406, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9063752293586731, + "rewards/margins": 1.4490259885787964, + "rewards/rejected": -2.3554012775421143, + "step": 992 + }, + { + "epoch": 1.1465647754307549, + "grad_norm": 55.30320117524687, + "learning_rate": 8.504139007111289e-08, + "logits/chosen": -1.524666666984558, + "logits/rejected": -1.4656498432159424, + "logps/chosen": -180.05706787109375, + "logps/rejected": -224.0005340576172, + "loss": 0.4497, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8488324880599976, + "rewards/margins": 1.1417200565338135, + "rewards/rejected": -1.9905524253845215, + "step": 994 + }, + { + "epoch": 1.1488717468098912, + "grad_norm": 59.50571311635345, + "learning_rate": 8.466083451213145e-08, + "logits/chosen": -1.3895516395568848, + "logits/rejected": -1.3458820581436157, + "logps/chosen": -222.20672607421875, + "logps/rejected": -246.43609619140625, + "loss": 0.3834, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9480006694793701, + "rewards/margins": 1.4348657131195068, + "rewards/rejected": -2.382866621017456, + "step": 996 + }, + { + "epoch": 1.1511787181890274, + "grad_norm": 52.90540723372322, + "learning_rate": 8.428050631658627e-08, + "logits/chosen": -1.463295340538025, + "logits/rejected": -1.527518630027771, + "logps/chosen": -169.41342163085938, + "logps/rejected": -211.99317932128906, + "loss": 0.409, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.7570784687995911, + "rewards/margins": 1.2448766231536865, + "rewards/rejected": -2.001955032348633, + "step": 998 + }, + { + "epoch": 1.1534856895681638, + "grad_norm": 62.46156202928193, + "learning_rate": 8.39004111218587e-08, + "logits/chosen": -1.4075658321380615, + "logits/rejected": -1.402864933013916, + "logps/chosen": -174.88134765625, + "logps/rejected": -177.6551055908203, + "loss": 0.3574, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.7704441547393799, + "rewards/margins": 0.945233166217804, + "rewards/rejected": -1.715677261352539, + "step": 1000 + }, + { + "epoch": 1.1534856895681638, + "eval_logits/chosen": -1.3622280359268188, + "eval_logits/rejected": -1.2871321439743042, + "eval_logps/chosen": -197.57440185546875, + "eval_logps/rejected": -168.7324676513672, + "eval_loss": 0.5431498885154724, + "eval_rewards/accuracies": 0.7599999904632568, + "eval_rewards/chosen": -1.239221215248108, + "eval_rewards/margins": 0.582485556602478, + "eval_rewards/rejected": -1.8217066526412964, + "eval_runtime": 23.3967, + "eval_samples_per_second": 4.274, + "eval_steps_per_second": 1.069, + "step": 1000 + }, + { + "epoch": 1.1557926609473002, + "grad_norm": 44.181005655906645, + "learning_rate": 8.352055456187644e-08, + "logits/chosen": -1.3333595991134644, + "logits/rejected": -1.2514859437942505, + "logps/chosen": -184.87631225585938, + "logps/rejected": -213.40603637695312, + "loss": 0.3575, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8501152992248535, + "rewards/margins": 1.3011317253112793, + "rewards/rejected": -2.151247262954712, + "step": 1002 + }, + { + "epoch": 1.1580996323264365, + "grad_norm": 42.72442121434693, + "learning_rate": 8.314094226703007e-08, + "logits/chosen": -1.4761807918548584, + "logits/rejected": -1.5279865264892578, + "logps/chosen": -138.0496826171875, + "logps/rejected": -223.12356567382812, + "loss": 0.4402, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.5688883662223816, + "rewards/margins": 1.421848177909851, + "rewards/rejected": -1.990736722946167, + "step": 1004 + }, + { + "epoch": 1.1604066037055727, + "grad_norm": 42.42576201307595, + "learning_rate": 8.276157986408958e-08, + "logits/chosen": -1.4122170209884644, + "logits/rejected": -1.5143085718154907, + "logps/chosen": -150.01748657226562, + "logps/rejected": -205.741943359375, + "loss": 0.408, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.709044873714447, + "rewards/margins": 1.0970107316970825, + "rewards/rejected": -1.8060553073883057, + "step": 1006 + }, + { + "epoch": 1.162713575084709, + "grad_norm": 46.98053894558455, + "learning_rate": 8.238247297612091e-08, + "logits/chosen": -1.4477453231811523, + "logits/rejected": -1.456710696220398, + "logps/chosen": -189.01307678222656, + "logps/rejected": -224.03646850585938, + "loss": 0.3941, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.5644448399543762, + "rewards/margins": 1.1727917194366455, + "rewards/rejected": -1.737236738204956, + "step": 1008 + }, + { + "epoch": 1.1650205464638455, + "grad_norm": 45.113641573334704, + "learning_rate": 8.200362722240272e-08, + "logits/chosen": -1.368615746498108, + "logits/rejected": -1.452118992805481, + "logps/chosen": -115.6017837524414, + "logps/rejected": -158.69747924804688, + "loss": 0.4656, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.7957598567008972, + "rewards/margins": 0.9295057654380798, + "rewards/rejected": -1.725265622138977, + "step": 1010 + }, + { + "epoch": 1.1673275178429818, + "grad_norm": 52.770081650362485, + "learning_rate": 8.162504821834295e-08, + "logits/chosen": -1.4406243562698364, + "logits/rejected": -1.3740017414093018, + "logps/chosen": -177.7287139892578, + "logps/rejected": -185.05763244628906, + "loss": 0.4228, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.7840394973754883, + "rewards/margins": 0.9432986378669739, + "rewards/rejected": -1.7273380756378174, + "step": 1012 + }, + { + "epoch": 1.1696344892221182, + "grad_norm": 40.23487524634427, + "learning_rate": 8.124674157539577e-08, + "logits/chosen": -1.3667372465133667, + "logits/rejected": -1.3167593479156494, + "logps/chosen": -205.84791564941406, + "logps/rejected": -246.4522705078125, + "loss": 0.3889, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8650726079940796, + "rewards/margins": 1.2019354104995728, + "rewards/rejected": -2.0670082569122314, + "step": 1014 + }, + { + "epoch": 1.1719414606012544, + "grad_norm": 64.51013858621744, + "learning_rate": 8.086871290097821e-08, + "logits/chosen": -1.3617969751358032, + "logits/rejected": -1.4326754808425903, + "logps/chosen": -137.392578125, + "logps/rejected": -219.85238647460938, + "loss": 0.4504, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.033658742904663, + "rewards/margins": 0.985666811466217, + "rewards/rejected": -2.0193252563476562, + "step": 1016 + }, + { + "epoch": 1.1742484319803907, + "grad_norm": 40.94400015951729, + "learning_rate": 8.049096779838717e-08, + "logits/chosen": -1.4027369022369385, + "logits/rejected": -1.3317725658416748, + "logps/chosen": -178.0692138671875, + "logps/rejected": -199.9445343017578, + "loss": 0.3641, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.67464679479599, + "rewards/margins": 1.4650667905807495, + "rewards/rejected": -2.1397135257720947, + "step": 1018 + }, + { + "epoch": 1.1765554033595271, + "grad_norm": 58.285176845680446, + "learning_rate": 8.011351186671635e-08, + "logits/chosen": -1.3443338871002197, + "logits/rejected": -1.4246101379394531, + "logps/chosen": -186.61431884765625, + "logps/rejected": -230.87966918945312, + "loss": 0.4095, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.9058922529220581, + "rewards/margins": 1.0936301946640015, + "rewards/rejected": -1.9995224475860596, + "step": 1020 + }, + { + "epoch": 1.1788623747386633, + "grad_norm": 40.74344848622249, + "learning_rate": 7.973635070077318e-08, + "logits/chosen": -1.3627556562423706, + "logits/rejected": -1.2945423126220703, + "logps/chosen": -169.3376922607422, + "logps/rejected": -196.9662628173828, + "loss": 0.3688, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8068114519119263, + "rewards/margins": 1.142544150352478, + "rewards/rejected": -1.9493556022644043, + "step": 1022 + }, + { + "epoch": 1.1811693461177997, + "grad_norm": 52.371921447490024, + "learning_rate": 7.935948989099605e-08, + "logits/chosen": -1.3821394443511963, + "logits/rejected": -1.385388970375061, + "logps/chosen": -188.00381469726562, + "logps/rejected": -241.38780212402344, + "loss": 0.4173, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.226588487625122, + "rewards/margins": 1.1497639417648315, + "rewards/rejected": -2.376352310180664, + "step": 1024 + }, + { + "epoch": 1.183476317496936, + "grad_norm": 47.37840376254983, + "learning_rate": 7.898293502337122e-08, + "logits/chosen": -1.3714951276779175, + "logits/rejected": -1.283757209777832, + "logps/chosen": -136.48751831054688, + "logps/rejected": -152.93016052246094, + "loss": 0.4274, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7810596227645874, + "rewards/margins": 0.9014545679092407, + "rewards/rejected": -1.6825141906738281, + "step": 1026 + }, + { + "epoch": 1.1857832888760724, + "grad_norm": 50.908179365332344, + "learning_rate": 7.860669167935028e-08, + "logits/chosen": -1.3071757555007935, + "logits/rejected": -1.287663221359253, + "logps/chosen": -230.11318969726562, + "logps/rejected": -298.1128234863281, + "loss": 0.4266, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.0475901365280151, + "rewards/margins": 1.5074639320373535, + "rewards/rejected": -2.555053949356079, + "step": 1028 + }, + { + "epoch": 1.1880902602552088, + "grad_norm": 44.365837449973405, + "learning_rate": 7.823076543576717e-08, + "logits/chosen": -1.4471302032470703, + "logits/rejected": -1.442579984664917, + "logps/chosen": -164.46011352539062, + "logps/rejected": -170.95596313476562, + "loss": 0.3882, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.5805782079696655, + "rewards/margins": 0.9794151782989502, + "rewards/rejected": -1.5599933862686157, + "step": 1030 + }, + { + "epoch": 1.190397231634345, + "grad_norm": 49.23836352683586, + "learning_rate": 7.785516186475574e-08, + "logits/chosen": -1.2814298868179321, + "logits/rejected": -1.3165457248687744, + "logps/chosen": -154.7665557861328, + "logps/rejected": -175.01173400878906, + "loss": 0.399, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.9837753176689148, + "rewards/margins": 1.1613128185272217, + "rewards/rejected": -2.145087957382202, + "step": 1032 + }, + { + "epoch": 1.1927042030134813, + "grad_norm": 51.14719932201333, + "learning_rate": 7.747988653366697e-08, + "logits/chosen": -1.5139728784561157, + "logits/rejected": -1.4376604557037354, + "logps/chosen": -156.8878173828125, + "logps/rejected": -158.37210083007812, + "loss": 0.3949, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8450059294700623, + "rewards/margins": 0.7959898710250854, + "rewards/rejected": -1.6409958600997925, + "step": 1034 + }, + { + "epoch": 1.1950111743926177, + "grad_norm": 49.56300160493831, + "learning_rate": 7.710494500498662e-08, + "logits/chosen": -1.4593157768249512, + "logits/rejected": -1.3894437551498413, + "logps/chosen": -106.61005401611328, + "logps/rejected": -107.51506042480469, + "loss": 0.3957, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8094179034233093, + "rewards/margins": 0.61183100938797, + "rewards/rejected": -1.4212487936019897, + "step": 1036 + }, + { + "epoch": 1.197318145771754, + "grad_norm": 46.81170703509067, + "learning_rate": 7.673034283625257e-08, + "logits/chosen": -1.4491219520568848, + "logits/rejected": -1.5339434146881104, + "logps/chosen": -157.70791625976562, + "logps/rejected": -199.98257446289062, + "loss": 0.4217, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8838984966278076, + "rewards/margins": 1.085174798965454, + "rewards/rejected": -1.9690735340118408, + "step": 1038 + }, + { + "epoch": 1.1996251171508903, + "grad_norm": 59.481661469725495, + "learning_rate": 7.635608557997271e-08, + "logits/chosen": -1.426401138305664, + "logits/rejected": -1.4868569374084473, + "logps/chosen": -175.43914794921875, + "logps/rejected": -214.3563232421875, + "loss": 0.4172, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8752603530883789, + "rewards/margins": 1.1431567668914795, + "rewards/rejected": -2.0184173583984375, + "step": 1040 + }, + { + "epoch": 1.2019320885300266, + "grad_norm": 53.22583112987165, + "learning_rate": 7.598217878354236e-08, + "logits/chosen": -1.4433585405349731, + "logits/rejected": -1.4538565874099731, + "logps/chosen": -157.7897186279297, + "logps/rejected": -229.7288055419922, + "loss": 0.4108, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.8286512494087219, + "rewards/margins": 1.2663472890853882, + "rewards/rejected": -2.094998598098755, + "step": 1042 + }, + { + "epoch": 1.204239059909163, + "grad_norm": 49.60437729117302, + "learning_rate": 7.560862798916228e-08, + "logits/chosen": -1.4474220275878906, + "logits/rejected": -1.4672783613204956, + "logps/chosen": -159.66246032714844, + "logps/rejected": -196.7609405517578, + "loss": 0.4004, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.754623293876648, + "rewards/margins": 1.0629355907440186, + "rewards/rejected": -1.8175588846206665, + "step": 1044 + }, + { + "epoch": 1.2065460312882994, + "grad_norm": 47.76416560805182, + "learning_rate": 7.52354387337564e-08, + "logits/chosen": -1.3766390085220337, + "logits/rejected": -1.453482985496521, + "logps/chosen": -127.11229705810547, + "logps/rejected": -186.68060302734375, + "loss": 0.4089, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.5222644209861755, + "rewards/margins": 1.1581171751022339, + "rewards/rejected": -1.6803816556930542, + "step": 1046 + }, + { + "epoch": 1.2088530026674356, + "grad_norm": 44.83004511877691, + "learning_rate": 7.486261654888972e-08, + "logits/chosen": -1.2795186042785645, + "logits/rejected": -1.3099991083145142, + "logps/chosen": -132.81536865234375, + "logps/rejected": -216.51211547851562, + "loss": 0.4054, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7403107285499573, + "rewards/margins": 1.632882833480835, + "rewards/rejected": -2.3731932640075684, + "step": 1048 + }, + { + "epoch": 1.211159974046572, + "grad_norm": 46.48503074566374, + "learning_rate": 7.449016696068645e-08, + "logits/chosen": -1.3228113651275635, + "logits/rejected": -1.3149969577789307, + "logps/chosen": -153.4790496826172, + "logps/rejected": -168.47897338867188, + "loss": 0.3849, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.8511451482772827, + "rewards/margins": 0.9793115854263306, + "rewards/rejected": -1.8304567337036133, + "step": 1050 + }, + { + "epoch": 1.2134669454257083, + "grad_norm": 45.82113493125643, + "learning_rate": 7.411809548974791e-08, + "logits/chosen": -1.3215563297271729, + "logits/rejected": -1.419891119003296, + "logps/chosen": -229.16896057128906, + "logps/rejected": -293.1673278808594, + "loss": 0.364, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1792607307434082, + "rewards/margins": 1.518965482711792, + "rewards/rejected": -2.6982262134552, + "step": 1052 + }, + { + "epoch": 1.2157739168048447, + "grad_norm": 49.08087412471043, + "learning_rate": 7.374640765107095e-08, + "logits/chosen": -1.4140170812606812, + "logits/rejected": -1.4835015535354614, + "logps/chosen": -223.50379943847656, + "logps/rejected": -291.9320373535156, + "loss": 0.4479, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.0911102294921875, + "rewards/margins": 1.4864944219589233, + "rewards/rejected": -2.5776045322418213, + "step": 1054 + }, + { + "epoch": 1.218080888183981, + "grad_norm": 56.10565437870734, + "learning_rate": 7.337510895396591e-08, + "logits/chosen": -1.2541605234146118, + "logits/rejected": -1.3272130489349365, + "logps/chosen": -124.03968048095703, + "logps/rejected": -178.76068115234375, + "loss": 0.3705, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6784568428993225, + "rewards/margins": 1.2771177291870117, + "rewards/rejected": -1.9555747509002686, + "step": 1056 + }, + { + "epoch": 1.2203878595631172, + "grad_norm": 55.75804088847802, + "learning_rate": 7.300420490197523e-08, + "logits/chosen": -1.311789631843567, + "logits/rejected": -1.3173766136169434, + "logps/chosen": -174.1820526123047, + "logps/rejected": -234.0801239013672, + "loss": 0.4168, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.9961414933204651, + "rewards/margins": 1.0488609075546265, + "rewards/rejected": -2.0450022220611572, + "step": 1058 + }, + { + "epoch": 1.2226948309422536, + "grad_norm": 40.489341178222624, + "learning_rate": 7.263370099279171e-08, + "logits/chosen": -1.188340663909912, + "logits/rejected": -1.1492575407028198, + "logps/chosen": -163.66763305664062, + "logps/rejected": -190.305908203125, + "loss": 0.3879, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.9333768486976624, + "rewards/margins": 1.2128130197525024, + "rewards/rejected": -2.1461899280548096, + "step": 1060 + }, + { + "epoch": 1.22500180232139, + "grad_norm": 50.037739270264275, + "learning_rate": 7.226360271817708e-08, + "logits/chosen": -1.3506156206130981, + "logits/rejected": -1.3269940614700317, + "logps/chosen": -206.15762329101562, + "logps/rejected": -231.0203094482422, + "loss": 0.3727, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.9508618712425232, + "rewards/margins": 1.099410891532898, + "rewards/rejected": -2.0502727031707764, + "step": 1062 + }, + { + "epoch": 1.2273087737005262, + "grad_norm": 49.869486100814115, + "learning_rate": 7.189391556388058e-08, + "logits/chosen": -1.4331234693527222, + "logits/rejected": -1.3977779150009155, + "logps/chosen": -209.97015380859375, + "logps/rejected": -260.2592468261719, + "loss": 0.4164, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1883045434951782, + "rewards/margins": 1.4966893196105957, + "rewards/rejected": -2.6849939823150635, + "step": 1064 + }, + { + "epoch": 1.2296157450796625, + "grad_norm": 49.45531524736339, + "learning_rate": 7.152464500955768e-08, + "logits/chosen": -1.3637323379516602, + "logits/rejected": -1.3475160598754883, + "logps/chosen": -194.37664794921875, + "logps/rejected": -210.33851623535156, + "loss": 0.3775, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.1909425258636475, + "rewards/margins": 1.4508752822875977, + "rewards/rejected": -2.641818046569824, + "step": 1066 + }, + { + "epoch": 1.231922716458799, + "grad_norm": 52.71504231247347, + "learning_rate": 7.115579652868878e-08, + "logits/chosen": -1.2816321849822998, + "logits/rejected": -1.2634057998657227, + "logps/chosen": -153.44662475585938, + "logps/rejected": -207.44244384765625, + "loss": 0.3713, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.833113431930542, + "rewards/margins": 1.5628983974456787, + "rewards/rejected": -2.3960118293762207, + "step": 1068 + }, + { + "epoch": 1.2342296878379353, + "grad_norm": 56.75539650441376, + "learning_rate": 7.078737558849818e-08, + "logits/chosen": -1.3372987508773804, + "logits/rejected": -1.3544334173202515, + "logps/chosen": -152.42861938476562, + "logps/rejected": -215.0513916015625, + "loss": 0.4061, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.9685344696044922, + "rewards/margins": 1.3745617866516113, + "rewards/rejected": -2.3430962562561035, + "step": 1070 + }, + { + "epoch": 1.2365366592170717, + "grad_norm": 54.12178163249736, + "learning_rate": 7.041938764987296e-08, + "logits/chosen": -1.4193816184997559, + "logits/rejected": -1.4047815799713135, + "logps/chosen": -206.84002685546875, + "logps/rejected": -220.5758819580078, + "loss": 0.3785, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9423998594284058, + "rewards/margins": 1.1570698022842407, + "rewards/rejected": -2.0994696617126465, + "step": 1072 + }, + { + "epoch": 1.2388436305962078, + "grad_norm": 41.420273500877144, + "learning_rate": 7.005183816728213e-08, + "logits/chosen": -1.4170185327529907, + "logits/rejected": -1.4868308305740356, + "logps/chosen": -205.6683807373047, + "logps/rejected": -309.4522705078125, + "loss": 0.3775, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.0644909143447876, + "rewards/margins": 1.8807792663574219, + "rewards/rejected": -2.945270299911499, + "step": 1074 + }, + { + "epoch": 1.2411506019753442, + "grad_norm": 54.934353878039, + "learning_rate": 6.968473258869565e-08, + "logits/chosen": -1.352636694908142, + "logits/rejected": -1.449910283088684, + "logps/chosen": -168.9800567626953, + "logps/rejected": -243.92965698242188, + "loss": 0.3853, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.8759716749191284, + "rewards/margins": 1.3743195533752441, + "rewards/rejected": -2.250291585922241, + "step": 1076 + }, + { + "epoch": 1.2434575733544806, + "grad_norm": 48.00874815986513, + "learning_rate": 6.931807635550383e-08, + "logits/chosen": -1.519219994544983, + "logits/rejected": -1.5146820545196533, + "logps/chosen": -205.1129913330078, + "logps/rejected": -249.6070098876953, + "loss": 0.3314, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.9951453804969788, + "rewards/margins": 1.2512072324752808, + "rewards/rejected": -2.2463526725769043, + "step": 1078 + }, + { + "epoch": 1.245764544733617, + "grad_norm": 50.334923119590876, + "learning_rate": 6.89518749024365e-08, + "logits/chosen": -1.4772987365722656, + "logits/rejected": -1.4446271657943726, + "logps/chosen": -154.9414520263672, + "logps/rejected": -164.93331909179688, + "loss": 0.3882, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.6665012240409851, + "rewards/margins": 0.8699983954429626, + "rewards/rejected": -1.5364995002746582, + "step": 1080 + }, + { + "epoch": 1.2480715161127531, + "grad_norm": 57.712585093330645, + "learning_rate": 6.858613365748267e-08, + "logits/chosen": -1.2888410091400146, + "logits/rejected": -1.3315074443817139, + "logps/chosen": -234.24783325195312, + "logps/rejected": -301.7956848144531, + "loss": 0.4517, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.1589394807815552, + "rewards/margins": 1.467663288116455, + "rewards/rejected": -2.6266026496887207, + "step": 1082 + }, + { + "epoch": 1.2503784874918895, + "grad_norm": 47.1157877073686, + "learning_rate": 6.822085804180984e-08, + "logits/chosen": -1.4558112621307373, + "logits/rejected": -1.355411171913147, + "logps/chosen": -205.1348876953125, + "logps/rejected": -189.77523803710938, + "loss": 0.3582, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7983089685440063, + "rewards/margins": 1.0706913471221924, + "rewards/rejected": -1.8690004348754883, + "step": 1084 + }, + { + "epoch": 1.252685458871026, + "grad_norm": 52.21642067042654, + "learning_rate": 6.785605346968386e-08, + "logits/chosen": -1.4896191358566284, + "logits/rejected": -1.5129355192184448, + "logps/chosen": -217.2392120361328, + "logps/rejected": -240.3934783935547, + "loss": 0.4285, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.158021330833435, + "rewards/margins": 1.2918341159820557, + "rewards/rejected": -2.4498555660247803, + "step": 1086 + }, + { + "epoch": 1.2549924302501623, + "grad_norm": 54.48719637145695, + "learning_rate": 6.749172534838848e-08, + "logits/chosen": -1.4208894968032837, + "logits/rejected": -1.3335964679718018, + "logps/chosen": -180.71263122558594, + "logps/rejected": -172.00143432617188, + "loss": 0.3962, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.8040263056755066, + "rewards/margins": 0.9753020405769348, + "rewards/rejected": -1.7793283462524414, + "step": 1088 + }, + { + "epoch": 1.2572994016292984, + "grad_norm": 44.86151315350074, + "learning_rate": 6.712787907814541e-08, + "logits/chosen": -1.3919479846954346, + "logits/rejected": -1.3973674774169922, + "logps/chosen": -234.341796875, + "logps/rejected": -334.7096862792969, + "loss": 0.3718, + "rewards/accuracies": 1.0, + "rewards/chosen": -1.0190818309783936, + "rewards/margins": 2.0405924320220947, + "rewards/rejected": -3.0596742630004883, + "step": 1090 + }, + { + "epoch": 1.2596063730084348, + "grad_norm": 49.585862978376184, + "learning_rate": 6.676452005203405e-08, + "logits/chosen": -1.3598554134368896, + "logits/rejected": -1.435031533241272, + "logps/chosen": -164.2373809814453, + "logps/rejected": -212.1796875, + "loss": 0.4162, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9823134541511536, + "rewards/margins": 1.0851922035217285, + "rewards/rejected": -2.0675055980682373, + "step": 1092 + }, + { + "epoch": 1.2619133443875712, + "grad_norm": 61.87058570329516, + "learning_rate": 6.640165365591175e-08, + "logits/chosen": -1.420425295829773, + "logits/rejected": -1.3986551761627197, + "logps/chosen": -179.66464233398438, + "logps/rejected": -229.3308563232422, + "loss": 0.4133, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.8618680238723755, + "rewards/margins": 1.1394115686416626, + "rewards/rejected": -2.001279830932617, + "step": 1094 + }, + { + "epoch": 1.2642203157667076, + "grad_norm": 38.55062311194488, + "learning_rate": 6.603928526833386e-08, + "logits/chosen": -1.4957396984100342, + "logits/rejected": -1.484830617904663, + "logps/chosen": -195.9167938232422, + "logps/rejected": -239.69021606445312, + "loss": 0.3935, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8891066312789917, + "rewards/margins": 1.3260498046875, + "rewards/rejected": -2.2151565551757812, + "step": 1096 + }, + { + "epoch": 1.266527287145844, + "grad_norm": 47.6968545203006, + "learning_rate": 6.567742026047405e-08, + "logits/chosen": -1.3937017917633057, + "logits/rejected": -1.4098970890045166, + "logps/chosen": -198.03074645996094, + "logps/rejected": -259.1086120605469, + "loss": 0.4311, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.115444302558899, + "rewards/margins": 1.3988248109817505, + "rewards/rejected": -2.5142691135406494, + "step": 1098 + }, + { + "epoch": 1.26883425852498, + "grad_norm": 53.01531141976359, + "learning_rate": 6.531606399604472e-08, + "logits/chosen": -1.3574663400650024, + "logits/rejected": -1.3760308027267456, + "logps/chosen": -195.03289794921875, + "logps/rejected": -241.59120178222656, + "loss": 0.3629, + "rewards/accuracies": 0.96875, + "rewards/chosen": -1.0627293586730957, + "rewards/margins": 1.5820095539093018, + "rewards/rejected": -2.6447389125823975, + "step": 1100 + }, + { + "epoch": 1.26883425852498, + "eval_logits/chosen": -1.346447467803955, + "eval_logits/rejected": -1.2698445320129395, + "eval_logps/chosen": -198.6750030517578, + "eval_logps/rejected": -170.5380096435547, + "eval_loss": 0.5291120409965515, + "eval_rewards/accuracies": 0.7599999904632568, + "eval_rewards/chosen": -1.3492814302444458, + "eval_rewards/margins": 0.6529796719551086, + "eval_rewards/rejected": -2.002261161804199, + "eval_runtime": 23.3211, + "eval_samples_per_second": 4.288, + "eval_steps_per_second": 1.072, + "step": 1100 + }, + { + "epoch": 1.2711412299041165, + "grad_norm": 54.927019109306244, + "learning_rate": 6.49552218312174e-08, + "logits/chosen": -1.49580717086792, + "logits/rejected": -1.5456180572509766, + "logps/chosen": -191.8947296142578, + "logps/rejected": -221.13339233398438, + "loss": 0.4337, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8337838649749756, + "rewards/margins": 1.0560129880905151, + "rewards/rejected": -1.8897968530654907, + "step": 1102 + }, + { + "epoch": 1.2734482012832529, + "grad_norm": 57.11044799181979, + "learning_rate": 6.459489911454348e-08, + "logits/chosen": -1.2576816082000732, + "logits/rejected": -1.3321532011032104, + "logps/chosen": -153.68167114257812, + "logps/rejected": -187.43959045410156, + "loss": 0.4551, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.0336787700653076, + "rewards/margins": 0.5679832100868225, + "rewards/rejected": -1.601662039756775, + "step": 1104 + }, + { + "epoch": 1.275755172662389, + "grad_norm": 48.9077605697985, + "learning_rate": 6.423510118687482e-08, + "logits/chosen": -1.2373907566070557, + "logits/rejected": -1.2709932327270508, + "logps/chosen": -142.4616241455078, + "logps/rejected": -163.26995849609375, + "loss": 0.4101, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.8073585033416748, + "rewards/margins": 0.8551322817802429, + "rewards/rejected": -1.662490725517273, + "step": 1106 + }, + { + "epoch": 1.2780621440415254, + "grad_norm": 56.507420732209305, + "learning_rate": 6.387583338128471e-08, + "logits/chosen": -1.2857414484024048, + "logits/rejected": -1.331521987915039, + "logps/chosen": -150.70591735839844, + "logps/rejected": -171.67141723632812, + "loss": 0.4416, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8073572516441345, + "rewards/margins": 0.8103399276733398, + "rewards/rejected": -1.6176972389221191, + "step": 1108 + }, + { + "epoch": 1.2803691154206618, + "grad_norm": 45.69922325435096, + "learning_rate": 6.351710102298867e-08, + "logits/chosen": -1.3792985677719116, + "logits/rejected": -1.452039361000061, + "logps/chosen": -207.93603515625, + "logps/rejected": -253.76393127441406, + "loss": 0.3717, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9924658536911011, + "rewards/margins": 1.279854655265808, + "rewards/rejected": -2.27232027053833, + "step": 1110 + }, + { + "epoch": 1.2826760867997982, + "grad_norm": 46.67725956292131, + "learning_rate": 6.31589094292657e-08, + "logits/chosen": -1.3903954029083252, + "logits/rejected": -1.4184892177581787, + "logps/chosen": -219.1831512451172, + "logps/rejected": -280.4320068359375, + "loss": 0.4046, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.0702323913574219, + "rewards/margins": 1.5914549827575684, + "rewards/rejected": -2.6616873741149902, + "step": 1112 + }, + { + "epoch": 1.2849830581789345, + "grad_norm": 40.80571923993709, + "learning_rate": 6.280126390937924e-08, + "logits/chosen": -1.4774876832962036, + "logits/rejected": -1.4684927463531494, + "logps/chosen": -215.90835571289062, + "logps/rejected": -221.57406616210938, + "loss": 0.436, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9599077701568604, + "rewards/margins": 0.8318207263946533, + "rewards/rejected": -1.7917284965515137, + "step": 1114 + }, + { + "epoch": 1.2872900295580707, + "grad_norm": 42.68311460025384, + "learning_rate": 6.244416976449875e-08, + "logits/chosen": -1.3221067190170288, + "logits/rejected": -1.2649778127670288, + "logps/chosen": -119.57810974121094, + "logps/rejected": -151.20071411132812, + "loss": 0.3855, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.7455071806907654, + "rewards/margins": 0.9170046448707581, + "rewards/rejected": -1.6625118255615234, + "step": 1116 + }, + { + "epoch": 1.289597000937207, + "grad_norm": 43.61686616170438, + "learning_rate": 6.208763228762082e-08, + "logits/chosen": -1.2151230573654175, + "logits/rejected": -1.2297194004058838, + "logps/chosen": -164.2678985595703, + "logps/rejected": -242.52076721191406, + "loss": 0.3943, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.0782736539840698, + "rewards/margins": 1.6307076215744019, + "rewards/rejected": -2.708981513977051, + "step": 1118 + }, + { + "epoch": 1.2919039723163435, + "grad_norm": 54.35054138906459, + "learning_rate": 6.173165676349102e-08, + "logits/chosen": -1.2725952863693237, + "logits/rejected": -1.4444892406463623, + "logps/chosen": -166.21847534179688, + "logps/rejected": -293.0514831542969, + "loss": 0.4145, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.8468817472457886, + "rewards/margins": 1.518345594406128, + "rewards/rejected": -2.365227222442627, + "step": 1120 + }, + { + "epoch": 1.2942109436954798, + "grad_norm": 44.69616309273757, + "learning_rate": 6.137624846852535e-08, + "logits/chosen": -1.2640550136566162, + "logits/rejected": -1.1721779108047485, + "logps/chosen": -126.40272521972656, + "logps/rejected": -149.834228515625, + "loss": 0.3984, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.9074777364730835, + "rewards/margins": 1.2042250633239746, + "rewards/rejected": -2.1117029190063477, + "step": 1122 + }, + { + "epoch": 1.2965179150746162, + "grad_norm": 44.46454668934902, + "learning_rate": 6.102141267073207e-08, + "logits/chosen": -1.3089536428451538, + "logits/rejected": -1.4028515815734863, + "logps/chosen": -139.8980255126953, + "logps/rejected": -190.38258361816406, + "loss": 0.3884, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9231299161911011, + "rewards/margins": 1.0761425495147705, + "rewards/rejected": -1.9992725849151611, + "step": 1124 + }, + { + "epoch": 1.2988248864537524, + "grad_norm": 55.74989650178749, + "learning_rate": 6.066715462963375e-08, + "logits/chosen": -1.4593031406402588, + "logits/rejected": -1.5641745328903198, + "logps/chosen": -196.91680908203125, + "logps/rejected": -257.47308349609375, + "loss": 0.4039, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.2544538974761963, + "rewards/margins": 1.0345784425735474, + "rewards/rejected": -2.289032220840454, + "step": 1126 + }, + { + "epoch": 1.3011318578328888, + "grad_norm": 51.02973123233033, + "learning_rate": 6.031347959618913e-08, + "logits/chosen": -1.357025384902954, + "logits/rejected": -1.4425618648529053, + "logps/chosen": -173.85137939453125, + "logps/rejected": -200.96022033691406, + "loss": 0.4391, + "rewards/accuracies": 0.71875, + "rewards/chosen": -0.8829444646835327, + "rewards/margins": 1.008034110069275, + "rewards/rejected": -1.8909785747528076, + "step": 1128 + }, + { + "epoch": 1.3034388292120251, + "grad_norm": 41.84275287977337, + "learning_rate": 5.996039281271543e-08, + "logits/chosen": -1.2947957515716553, + "logits/rejected": -1.2887905836105347, + "logps/chosen": -139.00042724609375, + "logps/rejected": -151.3178253173828, + "loss": 0.4256, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.8678557276725769, + "rewards/margins": 0.6157978177070618, + "rewards/rejected": -1.4836535453796387, + "step": 1130 + }, + { + "epoch": 1.3057458005911613, + "grad_norm": 51.77131279496888, + "learning_rate": 5.96078995128105e-08, + "logits/chosen": -1.3757154941558838, + "logits/rejected": -1.4178260564804077, + "logps/chosen": -150.56947326660156, + "logps/rejected": -227.6977996826172, + "loss": 0.3699, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.034728765487671, + "rewards/margins": 1.7096223831176758, + "rewards/rejected": -2.7443511486053467, + "step": 1132 + }, + { + "epoch": 1.3080527719702977, + "grad_norm": 47.700143499181216, + "learning_rate": 5.925600492127547e-08, + "logits/chosen": -1.3131159543991089, + "logits/rejected": -1.310932993888855, + "logps/chosen": -141.53134155273438, + "logps/rejected": -191.7972412109375, + "loss": 0.3982, + "rewards/accuracies": 0.6875, + "rewards/chosen": -0.870035707950592, + "rewards/margins": 1.2973705530166626, + "rewards/rejected": -2.1674063205718994, + "step": 1134 + }, + { + "epoch": 1.310359743349434, + "grad_norm": 53.07658826682632, + "learning_rate": 5.8904714254037025e-08, + "logits/chosen": -1.436853289604187, + "logits/rejected": -1.5420252084732056, + "logps/chosen": -137.68533325195312, + "logps/rejected": -197.42489624023438, + "loss": 0.4101, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.8586965799331665, + "rewards/margins": 1.3288297653198242, + "rewards/rejected": -2.1875264644622803, + "step": 1136 + }, + { + "epoch": 1.3126667147285704, + "grad_norm": 55.98499802869115, + "learning_rate": 5.855403271807032e-08, + "logits/chosen": -1.4441746473312378, + "logits/rejected": -1.3740431070327759, + "logps/chosen": -141.81146240234375, + "logps/rejected": -157.10916137695312, + "loss": 0.3657, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9153862595558167, + "rewards/margins": 1.5047463178634644, + "rewards/rejected": -2.420132637023926, + "step": 1138 + }, + { + "epoch": 1.3149736861077068, + "grad_norm": 47.88409037346829, + "learning_rate": 5.82039655113217e-08, + "logits/chosen": -1.5311238765716553, + "logits/rejected": -1.4044201374053955, + "logps/chosen": -165.2921600341797, + "logps/rejected": -202.717041015625, + "loss": 0.3879, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.142822027206421, + "rewards/margins": 1.5451483726501465, + "rewards/rejected": -2.6879703998565674, + "step": 1140 + }, + { + "epoch": 1.317280657486843, + "grad_norm": 63.28691341194904, + "learning_rate": 5.785451782263161e-08, + "logits/chosen": -1.3691322803497314, + "logits/rejected": -1.3665211200714111, + "logps/chosen": -185.71286010742188, + "logps/rejected": -199.6193084716797, + "loss": 0.4569, + "rewards/accuracies": 0.8125, + "rewards/chosen": -1.1284613609313965, + "rewards/margins": 0.9253698587417603, + "rewards/rejected": -2.053831100463867, + "step": 1142 + }, + { + "epoch": 1.3195876288659794, + "grad_norm": 47.39309353229901, + "learning_rate": 5.750569483165784e-08, + "logits/chosen": -1.4143561124801636, + "logits/rejected": -1.307342767715454, + "logps/chosen": -236.60182189941406, + "logps/rejected": -257.746337890625, + "loss": 0.4071, + "rewards/accuracies": 0.71875, + "rewards/chosen": -1.2457427978515625, + "rewards/margins": 1.2368875741958618, + "rewards/rejected": -2.482630491256714, + "step": 1144 + }, + { + "epoch": 1.3218946002451157, + "grad_norm": 47.09697442795077, + "learning_rate": 5.7157501708798584e-08, + "logits/chosen": -1.5032621622085571, + "logits/rejected": -1.3734570741653442, + "logps/chosen": -182.57345581054688, + "logps/rejected": -169.54696655273438, + "loss": 0.4129, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.912488579750061, + "rewards/margins": 0.9649824500083923, + "rewards/rejected": -1.8774710893630981, + "step": 1146 + }, + { + "epoch": 1.3242015716242521, + "grad_norm": 57.0952473659576, + "learning_rate": 5.6809943615115904e-08, + "logits/chosen": -1.4295306205749512, + "logits/rejected": -1.399375557899475, + "logps/chosen": -137.4228057861328, + "logps/rejected": -176.73545837402344, + "loss": 0.3865, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7859193086624146, + "rewards/margins": 1.589166283607483, + "rewards/rejected": -2.3750853538513184, + "step": 1148 + }, + { + "epoch": 1.3265085430033885, + "grad_norm": 57.54839610092353, + "learning_rate": 5.646302570225918e-08, + "logits/chosen": -1.2610923051834106, + "logits/rejected": -1.1784062385559082, + "logps/chosen": -173.5068359375, + "logps/rejected": -212.621337890625, + "loss": 0.4041, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9680019021034241, + "rewards/margins": 1.6587209701538086, + "rewards/rejected": -2.626722812652588, + "step": 1150 + }, + { + "epoch": 1.3288155143825247, + "grad_norm": 48.13045472735045, + "learning_rate": 5.6116753112388794e-08, + "logits/chosen": -1.4241957664489746, + "logits/rejected": -1.4249969720840454, + "logps/chosen": -174.09970092773438, + "logps/rejected": -180.14329528808594, + "loss": 0.415, + "rewards/accuracies": 0.78125, + "rewards/chosen": -0.8689387440681458, + "rewards/margins": 0.9074426889419556, + "rewards/rejected": -1.776381254196167, + "step": 1152 + }, + { + "epoch": 1.331122485761661, + "grad_norm": 54.2150965617913, + "learning_rate": 5.577113097809989e-08, + "logits/chosen": -1.3543493747711182, + "logits/rejected": -1.3709462881088257, + "logps/chosen": -142.47586059570312, + "logps/rejected": -179.7193145751953, + "loss": 0.3837, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.8821631073951721, + "rewards/margins": 1.1474261283874512, + "rewards/rejected": -2.0295891761779785, + "step": 1154 + }, + { + "epoch": 1.3334294571407974, + "grad_norm": 52.86369603141297, + "learning_rate": 5.542616442234618e-08, + "logits/chosen": -1.365761637687683, + "logits/rejected": -1.441951036453247, + "logps/chosen": -169.7147979736328, + "logps/rejected": -230.1123046875, + "loss": 0.4563, + "rewards/accuracies": 0.75, + "rewards/chosen": -1.0090858936309814, + "rewards/margins": 1.1310818195343018, + "rewards/rejected": -2.140167713165283, + "step": 1156 + }, + { + "epoch": 1.3357364285199336, + "grad_norm": 48.68345037696806, + "learning_rate": 5.508185855836425e-08, + "logits/chosen": -1.3327983617782593, + "logits/rejected": -1.4392744302749634, + "logps/chosen": -133.7701416015625, + "logps/rejected": -191.17843627929688, + "loss": 0.4077, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.9508491158485413, + "rewards/margins": 1.3994641304016113, + "rewards/rejected": -2.350313186645508, + "step": 1158 + }, + { + "epoch": 1.33804339989907, + "grad_norm": 42.64070787602433, + "learning_rate": 5.473821848959761e-08, + "logits/chosen": -1.2654824256896973, + "logits/rejected": -1.3428267240524292, + "logps/chosen": -177.55340576171875, + "logps/rejected": -255.42984008789062, + "loss": 0.3802, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9991359114646912, + "rewards/margins": 1.5646129846572876, + "rewards/rejected": -2.563748836517334, + "step": 1160 + }, + { + "epoch": 1.3403503712782063, + "grad_norm": 50.28942241228028, + "learning_rate": 5.4395249309621097e-08, + "logits/chosen": -1.3695107698440552, + "logits/rejected": -1.2706151008605957, + "logps/chosen": -283.1875, + "logps/rejected": -286.9220886230469, + "loss": 0.38, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.4288372993469238, + "rewards/margins": 1.4134087562561035, + "rewards/rejected": -2.8422460556030273, + "step": 1162 + }, + { + "epoch": 1.3426573426573427, + "grad_norm": 61.971878969120056, + "learning_rate": 5.405295610206524e-08, + "logits/chosen": -1.3642442226409912, + "logits/rejected": -1.3541247844696045, + "logps/chosen": -174.163330078125, + "logps/rejected": -198.29196166992188, + "loss": 0.4006, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.9269206523895264, + "rewards/margins": 1.0355504751205444, + "rewards/rejected": -1.9624711275100708, + "step": 1164 + }, + { + "epoch": 1.344964314036479, + "grad_norm": 46.86768761320917, + "learning_rate": 5.371134394054115e-08, + "logits/chosen": -1.4383530616760254, + "logits/rejected": -1.397727131843567, + "logps/chosen": -225.30628967285156, + "logps/rejected": -246.29644775390625, + "loss": 0.4128, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9159881472587585, + "rewards/margins": 1.293990135192871, + "rewards/rejected": -2.2099781036376953, + "step": 1166 + }, + { + "epoch": 1.3472712854156152, + "grad_norm": 53.19774662060618, + "learning_rate": 5.337041788856518e-08, + "logits/chosen": -1.3878076076507568, + "logits/rejected": -1.3727601766586304, + "logps/chosen": -161.39529418945312, + "logps/rejected": -191.12062072753906, + "loss": 0.407, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.8259797096252441, + "rewards/margins": 1.1816279888153076, + "rewards/rejected": -2.0076074600219727, + "step": 1168 + }, + { + "epoch": 1.3495782567947516, + "grad_norm": 60.247505085915364, + "learning_rate": 5.303018299948389e-08, + "logits/chosen": -1.3918073177337646, + "logits/rejected": -1.3807213306427002, + "logps/chosen": -133.32913208007812, + "logps/rejected": -169.5771942138672, + "loss": 0.3973, + "rewards/accuracies": 0.90625, + "rewards/chosen": -0.6024202108383179, + "rewards/margins": 1.2600902318954468, + "rewards/rejected": -1.8625102043151855, + "step": 1170 + }, + { + "epoch": 1.351885228173888, + "grad_norm": 47.80602272845366, + "learning_rate": 5.2690644316399004e-08, + "logits/chosen": -1.3061091899871826, + "logits/rejected": -1.3677294254302979, + "logps/chosen": -173.61183166503906, + "logps/rejected": -228.7135772705078, + "loss": 0.3373, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.1685351133346558, + "rewards/margins": 1.3943965435028076, + "rewards/rejected": -2.562931537628174, + "step": 1172 + }, + { + "epoch": 1.3541921995530242, + "grad_norm": 52.65074333120445, + "learning_rate": 5.235180687209295e-08, + "logits/chosen": -1.3143095970153809, + "logits/rejected": -1.373111367225647, + "logps/chosen": -237.4772491455078, + "logps/rejected": -283.6535339355469, + "loss": 0.4258, + "rewards/accuracies": 0.84375, + "rewards/chosen": -1.1781258583068848, + "rewards/margins": 1.684722900390625, + "rewards/rejected": -2.8628485202789307, + "step": 1174 + }, + { + "epoch": 1.3564991709321605, + "grad_norm": 56.602930524428096, + "learning_rate": 5.201367568895407e-08, + "logits/chosen": -1.2158329486846924, + "logits/rejected": -1.2403156757354736, + "logps/chosen": -219.708740234375, + "logps/rejected": -289.25732421875, + "loss": 0.3716, + "rewards/accuracies": 0.90625, + "rewards/chosen": -1.003746747970581, + "rewards/margins": 2.028501272201538, + "rewards/rejected": -3.03224778175354, + "step": 1176 + }, + { + "epoch": 1.358806142311297, + "grad_norm": 42.6278186261498, + "learning_rate": 5.167625577890222e-08, + "logits/chosen": -1.4404326677322388, + "logits/rejected": -1.4168187379837036, + "logps/chosen": -189.86143493652344, + "logps/rejected": -238.81646728515625, + "loss": 0.3508, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.7041274309158325, + "rewards/margins": 1.4839988946914673, + "rewards/rejected": -2.188126564025879, + "step": 1178 + }, + { + "epoch": 1.3611131136904333, + "grad_norm": 46.23876462891979, + "learning_rate": 5.133955214331438e-08, + "logits/chosen": -1.359468698501587, + "logits/rejected": -1.2669312953948975, + "logps/chosen": -175.90753173828125, + "logps/rejected": -184.07627868652344, + "loss": 0.3872, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.9324202537536621, + "rewards/margins": 1.3268898725509644, + "rewards/rejected": -2.259310007095337, + "step": 1180 + }, + { + "epoch": 1.3634200850695697, + "grad_norm": 42.009432528251, + "learning_rate": 5.1003569772950714e-08, + "logits/chosen": -1.413282036781311, + "logits/rejected": -1.3845648765563965, + "logps/chosen": -162.90414428710938, + "logps/rejected": -288.2789611816406, + "loss": 0.3854, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.790794849395752, + "rewards/margins": 1.7329788208007812, + "rewards/rejected": -2.523773670196533, + "step": 1182 + }, + { + "epoch": 1.3657270564487058, + "grad_norm": 59.51613798378988, + "learning_rate": 5.0668313647880465e-08, + "logits/chosen": -1.4198514223098755, + "logits/rejected": -1.367343783378601, + "logps/chosen": -153.51954650878906, + "logps/rejected": -163.91555786132812, + "loss": 0.4409, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9148359298706055, + "rewards/margins": 0.9861717224121094, + "rewards/rejected": -1.9010077714920044, + "step": 1184 + }, + { + "epoch": 1.3680340278278422, + "grad_norm": 43.64496759806874, + "learning_rate": 5.033378873740819e-08, + "logits/chosen": -1.4157196283340454, + "logits/rejected": -1.3928645849227905, + "logps/chosen": -151.0714111328125, + "logps/rejected": -140.967529296875, + "loss": 0.3607, + "rewards/accuracies": 0.8125, + "rewards/chosen": -0.9113080501556396, + "rewards/margins": 0.7878793478012085, + "rewards/rejected": -1.6991872787475586, + "step": 1186 + }, + { + "epoch": 1.3703409992069786, + "grad_norm": 59.24746681628499, + "learning_rate": 5.000000000000002e-08, + "logits/chosen": -1.2240504026412964, + "logits/rejected": -1.2518386840820312, + "logps/chosen": -197.45103454589844, + "logps/rejected": -240.2637176513672, + "loss": 0.3703, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9717167615890503, + "rewards/margins": 1.4621995687484741, + "rewards/rejected": -2.4339160919189453, + "step": 1188 + }, + { + "epoch": 1.372647970586115, + "grad_norm": 69.8619637568134, + "learning_rate": 4.966695238321027e-08, + "logits/chosen": -1.359329104423523, + "logits/rejected": -1.3603085279464722, + "logps/chosen": -226.86892700195312, + "logps/rejected": -452.2628173828125, + "loss": 0.4596, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.174229621887207, + "rewards/margins": 1.8406164646148682, + "rewards/rejected": -3.014845848083496, + "step": 1190 + }, + { + "epoch": 1.3749549419652514, + "grad_norm": 39.85416303865178, + "learning_rate": 4.933465082360807e-08, + "logits/chosen": -1.328731894493103, + "logits/rejected": -1.313866138458252, + "logps/chosen": -143.9809112548828, + "logps/rejected": -182.53477478027344, + "loss": 0.3694, + "rewards/accuracies": 0.84375, + "rewards/chosen": -0.59998619556427, + "rewards/margins": 1.3576034307479858, + "rewards/rejected": -1.9575895071029663, + "step": 1192 + }, + { + "epoch": 1.3772619133443875, + "grad_norm": 50.47296010963747, + "learning_rate": 4.90031002467042e-08, + "logits/chosen": -1.4128611087799072, + "logits/rejected": -1.3620035648345947, + "logps/chosen": -213.6893310546875, + "logps/rejected": -255.20004272460938, + "loss": 0.3618, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.9902510046958923, + "rewards/margins": 1.6613683700561523, + "rewards/rejected": -2.6516194343566895, + "step": 1194 + }, + { + "epoch": 1.379568884723524, + "grad_norm": 44.05325481629088, + "learning_rate": 4.867230556687796e-08, + "logits/chosen": -1.2905137538909912, + "logits/rejected": -1.2903690338134766, + "logps/chosen": -130.3699951171875, + "logps/rejected": -239.37860107421875, + "loss": 0.3685, + "rewards/accuracies": 0.75, + "rewards/chosen": -0.9421904683113098, + "rewards/margins": 1.2194817066192627, + "rewards/rejected": -2.1616721153259277, + "step": 1196 + }, + { + "epoch": 1.3818758561026603, + "grad_norm": 58.075075554298664, + "learning_rate": 4.8342271687304504e-08, + "logits/chosen": -1.3242213726043701, + "logits/rejected": -1.311003565788269, + "logps/chosen": -150.2444610595703, + "logps/rejected": -197.29025268554688, + "loss": 0.451, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.9144178032875061, + "rewards/margins": 1.1128352880477905, + "rewards/rejected": -2.0272531509399414, + "step": 1198 + }, + { + "epoch": 1.3841828274817964, + "grad_norm": 37.508119781233376, + "learning_rate": 4.801300349988219e-08, + "logits/chosen": -1.3149864673614502, + "logits/rejected": -1.423133373260498, + "logps/chosen": -187.515380859375, + "logps/rejected": -250.7738037109375, + "loss": 0.372, + "rewards/accuracies": 0.96875, + "rewards/chosen": -0.6898128986358643, + "rewards/margins": 1.6321946382522583, + "rewards/rejected": -2.322007417678833, + "step": 1200 + }, + { + "epoch": 1.3841828274817964, + "eval_logits/chosen": -1.3467315435409546, + "eval_logits/rejected": -1.2710996866226196, + "eval_logps/chosen": -199.28550720214844, + "eval_logps/rejected": -170.8891143798828, + "eval_loss": 0.5353700518608093, + "eval_rewards/accuracies": 0.6800000071525574, + "eval_rewards/chosen": -1.410333275794983, + "eval_rewards/margins": 0.6270380616188049, + "eval_rewards/rejected": -2.0373716354370117, + "eval_runtime": 23.5786, + "eval_samples_per_second": 4.241, + "eval_steps_per_second": 1.06, + "step": 1200 } ], "logging_steps": 2,