diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,7203 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 1000, + "global_step": 478, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0020920502092050207, + "grad_norm": 4.2377470321442345, + "learning_rate": 1.0416666666666666e-08, + "logits/chosen": -2.516148090362549, + "logits/rejected": -2.4595022201538086, + "logps/chosen": -1.2051799297332764, + "logps/rejected": -1.1685211658477783, + "loss": -0.0047, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3819315433502197, + "rewards/margins": -0.0013609901070594788, + "rewards/rejected": 0.3832925260066986, + "step": 1 + }, + { + "epoch": 0.0041841004184100415, + "grad_norm": 4.616200091383757, + "learning_rate": 2.083333333333333e-08, + "logits/chosen": -2.4128036499023438, + "logits/rejected": -2.479793071746826, + "logps/chosen": -1.2863150835037231, + "logps/rejected": -1.0149686336517334, + "loss": -0.0103, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.34382855892181396, + "rewards/margins": -0.057850055396556854, + "rewards/rejected": 0.4016786217689514, + "step": 2 + }, + { + "epoch": 0.006276150627615063, + "grad_norm": 3.700184874168768, + "learning_rate": 3.125e-08, + "logits/chosen": -2.678924083709717, + "logits/rejected": -2.668107032775879, + "logps/chosen": -0.8378489017486572, + "logps/rejected": -1.1761590242385864, + "loss": -0.0625, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44425511360168457, + "rewards/margins": 0.11912594735622406, + "rewards/rejected": 0.3251291811466217, + "step": 3 + }, + { + "epoch": 0.008368200836820083, + "grad_norm": 3.754675176110523, + "learning_rate": 4.166666666666666e-08, + "logits/chosen": -2.6950135231018066, + "logits/rejected": -2.7032535076141357, + "logps/chosen": -0.9283154010772705, + "logps/rejected": -0.842531681060791, + "loss": -0.0123, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4067751169204712, + "rewards/margins": -0.049619611352682114, + "rewards/rejected": 0.4563947319984436, + "step": 4 + }, + { + "epoch": 0.010460251046025104, + "grad_norm": 3.75972351843305, + "learning_rate": 5.208333333333333e-08, + "logits/chosen": -2.7333128452301025, + "logits/rejected": -2.6697793006896973, + "logps/chosen": -0.8777127265930176, + "logps/rejected": -1.043122410774231, + "loss": -0.0324, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4418213367462158, + "rewards/margins": 0.02298763580620289, + "rewards/rejected": 0.41883373260498047, + "step": 5 + }, + { + "epoch": 0.012552301255230125, + "grad_norm": 3.7912148566272696, + "learning_rate": 6.25e-08, + "logits/chosen": -2.8148372173309326, + "logits/rejected": -2.755297899246216, + "logps/chosen": -1.0822432041168213, + "logps/rejected": -1.2278616428375244, + "loss": -0.015, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.35870179533958435, + "rewards/margins": 0.02633710391819477, + "rewards/rejected": 0.3323647081851959, + "step": 6 + }, + { + "epoch": 0.014644351464435146, + "grad_norm": 4.260569724370287, + "learning_rate": 7.291666666666667e-08, + "logits/chosen": -2.7257280349731445, + "logits/rejected": -2.6599254608154297, + "logps/chosen": -1.0147705078125, + "logps/rejected": -0.9999724626541138, + "loss": -0.0232, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38983920216560364, + "rewards/margins": 0.007026904262602329, + "rewards/rejected": 0.3828122913837433, + "step": 7 + }, + { + "epoch": 0.016736401673640166, + "grad_norm": 4.363722516784404, + "learning_rate": 8.333333333333333e-08, + "logits/chosen": -2.8212952613830566, + "logits/rejected": -2.65653920173645, + "logps/chosen": -0.971604585647583, + "logps/rejected": -1.1535077095031738, + "loss": -0.0233, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4037427604198456, + "rewards/margins": 0.0658140704035759, + "rewards/rejected": 0.3379287123680115, + "step": 8 + }, + { + "epoch": 0.01882845188284519, + "grad_norm": 4.498762808376351, + "learning_rate": 9.375e-08, + "logits/chosen": -2.692415237426758, + "logits/rejected": -2.6526036262512207, + "logps/chosen": -0.9371532201766968, + "logps/rejected": -1.1771211624145508, + "loss": -0.0324, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4131394028663635, + "rewards/margins": 0.04278501123189926, + "rewards/rejected": 0.37035441398620605, + "step": 9 + }, + { + "epoch": 0.02092050209205021, + "grad_norm": 4.174864879865781, + "learning_rate": 1.0416666666666667e-07, + "logits/chosen": -2.5466957092285156, + "logits/rejected": -2.448245048522949, + "logps/chosen": -1.09194016456604, + "logps/rejected": -1.0789859294891357, + "loss": -0.0433, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.382916659116745, + "rewards/margins": 0.029688388109207153, + "rewards/rejected": 0.35322827100753784, + "step": 10 + }, + { + "epoch": 0.02301255230125523, + "grad_norm": 3.8103486541857396, + "learning_rate": 1.1458333333333332e-07, + "logits/chosen": -2.755258083343506, + "logits/rejected": -2.5368123054504395, + "logps/chosen": -0.8456466794013977, + "logps/rejected": -1.1355555057525635, + "loss": -0.0339, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44307994842529297, + "rewards/margins": 0.10706950724124908, + "rewards/rejected": 0.3360104560852051, + "step": 11 + }, + { + "epoch": 0.02510460251046025, + "grad_norm": 3.6062416909583246, + "learning_rate": 1.25e-07, + "logits/chosen": -2.8309640884399414, + "logits/rejected": -2.8085203170776367, + "logps/chosen": -1.0716187953948975, + "logps/rejected": -0.9773526191711426, + "loss": -0.0023, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3665681481361389, + "rewards/margins": -0.025565478950738907, + "rewards/rejected": 0.3921336233615875, + "step": 12 + }, + { + "epoch": 0.027196652719665274, + "grad_norm": 4.894676212438303, + "learning_rate": 1.3541666666666666e-07, + "logits/chosen": -2.646806001663208, + "logits/rejected": -2.4753153324127197, + "logps/chosen": -0.9714970588684082, + "logps/rejected": -1.3113722801208496, + "loss": -0.0457, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.406857430934906, + "rewards/margins": 0.11795446276664734, + "rewards/rejected": 0.2889029383659363, + "step": 13 + }, + { + "epoch": 0.029288702928870293, + "grad_norm": 3.971879581235679, + "learning_rate": 1.4583333333333335e-07, + "logits/chosen": -2.527503252029419, + "logits/rejected": -2.5251338481903076, + "logps/chosen": -1.6883941888809204, + "logps/rejected": -1.28494393825531, + "loss": -0.0479, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3116728961467743, + "rewards/margins": 0.010951630771160126, + "rewards/rejected": 0.30072125792503357, + "step": 14 + }, + { + "epoch": 0.03138075313807531, + "grad_norm": 4.3835348530172755, + "learning_rate": 1.5624999999999999e-07, + "logits/chosen": -2.7007007598876953, + "logits/rejected": -2.6135330200195312, + "logps/chosen": -0.9208273887634277, + "logps/rejected": -1.1965676546096802, + "loss": -0.0258, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4235038161277771, + "rewards/margins": 0.0690305233001709, + "rewards/rejected": 0.3544732928276062, + "step": 15 + }, + { + "epoch": 0.03347280334728033, + "grad_norm": 4.802927181660066, + "learning_rate": 1.6666666666666665e-07, + "logits/chosen": -2.7196192741394043, + "logits/rejected": -2.704707145690918, + "logps/chosen": -1.0318691730499268, + "logps/rejected": -1.019579291343689, + "loss": -0.0248, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.37734121084213257, + "rewards/margins": -0.018189536407589912, + "rewards/rejected": 0.3955307602882385, + "step": 16 + }, + { + "epoch": 0.03556485355648536, + "grad_norm": 3.624971192344114, + "learning_rate": 1.7708333333333334e-07, + "logits/chosen": -2.555588960647583, + "logits/rejected": -2.5805137157440186, + "logps/chosen": -0.8474312424659729, + "logps/rejected": -0.9811149835586548, + "loss": -0.0411, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44956469535827637, + "rewards/margins": 0.05308142676949501, + "rewards/rejected": 0.39648327231407166, + "step": 17 + }, + { + "epoch": 0.03765690376569038, + "grad_norm": 4.118639811082369, + "learning_rate": 1.875e-07, + "logits/chosen": -2.728419303894043, + "logits/rejected": -2.6613481044769287, + "logps/chosen": -0.9432893991470337, + "logps/rejected": -1.0357279777526855, + "loss": -0.0093, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.40460628271102905, + "rewards/margins": 0.005164351314306259, + "rewards/rejected": 0.3994419276714325, + "step": 18 + }, + { + "epoch": 0.0397489539748954, + "grad_norm": 4.70344861713647, + "learning_rate": 1.9791666666666664e-07, + "logits/chosen": -2.632000684738159, + "logits/rejected": -2.435832977294922, + "logps/chosen": -1.014740228652954, + "logps/rejected": -1.2633649110794067, + "loss": -0.044, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3806372284889221, + "rewards/margins": 0.07575057446956635, + "rewards/rejected": 0.30488666892051697, + "step": 19 + }, + { + "epoch": 0.04184100418410042, + "grad_norm": 4.0685356493856935, + "learning_rate": 2.0833333333333333e-07, + "logits/chosen": -2.7355053424835205, + "logits/rejected": -2.6346383094787598, + "logps/chosen": -1.0173437595367432, + "logps/rejected": -1.0325090885162354, + "loss": -0.0442, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3999063968658447, + "rewards/margins": 0.006554506719112396, + "rewards/rejected": 0.39335188269615173, + "step": 20 + }, + { + "epoch": 0.043933054393305436, + "grad_norm": 4.535495847198634, + "learning_rate": 2.1875e-07, + "logits/chosen": -2.667809247970581, + "logits/rejected": -2.7679128646850586, + "logps/chosen": -0.9066150188446045, + "logps/rejected": -1.042630672454834, + "loss": -0.0631, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4318792223930359, + "rewards/margins": 0.058360084891319275, + "rewards/rejected": 0.3735191226005554, + "step": 21 + }, + { + "epoch": 0.04602510460251046, + "grad_norm": 5.626761266687518, + "learning_rate": 2.2916666666666663e-07, + "logits/chosen": -2.7311110496520996, + "logits/rejected": -2.657087802886963, + "logps/chosen": -0.8237791657447815, + "logps/rejected": -0.9569952487945557, + "loss": -0.0306, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4589225649833679, + "rewards/margins": 0.0617038793861866, + "rewards/rejected": 0.3972187042236328, + "step": 22 + }, + { + "epoch": 0.04811715481171548, + "grad_norm": 4.31976215520191, + "learning_rate": 2.3958333333333335e-07, + "logits/chosen": -2.5237855911254883, + "logits/rejected": -2.485886573791504, + "logps/chosen": -1.017724633216858, + "logps/rejected": -1.073349118232727, + "loss": -0.0512, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4005371034145355, + "rewards/margins": 0.02935650758445263, + "rewards/rejected": 0.37118059396743774, + "step": 23 + }, + { + "epoch": 0.0502092050209205, + "grad_norm": 4.392600704421736, + "learning_rate": 2.5e-07, + "logits/chosen": -2.5538222789764404, + "logits/rejected": -2.4816856384277344, + "logps/chosen": -1.0112228393554688, + "logps/rejected": -1.232728362083435, + "loss": -0.0263, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.39029499888420105, + "rewards/margins": 0.03938312828540802, + "rewards/rejected": 0.35091185569763184, + "step": 24 + }, + { + "epoch": 0.05230125523012552, + "grad_norm": 4.19926916757524, + "learning_rate": 2.604166666666667e-07, + "logits/chosen": -2.5918517112731934, + "logits/rejected": -2.532200813293457, + "logps/chosen": -0.8845524787902832, + "logps/rejected": -0.7892788648605347, + "loss": -0.0345, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.4380142092704773, + "rewards/margins": -0.0516534224152565, + "rewards/rejected": 0.4896676540374756, + "step": 25 + }, + { + "epoch": 0.05439330543933055, + "grad_norm": 6.458780023037348, + "learning_rate": 2.708333333333333e-07, + "logits/chosen": -2.755127429962158, + "logits/rejected": -2.6158037185668945, + "logps/chosen": -0.9921547174453735, + "logps/rejected": -1.0211769342422485, + "loss": -0.0173, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3931275010108948, + "rewards/margins": 0.010015063919126987, + "rewards/rejected": 0.38311243057250977, + "step": 26 + }, + { + "epoch": 0.056485355648535567, + "grad_norm": 5.258249431893916, + "learning_rate": 2.8125e-07, + "logits/chosen": -2.682135581970215, + "logits/rejected": -2.555710792541504, + "logps/chosen": -0.941645622253418, + "logps/rejected": -1.0752073526382446, + "loss": -0.0419, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4056612253189087, + "rewards/margins": 0.021622449159622192, + "rewards/rejected": 0.3840388059616089, + "step": 27 + }, + { + "epoch": 0.058577405857740586, + "grad_norm": 4.888961752916909, + "learning_rate": 2.916666666666667e-07, + "logits/chosen": -2.7251815795898438, + "logits/rejected": -2.621352195739746, + "logps/chosen": -0.9367978572845459, + "logps/rejected": -1.0780291557312012, + "loss": -0.0295, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4540524184703827, + "rewards/margins": 0.06387701630592346, + "rewards/rejected": 0.39017540216445923, + "step": 28 + }, + { + "epoch": 0.060669456066945605, + "grad_norm": 3.776672366733632, + "learning_rate": 3.020833333333333e-07, + "logits/chosen": -2.8015384674072266, + "logits/rejected": -2.614657402038574, + "logps/chosen": -0.8712697625160217, + "logps/rejected": -0.9271025061607361, + "loss": -0.0434, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4297908544540405, + "rewards/margins": 0.002205170691013336, + "rewards/rejected": 0.4275856614112854, + "step": 29 + }, + { + "epoch": 0.06276150627615062, + "grad_norm": 5.648597556731242, + "learning_rate": 3.1249999999999997e-07, + "logits/chosen": -2.6692140102386475, + "logits/rejected": -2.67612624168396, + "logps/chosen": -0.7322691679000854, + "logps/rejected": -1.0025076866149902, + "loss": -0.0516, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5026121139526367, + "rewards/margins": 0.10446584224700928, + "rewards/rejected": 0.39814627170562744, + "step": 30 + }, + { + "epoch": 0.06485355648535565, + "grad_norm": 4.03210676684634, + "learning_rate": 3.2291666666666666e-07, + "logits/chosen": -2.7173359394073486, + "logits/rejected": -2.658566951751709, + "logps/chosen": -0.7469158172607422, + "logps/rejected": -0.8179668188095093, + "loss": -0.0637, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.48906075954437256, + "rewards/margins": 0.03415411710739136, + "rewards/rejected": 0.4549066722393036, + "step": 31 + }, + { + "epoch": 0.06694560669456066, + "grad_norm": 4.276452711129558, + "learning_rate": 3.333333333333333e-07, + "logits/chosen": -2.8544907569885254, + "logits/rejected": -2.737621307373047, + "logps/chosen": -0.9466021060943604, + "logps/rejected": -0.9423712491989136, + "loss": -0.0211, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.40121617913246155, + "rewards/margins": -0.005811708979308605, + "rewards/rejected": 0.4070279002189636, + "step": 32 + }, + { + "epoch": 0.06903765690376569, + "grad_norm": 4.391804275431816, + "learning_rate": 3.4375e-07, + "logits/chosen": -2.7092764377593994, + "logits/rejected": -2.723233699798584, + "logps/chosen": -0.7280018329620361, + "logps/rejected": -0.8956056833267212, + "loss": -0.0404, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5123242139816284, + "rewards/margins": 0.07963782548904419, + "rewards/rejected": 0.4326864182949066, + "step": 33 + }, + { + "epoch": 0.07112970711297072, + "grad_norm": 4.277257017884931, + "learning_rate": 3.541666666666667e-07, + "logits/chosen": -2.6140832901000977, + "logits/rejected": -2.687009811401367, + "logps/chosen": -0.944001317024231, + "logps/rejected": -1.021315336227417, + "loss": -0.039, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.4302595257759094, + "rewards/margins": 0.0396876186132431, + "rewards/rejected": 0.3905719220638275, + "step": 34 + }, + { + "epoch": 0.07322175732217573, + "grad_norm": 4.903229579454535, + "learning_rate": 3.645833333333333e-07, + "logits/chosen": -2.6962552070617676, + "logits/rejected": -2.609804630279541, + "logps/chosen": -0.9382450580596924, + "logps/rejected": -1.2346339225769043, + "loss": -0.0714, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4204990267753601, + "rewards/margins": 0.09388677775859833, + "rewards/rejected": 0.3266122341156006, + "step": 35 + }, + { + "epoch": 0.07531380753138076, + "grad_norm": 4.611264633523621, + "learning_rate": 3.75e-07, + "logits/chosen": -2.652193069458008, + "logits/rejected": -2.685980796813965, + "logps/chosen": -0.8813173770904541, + "logps/rejected": -1.0486414432525635, + "loss": -0.033, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.42412978410720825, + "rewards/margins": 0.0556933730840683, + "rewards/rejected": 0.36843645572662354, + "step": 36 + }, + { + "epoch": 0.07740585774058577, + "grad_norm": 4.492060757383237, + "learning_rate": 3.8541666666666665e-07, + "logits/chosen": -2.502615451812744, + "logits/rejected": -2.550412178039551, + "logps/chosen": -0.8484123945236206, + "logps/rejected": -0.8843783140182495, + "loss": -0.0398, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.45689207315444946, + "rewards/margins": 0.010739020071923733, + "rewards/rejected": 0.44615304470062256, + "step": 37 + }, + { + "epoch": 0.0794979079497908, + "grad_norm": 4.489890717193878, + "learning_rate": 3.958333333333333e-07, + "logits/chosen": -2.6360530853271484, + "logits/rejected": -2.594210147857666, + "logps/chosen": -0.9129270315170288, + "logps/rejected": -0.8058098554611206, + "loss": -0.0472, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4257778525352478, + "rewards/margins": -0.04449187219142914, + "rewards/rejected": 0.47026970982551575, + "step": 38 + }, + { + "epoch": 0.08158995815899582, + "grad_norm": 6.383961680323578, + "learning_rate": 4.0625e-07, + "logits/chosen": -2.7180564403533936, + "logits/rejected": -2.7788286209106445, + "logps/chosen": -0.7782948613166809, + "logps/rejected": -1.1029548645019531, + "loss": -0.0721, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4835827648639679, + "rewards/margins": 0.11756639182567596, + "rewards/rejected": 0.36601632833480835, + "step": 39 + }, + { + "epoch": 0.08368200836820083, + "grad_norm": 4.574297558165053, + "learning_rate": 4.1666666666666667e-07, + "logits/chosen": -2.6807751655578613, + "logits/rejected": -2.6066842079162598, + "logps/chosen": -0.7192291617393494, + "logps/rejected": -1.1977105140686035, + "loss": -0.0487, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5016716718673706, + "rewards/margins": 0.15893656015396118, + "rewards/rejected": 0.3427351117134094, + "step": 40 + }, + { + "epoch": 0.08577405857740586, + "grad_norm": 4.670542855578557, + "learning_rate": 4.270833333333333e-07, + "logits/chosen": -2.750822067260742, + "logits/rejected": -2.598273754119873, + "logps/chosen": -0.8036954402923584, + "logps/rejected": -0.8826763033866882, + "loss": -0.0595, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.47965848445892334, + "rewards/margins": 0.03851592540740967, + "rewards/rejected": 0.4411425292491913, + "step": 41 + }, + { + "epoch": 0.08786610878661087, + "grad_norm": 5.6444060526232205, + "learning_rate": 4.375e-07, + "logits/chosen": -2.7300307750701904, + "logits/rejected": -2.8059144020080566, + "logps/chosen": -0.871497631072998, + "logps/rejected": -1.0136091709136963, + "loss": -0.0639, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.44451814889907837, + "rewards/margins": 0.012301255017518997, + "rewards/rejected": 0.4322168827056885, + "step": 42 + }, + { + "epoch": 0.0899581589958159, + "grad_norm": 4.7276809756186555, + "learning_rate": 4.479166666666667e-07, + "logits/chosen": -2.8357558250427246, + "logits/rejected": -2.8672068119049072, + "logps/chosen": -0.8881194591522217, + "logps/rejected": -0.800782322883606, + "loss": -0.0444, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.42225128412246704, + "rewards/margins": -0.049045875668525696, + "rewards/rejected": 0.47129717469215393, + "step": 43 + }, + { + "epoch": 0.09205020920502092, + "grad_norm": 5.168209747900235, + "learning_rate": 4.5833333333333327e-07, + "logits/chosen": -2.6327948570251465, + "logits/rejected": -2.5793819427490234, + "logps/chosen": -0.8625282049179077, + "logps/rejected": -0.9531125426292419, + "loss": -0.0613, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4718647599220276, + "rewards/margins": 0.022850003093481064, + "rewards/rejected": 0.44901472330093384, + "step": 44 + }, + { + "epoch": 0.09414225941422594, + "grad_norm": 5.748488517798206, + "learning_rate": 4.6874999999999996e-07, + "logits/chosen": -2.7688827514648438, + "logits/rejected": -2.7678184509277344, + "logps/chosen": -0.9253497123718262, + "logps/rejected": -1.012501835823059, + "loss": -0.0772, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.44503989815711975, + "rewards/margins": 0.041792869567871094, + "rewards/rejected": 0.40324705839157104, + "step": 45 + }, + { + "epoch": 0.09623430962343096, + "grad_norm": 5.4053973810033265, + "learning_rate": 4.791666666666667e-07, + "logits/chosen": -2.6743528842926025, + "logits/rejected": -2.6169381141662598, + "logps/chosen": -0.9037365913391113, + "logps/rejected": -1.1369562149047852, + "loss": -0.039, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.42915260791778564, + "rewards/margins": 0.09240510314702988, + "rewards/rejected": 0.33674749732017517, + "step": 46 + }, + { + "epoch": 0.09832635983263599, + "grad_norm": 5.260086542730235, + "learning_rate": 4.895833333333333e-07, + "logits/chosen": -2.643252372741699, + "logits/rejected": -2.6564273834228516, + "logps/chosen": -0.711982250213623, + "logps/rejected": -0.9268280863761902, + "loss": -0.0643, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5240494608879089, + "rewards/margins": 0.1118624359369278, + "rewards/rejected": 0.41218701004981995, + "step": 47 + }, + { + "epoch": 0.100418410041841, + "grad_norm": 4.998054698179204, + "learning_rate": 5e-07, + "logits/chosen": -2.601804256439209, + "logits/rejected": -2.592262029647827, + "logps/chosen": -0.9648156762123108, + "logps/rejected": -1.016688346862793, + "loss": -0.0447, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4200252592563629, + "rewards/margins": 0.016244657337665558, + "rewards/rejected": 0.40378057956695557, + "step": 48 + }, + { + "epoch": 0.10251046025104603, + "grad_norm": 5.789088857902208, + "learning_rate": 4.999933277714308e-07, + "logits/chosen": -2.596520185470581, + "logits/rejected": -2.591111421585083, + "logps/chosen": -0.9277907609939575, + "logps/rejected": -0.9234358668327332, + "loss": -0.0426, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4210990071296692, + "rewards/margins": 0.0025722142308950424, + "rewards/rejected": 0.4185267984867096, + "step": 49 + }, + { + "epoch": 0.10460251046025104, + "grad_norm": 5.3386843580079235, + "learning_rate": 4.999733114418725e-07, + "logits/chosen": -2.7607293128967285, + "logits/rejected": -2.6781206130981445, + "logps/chosen": -0.699967622756958, + "logps/rejected": -0.9246343374252319, + "loss": -0.0689, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.532953679561615, + "rewards/margins": 0.11697375774383545, + "rewards/rejected": 0.41597992181777954, + "step": 50 + }, + { + "epoch": 0.10669456066945607, + "grad_norm": 5.916200359756027, + "learning_rate": 4.999399520797532e-07, + "logits/chosen": -2.6939072608947754, + "logits/rejected": -2.692051887512207, + "logps/chosen": -0.8488726615905762, + "logps/rejected": -1.0264009237289429, + "loss": -0.0331, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.44875848293304443, + "rewards/margins": 0.03353102505207062, + "rewards/rejected": 0.415227472782135, + "step": 51 + }, + { + "epoch": 0.1087866108786611, + "grad_norm": 6.06972427169266, + "learning_rate": 4.998932514657231e-07, + "logits/chosen": -2.7914843559265137, + "logits/rejected": -2.7075424194335938, + "logps/chosen": -0.9562678337097168, + "logps/rejected": -1.2075998783111572, + "loss": -0.0542, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4133673906326294, + "rewards/margins": 0.04625946655869484, + "rewards/rejected": 0.36710792779922485, + "step": 52 + }, + { + "epoch": 0.1108786610878661, + "grad_norm": 5.089942679391605, + "learning_rate": 4.998332120925598e-07, + "logits/chosen": -2.753302574157715, + "logits/rejected": -2.7128305435180664, + "logps/chosen": -0.8735638856887817, + "logps/rejected": -0.884076714515686, + "loss": -0.04, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.454987108707428, + "rewards/margins": 0.017186364158988, + "rewards/rejected": 0.4378007650375366, + "step": 53 + }, + { + "epoch": 0.11297071129707113, + "grad_norm": 6.052397500406735, + "learning_rate": 4.997598371650346e-07, + "logits/chosen": -2.581367015838623, + "logits/rejected": -2.5348823070526123, + "logps/chosen": -1.1102932691574097, + "logps/rejected": -1.1976356506347656, + "loss": -0.0424, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.36265528202056885, + "rewards/margins": 0.038111936300992966, + "rewards/rejected": 0.3245433568954468, + "step": 54 + }, + { + "epoch": 0.11506276150627615, + "grad_norm": 7.582726283106986, + "learning_rate": 4.996731305997416e-07, + "logits/chosen": -2.6058666706085205, + "logits/rejected": -2.5393056869506836, + "logps/chosen": -0.8814643621444702, + "logps/rejected": -1.1655168533325195, + "loss": -0.0653, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4387378692626953, + "rewards/margins": 0.08904074132442474, + "rewards/rejected": 0.34969714283943176, + "step": 55 + }, + { + "epoch": 0.11715481171548117, + "grad_norm": 6.882707748741758, + "learning_rate": 4.995730970248893e-07, + "logits/chosen": -2.767383575439453, + "logits/rejected": -2.616359233856201, + "logps/chosen": -0.7647976279258728, + "logps/rejected": -0.9351394772529602, + "loss": -0.1056, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4906325042247772, + "rewards/margins": 0.06625062227249146, + "rewards/rejected": 0.42438188195228577, + "step": 56 + }, + { + "epoch": 0.1192468619246862, + "grad_norm": 5.54458889124346, + "learning_rate": 4.994597417800523e-07, + "logits/chosen": -2.6890878677368164, + "logits/rejected": -2.6082592010498047, + "logps/chosen": -1.0842435359954834, + "logps/rejected": -1.4014520645141602, + "loss": -0.0148, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3719639182090759, + "rewards/margins": 0.05614328384399414, + "rewards/rejected": 0.3158206641674042, + "step": 57 + }, + { + "epoch": 0.12133891213389121, + "grad_norm": 6.9373603325608695, + "learning_rate": 4.993330709158879e-07, + "logits/chosen": -2.7441017627716064, + "logits/rejected": -2.5297117233276367, + "logps/chosen": -1.0221364498138428, + "logps/rejected": -1.1408171653747559, + "loss": -0.0863, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.39980924129486084, + "rewards/margins": 0.021861765533685684, + "rewards/rejected": 0.37794747948646545, + "step": 58 + }, + { + "epoch": 0.12343096234309624, + "grad_norm": 6.321561447114049, + "learning_rate": 4.991930911938115e-07, + "logits/chosen": -2.4681005477905273, + "logits/rejected": -2.4990711212158203, + "logps/chosen": -1.0633734464645386, + "logps/rejected": -1.114372730255127, + "loss": -0.0631, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.41653022170066833, + "rewards/margins": 0.05260680615901947, + "rewards/rejected": 0.36392340064048767, + "step": 59 + }, + { + "epoch": 0.12552301255230125, + "grad_norm": 7.821830122083976, + "learning_rate": 4.990398100856366e-07, + "logits/chosen": -2.7289726734161377, + "logits/rejected": -2.6343679428100586, + "logps/chosen": -0.9890714883804321, + "logps/rejected": -1.0246083736419678, + "loss": -0.0631, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.39659732580184937, + "rewards/margins": -0.005929629318416119, + "rewards/rejected": 0.40252697467803955, + "step": 60 + }, + { + "epoch": 0.12761506276150628, + "grad_norm": 5.543365313785047, + "learning_rate": 4.988732357731762e-07, + "logits/chosen": -2.670830011367798, + "logits/rejected": -2.719724655151367, + "logps/chosen": -0.8051760792732239, + "logps/rejected": -1.1917810440063477, + "loss": -0.0634, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4671447277069092, + "rewards/margins": 0.12195159494876862, + "rewards/rejected": 0.34519311785697937, + "step": 61 + }, + { + "epoch": 0.1297071129707113, + "grad_norm": 6.158656545510282, + "learning_rate": 4.986933771478051e-07, + "logits/chosen": -2.5070548057556152, + "logits/rejected": -2.5650229454040527, + "logps/chosen": -0.8839155435562134, + "logps/rejected": -1.074038028717041, + "loss": -0.0583, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4455738067626953, + "rewards/margins": 0.05701454356312752, + "rewards/rejected": 0.3885592222213745, + "step": 62 + }, + { + "epoch": 0.13179916317991633, + "grad_norm": 6.601566833855826, + "learning_rate": 4.985002438099865e-07, + "logits/chosen": -2.7178516387939453, + "logits/rejected": -2.614670753479004, + "logps/chosen": -0.9592885971069336, + "logps/rejected": -1.2345027923583984, + "loss": -0.0602, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.41584277153015137, + "rewards/margins": 0.07030172646045685, + "rewards/rejected": 0.3455410599708557, + "step": 63 + }, + { + "epoch": 0.13389121338912133, + "grad_norm": 5.110638476831358, + "learning_rate": 4.982938460687582e-07, + "logits/chosen": -2.6705799102783203, + "logits/rejected": -2.535891532897949, + "logps/chosen": -1.0088186264038086, + "logps/rejected": -1.3167262077331543, + "loss": -0.0636, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.379120796918869, + "rewards/margins": 0.07601502537727356, + "rewards/rejected": 0.30310577154159546, + "step": 64 + }, + { + "epoch": 0.13598326359832635, + "grad_norm": 6.39928910360316, + "learning_rate": 4.980741949411839e-07, + "logits/chosen": -2.5929765701293945, + "logits/rejected": -2.5973377227783203, + "logps/chosen": -1.2004098892211914, + "logps/rejected": -1.2652997970581055, + "loss": -0.0478, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3200799822807312, + "rewards/margins": 0.005203430540859699, + "rewards/rejected": 0.314876526594162, + "step": 65 + }, + { + "epoch": 0.13807531380753138, + "grad_norm": 5.664762551027314, + "learning_rate": 4.978413021517633e-07, + "logits/chosen": -2.7091989517211914, + "logits/rejected": -2.6626734733581543, + "logps/chosen": -0.9367485046386719, + "logps/rejected": -1.2151732444763184, + "loss": -0.0621, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.43186360597610474, + "rewards/margins": 0.11135440319776535, + "rewards/rejected": 0.32050925493240356, + "step": 66 + }, + { + "epoch": 0.1401673640167364, + "grad_norm": 6.952965346657238, + "learning_rate": 4.975951801318083e-07, + "logits/chosen": -2.5098729133605957, + "logits/rejected": -2.414456844329834, + "logps/chosen": -0.8484750986099243, + "logps/rejected": -0.9892241358757019, + "loss": -0.0918, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4698827266693115, + "rewards/margins": 0.06307347118854523, + "rewards/rejected": 0.4068092703819275, + "step": 67 + }, + { + "epoch": 0.14225941422594143, + "grad_norm": 6.436841023614087, + "learning_rate": 4.973358420187775e-07, + "logits/chosen": -2.7483773231506348, + "logits/rejected": -2.7305080890655518, + "logps/chosen": -1.0968064069747925, + "logps/rejected": -0.9938352108001709, + "loss": -0.066, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3873424530029297, + "rewards/margins": -0.041090674698352814, + "rewards/rejected": 0.4284331202507019, + "step": 68 + }, + { + "epoch": 0.14435146443514643, + "grad_norm": 21.319389236605137, + "learning_rate": 4.970633016555764e-07, + "logits/chosen": -2.6799473762512207, + "logits/rejected": -2.551419734954834, + "logps/chosen": -0.8439831733703613, + "logps/rejected": -1.3273664712905884, + "loss": -0.0879, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.46591490507125854, + "rewards/margins": 0.14239582419395447, + "rewards/rejected": 0.3235190808773041, + "step": 69 + }, + { + "epoch": 0.14644351464435146, + "grad_norm": 6.7666133308501, + "learning_rate": 4.967775735898179e-07, + "logits/chosen": -2.6862854957580566, + "logits/rejected": -2.689319133758545, + "logps/chosen": -0.9402115345001221, + "logps/rejected": -1.3846523761749268, + "loss": -0.0862, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4224025309085846, + "rewards/margins": 0.1333645135164261, + "rewards/rejected": 0.2890380024909973, + "step": 70 + }, + { + "epoch": 0.14853556485355648, + "grad_norm": 6.681866135481183, + "learning_rate": 4.964786730730454e-07, + "logits/chosen": -2.7253994941711426, + "logits/rejected": -2.776564598083496, + "logps/chosen": -1.0395264625549316, + "logps/rejected": -1.0701773166656494, + "loss": -0.0683, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3953511118888855, + "rewards/margins": 0.028210703283548355, + "rewards/rejected": 0.36714041233062744, + "step": 71 + }, + { + "epoch": 0.1506276150627615, + "grad_norm": 7.723837222390713, + "learning_rate": 4.961666160599197e-07, + "logits/chosen": -2.6773457527160645, + "logits/rejected": -2.59285306930542, + "logps/chosen": -0.9283171892166138, + "logps/rejected": -1.0602527856826782, + "loss": -0.0795, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4250871241092682, + "rewards/margins": 0.025532707571983337, + "rewards/rejected": 0.39955443143844604, + "step": 72 + }, + { + "epoch": 0.15271966527196654, + "grad_norm": 6.789946293344587, + "learning_rate": 4.958414192073665e-07, + "logits/chosen": -2.546414613723755, + "logits/rejected": -2.4631576538085938, + "logps/chosen": -0.9542316794395447, + "logps/rejected": -1.1909294128417969, + "loss": -0.0186, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4049053192138672, + "rewards/margins": 0.05104460567235947, + "rewards/rejected": 0.3538607060909271, + "step": 73 + }, + { + "epoch": 0.15481171548117154, + "grad_norm": 7.548673490492515, + "learning_rate": 4.955030998736876e-07, + "logits/chosen": -2.74623966217041, + "logits/rejected": -2.6763722896575928, + "logps/chosen": -1.0954599380493164, + "logps/rejected": -1.460294485092163, + "loss": -0.068, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3597659468650818, + "rewards/margins": 0.06558110564947128, + "rewards/rejected": 0.2941848635673523, + "step": 74 + }, + { + "epoch": 0.15690376569037656, + "grad_norm": 7.870243634302642, + "learning_rate": 4.951516761176343e-07, + "logits/chosen": -2.6385092735290527, + "logits/rejected": -2.5714449882507324, + "logps/chosen": -0.8212575912475586, + "logps/rejected": -1.3720622062683105, + "loss": -0.0754, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4903402030467987, + "rewards/margins": 0.16850018501281738, + "rewards/rejected": 0.3218400180339813, + "step": 75 + }, + { + "epoch": 0.1589958158995816, + "grad_norm": 7.740159216532024, + "learning_rate": 4.947871666974437e-07, + "logits/chosen": -2.746283531188965, + "logits/rejected": -2.6653685569763184, + "logps/chosen": -1.0666463375091553, + "logps/rejected": -1.5027016401290894, + "loss": -0.0814, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3813169598579407, + "rewards/margins": 0.11095122247934341, + "rewards/rejected": 0.27036574482917786, + "step": 76 + }, + { + "epoch": 0.16108786610878661, + "grad_norm": 7.976694731779206, + "learning_rate": 4.944095910698372e-07, + "logits/chosen": -2.6141693592071533, + "logits/rejected": -2.6142563819885254, + "logps/chosen": -1.0128289461135864, + "logps/rejected": -1.2213771343231201, + "loss": -0.0688, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3892231583595276, + "rewards/margins": 0.02057480998337269, + "rewards/rejected": 0.36864835023880005, + "step": 77 + }, + { + "epoch": 0.16317991631799164, + "grad_norm": 12.788059053105092, + "learning_rate": 4.940189693889818e-07, + "logits/chosen": -2.5218615531921387, + "logits/rejected": -2.475954055786133, + "logps/chosen": -1.17843759059906, + "logps/rejected": -1.431633472442627, + "loss": -0.079, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3605992794036865, + "rewards/margins": 0.08995417505502701, + "rewards/rejected": 0.2706451416015625, + "step": 78 + }, + { + "epoch": 0.16527196652719664, + "grad_norm": 7.9687380323133175, + "learning_rate": 4.936153225054146e-07, + "logits/chosen": -2.453888416290283, + "logits/rejected": -2.4906387329101562, + "logps/chosen": -1.0334270000457764, + "logps/rejected": -1.2166098356246948, + "loss": -0.0811, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40555617213249207, + "rewards/margins": 0.07155266404151917, + "rewards/rejected": 0.3340035080909729, + "step": 79 + }, + { + "epoch": 0.16736401673640167, + "grad_norm": 7.103423794773496, + "learning_rate": 4.931986719649298e-07, + "logits/chosen": -2.58834171295166, + "logits/rejected": -2.602987766265869, + "logps/chosen": -0.9456465840339661, + "logps/rejected": -1.0439833402633667, + "loss": -0.062, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4474758505821228, + "rewards/margins": 0.05571586266160011, + "rewards/rejected": 0.391759991645813, + "step": 80 + }, + { + "epoch": 0.1694560669456067, + "grad_norm": 8.009957315662204, + "learning_rate": 4.927690400074286e-07, + "logits/chosen": -2.6588869094848633, + "logits/rejected": -2.6168625354766846, + "logps/chosen": -0.9099507927894592, + "logps/rejected": -1.2698771953582764, + "loss": -0.0448, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.45280569791793823, + "rewards/margins": 0.09746626019477844, + "rewards/rejected": 0.3553394079208374, + "step": 81 + }, + { + "epoch": 0.17154811715481172, + "grad_norm": 7.621644329163717, + "learning_rate": 4.923264495657319e-07, + "logits/chosen": -2.6462247371673584, + "logits/rejected": -2.540278434753418, + "logps/chosen": -0.9409681558609009, + "logps/rejected": -1.4178993701934814, + "loss": -0.1086, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4165344536304474, + "rewards/margins": 0.11931312829256058, + "rewards/rejected": 0.2972213327884674, + "step": 82 + }, + { + "epoch": 0.17364016736401675, + "grad_norm": 6.497792969659968, + "learning_rate": 4.918709242643563e-07, + "logits/chosen": -2.6598472595214844, + "logits/rejected": -2.5916948318481445, + "logps/chosen": -0.94767165184021, + "logps/rejected": -1.136979341506958, + "loss": -0.0992, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.39976003766059875, + "rewards/margins": 0.04650614410638809, + "rewards/rejected": 0.35325387120246887, + "step": 83 + }, + { + "epoch": 0.17573221757322174, + "grad_norm": 5.447470282604431, + "learning_rate": 4.914024884182534e-07, + "logits/chosen": -2.7488155364990234, + "logits/rejected": -2.7466177940368652, + "logps/chosen": -0.9067939519882202, + "logps/rejected": -1.152085542678833, + "loss": -0.0763, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4365091919898987, + "rewards/margins": 0.1028919368982315, + "rewards/rejected": 0.33361726999282837, + "step": 84 + }, + { + "epoch": 0.17782426778242677, + "grad_norm": 9.656999379927793, + "learning_rate": 4.909211670315114e-07, + "logits/chosen": -2.494537115097046, + "logits/rejected": -2.4501397609710693, + "logps/chosen": -1.0578653812408447, + "logps/rejected": -1.2545416355133057, + "loss": -0.0752, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.40090253949165344, + "rewards/margins": 0.0684221088886261, + "rewards/rejected": 0.33248043060302734, + "step": 85 + }, + { + "epoch": 0.1799163179916318, + "grad_norm": 6.686833143408991, + "learning_rate": 4.904269857960208e-07, + "logits/chosen": -2.5811610221862793, + "logits/rejected": -2.5554800033569336, + "logps/chosen": -0.9244771003723145, + "logps/rejected": -0.9853077530860901, + "loss": -0.0852, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.43252524733543396, + "rewards/margins": 0.02168216183781624, + "rewards/rejected": 0.4108430743217468, + "step": 86 + }, + { + "epoch": 0.18200836820083682, + "grad_norm": 9.23900218513629, + "learning_rate": 4.899199710901028e-07, + "logits/chosen": -2.7979397773742676, + "logits/rejected": -2.7201385498046875, + "logps/chosen": -0.9246209859848022, + "logps/rejected": -1.1718610525131226, + "loss": -0.0907, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4397488832473755, + "rewards/margins": 0.09631224721670151, + "rewards/rejected": 0.34343665838241577, + "step": 87 + }, + { + "epoch": 0.18410041841004185, + "grad_norm": 9.234428289424285, + "learning_rate": 4.894001499771015e-07, + "logits/chosen": -2.5294344425201416, + "logits/rejected": -2.4418015480041504, + "logps/chosen": -1.093963384628296, + "logps/rejected": -2.0233564376831055, + "loss": -0.0818, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.39119696617126465, + "rewards/margins": 0.1488228142261505, + "rewards/rejected": 0.24237412214279175, + "step": 88 + }, + { + "epoch": 0.18619246861924685, + "grad_norm": 12.083165310097053, + "learning_rate": 4.888675502039391e-07, + "logits/chosen": -2.485434055328369, + "logits/rejected": -2.4613311290740967, + "logps/chosen": -1.15748929977417, + "logps/rejected": -1.3351922035217285, + "loss": -0.0771, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.33443617820739746, + "rewards/margins": -0.015506994910538197, + "rewards/rejected": 0.3499431908130646, + "step": 89 + }, + { + "epoch": 0.18828451882845187, + "grad_norm": 5.745618394681697, + "learning_rate": 4.883222001996351e-07, + "logits/chosen": -2.551848888397217, + "logits/rejected": -2.6118860244750977, + "logps/chosen": -1.160627841949463, + "logps/rejected": -1.5292195081710815, + "loss": -0.0656, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36388182640075684, + "rewards/margins": 0.056162893772125244, + "rewards/rejected": 0.3077189028263092, + "step": 90 + }, + { + "epoch": 0.1903765690376569, + "grad_norm": 6.048392960412249, + "learning_rate": 4.877641290737883e-07, + "logits/chosen": -2.544996738433838, + "logits/rejected": -2.5666582584381104, + "logps/chosen": -1.1059722900390625, + "logps/rejected": -1.0704509019851685, + "loss": -0.0755, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4008127450942993, + "rewards/margins": -0.0058725010603666306, + "rewards/rejected": 0.4066852629184723, + "step": 91 + }, + { + "epoch": 0.19246861924686193, + "grad_norm": 7.750849031271762, + "learning_rate": 4.871933666150239e-07, + "logits/chosen": -2.5515635013580322, + "logits/rejected": -2.622760534286499, + "logps/chosen": -1.0890980958938599, + "logps/rejected": -1.2788047790527344, + "loss": -0.0606, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3743430972099304, + "rewards/margins": 0.04916682466864586, + "rewards/rejected": 0.32517626881599426, + "step": 92 + }, + { + "epoch": 0.19456066945606695, + "grad_norm": 6.133733012127484, + "learning_rate": 4.866099432894024e-07, + "logits/chosen": -2.6669888496398926, + "logits/rejected": -2.5700578689575195, + "logps/chosen": -0.6280112266540527, + "logps/rejected": -1.3432271480560303, + "loss": -0.0976, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.56821608543396, + "rewards/margins": 0.256414532661438, + "rewards/rejected": 0.311801552772522, + "step": 93 + }, + { + "epoch": 0.19665271966527198, + "grad_norm": 7.408700272090894, + "learning_rate": 4.860138902387939e-07, + "logits/chosen": -2.6542887687683105, + "logits/rejected": -2.572908401489258, + "logps/chosen": -0.769626259803772, + "logps/rejected": -0.9871947169303894, + "loss": -0.0605, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.5172251462936401, + "rewards/margins": 0.06419370323419571, + "rewards/rejected": 0.45303142070770264, + "step": 94 + }, + { + "epoch": 0.19874476987447698, + "grad_norm": 5.427432507236315, + "learning_rate": 4.854052392792161e-07, + "logits/chosen": -2.437551975250244, + "logits/rejected": -2.353766918182373, + "logps/chosen": -1.1325689554214478, + "logps/rejected": -1.2256555557250977, + "loss": -0.0596, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36106187105178833, + "rewards/margins": 0.02263234183192253, + "rewards/rejected": 0.3384295105934143, + "step": 95 + }, + { + "epoch": 0.200836820083682, + "grad_norm": 8.701179547204811, + "learning_rate": 4.847840228991356e-07, + "logits/chosen": -2.6898810863494873, + "logits/rejected": -2.622732639312744, + "logps/chosen": -0.8455623388290405, + "logps/rejected": -1.298283338546753, + "loss": -0.0478, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.47173362970352173, + "rewards/margins": 0.14020122587680817, + "rewards/rejected": 0.33153235912323, + "step": 96 + }, + { + "epoch": 0.20292887029288703, + "grad_norm": 9.967523481438473, + "learning_rate": 4.841502742577338e-07, + "logits/chosen": -2.6468942165374756, + "logits/rejected": -2.573026180267334, + "logps/chosen": -1.0249431133270264, + "logps/rejected": -1.4148008823394775, + "loss": -0.0399, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3901897668838501, + "rewards/margins": 0.03252024948596954, + "rewards/rejected": 0.35766950249671936, + "step": 97 + }, + { + "epoch": 0.20502092050209206, + "grad_norm": 7.633760266532135, + "learning_rate": 4.83504027183137e-07, + "logits/chosen": -2.7580676078796387, + "logits/rejected": -2.746480941772461, + "logps/chosen": -0.9797180891036987, + "logps/rejected": -1.188408374786377, + "loss": -0.0867, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.42061182856559753, + "rewards/margins": 0.09095914661884308, + "rewards/rejected": 0.32965266704559326, + "step": 98 + }, + { + "epoch": 0.20711297071129708, + "grad_norm": 9.235928116007752, + "learning_rate": 4.828453161706108e-07, + "logits/chosen": -2.788665771484375, + "logits/rejected": -2.838707447052002, + "logps/chosen": -0.8068978786468506, + "logps/rejected": -1.2045230865478516, + "loss": -0.1209, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.48726317286491394, + "rewards/margins": 0.1484646499156952, + "rewards/rejected": 0.33879852294921875, + "step": 99 + }, + { + "epoch": 0.20920502092050208, + "grad_norm": 10.315885970527864, + "learning_rate": 4.821741763807186e-07, + "logits/chosen": -2.7912654876708984, + "logits/rejected": -2.681771755218506, + "logps/chosen": -1.0527245998382568, + "logps/rejected": -1.1655709743499756, + "loss": -0.0844, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3953736424446106, + "rewards/margins": -0.006716027855873108, + "rewards/rejected": 0.4020896852016449, + "step": 100 + }, + { + "epoch": 0.2112970711297071, + "grad_norm": 10.580144647139496, + "learning_rate": 4.81490643637445e-07, + "logits/chosen": -2.5700273513793945, + "logits/rejected": -2.616337299346924, + "logps/chosen": -0.8749042749404907, + "logps/rejected": -1.345421314239502, + "loss": -0.107, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.46963128447532654, + "rewards/margins": 0.13532906770706177, + "rewards/rejected": 0.3343021869659424, + "step": 101 + }, + { + "epoch": 0.21338912133891214, + "grad_norm": 7.233574775615707, + "learning_rate": 4.807947544262838e-07, + "logits/chosen": -2.5869598388671875, + "logits/rejected": -2.5668182373046875, + "logps/chosen": -1.0655555725097656, + "logps/rejected": -1.5141360759735107, + "loss": -0.1207, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3982890546321869, + "rewards/margins": 0.09518561512231827, + "rewards/rejected": 0.30310338735580444, + "step": 102 + }, + { + "epoch": 0.21548117154811716, + "grad_norm": 8.314364804924969, + "learning_rate": 4.800865458922898e-07, + "logits/chosen": -2.965330123901367, + "logits/rejected": -2.9011240005493164, + "logps/chosen": -0.8784880638122559, + "logps/rejected": -1.2078497409820557, + "loss": -0.0707, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4488745331764221, + "rewards/margins": 0.09526941180229187, + "rewards/rejected": 0.35360509157180786, + "step": 103 + }, + { + "epoch": 0.2175732217573222, + "grad_norm": 7.240646906240465, + "learning_rate": 4.793660558380969e-07, + "logits/chosen": -2.8919410705566406, + "logits/rejected": -2.908057451248169, + "logps/chosen": -1.297257661819458, + "logps/rejected": -1.2795225381851196, + "loss": -0.0768, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.31585755944252014, + "rewards/margins": -0.03636983036994934, + "rewards/rejected": 0.3522273600101471, + "step": 104 + }, + { + "epoch": 0.2196652719665272, + "grad_norm": 13.885756163895579, + "learning_rate": 4.786333227218995e-07, + "logits/chosen": -2.655101776123047, + "logits/rejected": -2.53055477142334, + "logps/chosen": -1.0020012855529785, + "logps/rejected": -1.7252869606018066, + "loss": -0.131, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4196831285953522, + "rewards/margins": 0.16872525215148926, + "rewards/rejected": 0.2509578466415405, + "step": 105 + }, + { + "epoch": 0.2217573221757322, + "grad_norm": 5.7928363890391195, + "learning_rate": 4.778883856554003e-07, + "logits/chosen": -2.6295785903930664, + "logits/rejected": -2.560263156890869, + "logps/chosen": -0.9247240424156189, + "logps/rejected": -1.521117925643921, + "loss": -0.1343, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42524152994155884, + "rewards/margins": 0.1525372862815857, + "rewards/rejected": 0.27270421385765076, + "step": 106 + }, + { + "epoch": 0.22384937238493724, + "grad_norm": 10.72110537794039, + "learning_rate": 4.771312844017224e-07, + "logits/chosen": -2.7769789695739746, + "logits/rejected": -2.6794891357421875, + "logps/chosen": -1.0048589706420898, + "logps/rejected": -1.5365948677062988, + "loss": -0.1032, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.40189632773399353, + "rewards/margins": 0.09801431000232697, + "rewards/rejected": 0.30388203263282776, + "step": 107 + }, + { + "epoch": 0.22594142259414227, + "grad_norm": 7.750878315038949, + "learning_rate": 4.7636205937328664e-07, + "logits/chosen": -2.4595589637756348, + "logits/rejected": -2.406452178955078, + "logps/chosen": -0.9035621285438538, + "logps/rejected": -1.3715593814849854, + "loss": -0.0912, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4404214024543762, + "rewards/margins": 0.10055311024188995, + "rewards/rejected": 0.33986830711364746, + "step": 108 + }, + { + "epoch": 0.2280334728033473, + "grad_norm": 8.013088010734279, + "learning_rate": 4.755807516296547e-07, + "logits/chosen": -2.734206199645996, + "logits/rejected": -2.6361465454101562, + "logps/chosen": -1.279822826385498, + "logps/rejected": -1.9017040729522705, + "loss": -0.0997, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.30485787987709045, + "rewards/margins": 0.04763868451118469, + "rewards/rejected": 0.25721919536590576, + "step": 109 + }, + { + "epoch": 0.2301255230125523, + "grad_norm": 10.361495087309358, + "learning_rate": 4.747874028753375e-07, + "logits/chosen": -2.663100242614746, + "logits/rejected": -2.608872890472412, + "logps/chosen": -1.1176433563232422, + "logps/rejected": -1.4879114627838135, + "loss": -0.0849, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.39366304874420166, + "rewards/margins": 0.08537843823432922, + "rewards/rejected": 0.30828458070755005, + "step": 110 + }, + { + "epoch": 0.23221757322175732, + "grad_norm": 11.489203651128713, + "learning_rate": 4.739820554575686e-07, + "logits/chosen": -2.7516872882843018, + "logits/rejected": -2.7274041175842285, + "logps/chosen": -1.2375142574310303, + "logps/rejected": -1.3109242916107178, + "loss": -0.0329, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.35172292590141296, + "rewards/margins": 0.028698619455099106, + "rewards/rejected": 0.32302427291870117, + "step": 111 + }, + { + "epoch": 0.23430962343096234, + "grad_norm": 7.973354310908626, + "learning_rate": 4.731647523640445e-07, + "logits/chosen": -2.761908531188965, + "logits/rejected": -2.641033172607422, + "logps/chosen": -0.9457468390464783, + "logps/rejected": -1.5787360668182373, + "loss": -0.1048, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.416814386844635, + "rewards/margins": 0.12927794456481934, + "rewards/rejected": 0.2875364422798157, + "step": 112 + }, + { + "epoch": 0.23640167364016737, + "grad_norm": 8.416401572506357, + "learning_rate": 4.723355372206297e-07, + "logits/chosen": -2.749239921569824, + "logits/rejected": -2.667978525161743, + "logps/chosen": -0.738419771194458, + "logps/rejected": -1.6119505167007446, + "loss": -0.1138, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.515089213848114, + "rewards/margins": 0.2025749832391739, + "rewards/rejected": 0.31251418590545654, + "step": 113 + }, + { + "epoch": 0.2384937238493724, + "grad_norm": 8.941134480770472, + "learning_rate": 4.714944542890278e-07, + "logits/chosen": -2.7977280616760254, + "logits/rejected": -2.7237446308135986, + "logps/chosen": -1.195307970046997, + "logps/rejected": -1.7132177352905273, + "loss": -0.1042, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3529362082481384, + "rewards/margins": 0.005412375554442406, + "rewards/rejected": 0.3475238084793091, + "step": 114 + }, + { + "epoch": 0.2405857740585774, + "grad_norm": 8.619242640102716, + "learning_rate": 4.706415484644195e-07, + "logits/chosen": -2.7845406532287598, + "logits/rejected": -2.7088088989257812, + "logps/chosen": -1.0112650394439697, + "logps/rejected": -1.4609363079071045, + "loss": -0.0881, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4237975776195526, + "rewards/margins": 0.11371254920959473, + "rewards/rejected": 0.3100849986076355, + "step": 115 + }, + { + "epoch": 0.24267782426778242, + "grad_norm": 15.569708812933248, + "learning_rate": 4.6977686527306555e-07, + "logits/chosen": -2.7467026710510254, + "logits/rejected": -2.6555519104003906, + "logps/chosen": -1.3848240375518799, + "logps/rejected": -1.6398433446884155, + "loss": -0.0615, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.32322824001312256, + "rewards/margins": 0.04797099530696869, + "rewards/rejected": 0.2752572298049927, + "step": 116 + }, + { + "epoch": 0.24476987447698745, + "grad_norm": 7.401683574617976, + "learning_rate": 4.6890045086987707e-07, + "logits/chosen": -2.8689966201782227, + "logits/rejected": -2.7104268074035645, + "logps/chosen": -0.9026917219161987, + "logps/rejected": -1.6604235172271729, + "loss": -0.0947, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4347726106643677, + "rewards/margins": 0.16332250833511353, + "rewards/rejected": 0.27145010232925415, + "step": 117 + }, + { + "epoch": 0.24686192468619247, + "grad_norm": 7.629104260103111, + "learning_rate": 4.680123520359519e-07, + "logits/chosen": -2.6418347358703613, + "logits/rejected": -2.5293517112731934, + "logps/chosen": -1.1596897840499878, + "logps/rejected": -1.622258186340332, + "loss": -0.1004, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3867058753967285, + "rewards/margins": 0.13067524135112762, + "rewards/rejected": 0.2560306191444397, + "step": 118 + }, + { + "epoch": 0.2489539748953975, + "grad_norm": 8.780545352344028, + "learning_rate": 4.671126161760772e-07, + "logits/chosen": -2.7948989868164062, + "logits/rejected": -2.7727484703063965, + "logps/chosen": -1.3009717464447021, + "logps/rejected": -2.0594563484191895, + "loss": -0.0531, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3208620548248291, + "rewards/margins": 0.0681702196598053, + "rewards/rejected": 0.2526918053627014, + "step": 119 + }, + { + "epoch": 0.2510460251046025, + "grad_norm": 8.141226660227753, + "learning_rate": 4.662012913161997e-07, + "logits/chosen": -2.8210859298706055, + "logits/rejected": -2.7449235916137695, + "logps/chosen": -1.0188672542572021, + "logps/rejected": -1.4339046478271484, + "loss": -0.0647, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41782328486442566, + "rewards/margins": 0.07959190011024475, + "rewards/rejected": 0.3382313847541809, + "step": 120 + }, + { + "epoch": 0.25313807531380755, + "grad_norm": 8.356022071440359, + "learning_rate": 4.6527842610086124e-07, + "logits/chosen": -2.499788761138916, + "logits/rejected": -2.588809013366699, + "logps/chosen": -1.0250036716461182, + "logps/rejected": -1.3243354558944702, + "loss": -0.0935, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.43090829253196716, + "rewards/margins": 0.053986430168151855, + "rewards/rejected": 0.3769218921661377, + "step": 121 + }, + { + "epoch": 0.25523012552301255, + "grad_norm": 8.069688534688002, + "learning_rate": 4.6434406979060327e-07, + "logits/chosen": -2.7623844146728516, + "logits/rejected": -2.692293167114258, + "logps/chosen": -1.4179835319519043, + "logps/rejected": -1.441861867904663, + "loss": -0.0593, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.2650303244590759, + "rewards/margins": -0.03164853900671005, + "rewards/rejected": 0.2966788411140442, + "step": 122 + }, + { + "epoch": 0.25732217573221755, + "grad_norm": 13.346564467347195, + "learning_rate": 4.6339827225933657e-07, + "logits/chosen": -2.778604030609131, + "logits/rejected": -2.7227749824523926, + "logps/chosen": -0.8697854280471802, + "logps/rejected": -1.2359058856964111, + "loss": -0.0895, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.45396870374679565, + "rewards/margins": 0.10900671780109406, + "rewards/rejected": 0.3449620008468628, + "step": 123 + }, + { + "epoch": 0.2594142259414226, + "grad_norm": 7.9417348062875055, + "learning_rate": 4.6244108399167977e-07, + "logits/chosen": -2.67881441116333, + "logits/rejected": -2.4599831104278564, + "logps/chosen": -1.3219633102416992, + "logps/rejected": -1.7368388175964355, + "loss": -0.0727, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3538292646408081, + "rewards/margins": 0.05511477217078209, + "rewards/rejected": 0.2987144887447357, + "step": 124 + }, + { + "epoch": 0.2615062761506276, + "grad_norm": 9.180985197503443, + "learning_rate": 4.614725560802639e-07, + "logits/chosen": -2.7183783054351807, + "logits/rejected": -2.6761581897735596, + "logps/chosen": -0.976803183555603, + "logps/rejected": -1.0104469060897827, + "loss": -0.0768, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4247945249080658, + "rewards/margins": 0.011689772829413414, + "rewards/rejected": 0.41310471296310425, + "step": 125 + }, + { + "epoch": 0.26359832635983266, + "grad_norm": 8.86553923626234, + "learning_rate": 4.60492740223006e-07, + "logits/chosen": -2.7192089557647705, + "logits/rejected": -2.6593987941741943, + "logps/chosen": -1.2293155193328857, + "logps/rejected": -1.3931902647018433, + "loss": -0.0804, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.37537840008735657, + "rewards/margins": 0.05658911168575287, + "rewards/rejected": 0.3187893033027649, + "step": 126 + }, + { + "epoch": 0.26569037656903766, + "grad_norm": 9.100936304907126, + "learning_rate": 4.595016887203488e-07, + "logits/chosen": -2.7341713905334473, + "logits/rejected": -2.625302314758301, + "logps/chosen": -1.227184772491455, + "logps/rejected": -1.8654588460922241, + "loss": -0.0752, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34461599588394165, + "rewards/margins": 0.10101988911628723, + "rewards/rejected": 0.24359610676765442, + "step": 127 + }, + { + "epoch": 0.26778242677824265, + "grad_norm": 10.631080592900888, + "learning_rate": 4.584994544724695e-07, + "logits/chosen": -2.548196315765381, + "logits/rejected": -2.4987902641296387, + "logps/chosen": -1.1368122100830078, + "logps/rejected": -2.0986969470977783, + "loss": -0.1204, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3744341731071472, + "rewards/margins": 0.12809962034225464, + "rewards/rejected": 0.24633455276489258, + "step": 128 + }, + { + "epoch": 0.2698744769874477, + "grad_norm": 13.719744878682432, + "learning_rate": 4.574860909764559e-07, + "logits/chosen": -2.5039968490600586, + "logits/rejected": -2.5294370651245117, + "logps/chosen": -1.5889067649841309, + "logps/rejected": -1.7118114233016968, + "loss": -0.1109, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3196980655193329, + "rewards/margins": 0.10617660731077194, + "rewards/rejected": 0.21352145075798035, + "step": 129 + }, + { + "epoch": 0.2719665271966527, + "grad_norm": 15.366716836335764, + "learning_rate": 4.5646165232345103e-07, + "logits/chosen": -2.7375288009643555, + "logits/rejected": -2.6242637634277344, + "logps/chosen": -1.133746862411499, + "logps/rejected": -1.0736860036849976, + "loss": -0.105, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.41827988624572754, + "rewards/margins": -0.03402038663625717, + "rewards/rejected": 0.4523002803325653, + "step": 130 + }, + { + "epoch": 0.27405857740585776, + "grad_norm": 15.603864611701567, + "learning_rate": 4.554261931957657e-07, + "logits/chosen": -2.7278056144714355, + "logits/rejected": -2.5975325107574463, + "logps/chosen": -0.9498021602630615, + "logps/rejected": -1.1297776699066162, + "loss": -0.0668, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4634658396244049, + "rewards/margins": 0.07323861122131348, + "rewards/rejected": 0.39022722840309143, + "step": 131 + }, + { + "epoch": 0.27615062761506276, + "grad_norm": 10.531086272373418, + "learning_rate": 4.5437976886395955e-07, + "logits/chosen": -2.705713987350464, + "logits/rejected": -2.652479410171509, + "logps/chosen": -1.1467740535736084, + "logps/rejected": -1.4576821327209473, + "loss": -0.0988, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.41082507371902466, + "rewards/margins": 0.09115354716777802, + "rewards/rejected": 0.31967151165008545, + "step": 132 + }, + { + "epoch": 0.27824267782426776, + "grad_norm": 7.399259334304283, + "learning_rate": 4.5332243518389136e-07, + "logits/chosen": -2.725745677947998, + "logits/rejected": -2.7743215560913086, + "logps/chosen": -1.1210857629776, + "logps/rejected": -1.1685874462127686, + "loss": -0.093, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.4526345133781433, + "rewards/margins": 0.03685787320137024, + "rewards/rejected": 0.41577666997909546, + "step": 133 + }, + { + "epoch": 0.2803347280334728, + "grad_norm": 7.297983481533, + "learning_rate": 4.5225424859373684e-07, + "logits/chosen": -2.640211582183838, + "logits/rejected": -2.570061206817627, + "logps/chosen": -0.9512884020805359, + "logps/rejected": -1.7315542697906494, + "loss": -0.1001, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.406036376953125, + "rewards/margins": 0.10577087104320526, + "rewards/rejected": 0.30026549100875854, + "step": 134 + }, + { + "epoch": 0.2824267782426778, + "grad_norm": 8.614770102330974, + "learning_rate": 4.511752661109768e-07, + "logits/chosen": -2.5034797191619873, + "logits/rejected": -2.482116460800171, + "logps/chosen": -0.8747888207435608, + "logps/rejected": -1.3776814937591553, + "loss": -0.0835, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5053682327270508, + "rewards/margins": 0.13714160025119781, + "rewards/rejected": 0.36822664737701416, + "step": 135 + }, + { + "epoch": 0.28451882845188287, + "grad_norm": 11.511732667746932, + "learning_rate": 4.5008554532935316e-07, + "logits/chosen": -2.2463009357452393, + "logits/rejected": -2.418936014175415, + "logps/chosen": -1.3389081954956055, + "logps/rejected": -1.4335492849349976, + "loss": -0.1004, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.33481383323669434, + "rewards/margins": 0.04980039596557617, + "rewards/rejected": 0.28501343727111816, + "step": 136 + }, + { + "epoch": 0.28661087866108786, + "grad_norm": 7.62656299018194, + "learning_rate": 4.4898514441579493e-07, + "logits/chosen": -2.730118751525879, + "logits/rejected": -2.7033190727233887, + "logps/chosen": -1.2007243633270264, + "logps/rejected": -1.3021783828735352, + "loss": -0.0571, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.34427547454833984, + "rewards/margins": 0.009376163594424725, + "rewards/rejected": 0.33489930629730225, + "step": 137 + }, + { + "epoch": 0.28870292887029286, + "grad_norm": 10.818515405254571, + "learning_rate": 4.478741221073135e-07, + "logits/chosen": -2.742260694503784, + "logits/rejected": -2.5423145294189453, + "logps/chosen": -0.9980774521827698, + "logps/rejected": -1.3085711002349854, + "loss": -0.0653, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4080244302749634, + "rewards/margins": 0.05671762675046921, + "rewards/rejected": 0.3513067960739136, + "step": 138 + }, + { + "epoch": 0.2907949790794979, + "grad_norm": 11.277914045373983, + "learning_rate": 4.467525377078671e-07, + "logits/chosen": -2.218235492706299, + "logits/rejected": -2.426461696624756, + "logps/chosen": -1.1532562971115112, + "logps/rejected": -1.4017515182495117, + "loss": -0.0578, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.4083666205406189, + "rewards/margins": 0.056754596531391144, + "rewards/rejected": 0.35161203145980835, + "step": 139 + }, + { + "epoch": 0.2928870292887029, + "grad_norm": 14.584077361972543, + "learning_rate": 4.456204510851956e-07, + "logits/chosen": -2.7155497074127197, + "logits/rejected": -2.58384370803833, + "logps/chosen": -1.0665032863616943, + "logps/rejected": -1.7976462841033936, + "loss": -0.1131, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38510066270828247, + "rewards/margins": 0.15970441699028015, + "rewards/rejected": 0.22539621591567993, + "step": 140 + }, + { + "epoch": 0.29497907949790797, + "grad_norm": 13.90438178725477, + "learning_rate": 4.444779226676246e-07, + "logits/chosen": -2.7173407077789307, + "logits/rejected": -2.7211666107177734, + "logps/chosen": -1.0276908874511719, + "logps/rejected": -1.4164080619812012, + "loss": -0.1078, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.40510475635528564, + "rewards/margins": 0.08837657421827316, + "rewards/rejected": 0.3167282044887543, + "step": 141 + }, + { + "epoch": 0.29707112970711297, + "grad_norm": 12.235049719980324, + "learning_rate": 4.4332501344084005e-07, + "logits/chosen": -2.552258014678955, + "logits/rejected": -2.592665195465088, + "logps/chosen": -0.9130370616912842, + "logps/rejected": -1.2232286930084229, + "loss": -0.0768, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4437675476074219, + "rewards/margins": 0.04495351016521454, + "rewards/rejected": 0.39881402254104614, + "step": 142 + }, + { + "epoch": 0.29916317991631797, + "grad_norm": 21.089753672362736, + "learning_rate": 4.4216178494463295e-07, + "logits/chosen": -2.7459824085235596, + "logits/rejected": -2.619235038757324, + "logps/chosen": -1.0598134994506836, + "logps/rejected": -1.9267642498016357, + "loss": -0.1071, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.36971956491470337, + "rewards/margins": 0.14974790811538696, + "rewards/rejected": 0.2199716567993164, + "step": 143 + }, + { + "epoch": 0.301255230125523, + "grad_norm": 14.159186928029824, + "learning_rate": 4.4098829926961477e-07, + "logits/chosen": -2.5897159576416016, + "logits/rejected": -2.516749858856201, + "logps/chosen": -0.7705448269844055, + "logps/rejected": -1.9500865936279297, + "loss": -0.0738, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5220432281494141, + "rewards/margins": 0.24915724992752075, + "rewards/rejected": 0.2728859782218933, + "step": 144 + }, + { + "epoch": 0.303347280334728, + "grad_norm": 25.852536388047355, + "learning_rate": 4.398046190539024e-07, + "logits/chosen": -2.5979108810424805, + "logits/rejected": -2.5280308723449707, + "logps/chosen": -1.0106014013290405, + "logps/rejected": -2.4643092155456543, + "loss": -0.1157, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3909301459789276, + "rewards/margins": 0.12234397977590561, + "rewards/rejected": 0.2685861587524414, + "step": 145 + }, + { + "epoch": 0.3054393305439331, + "grad_norm": 37.83743366982954, + "learning_rate": 4.3861080747977566e-07, + "logits/chosen": -2.741292953491211, + "logits/rejected": -2.7124412059783936, + "logps/chosen": -1.1303961277008057, + "logps/rejected": -2.0690033435821533, + "loss": -0.1239, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38765209913253784, + "rewards/margins": 0.16262786090373993, + "rewards/rejected": 0.22502422332763672, + "step": 146 + }, + { + "epoch": 0.3075313807531381, + "grad_norm": 15.24433968980372, + "learning_rate": 4.37406928270304e-07, + "logits/chosen": -2.6818463802337646, + "logits/rejected": -2.603290557861328, + "logps/chosen": -1.3489582538604736, + "logps/rejected": -1.6901756525039673, + "loss": -0.0567, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.31866884231567383, + "rewards/margins": 0.022746426984667778, + "rewards/rejected": 0.2959223985671997, + "step": 147 + }, + { + "epoch": 0.30962343096234307, + "grad_norm": 22.776756565596372, + "learning_rate": 4.3619304568594546e-07, + "logits/chosen": -2.6372740268707275, + "logits/rejected": -2.6938648223876953, + "logps/chosen": -1.3485358953475952, + "logps/rejected": -1.5879672765731812, + "loss": -0.0709, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33045515418052673, + "rewards/margins": 0.09428148716688156, + "rewards/rejected": 0.23617368936538696, + "step": 148 + }, + { + "epoch": 0.3117154811715481, + "grad_norm": 14.832493676126875, + "learning_rate": 4.349692245211165e-07, + "logits/chosen": -2.8095154762268066, + "logits/rejected": -2.6838107109069824, + "logps/chosen": -1.02547025680542, + "logps/rejected": -1.4738640785217285, + "loss": -0.0792, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.414419949054718, + "rewards/margins": 0.08410301804542542, + "rewards/rejected": 0.3303169012069702, + "step": 149 + }, + { + "epoch": 0.3138075313807531, + "grad_norm": 18.310776769390138, + "learning_rate": 4.337355301007335e-07, + "logits/chosen": -2.5838117599487305, + "logits/rejected": -2.431469440460205, + "logps/chosen": -1.4929683208465576, + "logps/rejected": -1.7874231338500977, + "loss": -0.0836, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3493340015411377, + "rewards/margins": 0.10493455827236176, + "rewards/rejected": 0.24439939856529236, + "step": 150 + }, + { + "epoch": 0.3158995815899582, + "grad_norm": 9.964326552753194, + "learning_rate": 4.324920282767256e-07, + "logits/chosen": -2.6689319610595703, + "logits/rejected": -2.5450775623321533, + "logps/chosen": -1.054506778717041, + "logps/rejected": -1.81797194480896, + "loss": -0.1199, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3724176287651062, + "rewards/margins": 0.1260986328125, + "rewards/rejected": 0.2463189959526062, + "step": 151 + }, + { + "epoch": 0.3179916317991632, + "grad_norm": 14.781469879773143, + "learning_rate": 4.312387854245201e-07, + "logits/chosen": -2.4934237003326416, + "logits/rejected": -2.5184531211853027, + "logps/chosen": -1.202325701713562, + "logps/rejected": -1.8260574340820312, + "loss": -0.1168, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.34124889969825745, + "rewards/margins": 0.11295446753501892, + "rewards/rejected": 0.22829441726207733, + "step": 152 + }, + { + "epoch": 0.3200836820083682, + "grad_norm": 13.249819772322319, + "learning_rate": 4.2997586843949896e-07, + "logits/chosen": -2.612116813659668, + "logits/rejected": -2.5220794677734375, + "logps/chosen": -1.3319296836853027, + "logps/rejected": -1.688657283782959, + "loss": -0.1192, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.31724148988723755, + "rewards/margins": 0.07862979173660278, + "rewards/rejected": 0.23861169815063477, + "step": 153 + }, + { + "epoch": 0.32217573221757323, + "grad_norm": 9.882053994484261, + "learning_rate": 4.287033447334286e-07, + "logits/chosen": -2.6856942176818848, + "logits/rejected": -2.6089882850646973, + "logps/chosen": -1.3274683952331543, + "logps/rejected": -2.0857741832733154, + "loss": -0.0792, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3238699436187744, + "rewards/margins": 0.08010490238666534, + "rewards/rejected": 0.24376502633094788, + "step": 154 + }, + { + "epoch": 0.32426778242677823, + "grad_norm": 20.74435826093936, + "learning_rate": 4.2742128223086115e-07, + "logits/chosen": -2.469430446624756, + "logits/rejected": -2.3157687187194824, + "logps/chosen": -1.229305624961853, + "logps/rejected": -1.7900751829147339, + "loss": -0.0827, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3742397427558899, + "rewards/margins": 0.09423865377902985, + "rewards/rejected": 0.28000104427337646, + "step": 155 + }, + { + "epoch": 0.3263598326359833, + "grad_norm": 17.4329604936243, + "learning_rate": 4.261297493655092e-07, + "logits/chosen": -2.436098575592041, + "logits/rejected": -2.3914339542388916, + "logps/chosen": -1.217616081237793, + "logps/rejected": -1.8459382057189941, + "loss": -0.1423, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4322535991668701, + "rewards/margins": 0.1469898223876953, + "rewards/rejected": 0.2852637767791748, + "step": 156 + }, + { + "epoch": 0.3284518828451883, + "grad_norm": 17.563732823445005, + "learning_rate": 4.2482881507659244e-07, + "logits/chosen": -2.4747214317321777, + "logits/rejected": -2.4113662242889404, + "logps/chosen": -1.379757046699524, + "logps/rejected": -1.7662575244903564, + "loss": -0.1266, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.37260758876800537, + "rewards/margins": 0.06532790511846542, + "rewards/rejected": 0.30727970600128174, + "step": 157 + }, + { + "epoch": 0.3305439330543933, + "grad_norm": 8.540815754507376, + "learning_rate": 4.235185488051585e-07, + "logits/chosen": -2.3964672088623047, + "logits/rejected": -2.3802266120910645, + "logps/chosen": -1.4923787117004395, + "logps/rejected": -1.987583875656128, + "loss": -0.1227, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3038066029548645, + "rewards/margins": 0.06362194567918777, + "rewards/rejected": 0.24018466472625732, + "step": 158 + }, + { + "epoch": 0.33263598326359833, + "grad_norm": 9.25947209544519, + "learning_rate": 4.2219902049037554e-07, + "logits/chosen": -2.479762077331543, + "logits/rejected": -2.4210665225982666, + "logps/chosen": -1.38946533203125, + "logps/rejected": -1.7284841537475586, + "loss": -0.0544, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.29429730772972107, + "rewards/margins": 0.0212214644998312, + "rewards/rejected": 0.27307581901550293, + "step": 159 + }, + { + "epoch": 0.33472803347280333, + "grad_norm": 12.44176743490638, + "learning_rate": 4.2087030056579986e-07, + "logits/chosen": -2.319620132446289, + "logits/rejected": -2.326211452484131, + "logps/chosen": -0.8733711242675781, + "logps/rejected": -1.2630772590637207, + "loss": -0.0928, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4575880169868469, + "rewards/margins": 0.08910935372114182, + "rewards/rejected": 0.3684786856174469, + "step": 160 + }, + { + "epoch": 0.3368200836820084, + "grad_norm": 23.431018635077386, + "learning_rate": 4.1953245995561577e-07, + "logits/chosen": -2.76431941986084, + "logits/rejected": -2.6128687858581543, + "logps/chosen": -0.8526709079742432, + "logps/rejected": -1.4783575534820557, + "loss": -0.1187, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4951631426811218, + "rewards/margins": 0.10067776590585709, + "rewards/rejected": 0.39448535442352295, + "step": 161 + }, + { + "epoch": 0.3389121338912134, + "grad_norm": 15.623223043488867, + "learning_rate": 4.1818557007085e-07, + "logits/chosen": -2.620527982711792, + "logits/rejected": -2.532945156097412, + "logps/chosen": -1.390239953994751, + "logps/rejected": -1.7109453678131104, + "loss": -0.091, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33600008487701416, + "rewards/margins": 0.07325442135334015, + "rewards/rejected": 0.2627456486225128, + "step": 162 + }, + { + "epoch": 0.3410041841004184, + "grad_norm": 12.224980553609598, + "learning_rate": 4.1682970280555987e-07, + "logits/chosen": -2.6704134941101074, + "logits/rejected": -2.5877957344055176, + "logps/chosen": -0.9367567896842957, + "logps/rejected": -1.9059950113296509, + "loss": -0.1361, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.46736544370651245, + "rewards/margins": 0.20542237162590027, + "rewards/rejected": 0.26194310188293457, + "step": 163 + }, + { + "epoch": 0.34309623430962344, + "grad_norm": 12.801839720123297, + "learning_rate": 4.154649305329958e-07, + "logits/chosen": -2.562652111053467, + "logits/rejected": -2.575808048248291, + "logps/chosen": -1.4792587757110596, + "logps/rejected": -1.5863935947418213, + "loss": -0.1133, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3257550001144409, + "rewards/margins": 0.05095459520816803, + "rewards/rejected": 0.2748003900051117, + "step": 164 + }, + { + "epoch": 0.34518828451882844, + "grad_norm": 12.577792600421537, + "learning_rate": 4.140913261017382e-07, + "logits/chosen": -2.5608842372894287, + "logits/rejected": -2.502084732055664, + "logps/chosen": -0.9237871170043945, + "logps/rejected": -1.2350677251815796, + "loss": -0.0699, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.44716089963912964, + "rewards/margins": 0.09571586549282074, + "rewards/rejected": 0.3514450192451477, + "step": 165 + }, + { + "epoch": 0.3472803347280335, + "grad_norm": 25.043359615671452, + "learning_rate": 4.127089628318089e-07, + "logits/chosen": -2.6735382080078125, + "logits/rejected": -2.5704262256622314, + "logps/chosen": -1.2471284866333008, + "logps/rejected": -1.6530365943908691, + "loss": -0.0821, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3380359411239624, + "rewards/margins": 0.031453195959329605, + "rewards/rejected": 0.3065827488899231, + "step": 166 + }, + { + "epoch": 0.3493723849372385, + "grad_norm": 14.126453426156447, + "learning_rate": 4.113179145107575e-07, + "logits/chosen": -2.7621216773986816, + "logits/rejected": -2.6524105072021484, + "logps/chosen": -0.7899508476257324, + "logps/rejected": -1.3169074058532715, + "loss": -0.084, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4821957051753998, + "rewards/margins": 0.10919322073459625, + "rewards/rejected": 0.3730024993419647, + "step": 167 + }, + { + "epoch": 0.3514644351464435, + "grad_norm": 8.684294843492529, + "learning_rate": 4.099182553897228e-07, + "logits/chosen": -2.5085854530334473, + "logits/rejected": -2.3533246517181396, + "logps/chosen": -1.205338716506958, + "logps/rejected": -1.7073439359664917, + "loss": -0.09, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3430725932121277, + "rewards/margins": 0.0864902213215828, + "rewards/rejected": 0.2565823793411255, + "step": 168 + }, + { + "epoch": 0.35355648535564854, + "grad_norm": 10.933055039872562, + "learning_rate": 4.0851006017946945e-07, + "logits/chosen": -2.7597827911376953, + "logits/rejected": -2.5856375694274902, + "logps/chosen": -1.1774203777313232, + "logps/rejected": -2.170247793197632, + "loss": -0.1448, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39301756024360657, + "rewards/margins": 0.15624675154685974, + "rewards/rejected": 0.23677079379558563, + "step": 169 + }, + { + "epoch": 0.35564853556485354, + "grad_norm": 11.726931388893034, + "learning_rate": 4.070934040463998e-07, + "logits/chosen": -2.6450562477111816, + "logits/rejected": -2.5120420455932617, + "logps/chosen": -1.0560662746429443, + "logps/rejected": -1.673842191696167, + "loss": -0.0984, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4195786118507385, + "rewards/margins": 0.17711637914180756, + "rewards/rejected": 0.24246223270893097, + "step": 170 + }, + { + "epoch": 0.3577405857740586, + "grad_norm": 10.741121040812216, + "learning_rate": 4.056683626085422e-07, + "logits/chosen": -2.6740009784698486, + "logits/rejected": -2.5694196224212646, + "logps/chosen": -0.98111891746521, + "logps/rejected": -1.4604544639587402, + "loss": -0.1277, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.41521942615509033, + "rewards/margins": 0.03261597827076912, + "rewards/rejected": 0.3826034665107727, + "step": 171 + }, + { + "epoch": 0.3598326359832636, + "grad_norm": 12.230786885035174, + "learning_rate": 4.042350119315141e-07, + "logits/chosen": -2.764572858810425, + "logits/rejected": -2.6556856632232666, + "logps/chosen": -1.0518836975097656, + "logps/rejected": -1.255865454673767, + "loss": -0.0712, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3684941828250885, + "rewards/margins": 0.009007595479488373, + "rewards/rejected": 0.35948657989501953, + "step": 172 + }, + { + "epoch": 0.3619246861924686, + "grad_norm": 14.615276527777311, + "learning_rate": 4.027934285244623e-07, + "logits/chosen": -2.2994682788848877, + "logits/rejected": -2.2448410987854004, + "logps/chosen": -1.4600330591201782, + "logps/rejected": -1.3843920230865479, + "loss": -0.0957, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3354830741882324, + "rewards/margins": -0.007916823029518127, + "rewards/rejected": 0.34339988231658936, + "step": 173 + }, + { + "epoch": 0.36401673640167365, + "grad_norm": 17.754495400703274, + "learning_rate": 4.0134368933597864e-07, + "logits/chosen": -2.5806937217712402, + "logits/rejected": -2.5036139488220215, + "logps/chosen": -1.0033631324768066, + "logps/rejected": -1.66176438331604, + "loss": -0.1324, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.43012845516204834, + "rewards/margins": 0.09277307987213135, + "rewards/rejected": 0.337355375289917, + "step": 174 + }, + { + "epoch": 0.36610878661087864, + "grad_norm": 36.05147178915256, + "learning_rate": 3.9988587174999306e-07, + "logits/chosen": -2.775186538696289, + "logits/rejected": -2.6564929485321045, + "logps/chosen": -1.3308205604553223, + "logps/rejected": -1.4604084491729736, + "loss": -0.1109, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3784557282924652, + "rewards/margins": 0.07769946753978729, + "rewards/rejected": 0.3007562458515167, + "step": 175 + }, + { + "epoch": 0.3682008368200837, + "grad_norm": 8.497342760203038, + "learning_rate": 3.9842005358164267e-07, + "logits/chosen": -2.674741268157959, + "logits/rejected": -2.57438063621521, + "logps/chosen": -1.2102327346801758, + "logps/rejected": -1.3143596649169922, + "loss": -0.0969, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36521029472351074, + "rewards/margins": 0.0020803166553378105, + "rewards/rejected": 0.36313000321388245, + "step": 176 + }, + { + "epoch": 0.3702928870292887, + "grad_norm": 10.985890442670515, + "learning_rate": 3.9694631307311825e-07, + "logits/chosen": -2.6628575325012207, + "logits/rejected": -2.4451780319213867, + "logps/chosen": -0.8929375410079956, + "logps/rejected": -1.682444453239441, + "loss": -0.0701, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4489145576953888, + "rewards/margins": 0.19804592430591583, + "rewards/rejected": 0.25086861848831177, + "step": 177 + }, + { + "epoch": 0.3723849372384937, + "grad_norm": 11.399134258390868, + "learning_rate": 3.954647288894882e-07, + "logits/chosen": -2.5748448371887207, + "logits/rejected": -2.4727160930633545, + "logps/chosen": -0.9902825355529785, + "logps/rejected": -1.9029393196105957, + "loss": -0.0998, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.40634915232658386, + "rewards/margins": 0.1376405507326126, + "rewards/rejected": 0.26870861649513245, + "step": 178 + }, + { + "epoch": 0.37447698744769875, + "grad_norm": 17.72907495829811, + "learning_rate": 3.9397538011449896e-07, + "logits/chosen": -2.5171046257019043, + "logits/rejected": -2.4481072425842285, + "logps/chosen": -1.2923526763916016, + "logps/rejected": -1.5680699348449707, + "loss": -0.1458, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.37985485792160034, + "rewards/margins": 0.10051047801971436, + "rewards/rejected": 0.279344379901886, + "step": 179 + }, + { + "epoch": 0.37656903765690375, + "grad_norm": 13.449254488225826, + "learning_rate": 3.9247834624635404e-07, + "logits/chosen": -2.5202701091766357, + "logits/rejected": -2.5015406608581543, + "logps/chosen": -1.03229820728302, + "logps/rejected": -1.5690972805023193, + "loss": -0.0937, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4546021819114685, + "rewards/margins": 0.09776400029659271, + "rewards/rejected": 0.3568382263183594, + "step": 180 + }, + { + "epoch": 0.3786610878661088, + "grad_norm": 27.750872074205276, + "learning_rate": 3.9097370719347065e-07, + "logits/chosen": -2.4913382530212402, + "logits/rejected": -2.458721160888672, + "logps/chosen": -1.2403842210769653, + "logps/rejected": -1.6784918308258057, + "loss": -0.0936, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4432637095451355, + "rewards/margins": 0.09729886054992676, + "rewards/rejected": 0.34596481919288635, + "step": 181 + }, + { + "epoch": 0.3807531380753138, + "grad_norm": 10.808603632088001, + "learning_rate": 3.894615432702143e-07, + "logits/chosen": -2.595398187637329, + "logits/rejected": -2.599726915359497, + "logps/chosen": -1.179286003112793, + "logps/rejected": -1.8391259908676147, + "loss": -0.1385, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3741753101348877, + "rewards/margins": 0.12403026968240738, + "rewards/rejected": 0.2501450181007385, + "step": 182 + }, + { + "epoch": 0.38284518828451886, + "grad_norm": 30.462769845824713, + "learning_rate": 3.879419351926115e-07, + "logits/chosen": -2.8511743545532227, + "logits/rejected": -2.758679151535034, + "logps/chosen": -1.5136802196502686, + "logps/rejected": -1.9941767454147339, + "loss": -0.1107, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3550741672515869, + "rewards/margins": 0.037071578204631805, + "rewards/rejected": 0.3180025815963745, + "step": 183 + }, + { + "epoch": 0.38493723849372385, + "grad_norm": 20.80095620525014, + "learning_rate": 3.864149640740416e-07, + "logits/chosen": -2.501633882522583, + "logits/rejected": -2.5069472789764404, + "logps/chosen": -0.8953936100006104, + "logps/rejected": -2.3201770782470703, + "loss": -0.1123, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.47941428422927856, + "rewards/margins": 0.2701411843299866, + "rewards/rejected": 0.20927312970161438, + "step": 184 + }, + { + "epoch": 0.38702928870292885, + "grad_norm": 10.92510807498925, + "learning_rate": 3.848807114209074e-07, + "logits/chosen": -2.6651406288146973, + "logits/rejected": -2.560551166534424, + "logps/chosen": -1.2817895412445068, + "logps/rejected": -1.9381226301193237, + "loss": -0.0943, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.32449212670326233, + "rewards/margins": 0.07041622698307037, + "rewards/rejected": 0.25407588481903076, + "step": 185 + }, + { + "epoch": 0.3891213389121339, + "grad_norm": 12.044202139443744, + "learning_rate": 3.833392591282838e-07, + "logits/chosen": -2.7797350883483887, + "logits/rejected": -2.7092323303222656, + "logps/chosen": -1.0567940473556519, + "logps/rejected": -1.476905107498169, + "loss": -0.0953, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39627107977867126, + "rewards/margins": 0.1051664873957634, + "rewards/rejected": 0.29110458493232727, + "step": 186 + }, + { + "epoch": 0.3912133891213389, + "grad_norm": 18.417800983461795, + "learning_rate": 3.8179068947554705e-07, + "logits/chosen": -2.527510643005371, + "logits/rejected": -2.4806833267211914, + "logps/chosen": -1.0980507135391235, + "logps/rejected": -1.4430592060089111, + "loss": -0.1071, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37078559398651123, + "rewards/margins": 0.07708199322223663, + "rewards/rejected": 0.293703556060791, + "step": 187 + }, + { + "epoch": 0.39330543933054396, + "grad_norm": 10.619572229193354, + "learning_rate": 3.8023508512198257e-07, + "logits/chosen": -2.6688337326049805, + "logits/rejected": -2.5510902404785156, + "logps/chosen": -1.0620704889297485, + "logps/rejected": -1.6755778789520264, + "loss": -0.1078, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3925957679748535, + "rewards/margins": 0.10504680871963501, + "rewards/rejected": 0.2875489592552185, + "step": 188 + }, + { + "epoch": 0.39539748953974896, + "grad_norm": 13.725621477241875, + "learning_rate": 3.786725291023728e-07, + "logits/chosen": -2.6156606674194336, + "logits/rejected": -2.729259490966797, + "logps/chosen": -1.2324016094207764, + "logps/rejected": -1.929736852645874, + "loss": -0.114, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36698228120803833, + "rewards/margins": 0.09162534028291702, + "rewards/rejected": 0.2753569483757019, + "step": 189 + }, + { + "epoch": 0.39748953974895396, + "grad_norm": 17.142971438072294, + "learning_rate": 3.7710310482256523e-07, + "logits/chosen": -2.6821255683898926, + "logits/rejected": -2.609424114227295, + "logps/chosen": -1.1436941623687744, + "logps/rejected": -1.7539355754852295, + "loss": -0.1211, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3664313554763794, + "rewards/margins": 0.1225995272397995, + "rewards/rejected": 0.2438318431377411, + "step": 190 + }, + { + "epoch": 0.399581589958159, + "grad_norm": 10.808377047655993, + "learning_rate": 3.7552689605501986e-07, + "logits/chosen": -2.8227596282958984, + "logits/rejected": -2.7516233921051025, + "logps/chosen": -1.1025787591934204, + "logps/rejected": -1.4772980213165283, + "loss": -0.1236, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3938792943954468, + "rewards/margins": 0.12577448785305023, + "rewards/rejected": 0.26810479164123535, + "step": 191 + }, + { + "epoch": 0.401673640167364, + "grad_norm": 9.051612364731888, + "learning_rate": 3.7394398693433794e-07, + "logits/chosen": -2.725497007369995, + "logits/rejected": -2.604818820953369, + "logps/chosen": -1.1002895832061768, + "logps/rejected": -1.7089571952819824, + "loss": -0.1199, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4390842914581299, + "rewards/margins": 0.12097970396280289, + "rewards/rejected": 0.3181045949459076, + "step": 192 + }, + { + "epoch": 0.40376569037656906, + "grad_norm": 7.1679248444670325, + "learning_rate": 3.7235446195277136e-07, + "logits/chosen": -2.573514938354492, + "logits/rejected": -2.376397132873535, + "logps/chosen": -1.1410503387451172, + "logps/rejected": -1.619983196258545, + "loss": -0.1093, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36380043625831604, + "rewards/margins": 0.03999757021665573, + "rewards/rejected": 0.3238028883934021, + "step": 193 + }, + { + "epoch": 0.40585774058577406, + "grad_norm": 13.32634634691774, + "learning_rate": 3.7075840595571194e-07, + "logits/chosen": -2.483315944671631, + "logits/rejected": -2.51577091217041, + "logps/chosen": -1.5389375686645508, + "logps/rejected": -1.8842053413391113, + "loss": -0.0454, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.31732869148254395, + "rewards/margins": 0.020423030480742455, + "rewards/rejected": 0.29690563678741455, + "step": 194 + }, + { + "epoch": 0.40794979079497906, + "grad_norm": 6.509324354981061, + "learning_rate": 3.691559041371631e-07, + "logits/chosen": -2.787278652191162, + "logits/rejected": -2.7446131706237793, + "logps/chosen": -1.2347677946090698, + "logps/rejected": -1.3611440658569336, + "loss": -0.0695, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3504650890827179, + "rewards/margins": 0.03077811747789383, + "rewards/rejected": 0.3196869492530823, + "step": 195 + }, + { + "epoch": 0.4100418410041841, + "grad_norm": 9.647934576973586, + "learning_rate": 3.6754704203519204e-07, + "logits/chosen": -2.79685640335083, + "logits/rejected": -2.7126359939575195, + "logps/chosen": -0.9442129731178284, + "logps/rejected": -1.1807160377502441, + "loss": -0.1063, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.435344934463501, + "rewards/margins": 0.0697101503610611, + "rewards/rejected": 0.3656347990036011, + "step": 196 + }, + { + "epoch": 0.4121338912133891, + "grad_norm": 7.90674670924584, + "learning_rate": 3.659319055273644e-07, + "logits/chosen": -2.6790804862976074, + "logits/rejected": -2.638993740081787, + "logps/chosen": -1.2018253803253174, + "logps/rejected": -1.8742297887802124, + "loss": -0.1294, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3638248145580292, + "rewards/margins": 0.11370320618152618, + "rewards/rejected": 0.2501215934753418, + "step": 197 + }, + { + "epoch": 0.41422594142259417, + "grad_norm": 7.339508519985613, + "learning_rate": 3.643105808261596e-07, + "logits/chosen": -2.8512470722198486, + "logits/rejected": -2.8233699798583984, + "logps/chosen": -0.9459013938903809, + "logps/rejected": -1.6173968315124512, + "loss": -0.1092, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.43863311409950256, + "rewards/margins": 0.17252716422080994, + "rewards/rejected": 0.2661059498786926, + "step": 198 + }, + { + "epoch": 0.41631799163179917, + "grad_norm": 8.240459127134638, + "learning_rate": 3.626831544743697e-07, + "logits/chosen": -2.6968235969543457, + "logits/rejected": -2.6434929370880127, + "logps/chosen": -0.81470787525177, + "logps/rejected": -2.106025218963623, + "loss": -0.0904, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4797627925872803, + "rewards/margins": 0.17039833962917328, + "rewards/rejected": 0.3093644380569458, + "step": 199 + }, + { + "epoch": 0.41841004184100417, + "grad_norm": 6.44111013570436, + "learning_rate": 3.610497133404795e-07, + "logits/chosen": -2.7517199516296387, + "logits/rejected": -2.6663498878479004, + "logps/chosen": -1.2116317749023438, + "logps/rejected": -1.853126883506775, + "loss": -0.0994, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3775237798690796, + "rewards/margins": 0.12298993766307831, + "rewards/rejected": 0.2545338273048401, + "step": 200 + }, + { + "epoch": 0.4205020920502092, + "grad_norm": 13.368515581855178, + "learning_rate": 3.594103446140297e-07, + "logits/chosen": -2.706216335296631, + "logits/rejected": -2.6910247802734375, + "logps/chosen": -1.0522258281707764, + "logps/rejected": -2.026273727416992, + "loss": -0.1004, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.39808568358421326, + "rewards/margins": 0.06647545844316483, + "rewards/rejected": 0.33161020278930664, + "step": 201 + }, + { + "epoch": 0.4225941422594142, + "grad_norm": 10.440505640054965, + "learning_rate": 3.5776513580096313e-07, + "logits/chosen": -2.8171496391296387, + "logits/rejected": -2.7153167724609375, + "logps/chosen": -1.0142191648483276, + "logps/rejected": -1.8539962768554688, + "loss": -0.112, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.419700026512146, + "rewards/margins": 0.18621891736984253, + "rewards/rejected": 0.23348110914230347, + "step": 202 + }, + { + "epoch": 0.4246861924686193, + "grad_norm": 12.957381005981867, + "learning_rate": 3.5611417471895376e-07, + "logits/chosen": -2.5837535858154297, + "logits/rejected": -2.5695419311523438, + "logps/chosen": -1.0546152591705322, + "logps/rejected": -1.3897829055786133, + "loss": -0.1011, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4112699031829834, + "rewards/margins": 0.07434143126010895, + "rewards/rejected": 0.33692848682403564, + "step": 203 + }, + { + "epoch": 0.42677824267782427, + "grad_norm": 11.867645234753555, + "learning_rate": 3.5445754949271924e-07, + "logits/chosen": -2.698544979095459, + "logits/rejected": -2.6663873195648193, + "logps/chosen": -1.3959226608276367, + "logps/rejected": -1.8983134031295776, + "loss": -0.1, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.32857778668403625, + "rewards/margins": 0.07399465888738632, + "rewards/rejected": 0.25458312034606934, + "step": 204 + }, + { + "epoch": 0.42887029288702927, + "grad_norm": 11.364438697927202, + "learning_rate": 3.5279534854931674e-07, + "logits/chosen": -2.551753044128418, + "logits/rejected": -2.554995536804199, + "logps/chosen": -0.8723805546760559, + "logps/rejected": -1.1063939332962036, + "loss": -0.1265, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.5010254979133606, + "rewards/margins": 0.12164342403411865, + "rewards/rejected": 0.37938204407691956, + "step": 205 + }, + { + "epoch": 0.4309623430962343, + "grad_norm": 8.03697196685394, + "learning_rate": 3.511276606134234e-07, + "logits/chosen": -2.748340606689453, + "logits/rejected": -2.7050065994262695, + "logps/chosen": -1.1861228942871094, + "logps/rejected": -1.7552168369293213, + "loss": -0.1095, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3975922465324402, + "rewards/margins": 0.14437630772590637, + "rewards/rejected": 0.2532159090042114, + "step": 206 + }, + { + "epoch": 0.4330543933054393, + "grad_norm": 13.082858779626731, + "learning_rate": 3.4945457470259987e-07, + "logits/chosen": -2.761401653289795, + "logits/rejected": -2.730571746826172, + "logps/chosen": -1.2381715774536133, + "logps/rejected": -1.7709894180297852, + "loss": -0.099, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.32908713817596436, + "rewards/margins": -0.023325469344854355, + "rewards/rejected": 0.352412611246109, + "step": 207 + }, + { + "epoch": 0.4351464435146444, + "grad_norm": 10.455595309048718, + "learning_rate": 3.4777618012253895e-07, + "logits/chosen": -2.8608574867248535, + "logits/rejected": -2.6432385444641113, + "logps/chosen": -1.0563664436340332, + "logps/rejected": -1.7969838380813599, + "loss": -0.0882, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4247441291809082, + "rewards/margins": 0.1520257294178009, + "rewards/rejected": 0.2727183699607849, + "step": 208 + }, + { + "epoch": 0.4372384937238494, + "grad_norm": 9.638630516945211, + "learning_rate": 3.4609256646229903e-07, + "logits/chosen": -2.3889663219451904, + "logits/rejected": -2.1771650314331055, + "logps/chosen": -1.2622408866882324, + "logps/rejected": -1.732050895690918, + "loss": -0.095, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4195970594882965, + "rewards/margins": 0.1015428677201271, + "rewards/rejected": 0.31805419921875, + "step": 209 + }, + { + "epoch": 0.4393305439330544, + "grad_norm": 9.12468771204172, + "learning_rate": 3.4440382358952115e-07, + "logits/chosen": -2.6972784996032715, + "logits/rejected": -2.6877102851867676, + "logps/chosen": -1.0190376043319702, + "logps/rejected": -1.5476694107055664, + "loss": -0.0858, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.42147475481033325, + "rewards/margins": 0.12910205125808716, + "rewards/rejected": 0.2923726737499237, + "step": 210 + }, + { + "epoch": 0.44142259414225943, + "grad_norm": 32.3610083786221, + "learning_rate": 3.4271004164563294e-07, + "logits/chosen": -2.7253365516662598, + "logits/rejected": -2.6379623413085938, + "logps/chosen": -1.332204818725586, + "logps/rejected": -1.8204054832458496, + "loss": -0.1333, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.32043054699897766, + "rewards/margins": 0.06172497570514679, + "rewards/rejected": 0.25870558619499207, + "step": 211 + }, + { + "epoch": 0.4435146443514644, + "grad_norm": 10.842115823256194, + "learning_rate": 3.410113110410366e-07, + "logits/chosen": -2.586061477661133, + "logits/rejected": -2.564213514328003, + "logps/chosen": -2.031338691711426, + "logps/rejected": -2.1223268508911133, + "loss": -0.0833, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3001391291618347, + "rewards/margins": 0.08526147902011871, + "rewards/rejected": 0.2148776352405548, + "step": 212 + }, + { + "epoch": 0.4456066945606695, + "grad_norm": 9.514760245882242, + "learning_rate": 3.3930772245028317e-07, + "logits/chosen": -2.7689568996429443, + "logits/rejected": -2.7311832904815674, + "logps/chosen": -1.108777642250061, + "logps/rejected": -1.4788711071014404, + "loss": -0.0744, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36296340823173523, + "rewards/margins": 0.10396925359964371, + "rewards/rejected": 0.2589941620826721, + "step": 213 + }, + { + "epoch": 0.4476987447698745, + "grad_norm": 18.402743823262046, + "learning_rate": 3.3759936680723233e-07, + "logits/chosen": -2.7321577072143555, + "logits/rejected": -2.66558837890625, + "logps/chosen": -1.065527081489563, + "logps/rejected": -2.6747887134552, + "loss": -0.1122, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36968982219696045, + "rewards/margins": 0.13903871178627014, + "rewards/rejected": 0.2306510955095291, + "step": 214 + }, + { + "epoch": 0.4497907949790795, + "grad_norm": 18.17225957514382, + "learning_rate": 3.3588633530019866e-07, + "logits/chosen": -2.7257394790649414, + "logits/rejected": -2.6584632396698, + "logps/chosen": -1.2919788360595703, + "logps/rejected": -2.163170099258423, + "loss": -0.0922, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.324113667011261, + "rewards/margins": 0.10252688080072403, + "rewards/rejected": 0.22158677875995636, + "step": 215 + }, + { + "epoch": 0.45188284518828453, + "grad_norm": 9.915365267290412, + "learning_rate": 3.341687193670843e-07, + "logits/chosen": -2.619964599609375, + "logits/rejected": -2.6678147315979004, + "logps/chosen": -1.0964298248291016, + "logps/rejected": -1.6010725498199463, + "loss": -0.1037, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.363621324300766, + "rewards/margins": 0.09185197204351425, + "rewards/rejected": 0.27176934480667114, + "step": 216 + }, + { + "epoch": 0.45397489539748953, + "grad_norm": 12.895209885390825, + "learning_rate": 3.3244661069049806e-07, + "logits/chosen": -2.659514904022217, + "logits/rejected": -2.7047770023345947, + "logps/chosen": -1.2583937644958496, + "logps/rejected": -1.487951636314392, + "loss": -0.1083, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3770939111709595, + "rewards/margins": 0.06072164326906204, + "rewards/rejected": 0.316372275352478, + "step": 217 + }, + { + "epoch": 0.4560669456066946, + "grad_norm": 10.495110963013996, + "learning_rate": 3.3072010119286155e-07, + "logits/chosen": -2.5267844200134277, + "logits/rejected": -2.553617477416992, + "logps/chosen": -1.634291410446167, + "logps/rejected": -2.40535569190979, + "loss": -0.1146, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3271016478538513, + "rewards/margins": 0.12336497008800507, + "rewards/rejected": 0.20373669266700745, + "step": 218 + }, + { + "epoch": 0.4581589958158996, + "grad_norm": 17.247831788836987, + "learning_rate": 3.289892830315028e-07, + "logits/chosen": -2.6873631477355957, + "logits/rejected": -2.706193447113037, + "logps/chosen": -1.4748320579528809, + "logps/rejected": -1.753096103668213, + "loss": -0.111, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3177873194217682, + "rewards/margins": 0.0883328765630722, + "rewards/rejected": 0.2294544279575348, + "step": 219 + }, + { + "epoch": 0.4602510460251046, + "grad_norm": 11.987244502458385, + "learning_rate": 3.272542485937368e-07, + "logits/chosen": -2.706116199493408, + "logits/rejected": -2.677891254425049, + "logps/chosen": -1.3741241693496704, + "logps/rejected": -2.315911293029785, + "loss": -0.1119, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.2902843952178955, + "rewards/margins": 0.04889319837093353, + "rewards/rejected": 0.24139118194580078, + "step": 220 + }, + { + "epoch": 0.46234309623430964, + "grad_norm": 12.418005612952522, + "learning_rate": 3.2551509049193444e-07, + "logits/chosen": -2.2188100814819336, + "logits/rejected": -2.2306671142578125, + "logps/chosen": -1.245575189590454, + "logps/rejected": -1.4142975807189941, + "loss": -0.0828, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.36663031578063965, + "rewards/margins": 0.045284636318683624, + "rewards/rejected": 0.32134565711021423, + "step": 221 + }, + { + "epoch": 0.46443514644351463, + "grad_norm": 20.459625679835504, + "learning_rate": 3.2377190155857864e-07, + "logits/chosen": -2.6385624408721924, + "logits/rejected": -2.5443832874298096, + "logps/chosen": -1.0716688632965088, + "logps/rejected": -2.228652000427246, + "loss": -0.1143, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3814164102077484, + "rewards/margins": 0.11980819702148438, + "rewards/rejected": 0.26160821318626404, + "step": 222 + }, + { + "epoch": 0.4665271966527197, + "grad_norm": 9.310053766507147, + "learning_rate": 3.220247748413094e-07, + "logits/chosen": -2.6578550338745117, + "logits/rejected": -2.5362112522125244, + "logps/chosen": -1.8237850666046143, + "logps/rejected": -2.437105655670166, + "loss": -0.1295, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.29475274682044983, + "rewards/margins": 0.08544115722179413, + "rewards/rejected": 0.2093116044998169, + "step": 223 + }, + { + "epoch": 0.4686192468619247, + "grad_norm": 26.131595636840544, + "learning_rate": 3.2027380359795706e-07, + "logits/chosen": -2.605294942855835, + "logits/rejected": -2.6494808197021484, + "logps/chosen": -1.6993333101272583, + "logps/rejected": -2.158817768096924, + "loss": -0.1396, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.29801005125045776, + "rewards/margins": 0.04826948419213295, + "rewards/rejected": 0.24974055588245392, + "step": 224 + }, + { + "epoch": 0.4707112970711297, + "grad_norm": 15.164313053378295, + "learning_rate": 3.185190812915646e-07, + "logits/chosen": -2.7706186771392822, + "logits/rejected": -2.7250330448150635, + "logps/chosen": -1.3935312032699585, + "logps/rejected": -1.8978990316390991, + "loss": -0.0839, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.37241330742836, + "rewards/margins": 0.08906591683626175, + "rewards/rejected": 0.28334739804267883, + "step": 225 + }, + { + "epoch": 0.47280334728033474, + "grad_norm": 12.335340044509186, + "learning_rate": 3.167607015853983e-07, + "logits/chosen": -2.734358787536621, + "logits/rejected": -2.7213616371154785, + "logps/chosen": -1.0013186931610107, + "logps/rejected": -1.8874804973602295, + "loss": -0.1055, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3800060451030731, + "rewards/margins": 0.12340433895587921, + "rewards/rejected": 0.2566016912460327, + "step": 226 + }, + { + "epoch": 0.47489539748953974, + "grad_norm": 13.155370252466945, + "learning_rate": 3.149987583379485e-07, + "logits/chosen": -2.5443460941314697, + "logits/rejected": -2.6617841720581055, + "logps/chosen": -1.8027722835540771, + "logps/rejected": -1.8264943361282349, + "loss": -0.0883, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.29171156883239746, + "rewards/margins": -0.008143829181790352, + "rewards/rejected": 0.29985541105270386, + "step": 227 + }, + { + "epoch": 0.4769874476987448, + "grad_norm": 9.64227092918857, + "learning_rate": 3.1323334559792015e-07, + "logits/chosen": -2.615062713623047, + "logits/rejected": -2.5575170516967773, + "logps/chosen": -1.3669350147247314, + "logps/rejected": -2.0923495292663574, + "loss": -0.0975, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3245178461074829, + "rewards/margins": 0.047479961067438126, + "rewards/rejected": 0.2770378887653351, + "step": 228 + }, + { + "epoch": 0.4790794979079498, + "grad_norm": 27.480598402768067, + "learning_rate": 3.114645575992116e-07, + "logits/chosen": -2.754039764404297, + "logits/rejected": -2.71734619140625, + "logps/chosen": -1.1088240146636963, + "logps/rejected": -1.2668546438217163, + "loss": -0.1377, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3654661178588867, + "rewards/margins": 0.04204658791422844, + "rewards/rejected": 0.3234195411205292, + "step": 229 + }, + { + "epoch": 0.4811715481171548, + "grad_norm": 29.703059245061358, + "learning_rate": 3.096924887558854e-07, + "logits/chosen": -2.4283628463745117, + "logits/rejected": -2.3639044761657715, + "logps/chosen": -1.721587061882019, + "logps/rejected": -1.948933720588684, + "loss": -0.095, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2999853789806366, + "rewards/margins": 0.036863137036561966, + "rewards/rejected": 0.2631222605705261, + "step": 230 + }, + { + "epoch": 0.48326359832635984, + "grad_norm": 16.287734133486, + "learning_rate": 3.079172336571286e-07, + "logits/chosen": -2.5601136684417725, + "logits/rejected": -2.435715675354004, + "logps/chosen": -1.1398509740829468, + "logps/rejected": -1.23728346824646, + "loss": -0.1092, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.37845513224601746, + "rewards/margins": 0.035441212356090546, + "rewards/rejected": 0.3430139422416687, + "step": 231 + }, + { + "epoch": 0.48535564853556484, + "grad_norm": 21.134919315037024, + "learning_rate": 3.061388870622033e-07, + "logits/chosen": -2.6344518661499023, + "logits/rejected": -2.6067075729370117, + "logps/chosen": -1.2356458902359009, + "logps/rejected": -1.4378910064697266, + "loss": -0.0805, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3725782036781311, + "rewards/margins": 0.08114241808652878, + "rewards/rejected": 0.29143577814102173, + "step": 232 + }, + { + "epoch": 0.4874476987447699, + "grad_norm": 16.14848191061866, + "learning_rate": 3.0435754389538925e-07, + "logits/chosen": -2.6492137908935547, + "logits/rejected": -2.6512625217437744, + "logps/chosen": -0.9026902914047241, + "logps/rejected": -1.7054013013839722, + "loss": -0.1189, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.45766186714172363, + "rewards/margins": 0.2092771828174591, + "rewards/rejected": 0.24838465452194214, + "step": 233 + }, + { + "epoch": 0.4895397489539749, + "grad_norm": 13.080452441868529, + "learning_rate": 3.0257329924091654e-07, + "logits/chosen": -2.4482171535491943, + "logits/rejected": -2.4012904167175293, + "logps/chosen": -1.2280887365341187, + "logps/rejected": -2.007750988006592, + "loss": -0.1044, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.34713730216026306, + "rewards/margins": 0.0972638949751854, + "rewards/rejected": 0.24987338483333588, + "step": 234 + }, + { + "epoch": 0.4916317991631799, + "grad_norm": 14.805770182211953, + "learning_rate": 3.007862483378906e-07, + "logits/chosen": -2.4385581016540527, + "logits/rejected": -2.5235042572021484, + "logps/chosen": -0.982105553150177, + "logps/rejected": -2.118596315383911, + "loss": -0.1241, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4126356542110443, + "rewards/margins": 0.11890283226966858, + "rewards/rejected": 0.29373282194137573, + "step": 235 + }, + { + "epoch": 0.49372384937238495, + "grad_norm": 17.000928068072675, + "learning_rate": 2.989964865752079e-07, + "logits/chosen": -2.6185555458068848, + "logits/rejected": -2.569633960723877, + "logps/chosen": -1.3694891929626465, + "logps/rejected": -2.023630380630493, + "loss": -0.1363, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4494704008102417, + "rewards/margins": 0.11591412127017975, + "rewards/rejected": 0.33355626463890076, + "step": 236 + }, + { + "epoch": 0.49581589958158995, + "grad_norm": 11.97132435495436, + "learning_rate": 2.97204109486465e-07, + "logits/chosen": -2.6144487857818604, + "logits/rejected": -2.5463814735412598, + "logps/chosen": -1.415244460105896, + "logps/rejected": -2.0929644107818604, + "loss": -0.0947, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34909066557884216, + "rewards/margins": 0.054297953844070435, + "rewards/rejected": 0.29479271173477173, + "step": 237 + }, + { + "epoch": 0.497907949790795, + "grad_norm": 13.176263587191198, + "learning_rate": 2.954092127448591e-07, + "logits/chosen": -2.405916690826416, + "logits/rejected": -2.5460305213928223, + "logps/chosen": -1.3031535148620605, + "logps/rejected": -1.7486495971679688, + "loss": -0.0982, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3606809377670288, + "rewards/margins": 0.08783643692731857, + "rewards/rejected": 0.27284449338912964, + "step": 238 + }, + { + "epoch": 0.5, + "grad_norm": 10.727719660590079, + "learning_rate": 2.9361189215808057e-07, + "logits/chosen": -2.478989601135254, + "logits/rejected": -2.439676284790039, + "logps/chosen": -1.472078800201416, + "logps/rejected": -1.6831552982330322, + "loss": -0.1018, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35690170526504517, + "rewards/margins": 0.0915931761264801, + "rewards/rejected": 0.26530852913856506, + "step": 239 + }, + { + "epoch": 0.502092050209205, + "grad_norm": 6.3268417752750254, + "learning_rate": 2.9181224366319943e-07, + "logits/chosen": -2.3423075675964355, + "logits/rejected": -2.273186683654785, + "logps/chosen": -0.9384135007858276, + "logps/rejected": -2.9800286293029785, + "loss": -0.1227, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4423084259033203, + "rewards/margins": 0.2813805341720581, + "rewards/rejected": 0.1609278917312622, + "step": 240 + }, + { + "epoch": 0.50418410041841, + "grad_norm": 12.995796697837475, + "learning_rate": 2.900103633215447e-07, + "logits/chosen": -2.729633331298828, + "logits/rejected": -2.6653881072998047, + "logps/chosen": -0.8296223878860474, + "logps/rejected": -1.6522765159606934, + "loss": -0.1, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.48354005813598633, + "rewards/margins": 0.20459306240081787, + "rewards/rejected": 0.27894699573516846, + "step": 241 + }, + { + "epoch": 0.5062761506276151, + "grad_norm": 18.75630509223207, + "learning_rate": 2.882063473135763e-07, + "logits/chosen": -2.5082345008850098, + "logits/rejected": -2.5162205696105957, + "logps/chosen": -1.2960155010223389, + "logps/rejected": -2.2314138412475586, + "loss": -0.1344, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4037836790084839, + "rewards/margins": 0.16876041889190674, + "rewards/rejected": 0.23502326011657715, + "step": 242 + }, + { + "epoch": 0.5083682008368201, + "grad_norm": 8.482528682337579, + "learning_rate": 2.864002919337512e-07, + "logits/chosen": -2.416985511779785, + "logits/rejected": -2.3670668601989746, + "logps/chosen": -1.2545527219772339, + "logps/rejected": -1.3394420146942139, + "loss": -0.1234, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3523995876312256, + "rewards/margins": 0.04634542763233185, + "rewards/rejected": 0.30605414509773254, + "step": 243 + }, + { + "epoch": 0.5104602510460251, + "grad_norm": 10.719105172459228, + "learning_rate": 2.8459229358538404e-07, + "logits/chosen": -2.5898609161376953, + "logits/rejected": -2.541146755218506, + "logps/chosen": -1.346826434135437, + "logps/rejected": -1.5069986581802368, + "loss": -0.1222, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3515220880508423, + "rewards/margins": 0.023752853274345398, + "rewards/rejected": 0.3277692198753357, + "step": 244 + }, + { + "epoch": 0.5125523012552301, + "grad_norm": 11.8510977063605, + "learning_rate": 2.827824487755007e-07, + "logits/chosen": -2.5046603679656982, + "logits/rejected": -2.5287513732910156, + "logps/chosen": -0.9352712035179138, + "logps/rejected": -1.6383126974105835, + "loss": -0.1216, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.43307042121887207, + "rewards/margins": 0.14084957540035248, + "rewards/rejected": 0.2922208607196808, + "step": 245 + }, + { + "epoch": 0.5146443514644351, + "grad_norm": 15.503410702429422, + "learning_rate": 2.8097085410968694e-07, + "logits/chosen": -2.634338855743408, + "logits/rejected": -2.51353120803833, + "logps/chosen": -1.4211082458496094, + "logps/rejected": -2.5676441192626953, + "loss": -0.0967, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.36743271350860596, + "rewards/margins": 0.1599361002445221, + "rewards/rejected": 0.20749661326408386, + "step": 246 + }, + { + "epoch": 0.5167364016736402, + "grad_norm": 14.807369583073374, + "learning_rate": 2.7915760628693253e-07, + "logits/chosen": -2.6446118354797363, + "logits/rejected": -2.5478878021240234, + "logps/chosen": -1.0431950092315674, + "logps/rejected": -2.3127846717834473, + "loss": -0.1089, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.4418485760688782, + "rewards/margins": 0.20923131704330444, + "rewards/rejected": 0.23261725902557373, + "step": 247 + }, + { + "epoch": 0.5188284518828452, + "grad_norm": 101.83732528062832, + "learning_rate": 2.7734280209446865e-07, + "logits/chosen": -2.5180978775024414, + "logits/rejected": -2.453972339630127, + "logps/chosen": -0.8324543237686157, + "logps/rejected": -1.6864728927612305, + "loss": -0.1014, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.46302682161331177, + "rewards/margins": 0.18120057880878448, + "rewards/rejected": 0.2818262279033661, + "step": 248 + }, + { + "epoch": 0.5209205020920502, + "grad_norm": 9.351081141015694, + "learning_rate": 2.755265384026023e-07, + "logits/chosen": -2.600846290588379, + "logits/rejected": -2.6015748977661133, + "logps/chosen": -1.2091116905212402, + "logps/rejected": -1.1987733840942383, + "loss": -0.0511, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3433293104171753, + "rewards/margins": 0.025443512946367264, + "rewards/rejected": 0.31788578629493713, + "step": 249 + }, + { + "epoch": 0.5230125523012552, + "grad_norm": 6.870687623461676, + "learning_rate": 2.7370891215954565e-07, + "logits/chosen": -2.494401454925537, + "logits/rejected": -2.4487247467041016, + "logps/chosen": -1.596292495727539, + "logps/rejected": -1.5297508239746094, + "loss": -0.1328, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.34035491943359375, + "rewards/margins": 0.039452340453863144, + "rewards/rejected": 0.3009026050567627, + "step": 250 + }, + { + "epoch": 0.5251046025104602, + "grad_norm": 13.661153720373681, + "learning_rate": 2.7189002038624057e-07, + "logits/chosen": -2.5769824981689453, + "logits/rejected": -2.481782913208008, + "logps/chosen": -1.1278202533721924, + "logps/rejected": -1.621917963027954, + "loss": -0.0929, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.37641996145248413, + "rewards/margins": 0.02839614823460579, + "rewards/rejected": 0.34802380204200745, + "step": 251 + }, + { + "epoch": 0.5271966527196653, + "grad_norm": 24.04163690716132, + "learning_rate": 2.7006996017118027e-07, + "logits/chosen": -2.618551254272461, + "logits/rejected": -2.5014870166778564, + "logps/chosen": -1.0564649105072021, + "logps/rejected": -1.7426464557647705, + "loss": -0.0936, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3888087868690491, + "rewards/margins": 0.12157319486141205, + "rewards/rejected": 0.2672356367111206, + "step": 252 + }, + { + "epoch": 0.5292887029288703, + "grad_norm": 14.053938365943903, + "learning_rate": 2.682488286652269e-07, + "logits/chosen": -2.354100227355957, + "logits/rejected": -2.320981502532959, + "logps/chosen": -1.0166618824005127, + "logps/rejected": -2.0148091316223145, + "loss": -0.1304, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4282534122467041, + "rewards/margins": 0.19922780990600586, + "rewards/rejected": 0.22902560234069824, + "step": 253 + }, + { + "epoch": 0.5313807531380753, + "grad_norm": 9.783298850662142, + "learning_rate": 2.6642672307642573e-07, + "logits/chosen": -2.5941338539123535, + "logits/rejected": -2.4625022411346436, + "logps/chosen": -1.620993971824646, + "logps/rejected": -1.5014917850494385, + "loss": -0.1108, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.350075900554657, + "rewards/margins": 0.060651615262031555, + "rewards/rejected": 0.2894243001937866, + "step": 254 + }, + { + "epoch": 0.5334728033472803, + "grad_norm": 43.27055232189887, + "learning_rate": 2.646037406648165e-07, + "logits/chosen": -2.4940779209136963, + "logits/rejected": -2.4904818534851074, + "logps/chosen": -1.324521541595459, + "logps/rejected": -1.9451937675476074, + "loss": -0.0874, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.32973212003707886, + "rewards/margins": 0.11690256744623184, + "rewards/rejected": 0.2128295600414276, + "step": 255 + }, + { + "epoch": 0.5355648535564853, + "grad_norm": 32.69660335713406, + "learning_rate": 2.6277997873724176e-07, + "logits/chosen": -2.576993465423584, + "logits/rejected": -2.549190044403076, + "logps/chosen": -1.1641985177993774, + "logps/rejected": -1.3727514743804932, + "loss": -0.1371, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3553909659385681, + "rewards/margins": 0.04894494265317917, + "rewards/rejected": 0.30644604563713074, + "step": 256 + }, + { + "epoch": 0.5376569037656904, + "grad_norm": 17.152292597708968, + "learning_rate": 2.609555346421532e-07, + "logits/chosen": -2.582218647003174, + "logits/rejected": -2.4460763931274414, + "logps/chosen": -0.8872560262680054, + "logps/rejected": -2.193643093109131, + "loss": -0.1292, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4860767722129822, + "rewards/margins": 0.2561631202697754, + "rewards/rejected": 0.2299136519432068, + "step": 257 + }, + { + "epoch": 0.5397489539748954, + "grad_norm": 20.420927555489797, + "learning_rate": 2.5913050576441473e-07, + "logits/chosen": -2.537754535675049, + "logits/rejected": -2.533719062805176, + "logps/chosen": -1.4319506883621216, + "logps/rejected": -1.8760106563568115, + "loss": -0.1314, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.41588470339775085, + "rewards/margins": 0.10261406004428864, + "rewards/rejected": 0.313270628452301, + "step": 258 + }, + { + "epoch": 0.5418410041841004, + "grad_norm": 24.83960755558801, + "learning_rate": 2.5730498952010496e-07, + "logits/chosen": -2.3227603435516357, + "logits/rejected": -2.322937488555908, + "logps/chosen": -1.1083595752716064, + "logps/rejected": -1.3232736587524414, + "loss": -0.1028, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.375250905752182, + "rewards/margins": 0.0630226731300354, + "rewards/rejected": 0.3122282326221466, + "step": 259 + }, + { + "epoch": 0.5439330543933054, + "grad_norm": 14.323617407100764, + "learning_rate": 2.55479083351317e-07, + "logits/chosen": -2.5259079933166504, + "logits/rejected": -2.4510679244995117, + "logps/chosen": -0.9364595413208008, + "logps/rejected": -2.3588366508483887, + "loss": -0.0809, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4500068426132202, + "rewards/margins": 0.18414944410324097, + "rewards/rejected": 0.26585739850997925, + "step": 260 + }, + { + "epoch": 0.5460251046025104, + "grad_norm": 17.10022407964933, + "learning_rate": 2.536528847209573e-07, + "logits/chosen": -2.4352588653564453, + "logits/rejected": -2.2389445304870605, + "logps/chosen": -1.1492376327514648, + "logps/rejected": -1.8517694473266602, + "loss": -0.124, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4028159976005554, + "rewards/margins": 0.12413935363292694, + "rewards/rejected": 0.2786766290664673, + "step": 261 + }, + { + "epoch": 0.5481171548117155, + "grad_norm": 10.190814456155364, + "learning_rate": 2.5182649110754325e-07, + "logits/chosen": -2.412586212158203, + "logits/rejected": -2.4488370418548584, + "logps/chosen": -1.2817234992980957, + "logps/rejected": -1.8377283811569214, + "loss": -0.1022, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38602763414382935, + "rewards/margins": 0.09022464603185654, + "rewards/rejected": 0.2958029508590698, + "step": 262 + }, + { + "epoch": 0.5502092050209205, + "grad_norm": 8.793628588219915, + "learning_rate": 2.5e-07, + "logits/chosen": -2.462949275970459, + "logits/rejected": -2.5068588256835938, + "logps/chosen": -1.3072988986968994, + "logps/rejected": -1.7679378986358643, + "loss": -0.0738, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3691004514694214, + "rewards/margins": 0.07263248413801193, + "rewards/rejected": 0.29646795988082886, + "step": 263 + }, + { + "epoch": 0.5523012552301255, + "grad_norm": 15.380045241350123, + "learning_rate": 2.4817350889245673e-07, + "logits/chosen": -2.341583013534546, + "logits/rejected": -2.3073818683624268, + "logps/chosen": -1.3789433240890503, + "logps/rejected": -2.094804048538208, + "loss": -0.1176, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3920101821422577, + "rewards/margins": 0.10554394125938416, + "rewards/rejected": 0.28646624088287354, + "step": 264 + }, + { + "epoch": 0.5543933054393305, + "grad_norm": 8.58809117665816, + "learning_rate": 2.463471152790427e-07, + "logits/chosen": -2.4648337364196777, + "logits/rejected": -2.2489168643951416, + "logps/chosen": -1.2276692390441895, + "logps/rejected": -2.249476909637451, + "loss": -0.1457, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.48874348402023315, + "rewards/margins": 0.22283288836479187, + "rewards/rejected": 0.2659105658531189, + "step": 265 + }, + { + "epoch": 0.5564853556485355, + "grad_norm": 11.286756360470665, + "learning_rate": 2.44520916648683e-07, + "logits/chosen": -2.3162903785705566, + "logits/rejected": -2.3765616416931152, + "logps/chosen": -1.3577649593353271, + "logps/rejected": -2.117098808288574, + "loss": -0.1085, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3324105441570282, + "rewards/margins": 0.09969653189182281, + "rewards/rejected": 0.23271401226520538, + "step": 266 + }, + { + "epoch": 0.5585774058577406, + "grad_norm": 15.174465033011408, + "learning_rate": 2.426950104798951e-07, + "logits/chosen": -2.4470949172973633, + "logits/rejected": -2.42891788482666, + "logps/chosen": -1.2459077835083008, + "logps/rejected": -1.9684381484985352, + "loss": -0.11, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.40111368894577026, + "rewards/margins": 0.14265714585781097, + "rewards/rejected": 0.2584565281867981, + "step": 267 + }, + { + "epoch": 0.5606694560669456, + "grad_norm": 14.450224054308865, + "learning_rate": 2.4086949423558525e-07, + "logits/chosen": -2.407813549041748, + "logits/rejected": -2.3005266189575195, + "logps/chosen": -1.276888370513916, + "logps/rejected": -2.612614631652832, + "loss": -0.112, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39179039001464844, + "rewards/margins": 0.14099331200122833, + "rewards/rejected": 0.2507970631122589, + "step": 268 + }, + { + "epoch": 0.5627615062761506, + "grad_norm": 17.153862681122654, + "learning_rate": 2.3904446535784686e-07, + "logits/chosen": -2.3327040672302246, + "logits/rejected": -2.223817825317383, + "logps/chosen": -1.4948809146881104, + "logps/rejected": -2.7149105072021484, + "loss": -0.1135, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3385244309902191, + "rewards/margins": 0.10032963752746582, + "rewards/rejected": 0.2381947934627533, + "step": 269 + }, + { + "epoch": 0.5648535564853556, + "grad_norm": 16.515843039005382, + "learning_rate": 2.3722002126275822e-07, + "logits/chosen": -2.504027843475342, + "logits/rejected": -2.405820369720459, + "logps/chosen": -0.9141228199005127, + "logps/rejected": -1.5929572582244873, + "loss": -0.1279, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42940253019332886, + "rewards/margins": 0.11588534712791443, + "rewards/rejected": 0.3135172128677368, + "step": 270 + }, + { + "epoch": 0.5669456066945606, + "grad_norm": 10.230006000297033, + "learning_rate": 2.353962593351835e-07, + "logits/chosen": -2.282273292541504, + "logits/rejected": -2.239558696746826, + "logps/chosen": -1.2985116243362427, + "logps/rejected": -2.147219181060791, + "loss": -0.1401, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.33533766865730286, + "rewards/margins": 0.10907959938049316, + "rewards/rejected": 0.2262580394744873, + "step": 271 + }, + { + "epoch": 0.5690376569037657, + "grad_norm": 29.235986727351534, + "learning_rate": 2.3357327692357428e-07, + "logits/chosen": -2.3464512825012207, + "logits/rejected": -2.2939443588256836, + "logps/chosen": -0.868351936340332, + "logps/rejected": -2.0506629943847656, + "loss": -0.1217, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.46230238676071167, + "rewards/margins": 0.1395689845085144, + "rewards/rejected": 0.32273340225219727, + "step": 272 + }, + { + "epoch": 0.5711297071129707, + "grad_norm": 13.500310258479074, + "learning_rate": 2.317511713347731e-07, + "logits/chosen": -2.6007192134857178, + "logits/rejected": -2.519045352935791, + "logps/chosen": -0.9955112934112549, + "logps/rejected": -2.4560132026672363, + "loss": -0.1451, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4444941282272339, + "rewards/margins": 0.19739052653312683, + "rewards/rejected": 0.24710360169410706, + "step": 273 + }, + { + "epoch": 0.5732217573221757, + "grad_norm": 16.783212103053867, + "learning_rate": 2.2993003982881973e-07, + "logits/chosen": -2.2793025970458984, + "logits/rejected": -2.195977210998535, + "logps/chosen": -1.0133543014526367, + "logps/rejected": -2.022432804107666, + "loss": -0.1397, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.38419491052627563, + "rewards/margins": 0.14749623835086823, + "rewards/rejected": 0.2366986870765686, + "step": 274 + }, + { + "epoch": 0.5753138075313807, + "grad_norm": 10.232857115492903, + "learning_rate": 2.2810997961375938e-07, + "logits/chosen": -2.620478630065918, + "logits/rejected": -2.368046283721924, + "logps/chosen": -0.8385424613952637, + "logps/rejected": -1.756354570388794, + "loss": -0.1605, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4779413044452667, + "rewards/margins": 0.16384732723236084, + "rewards/rejected": 0.31409400701522827, + "step": 275 + }, + { + "epoch": 0.5774058577405857, + "grad_norm": 9.441049880789874, + "learning_rate": 2.2629108784045436e-07, + "logits/chosen": -2.2932982444763184, + "logits/rejected": -2.1024169921875, + "logps/chosen": -1.0233564376831055, + "logps/rejected": -1.8084924221038818, + "loss": -0.1577, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.39267247915267944, + "rewards/margins": 0.17666901648044586, + "rewards/rejected": 0.21600347757339478, + "step": 276 + }, + { + "epoch": 0.5794979079497908, + "grad_norm": 11.23380249432245, + "learning_rate": 2.2447346159739768e-07, + "logits/chosen": -2.1626408100128174, + "logits/rejected": -2.164907455444336, + "logps/chosen": -1.130321979522705, + "logps/rejected": -1.553159475326538, + "loss": -0.0833, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3877987563610077, + "rewards/margins": 0.061269715428352356, + "rewards/rejected": 0.32652902603149414, + "step": 277 + }, + { + "epoch": 0.5815899581589958, + "grad_norm": 15.096559124916823, + "learning_rate": 2.2265719790553146e-07, + "logits/chosen": -2.020289897918701, + "logits/rejected": -2.020287036895752, + "logps/chosen": -1.4989287853240967, + "logps/rejected": -1.5906364917755127, + "loss": -0.1001, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3539296090602875, + "rewards/margins": 0.014707334339618683, + "rewards/rejected": 0.3392222821712494, + "step": 278 + }, + { + "epoch": 0.5836820083682008, + "grad_norm": 10.528790195156935, + "learning_rate": 2.2084239371306752e-07, + "logits/chosen": -2.0334033966064453, + "logits/rejected": -1.881544589996338, + "logps/chosen": -0.9108498096466064, + "logps/rejected": -1.809361457824707, + "loss": -0.0869, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4324657917022705, + "rewards/margins": 0.1985461413860321, + "rewards/rejected": 0.23391962051391602, + "step": 279 + }, + { + "epoch": 0.5857740585774058, + "grad_norm": 10.4888057073251, + "learning_rate": 2.19029145890313e-07, + "logits/chosen": -2.1263463497161865, + "logits/rejected": -1.9130442142486572, + "logps/chosen": -0.9421380758285522, + "logps/rejected": -2.6061289310455322, + "loss": -0.136, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4234393835067749, + "rewards/margins": 0.15696920454502106, + "rewards/rejected": 0.26647016406059265, + "step": 280 + }, + { + "epoch": 0.5878661087866108, + "grad_norm": 10.261955546016459, + "learning_rate": 2.172175512244993e-07, + "logits/chosen": -1.8044922351837158, + "logits/rejected": -1.7952802181243896, + "logps/chosen": -1.216686725616455, + "logps/rejected": -1.8076926469802856, + "loss": -0.1096, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35631251335144043, + "rewards/margins": 0.12221503257751465, + "rewards/rejected": 0.2340974658727646, + "step": 281 + }, + { + "epoch": 0.5899581589958159, + "grad_norm": 9.452698854031054, + "learning_rate": 2.154077064146159e-07, + "logits/chosen": -2.2612810134887695, + "logits/rejected": -2.135688543319702, + "logps/chosen": -1.256422519683838, + "logps/rejected": -2.5949811935424805, + "loss": -0.1141, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3647221326828003, + "rewards/margins": 0.1851029098033905, + "rewards/rejected": 0.17961925268173218, + "step": 282 + }, + { + "epoch": 0.5920502092050209, + "grad_norm": 12.99298258428552, + "learning_rate": 2.1359970806624884e-07, + "logits/chosen": -2.015012741088867, + "logits/rejected": -1.8471283912658691, + "logps/chosen": -1.1125614643096924, + "logps/rejected": -3.3590145111083984, + "loss": -0.1401, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.39575979113578796, + "rewards/margins": 0.21206815540790558, + "rewards/rejected": 0.1836916208267212, + "step": 283 + }, + { + "epoch": 0.5941422594142259, + "grad_norm": 11.402998787496921, + "learning_rate": 2.1179365268642375e-07, + "logits/chosen": -1.88407301902771, + "logits/rejected": -1.6973862648010254, + "logps/chosen": -1.3595623970031738, + "logps/rejected": -1.9901232719421387, + "loss": -0.1053, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3310593366622925, + "rewards/margins": 0.025551561266183853, + "rewards/rejected": 0.3055077791213989, + "step": 284 + }, + { + "epoch": 0.5962343096234309, + "grad_norm": 12.544217446992135, + "learning_rate": 2.0998963667845536e-07, + "logits/chosen": -2.0112407207489014, + "logits/rejected": -1.8508641719818115, + "logps/chosen": -1.3053799867630005, + "logps/rejected": -1.789559006690979, + "loss": -0.0668, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.36560434103012085, + "rewards/margins": 0.05719471722841263, + "rewards/rejected": 0.3084096312522888, + "step": 285 + }, + { + "epoch": 0.5983263598326359, + "grad_norm": 9.227582418093855, + "learning_rate": 2.0818775633680055e-07, + "logits/chosen": -2.041214942932129, + "logits/rejected": -1.606358528137207, + "logps/chosen": -1.0451130867004395, + "logps/rejected": -3.085855484008789, + "loss": -0.1153, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.41002923250198364, + "rewards/margins": 0.2596610188484192, + "rewards/rejected": 0.15036822855472565, + "step": 286 + }, + { + "epoch": 0.600418410041841, + "grad_norm": 10.879171338781033, + "learning_rate": 2.0638810784191946e-07, + "logits/chosen": -1.8780803680419922, + "logits/rejected": -1.659158706665039, + "logps/chosen": -1.0779826641082764, + "logps/rejected": -1.470560073852539, + "loss": -0.1425, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4114425778388977, + "rewards/margins": 0.05348636955022812, + "rewards/rejected": 0.357956200838089, + "step": 287 + }, + { + "epoch": 0.602510460251046, + "grad_norm": 24.091905733565373, + "learning_rate": 2.0459078725514089e-07, + "logits/chosen": -1.4315662384033203, + "logits/rejected": -1.4450922012329102, + "logps/chosen": -1.0725127458572388, + "logps/rejected": -1.8540666103363037, + "loss": -0.105, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.4384384751319885, + "rewards/margins": 0.028723157942295074, + "rewards/rejected": 0.40971535444259644, + "step": 288 + }, + { + "epoch": 0.604602510460251, + "grad_norm": 7.747661853144717, + "learning_rate": 2.027958905135349e-07, + "logits/chosen": -1.7538211345672607, + "logits/rejected": -1.6950899362564087, + "logps/chosen": -1.7596023082733154, + "logps/rejected": -2.0811004638671875, + "loss": -0.0933, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3173483610153198, + "rewards/margins": 0.06009531766176224, + "rewards/rejected": 0.2572530210018158, + "step": 289 + }, + { + "epoch": 0.606694560669456, + "grad_norm": 12.087581194789344, + "learning_rate": 2.0100351342479216e-07, + "logits/chosen": -1.7584887742996216, + "logits/rejected": -1.574345588684082, + "logps/chosen": -0.9665372967720032, + "logps/rejected": -2.1095662117004395, + "loss": -0.1048, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4177166223526001, + "rewards/margins": 0.2137906551361084, + "rewards/rejected": 0.2039259523153305, + "step": 290 + }, + { + "epoch": 0.608786610878661, + "grad_norm": 8.499022932555569, + "learning_rate": 1.9921375166210945e-07, + "logits/chosen": -1.7692201137542725, + "logits/rejected": -1.7351100444793701, + "logps/chosen": -1.3378679752349854, + "logps/rejected": -2.4489097595214844, + "loss": -0.165, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.32389402389526367, + "rewards/margins": 0.0668807253241539, + "rewards/rejected": 0.2570132911205292, + "step": 291 + }, + { + "epoch": 0.6108786610878661, + "grad_norm": 9.761211893478931, + "learning_rate": 1.9742670075908349e-07, + "logits/chosen": -1.6519404649734497, + "logits/rejected": -1.5875215530395508, + "logps/chosen": -1.360038161277771, + "logps/rejected": -1.602811336517334, + "loss": -0.099, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.37947362661361694, + "rewards/margins": 0.0840110331773758, + "rewards/rejected": 0.29546260833740234, + "step": 292 + }, + { + "epoch": 0.6129707112970711, + "grad_norm": 13.035324813489078, + "learning_rate": 1.9564245610461078e-07, + "logits/chosen": -1.7272558212280273, + "logits/rejected": -1.4798274040222168, + "logps/chosen": -1.3200814723968506, + "logps/rejected": -1.9456639289855957, + "loss": -0.102, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.31104105710983276, + "rewards/margins": 0.06010560691356659, + "rewards/rejected": 0.250935435295105, + "step": 293 + }, + { + "epoch": 0.6150627615062761, + "grad_norm": 12.373073851835231, + "learning_rate": 1.938611129377967e-07, + "logits/chosen": -1.4790561199188232, + "logits/rejected": -1.4483731985092163, + "logps/chosen": -1.1588627099990845, + "logps/rejected": -1.8893986940383911, + "loss": -0.1182, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40000268816947937, + "rewards/margins": 0.1388007402420044, + "rewards/rejected": 0.261201947927475, + "step": 294 + }, + { + "epoch": 0.6171548117154811, + "grad_norm": 13.570956779826611, + "learning_rate": 1.920827663428714e-07, + "logits/chosen": -1.3182231187820435, + "logits/rejected": -1.210131287574768, + "logps/chosen": -1.8925442695617676, + "logps/rejected": -2.5153698921203613, + "loss": -0.1257, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22584283351898193, + "rewards/margins": 0.044571515172719955, + "rewards/rejected": 0.18127131462097168, + "step": 295 + }, + { + "epoch": 0.6192468619246861, + "grad_norm": 12.244728666938812, + "learning_rate": 1.9030751124411448e-07, + "logits/chosen": -1.202715277671814, + "logits/rejected": -1.2500357627868652, + "logps/chosen": -1.0252957344055176, + "logps/rejected": -2.197416305541992, + "loss": -0.1301, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4244975447654724, + "rewards/margins": 0.19387859106063843, + "rewards/rejected": 0.2306189239025116, + "step": 296 + }, + { + "epoch": 0.6213389121338913, + "grad_norm": 11.318104404213964, + "learning_rate": 1.8853544240078836e-07, + "logits/chosen": -1.6210575103759766, + "logits/rejected": -1.6088294982910156, + "logps/chosen": -0.868645966053009, + "logps/rejected": -2.105026960372925, + "loss": -0.0987, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.461618572473526, + "rewards/margins": 0.15263152122497559, + "rewards/rejected": 0.3089870810508728, + "step": 297 + }, + { + "epoch": 0.6234309623430963, + "grad_norm": 13.963854313724484, + "learning_rate": 1.8676665440207977e-07, + "logits/chosen": -1.3747458457946777, + "logits/rejected": -1.2494533061981201, + "logps/chosen": -1.5411226749420166, + "logps/rejected": -2.833916664123535, + "loss": -0.1238, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.30651533603668213, + "rewards/margins": 0.11065477132797241, + "rewards/rejected": 0.1958606094121933, + "step": 298 + }, + { + "epoch": 0.6255230125523012, + "grad_norm": 12.924584175076303, + "learning_rate": 1.850012416620515e-07, + "logits/chosen": -1.06024169921875, + "logits/rejected": -1.0010329484939575, + "logps/chosen": -1.9282222986221313, + "logps/rejected": -2.097105026245117, + "loss": -0.1203, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2811262607574463, + "rewards/margins": 0.0608399361371994, + "rewards/rejected": 0.2202863246202469, + "step": 299 + }, + { + "epoch": 0.6276150627615062, + "grad_norm": 14.77377885396883, + "learning_rate": 1.8323929841460178e-07, + "logits/chosen": -1.4455621242523193, + "logits/rejected": -1.5248115062713623, + "logps/chosen": -1.1223115921020508, + "logps/rejected": -1.5705149173736572, + "loss": -0.1138, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.37588104605674744, + "rewards/margins": 0.06724551320075989, + "rewards/rejected": 0.30863553285598755, + "step": 300 + }, + { + "epoch": 0.6297071129707112, + "grad_norm": 12.758467910292707, + "learning_rate": 1.8148091870843552e-07, + "logits/chosen": -1.3640754222869873, + "logits/rejected": -1.0591566562652588, + "logps/chosen": -1.2672115564346313, + "logps/rejected": -2.286756992340088, + "loss": -0.1324, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44048044085502625, + "rewards/margins": 0.191227987408638, + "rewards/rejected": 0.24925243854522705, + "step": 301 + }, + { + "epoch": 0.6317991631799164, + "grad_norm": 12.745982416001658, + "learning_rate": 1.7972619640204294e-07, + "logits/chosen": -1.4261343479156494, + "logits/rejected": -1.451894760131836, + "logps/chosen": -1.2100127935409546, + "logps/rejected": -1.9714622497558594, + "loss": -0.1214, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40259698033332825, + "rewards/margins": 0.1440141350030899, + "rewards/rejected": 0.25858286023139954, + "step": 302 + }, + { + "epoch": 0.6338912133891214, + "grad_norm": 42.33215506608496, + "learning_rate": 1.779752251586906e-07, + "logits/chosen": -1.2698376178741455, + "logits/rejected": -1.4541172981262207, + "logps/chosen": -1.0781916379928589, + "logps/rejected": -2.400745153427124, + "loss": -0.1352, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.40851694345474243, + "rewards/margins": 0.07796123623847961, + "rewards/rejected": 0.3305557072162628, + "step": 303 + }, + { + "epoch": 0.6359832635983264, + "grad_norm": 11.646923762516986, + "learning_rate": 1.7622809844142137e-07, + "logits/chosen": -1.804600477218628, + "logits/rejected": -1.4111582040786743, + "logps/chosen": -1.0599024295806885, + "logps/rejected": -1.7829433679580688, + "loss": -0.141, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.43998903036117554, + "rewards/margins": 0.12246303260326385, + "rewards/rejected": 0.31752604246139526, + "step": 304 + }, + { + "epoch": 0.6380753138075314, + "grad_norm": 17.322291060310846, + "learning_rate": 1.7448490950806548e-07, + "logits/chosen": -0.9987385272979736, + "logits/rejected": -1.0168724060058594, + "logps/chosen": -1.1187671422958374, + "logps/rejected": -1.6882683038711548, + "loss": -0.1489, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4853442311286926, + "rewards/margins": 0.17401915788650513, + "rewards/rejected": 0.3113250732421875, + "step": 305 + }, + { + "epoch": 0.6401673640167364, + "grad_norm": 48.50507886583888, + "learning_rate": 1.7274575140626315e-07, + "logits/chosen": -1.501349925994873, + "logits/rejected": -1.0330767631530762, + "logps/chosen": -0.9468430280685425, + "logps/rejected": -2.1863956451416016, + "loss": -0.1216, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4197174608707428, + "rewards/margins": 0.1767096370458603, + "rewards/rejected": 0.2430078387260437, + "step": 306 + }, + { + "epoch": 0.6422594142259415, + "grad_norm": 33.49834381108919, + "learning_rate": 1.7101071696849718e-07, + "logits/chosen": -0.9369056224822998, + "logits/rejected": -0.749775767326355, + "logps/chosen": -1.204647421836853, + "logps/rejected": -2.528010606765747, + "loss": -0.1797, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.41651594638824463, + "rewards/margins": 0.20134611427783966, + "rewards/rejected": 0.21516986191272736, + "step": 307 + }, + { + "epoch": 0.6443514644351465, + "grad_norm": 9.165843117450295, + "learning_rate": 1.692798988071385e-07, + "logits/chosen": -1.354283094406128, + "logits/rejected": -1.3673646450042725, + "logps/chosen": -1.1920933723449707, + "logps/rejected": -3.002389907836914, + "loss": -0.0909, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3539876341819763, + "rewards/margins": 0.09995920956134796, + "rewards/rejected": 0.25402843952178955, + "step": 308 + }, + { + "epoch": 0.6464435146443515, + "grad_norm": 39.53221497092988, + "learning_rate": 1.6755338930950192e-07, + "logits/chosen": -1.3984010219573975, + "logits/rejected": -1.153122067451477, + "logps/chosen": -1.0089532136917114, + "logps/rejected": -2.453733444213867, + "loss": -0.1695, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4267140030860901, + "rewards/margins": 0.13178855180740356, + "rewards/rejected": 0.2949254512786865, + "step": 309 + }, + { + "epoch": 0.6485355648535565, + "grad_norm": 22.69036225057899, + "learning_rate": 1.6583128063291573e-07, + "logits/chosen": -1.415753960609436, + "logits/rejected": -1.2994680404663086, + "logps/chosen": -0.9845906496047974, + "logps/rejected": -1.4354872703552246, + "loss": -0.1213, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.39985141158103943, + "rewards/margins": 0.08858242630958557, + "rewards/rejected": 0.31126895546913147, + "step": 310 + }, + { + "epoch": 0.6506276150627615, + "grad_norm": 11.735736386990032, + "learning_rate": 1.6411366469980134e-07, + "logits/chosen": -0.8768999576568604, + "logits/rejected": -0.7540444135665894, + "logps/chosen": -1.6087219715118408, + "logps/rejected": -2.7079243659973145, + "loss": -0.1349, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3981132507324219, + "rewards/margins": 0.15352198481559753, + "rewards/rejected": 0.24459126591682434, + "step": 311 + }, + { + "epoch": 0.6527196652719666, + "grad_norm": 36.93233096234923, + "learning_rate": 1.6240063319276764e-07, + "logits/chosen": -1.090734601020813, + "logits/rejected": -0.999484658241272, + "logps/chosen": -1.0787606239318848, + "logps/rejected": -2.9695515632629395, + "loss": -0.1377, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4704767167568207, + "rewards/margins": 0.2525039315223694, + "rewards/rejected": 0.21797281503677368, + "step": 312 + }, + { + "epoch": 0.6548117154811716, + "grad_norm": 12.512648020048866, + "learning_rate": 1.606922775497168e-07, + "logits/chosen": -1.3213591575622559, + "logits/rejected": -0.8296306133270264, + "logps/chosen": -1.1784396171569824, + "logps/rejected": -2.6272573471069336, + "loss": -0.134, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33794206380844116, + "rewards/margins": 0.11368609964847565, + "rewards/rejected": 0.22425594925880432, + "step": 313 + }, + { + "epoch": 0.6569037656903766, + "grad_norm": 33.98215330381615, + "learning_rate": 1.5898868895896332e-07, + "logits/chosen": -1.473046898841858, + "logits/rejected": -0.959273099899292, + "logps/chosen": -1.1853554248809814, + "logps/rejected": -2.4030003547668457, + "loss": -0.1059, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4078238606452942, + "rewards/margins": 0.12516064941883087, + "rewards/rejected": 0.2826631963253021, + "step": 314 + }, + { + "epoch": 0.6589958158995816, + "grad_norm": 11.777819227549552, + "learning_rate": 1.572899583543671e-07, + "logits/chosen": -1.2090966701507568, + "logits/rejected": -0.9002936482429504, + "logps/chosen": -1.2504746913909912, + "logps/rejected": -3.4912562370300293, + "loss": -0.1467, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3819003105163574, + "rewards/margins": 0.18813729286193848, + "rewards/rejected": 0.19376301765441895, + "step": 315 + }, + { + "epoch": 0.6610878661087866, + "grad_norm": 13.64501155485151, + "learning_rate": 1.5559617641047885e-07, + "logits/chosen": -1.0406343936920166, + "logits/rejected": -0.9147306680679321, + "logps/chosen": -1.650768518447876, + "logps/rejected": -2.1240224838256836, + "loss": -0.129, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2685588300228119, + "rewards/margins": 0.01526784710586071, + "rewards/rejected": 0.25329098105430603, + "step": 316 + }, + { + "epoch": 0.6631799163179917, + "grad_norm": 17.681504166772957, + "learning_rate": 1.5390743353770108e-07, + "logits/chosen": -0.7469021081924438, + "logits/rejected": -0.7831078767776489, + "logps/chosen": -1.285473346710205, + "logps/rejected": -1.6377302408218384, + "loss": -0.0996, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.4055963158607483, + "rewards/margins": 0.03280503302812576, + "rewards/rejected": 0.3727912902832031, + "step": 317 + }, + { + "epoch": 0.6652719665271967, + "grad_norm": 23.101940308036763, + "learning_rate": 1.5222381987746102e-07, + "logits/chosen": -0.8882025480270386, + "logits/rejected": -0.9792768955230713, + "logps/chosen": -1.6120126247406006, + "logps/rejected": -2.5856385231018066, + "loss": -0.1461, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3878062963485718, + "rewards/margins": 0.1964162290096283, + "rewards/rejected": 0.1913900524377823, + "step": 318 + }, + { + "epoch": 0.6673640167364017, + "grad_norm": 32.62994604510173, + "learning_rate": 1.5054542529740008e-07, + "logits/chosen": -1.0213111639022827, + "logits/rejected": -0.7104781866073608, + "logps/chosen": -1.222242832183838, + "logps/rejected": -3.8115944862365723, + "loss": -0.1556, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3644482493400574, + "rewards/margins": 0.15496206283569336, + "rewards/rejected": 0.209486186504364, + "step": 319 + }, + { + "epoch": 0.6694560669456067, + "grad_norm": 13.051692014015307, + "learning_rate": 1.488723393865766e-07, + "logits/chosen": -1.4067423343658447, + "logits/rejected": -1.2527976036071777, + "logps/chosen": -1.1540534496307373, + "logps/rejected": -2.087472438812256, + "loss": -0.1364, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5028185844421387, + "rewards/margins": 0.1457022875547409, + "rewards/rejected": 0.3571162819862366, + "step": 320 + }, + { + "epoch": 0.6715481171548117, + "grad_norm": 24.77692590150978, + "learning_rate": 1.472046514506832e-07, + "logits/chosen": -1.0069209337234497, + "logits/rejected": -0.8981058597564697, + "logps/chosen": -1.629122018814087, + "logps/rejected": -1.7869584560394287, + "loss": -0.0875, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.31096604466438293, + "rewards/margins": 0.06203940510749817, + "rewards/rejected": 0.24892663955688477, + "step": 321 + }, + { + "epoch": 0.6736401673640168, + "grad_norm": 17.05484586075126, + "learning_rate": 1.4554245050728084e-07, + "logits/chosen": -1.1591291427612305, + "logits/rejected": -0.8904301524162292, + "logps/chosen": -1.1888909339904785, + "logps/rejected": -2.005044937133789, + "loss": -0.1115, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.405101478099823, + "rewards/margins": 0.13060510158538818, + "rewards/rejected": 0.2744963467121124, + "step": 322 + }, + { + "epoch": 0.6757322175732218, + "grad_norm": 11.322008720616964, + "learning_rate": 1.4388582528104627e-07, + "logits/chosen": -0.8311055302619934, + "logits/rejected": -0.7126474380493164, + "logps/chosen": -1.374341607093811, + "logps/rejected": -2.404649019241333, + "loss": -0.0939, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37304264307022095, + "rewards/margins": 0.05022872984409332, + "rewards/rejected": 0.3228139281272888, + "step": 323 + }, + { + "epoch": 0.6778242677824268, + "grad_norm": 14.488311265640249, + "learning_rate": 1.422348641990369e-07, + "logits/chosen": -1.0909366607666016, + "logits/rejected": -0.9740039706230164, + "logps/chosen": -0.7895447015762329, + "logps/rejected": -2.2779781818389893, + "loss": -0.1645, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.49242639541625977, + "rewards/margins": 0.2595653533935547, + "rewards/rejected": 0.23286104202270508, + "step": 324 + }, + { + "epoch": 0.6799163179916318, + "grad_norm": 9.655770397929885, + "learning_rate": 1.4058965538597032e-07, + "logits/chosen": -1.1042437553405762, + "logits/rejected": -1.1286996603012085, + "logps/chosen": -1.1713753938674927, + "logps/rejected": -1.9938318729400635, + "loss": -0.0939, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38073980808258057, + "rewards/margins": 0.1361331045627594, + "rewards/rejected": 0.24460668861865997, + "step": 325 + }, + { + "epoch": 0.6820083682008368, + "grad_norm": 13.037016765822761, + "learning_rate": 1.3895028665952057e-07, + "logits/chosen": -0.8333834409713745, + "logits/rejected": -0.8366643190383911, + "logps/chosen": -1.1039419174194336, + "logps/rejected": -2.8780007362365723, + "loss": -0.1369, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.41021984815597534, + "rewards/margins": 0.18466591835021973, + "rewards/rejected": 0.22555390000343323, + "step": 326 + }, + { + "epoch": 0.6841004184100419, + "grad_norm": 14.265017443250814, + "learning_rate": 1.3731684552563027e-07, + "logits/chosen": -0.5523157119750977, + "logits/rejected": -0.542705774307251, + "logps/chosen": -0.7236326336860657, + "logps/rejected": -1.5846632719039917, + "loss": -0.1217, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.5142608880996704, + "rewards/margins": 0.1967753767967224, + "rewards/rejected": 0.317485511302948, + "step": 327 + }, + { + "epoch": 0.6861924686192469, + "grad_norm": 10.0387668669992, + "learning_rate": 1.3568941917384036e-07, + "logits/chosen": -0.8576334118843079, + "logits/rejected": -0.7204816937446594, + "logps/chosen": -1.0021593570709229, + "logps/rejected": -2.57088565826416, + "loss": -0.1258, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.42235422134399414, + "rewards/margins": 0.15459594130516052, + "rewards/rejected": 0.26775825023651123, + "step": 328 + }, + { + "epoch": 0.6882845188284519, + "grad_norm": 10.581183347813186, + "learning_rate": 1.3406809447263568e-07, + "logits/chosen": -1.0595145225524902, + "logits/rejected": -0.8824669122695923, + "logps/chosen": -1.6525616645812988, + "logps/rejected": -2.7960574626922607, + "loss": -0.0999, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.24829578399658203, + "rewards/margins": 0.045187197625637054, + "rewards/rejected": 0.20310857892036438, + "step": 329 + }, + { + "epoch": 0.6903765690376569, + "grad_norm": 17.448093710233614, + "learning_rate": 1.3245295796480788e-07, + "logits/chosen": -0.8543559908866882, + "logits/rejected": -0.49344658851623535, + "logps/chosen": -1.0859427452087402, + "logps/rejected": -3.5764997005462646, + "loss": -0.1058, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.41744154691696167, + "rewards/margins": 0.21425436437129974, + "rewards/rejected": 0.20318715274333954, + "step": 330 + }, + { + "epoch": 0.6924686192468619, + "grad_norm": 16.730200153254856, + "learning_rate": 1.3084409586283694e-07, + "logits/chosen": -1.2231147289276123, + "logits/rejected": -0.7146123051643372, + "logps/chosen": -2.151677131652832, + "logps/rejected": -2.705674409866333, + "loss": -0.106, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3055073022842407, + "rewards/margins": 0.035416390746831894, + "rewards/rejected": 0.27009090781211853, + "step": 331 + }, + { + "epoch": 0.694560669456067, + "grad_norm": 13.156926153571998, + "learning_rate": 1.2924159404428801e-07, + "logits/chosen": -0.7178107500076294, + "logits/rejected": -0.36026304960250854, + "logps/chosen": -1.1973762512207031, + "logps/rejected": -3.018155574798584, + "loss": -0.1277, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.367809534072876, + "rewards/margins": 0.16245080530643463, + "rewards/rejected": 0.20535872876644135, + "step": 332 + }, + { + "epoch": 0.696652719665272, + "grad_norm": 14.96194258618006, + "learning_rate": 1.2764553804722867e-07, + "logits/chosen": -0.6460230946540833, + "logits/rejected": -0.623856782913208, + "logps/chosen": -1.4669647216796875, + "logps/rejected": -1.899721622467041, + "loss": -0.0925, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.41646426916122437, + "rewards/margins": 0.09561848640441895, + "rewards/rejected": 0.3208458125591278, + "step": 333 + }, + { + "epoch": 0.698744769874477, + "grad_norm": 10.920052001046809, + "learning_rate": 1.2605601306566204e-07, + "logits/chosen": -1.2113707065582275, + "logits/rejected": -0.9189119338989258, + "logps/chosen": -1.2759509086608887, + "logps/rejected": -2.9539761543273926, + "loss": -0.1303, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3761143088340759, + "rewards/margins": 0.11043473333120346, + "rewards/rejected": 0.26567959785461426, + "step": 334 + }, + { + "epoch": 0.700836820083682, + "grad_norm": 12.026438516396775, + "learning_rate": 1.2447310394498017e-07, + "logits/chosen": -0.3498826324939728, + "logits/rejected": -0.2449064552783966, + "logps/chosen": -1.938281536102295, + "logps/rejected": -2.0428366661071777, + "loss": -0.1389, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3499003052711487, + "rewards/margins": 0.08844777196645737, + "rewards/rejected": 0.2614525258541107, + "step": 335 + }, + { + "epoch": 0.702928870292887, + "grad_norm": 10.793477695654982, + "learning_rate": 1.2289689517743472e-07, + "logits/chosen": -0.5094469785690308, + "logits/rejected": -0.2816145718097687, + "logps/chosen": -1.3401274681091309, + "logps/rejected": -2.9226505756378174, + "loss": -0.1241, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.37236952781677246, + "rewards/margins": 0.20558218657970428, + "rewards/rejected": 0.16678734123706818, + "step": 336 + }, + { + "epoch": 0.7050209205020921, + "grad_norm": 10.41218722859423, + "learning_rate": 1.213274708976271e-07, + "logits/chosen": -0.6510695815086365, + "logits/rejected": -0.739233136177063, + "logps/chosen": -1.454422950744629, + "logps/rejected": -2.9711012840270996, + "loss": -0.1458, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.29552698135375977, + "rewards/margins": 0.11168254911899567, + "rewards/rejected": 0.1838444173336029, + "step": 337 + }, + { + "epoch": 0.7071129707112971, + "grad_norm": 17.793530880939215, + "learning_rate": 1.1976491487801746e-07, + "logits/chosen": -0.9943718910217285, + "logits/rejected": -0.8745123147964478, + "logps/chosen": -1.1566295623779297, + "logps/rejected": -3.3676934242248535, + "loss": -0.1562, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38487157225608826, + "rewards/margins": 0.10210902243852615, + "rewards/rejected": 0.2827625572681427, + "step": 338 + }, + { + "epoch": 0.7092050209205021, + "grad_norm": 16.609393373696328, + "learning_rate": 1.1820931052445297e-07, + "logits/chosen": -0.9806454181671143, + "logits/rejected": -0.5159525871276855, + "logps/chosen": -1.153395414352417, + "logps/rejected": -3.263624429702759, + "loss": -0.1205, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3399513363838196, + "rewards/margins": 0.1355111300945282, + "rewards/rejected": 0.204440176486969, + "step": 339 + }, + { + "epoch": 0.7112970711297071, + "grad_norm": 18.60139351823708, + "learning_rate": 1.1666074087171627e-07, + "logits/chosen": -0.8198652267456055, + "logits/rejected": -0.4478958547115326, + "logps/chosen": -1.401387333869934, + "logps/rejected": -3.135948657989502, + "loss": -0.1524, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.35824739933013916, + "rewards/margins": 0.18047404289245605, + "rewards/rejected": 0.1777733713388443, + "step": 340 + }, + { + "epoch": 0.7133891213389121, + "grad_norm": 11.26052872387053, + "learning_rate": 1.1511928857909264e-07, + "logits/chosen": -0.6239542961120605, + "logits/rejected": -0.25442153215408325, + "logps/chosen": -1.1070218086242676, + "logps/rejected": -2.229344367980957, + "loss": -0.0917, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3640150725841522, + "rewards/margins": 0.13586899638175964, + "rewards/rejected": 0.22814607620239258, + "step": 341 + }, + { + "epoch": 0.7154811715481172, + "grad_norm": 13.071141355219329, + "learning_rate": 1.1358503592595837e-07, + "logits/chosen": -0.30327486991882324, + "logits/rejected": -0.11083739995956421, + "logps/chosen": -2.030562400817871, + "logps/rejected": -3.745598793029785, + "loss": -0.1459, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.30320000648498535, + "rewards/margins": 0.10328814387321472, + "rewards/rejected": 0.19991187751293182, + "step": 342 + }, + { + "epoch": 0.7175732217573222, + "grad_norm": 35.84080060882081, + "learning_rate": 1.120580648073885e-07, + "logits/chosen": -0.28081023693084717, + "logits/rejected": 0.07680314779281616, + "logps/chosen": -1.5106937885284424, + "logps/rejected": -2.175105094909668, + "loss": -0.1121, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.38606128096580505, + "rewards/margins": 0.07659916579723358, + "rewards/rejected": 0.30946213006973267, + "step": 343 + }, + { + "epoch": 0.7196652719665272, + "grad_norm": 12.317160716663446, + "learning_rate": 1.1053845672978565e-07, + "logits/chosen": -1.054025411605835, + "logits/rejected": -0.6850357055664062, + "logps/chosen": -1.185429334640503, + "logps/rejected": -2.5092053413391113, + "loss": -0.1204, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.38027000427246094, + "rewards/margins": 0.06081349402666092, + "rewards/rejected": 0.3194565176963806, + "step": 344 + }, + { + "epoch": 0.7217573221757322, + "grad_norm": 16.46100980347196, + "learning_rate": 1.090262928065293e-07, + "logits/chosen": -1.0338250398635864, + "logits/rejected": -0.9750658273696899, + "logps/chosen": -0.9890057444572449, + "logps/rejected": -1.7097139358520508, + "loss": -0.108, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.42490583658218384, + "rewards/margins": 0.10276540368795395, + "rewards/rejected": 0.3221404552459717, + "step": 345 + }, + { + "epoch": 0.7238493723849372, + "grad_norm": 19.269194310225696, + "learning_rate": 1.0752165375364591e-07, + "logits/chosen": 0.17747777700424194, + "logits/rejected": 0.3185412287712097, + "logps/chosen": -1.8695478439331055, + "logps/rejected": -2.6686084270477295, + "loss": -0.1522, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3071568012237549, + "rewards/margins": 0.05259630084037781, + "rewards/rejected": 0.25456053018569946, + "step": 346 + }, + { + "epoch": 0.7259414225941423, + "grad_norm": 13.005216285581936, + "learning_rate": 1.060246198855011e-07, + "logits/chosen": -0.6884080171585083, + "logits/rejected": 0.031216230243444443, + "logps/chosen": -1.4426461458206177, + "logps/rejected": -2.789125919342041, + "loss": -0.0979, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36756807565689087, + "rewards/margins": 0.15506842732429504, + "rewards/rejected": 0.21249966323375702, + "step": 347 + }, + { + "epoch": 0.7280334728033473, + "grad_norm": 22.121610737491377, + "learning_rate": 1.0453527111051183e-07, + "logits/chosen": -1.3621861934661865, + "logits/rejected": -0.784406304359436, + "logps/chosen": -1.073019027709961, + "logps/rejected": -3.2600555419921875, + "loss": -0.1192, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4461975693702698, + "rewards/margins": 0.14909616112709045, + "rewards/rejected": 0.2971014380455017, + "step": 348 + }, + { + "epoch": 0.7301255230125523, + "grad_norm": 10.880303503739961, + "learning_rate": 1.0305368692688174e-07, + "logits/chosen": -0.3833010792732239, + "logits/rejected": -0.0003327280282974243, + "logps/chosen": -2.284611225128174, + "logps/rejected": -2.494032144546509, + "loss": -0.1503, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.31178295612335205, + "rewards/margins": 0.11959420889616013, + "rewards/rejected": 0.19218875467777252, + "step": 349 + }, + { + "epoch": 0.7322175732217573, + "grad_norm": 17.100515864214906, + "learning_rate": 1.0157994641835734e-07, + "logits/chosen": -0.2467648983001709, + "logits/rejected": -0.18056590855121613, + "logps/chosen": -2.0999393463134766, + "logps/rejected": -2.614711046218872, + "loss": -0.1453, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.37043696641921997, + "rewards/margins": 0.08482887595891953, + "rewards/rejected": 0.28560805320739746, + "step": 350 + }, + { + "epoch": 0.7343096234309623, + "grad_norm": 28.629645757692103, + "learning_rate": 1.0011412825000693e-07, + "logits/chosen": -1.1214914321899414, + "logits/rejected": -0.7452467083930969, + "logps/chosen": -1.2375651597976685, + "logps/rejected": -2.4114208221435547, + "loss": -0.0998, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3861534595489502, + "rewards/margins": 0.0308099165558815, + "rewards/rejected": 0.3553435802459717, + "step": 351 + }, + { + "epoch": 0.7364016736401674, + "grad_norm": 12.024191314088965, + "learning_rate": 9.865631066402136e-08, + "logits/chosen": -0.7538611888885498, + "logits/rejected": 0.19186115264892578, + "logps/chosen": -1.4035093784332275, + "logps/rejected": -4.363006591796875, + "loss": -0.1394, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.3268209993839264, + "rewards/margins": 0.18925338983535767, + "rewards/rejected": 0.13756762444972992, + "step": 352 + }, + { + "epoch": 0.7384937238493724, + "grad_norm": 15.314163055416728, + "learning_rate": 9.720657147553767e-08, + "logits/chosen": -0.4137316942214966, + "logits/rejected": -0.34769952297210693, + "logps/chosen": -2.0995492935180664, + "logps/rejected": -2.4585986137390137, + "loss": -0.093, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.28392699360847473, + "rewards/margins": 0.05607965961098671, + "rewards/rejected": 0.22784735262393951, + "step": 353 + }, + { + "epoch": 0.7405857740585774, + "grad_norm": 16.82859831332979, + "learning_rate": 9.57649880684859e-08, + "logits/chosen": -0.16023807227611542, + "logits/rejected": -0.07635152339935303, + "logps/chosen": -1.9627785682678223, + "logps/rejected": -1.6600819826126099, + "loss": -0.1113, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.337715744972229, + "rewards/margins": -0.00830451026558876, + "rewards/rejected": 0.3460202217102051, + "step": 354 + }, + { + "epoch": 0.7426778242677824, + "grad_norm": 17.97313380627762, + "learning_rate": 9.433163739145771e-08, + "logits/chosen": -0.5657111406326294, + "logits/rejected": -0.043357543647289276, + "logps/chosen": -1.0140752792358398, + "logps/rejected": -2.526522636413574, + "loss": -0.1323, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.40150293707847595, + "rewards/margins": 0.20014238357543945, + "rewards/rejected": 0.2013605535030365, + "step": 355 + }, + { + "epoch": 0.7447698744769874, + "grad_norm": 21.86574720605052, + "learning_rate": 9.290659595360017e-08, + "logits/chosen": -0.07870801538228989, + "logits/rejected": -0.09383751451969147, + "logps/chosen": -1.0062921047210693, + "logps/rejected": -2.758099317550659, + "loss": -0.1152, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.45154866576194763, + "rewards/margins": 0.24462944269180298, + "rewards/rejected": 0.20691920816898346, + "step": 356 + }, + { + "epoch": 0.7468619246861925, + "grad_norm": 11.516028470925384, + "learning_rate": 9.148993982053058e-08, + "logits/chosen": -0.04537372291088104, + "logits/rejected": 0.28835880756378174, + "logps/chosen": -1.351635217666626, + "logps/rejected": -2.764091968536377, + "loss": -0.1137, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.37033578753471375, + "rewards/margins": 0.13014227151870728, + "rewards/rejected": 0.24019351601600647, + "step": 357 + }, + { + "epoch": 0.7489539748953975, + "grad_norm": 17.569715002068367, + "learning_rate": 9.008174461027723e-08, + "logits/chosen": -0.7079002261161804, + "logits/rejected": -0.3616589903831482, + "logps/chosen": -1.2153748273849487, + "logps/rejected": -2.612088680267334, + "loss": -0.1094, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.38530054688453674, + "rewards/margins": 0.10429896414279938, + "rewards/rejected": 0.28100159764289856, + "step": 358 + }, + { + "epoch": 0.7510460251046025, + "grad_norm": 11.265866090520127, + "learning_rate": 8.868208548924253e-08, + "logits/chosen": -0.3887017071247101, + "logits/rejected": -0.23636029660701752, + "logps/chosen": -0.9961744546890259, + "logps/rejected": -2.2566308975219727, + "loss": -0.1156, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4412422776222229, + "rewards/margins": 0.13439972698688507, + "rewards/rejected": 0.30684253573417664, + "step": 359 + }, + { + "epoch": 0.7531380753138075, + "grad_norm": 17.208413235507344, + "learning_rate": 8.729103716819111e-08, + "logits/chosen": -0.8207125663757324, + "logits/rejected": -0.43636664748191833, + "logps/chosen": -2.236236095428467, + "logps/rejected": -2.8618106842041016, + "loss": -0.1482, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3289279341697693, + "rewards/margins": 0.11165976524353027, + "rewards/rejected": 0.2172681838274002, + "step": 360 + }, + { + "epoch": 0.7552301255230126, + "grad_norm": 10.315633401806636, + "learning_rate": 8.590867389826179e-08, + "logits/chosen": -0.05628104507923126, + "logits/rejected": 0.12740841507911682, + "logps/chosen": -0.8035219311714172, + "logps/rejected": -2.464738368988037, + "loss": -0.1833, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5340550541877747, + "rewards/margins": 0.24898886680603027, + "rewards/rejected": 0.2850662171840668, + "step": 361 + }, + { + "epoch": 0.7573221757322176, + "grad_norm": 44.322778195778604, + "learning_rate": 8.453506946700417e-08, + "logits/chosen": -0.7813249826431274, + "logits/rejected": -0.02990594506263733, + "logps/chosen": -0.9563344120979309, + "logps/rejected": -3.9870352745056152, + "loss": -0.143, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.42556053400039673, + "rewards/margins": 0.20505291223526, + "rewards/rejected": 0.2205076366662979, + "step": 362 + }, + { + "epoch": 0.7594142259414226, + "grad_norm": 12.418318943630904, + "learning_rate": 8.317029719444016e-08, + "logits/chosen": -0.534734845161438, + "logits/rejected": 0.015497885644435883, + "logps/chosen": -1.012712001800537, + "logps/rejected": -2.580301284790039, + "loss": -0.1126, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.41058653593063354, + "rewards/margins": 0.1472766250371933, + "rewards/rejected": 0.26330992579460144, + "step": 363 + }, + { + "epoch": 0.7615062761506276, + "grad_norm": 14.003047066565102, + "learning_rate": 8.181442992915e-08, + "logits/chosen": -0.52370285987854, + "logits/rejected": 0.1399349719285965, + "logps/chosen": -0.8610312938690186, + "logps/rejected": -3.0481395721435547, + "loss": -0.1402, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.4761735200881958, + "rewards/margins": 0.25835907459259033, + "rewards/rejected": 0.21781444549560547, + "step": 364 + }, + { + "epoch": 0.7635983263598326, + "grad_norm": 27.853135912779752, + "learning_rate": 8.046754004438428e-08, + "logits/chosen": -0.38113510608673096, + "logits/rejected": -0.05555605888366699, + "logps/chosen": -0.7260129451751709, + "logps/rejected": -1.8457127809524536, + "loss": -0.1276, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.5552431344985962, + "rewards/margins": 0.20019498467445374, + "rewards/rejected": 0.35504812002182007, + "step": 365 + }, + { + "epoch": 0.7656903765690377, + "grad_norm": 14.711785420166494, + "learning_rate": 7.912969943420017e-08, + "logits/chosen": -1.2947417497634888, + "logits/rejected": -0.7500016689300537, + "logps/chosen": -1.219745397567749, + "logps/rejected": -1.6371362209320068, + "loss": -0.1534, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.36199867725372314, + "rewards/margins": 0.12112035602331161, + "rewards/rejected": 0.24087829887866974, + "step": 366 + }, + { + "epoch": 0.7677824267782427, + "grad_norm": 11.746982917584226, + "learning_rate": 7.780097950962447e-08, + "logits/chosen": -0.529086172580719, + "logits/rejected": -0.30336976051330566, + "logps/chosen": -1.449676513671875, + "logps/rejected": -1.9631965160369873, + "loss": -0.148, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3482644557952881, + "rewards/margins": 0.08891519904136658, + "rewards/rejected": 0.2593492269515991, + "step": 367 + }, + { + "epoch": 0.7698744769874477, + "grad_norm": 18.018124250700236, + "learning_rate": 7.648145119484151e-08, + "logits/chosen": -0.7858597040176392, + "logits/rejected": -0.34618625044822693, + "logps/chosen": -1.6457475423812866, + "logps/rejected": -2.3563523292541504, + "loss": -0.1509, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.37516093254089355, + "rewards/margins": 0.11906899511814117, + "rewards/rejected": 0.2560918927192688, + "step": 368 + }, + { + "epoch": 0.7719665271966527, + "grad_norm": 14.496994233180718, + "learning_rate": 7.517118492340748e-08, + "logits/chosen": -0.7869740128517151, + "logits/rejected": -0.7773195505142212, + "logps/chosen": -1.2835150957107544, + "logps/rejected": -2.575712203979492, + "loss": -0.0977, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.31397247314453125, + "rewards/margins": 0.12309419363737106, + "rewards/rejected": 0.19087830185890198, + "step": 369 + }, + { + "epoch": 0.7740585774058577, + "grad_norm": 25.21651844990452, + "learning_rate": 7.387025063449081e-08, + "logits/chosen": 0.0035008257254958153, + "logits/rejected": 0.4158563017845154, + "logps/chosen": -1.4392738342285156, + "logps/rejected": -3.455766439437866, + "loss": -0.1568, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.34259992837905884, + "rewards/margins": 0.16605782508850098, + "rewards/rejected": 0.17654210329055786, + "step": 370 + }, + { + "epoch": 0.7761506276150628, + "grad_norm": 12.841556705216126, + "learning_rate": 7.257871776913879e-08, + "logits/chosen": -1.1575801372528076, + "logits/rejected": -0.5147409439086914, + "logps/chosen": -0.9548391699790955, + "logps/rejected": -3.196890115737915, + "loss": -0.1368, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4133971333503723, + "rewards/margins": 0.14503958821296692, + "rewards/rejected": 0.2683575451374054, + "step": 371 + }, + { + "epoch": 0.7782426778242678, + "grad_norm": 14.050240424328877, + "learning_rate": 7.129665526657145e-08, + "logits/chosen": -0.8727412819862366, + "logits/rejected": -0.41230928897857666, + "logps/chosen": -1.1312958002090454, + "logps/rejected": -2.5632314682006836, + "loss": -0.1652, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39264926314353943, + "rewards/margins": 0.13636496663093567, + "rewards/rejected": 0.25628429651260376, + "step": 372 + }, + { + "epoch": 0.7803347280334728, + "grad_norm": 13.116388311344872, + "learning_rate": 7.002413156050108e-08, + "logits/chosen": -0.6466270685195923, + "logits/rejected": 0.143938809633255, + "logps/chosen": -1.2247765064239502, + "logps/rejected": -2.1847689151763916, + "loss": -0.1197, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3555133640766144, + "rewards/margins": 0.09387169033288956, + "rewards/rejected": 0.2616417109966278, + "step": 373 + }, + { + "epoch": 0.7824267782426778, + "grad_norm": 23.736331901716355, + "learning_rate": 6.876121457547995e-08, + "logits/chosen": -1.3204679489135742, + "logits/rejected": -1.0910612344741821, + "logps/chosen": -0.9571521282196045, + "logps/rejected": -2.0240230560302734, + "loss": -0.1045, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4545818269252777, + "rewards/margins": 0.09085176885128021, + "rewards/rejected": 0.3637300729751587, + "step": 374 + }, + { + "epoch": 0.7845188284518828, + "grad_norm": 24.818348846229174, + "learning_rate": 6.75079717232744e-08, + "logits/chosen": -0.06902044266462326, + "logits/rejected": 0.6411824226379395, + "logps/chosen": -1.6761717796325684, + "logps/rejected": -2.5190796852111816, + "loss": -0.0937, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.28744983673095703, + "rewards/margins": -0.014971615746617317, + "rewards/rejected": 0.3024214804172516, + "step": 375 + }, + { + "epoch": 0.7866108786610879, + "grad_norm": 16.412707062669462, + "learning_rate": 6.626446989926652e-08, + "logits/chosen": -0.7313116192817688, + "logits/rejected": -0.14149028062820435, + "logps/chosen": -1.0154411792755127, + "logps/rejected": -2.468728542327881, + "loss": -0.1287, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.39400801062583923, + "rewards/margins": 0.13069072365760803, + "rewards/rejected": 0.2633172869682312, + "step": 376 + }, + { + "epoch": 0.7887029288702929, + "grad_norm": 12.947593105914633, + "learning_rate": 6.503077547888352e-08, + "logits/chosen": -0.3051021099090576, + "logits/rejected": -0.031502705067396164, + "logps/chosen": -1.4966380596160889, + "logps/rejected": -1.756742000579834, + "loss": -0.1317, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2928207516670227, + "rewards/margins": 0.045941371470689774, + "rewards/rejected": 0.24687938392162323, + "step": 377 + }, + { + "epoch": 0.7907949790794979, + "grad_norm": 22.150989160775218, + "learning_rate": 6.380695431405453e-08, + "logits/chosen": -0.8521575331687927, + "logits/rejected": -0.49384939670562744, + "logps/chosen": -0.8695181012153625, + "logps/rejected": -2.132112979888916, + "loss": -0.0904, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.45235487818717957, + "rewards/margins": 0.09790237247943878, + "rewards/rejected": 0.354452520608902, + "step": 378 + }, + { + "epoch": 0.7928870292887029, + "grad_norm": 17.656613830598737, + "learning_rate": 6.259307172969606e-08, + "logits/chosen": -0.6174445152282715, + "logits/rejected": 0.07970046997070312, + "logps/chosen": -1.4945907592773438, + "logps/rejected": -3.1515700817108154, + "loss": -0.1211, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.357840359210968, + "rewards/margins": 0.05095478519797325, + "rewards/rejected": 0.30688557028770447, + "step": 379 + }, + { + "epoch": 0.7949790794979079, + "grad_norm": 15.194872161839614, + "learning_rate": 6.138919252022435e-08, + "logits/chosen": -0.35832735896110535, + "logits/rejected": -0.3035891056060791, + "logps/chosen": -1.24945867061615, + "logps/rejected": -1.789233684539795, + "loss": -0.0939, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3697297275066376, + "rewards/margins": 0.10971508920192719, + "rewards/rejected": 0.2600146532058716, + "step": 380 + }, + { + "epoch": 0.797071129707113, + "grad_norm": 19.862030211955297, + "learning_rate": 6.019538094609759e-08, + "logits/chosen": -0.2610815465450287, + "logits/rejected": -0.4965687394142151, + "logps/chosen": -0.9594101309776306, + "logps/rejected": -1.6051406860351562, + "loss": -0.1072, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4468555748462677, + "rewards/margins": 0.13224181532859802, + "rewards/rejected": 0.3146137595176697, + "step": 381 + }, + { + "epoch": 0.799163179916318, + "grad_norm": 16.771708981733756, + "learning_rate": 5.9011700730385224e-08, + "logits/chosen": -0.19101418554782867, + "logits/rejected": -0.03149036690592766, + "logps/chosen": -1.246549129486084, + "logps/rejected": -3.112955331802368, + "loss": -0.1322, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3705316185951233, + "rewards/margins": 0.0991310402750969, + "rewards/rejected": 0.2714005708694458, + "step": 382 + }, + { + "epoch": 0.801255230125523, + "grad_norm": 38.34589726312402, + "learning_rate": 5.7838215055366954e-08, + "logits/chosen": -1.1628611087799072, + "logits/rejected": -0.803926408290863, + "logps/chosen": -1.4910624027252197, + "logps/rejected": -1.700842261314392, + "loss": -0.1221, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3370550870895386, + "rewards/margins": 0.01984988898038864, + "rewards/rejected": 0.31720519065856934, + "step": 383 + }, + { + "epoch": 0.803347280334728, + "grad_norm": 15.644651216768494, + "learning_rate": 5.6674986559160004e-08, + "logits/chosen": -0.9479022026062012, + "logits/rejected": -0.10572785139083862, + "logps/chosen": -0.906363844871521, + "logps/rejected": -2.1339168548583984, + "loss": -0.1254, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4436798393726349, + "rewards/margins": 0.23987531661987305, + "rewards/rejected": 0.20380452275276184, + "step": 384 + }, + { + "epoch": 0.805439330543933, + "grad_norm": 19.05289765194646, + "learning_rate": 5.552207733237543e-08, + "logits/chosen": -0.6024627685546875, + "logits/rejected": -0.5707870721817017, + "logps/chosen": -0.8243768215179443, + "logps/rejected": -1.8555145263671875, + "loss": -0.1289, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4963781237602234, + "rewards/margins": 0.2237432599067688, + "rewards/rejected": 0.2726348638534546, + "step": 385 + }, + { + "epoch": 0.8075313807531381, + "grad_norm": 15.4528964096149, + "learning_rate": 5.4379548914804427e-08, + "logits/chosen": -0.016455668956041336, + "logits/rejected": 0.3638351559638977, + "logps/chosen": -1.2458593845367432, + "logps/rejected": -2.3567447662353516, + "loss": -0.1144, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3728832006454468, + "rewards/margins": 0.11920465528964996, + "rewards/rejected": 0.253678560256958, + "step": 386 + }, + { + "epoch": 0.8096234309623431, + "grad_norm": 22.637025787930654, + "learning_rate": 5.324746229213281e-08, + "logits/chosen": -0.13823582231998444, + "logits/rejected": 0.23312069475650787, + "logps/chosen": -1.2766966819763184, + "logps/rejected": -1.7796881198883057, + "loss": -0.0925, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3433237671852112, + "rewards/margins": 0.09567471593618393, + "rewards/rejected": 0.24764902889728546, + "step": 387 + }, + { + "epoch": 0.8117154811715481, + "grad_norm": 14.718678468178092, + "learning_rate": 5.212587789268649e-08, + "logits/chosen": -0.2695750594139099, + "logits/rejected": -0.005407601594924927, + "logps/chosen": -1.4397711753845215, + "logps/rejected": -1.629962682723999, + "loss": -0.0973, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.33621859550476074, + "rewards/margins": 0.033973321318626404, + "rewards/rejected": 0.30224525928497314, + "step": 388 + }, + { + "epoch": 0.8138075313807531, + "grad_norm": 20.171091314172557, + "learning_rate": 5.101485558420504e-08, + "logits/chosen": 0.19560030102729797, + "logits/rejected": 0.34746694564819336, + "logps/chosen": -1.6387301683425903, + "logps/rejected": -4.318758010864258, + "loss": -0.0964, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3025053143501282, + "rewards/margins": 0.17127913236618042, + "rewards/rejected": 0.13122616708278656, + "step": 389 + }, + { + "epoch": 0.8158995815899581, + "grad_norm": 19.414432686359987, + "learning_rate": 4.991445467064689e-08, + "logits/chosen": -0.273246169090271, + "logits/rejected": -0.431201308965683, + "logps/chosen": -0.9642292857170105, + "logps/rejected": -1.8679355382919312, + "loss": -0.1217, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4235716760158539, + "rewards/margins": 0.08978331089019775, + "rewards/rejected": 0.33378836512565613, + "step": 390 + }, + { + "epoch": 0.8179916317991632, + "grad_norm": 16.85819113660886, + "learning_rate": 4.882473388902322e-08, + "logits/chosen": -0.500401496887207, + "logits/rejected": -0.2619732618331909, + "logps/chosen": -1.076308250427246, + "logps/rejected": -2.224181890487671, + "loss": -0.0881, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4291846454143524, + "rewards/margins": 0.11726579070091248, + "rewards/rejected": 0.31191885471343994, + "step": 391 + }, + { + "epoch": 0.8200836820083682, + "grad_norm": 15.5960410352012, + "learning_rate": 4.774575140626316e-08, + "logits/chosen": -0.5160663723945618, + "logits/rejected": 0.43592193722724915, + "logps/chosen": -0.9588484168052673, + "logps/rejected": -2.9696645736694336, + "loss": -0.1598, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.45355111360549927, + "rewards/margins": 0.257058709859848, + "rewards/rejected": 0.19649238884449005, + "step": 392 + }, + { + "epoch": 0.8221757322175732, + "grad_norm": 23.839231530419294, + "learning_rate": 4.667756481610866e-08, + "logits/chosen": -0.3702046275138855, + "logits/rejected": -0.35005468130111694, + "logps/chosen": -0.9770090579986572, + "logps/rejected": -1.1811765432357788, + "loss": -0.1075, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40616047382354736, + "rewards/margins": 0.03360208123922348, + "rewards/rejected": 0.3725584149360657, + "step": 393 + }, + { + "epoch": 0.8242677824267782, + "grad_norm": 16.344564093013464, + "learning_rate": 4.562023113604041e-08, + "logits/chosen": -0.8614406585693359, + "logits/rejected": -0.667637825012207, + "logps/chosen": -0.942477822303772, + "logps/rejected": -1.798526644706726, + "loss": -0.1132, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4376535415649414, + "rewards/margins": 0.10072365403175354, + "rewards/rejected": 0.33692991733551025, + "step": 394 + }, + { + "epoch": 0.8263598326359832, + "grad_norm": 18.59736703264403, + "learning_rate": 4.4573806804234335e-08, + "logits/chosen": -0.6730791330337524, + "logits/rejected": -0.10565708577632904, + "logps/chosen": -0.9278669953346252, + "logps/rejected": -2.0849101543426514, + "loss": -0.1275, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.4342162609100342, + "rewards/margins": 0.16311287879943848, + "rewards/rejected": 0.2711033821105957, + "step": 395 + }, + { + "epoch": 0.8284518828451883, + "grad_norm": 17.260327771943437, + "learning_rate": 4.3538347676548956e-08, + "logits/chosen": -0.22298240661621094, + "logits/rejected": -0.0226747989654541, + "logps/chosen": -1.0462150573730469, + "logps/rejected": -2.898242235183716, + "loss": -0.1041, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4490687847137451, + "rewards/margins": 0.21503090858459473, + "rewards/rejected": 0.23403790593147278, + "step": 396 + }, + { + "epoch": 0.8305439330543933, + "grad_norm": 10.738289775824189, + "learning_rate": 4.251390902354413e-08, + "logits/chosen": -0.19089946150779724, + "logits/rejected": -0.2588718831539154, + "logps/chosen": -1.3638062477111816, + "logps/rejected": -1.6540424823760986, + "loss": -0.1205, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.34393906593322754, + "rewards/margins": 0.06465037912130356, + "rewards/rejected": 0.279288649559021, + "step": 397 + }, + { + "epoch": 0.8326359832635983, + "grad_norm": 14.000882257182658, + "learning_rate": 4.1500545527530544e-08, + "logits/chosen": -0.4493892192840576, + "logits/rejected": -0.09688322246074677, + "logps/chosen": -1.4314875602722168, + "logps/rejected": -1.8489952087402344, + "loss": -0.0961, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.358940064907074, + "rewards/margins": 0.13328352570533752, + "rewards/rejected": 0.22565653920173645, + "step": 398 + }, + { + "epoch": 0.8347280334728033, + "grad_norm": 15.545368370778581, + "learning_rate": 4.0498311279651196e-08, + "logits/chosen": -0.07205243408679962, + "logits/rejected": 0.23499715328216553, + "logps/chosen": -1.2373902797698975, + "logps/rejected": -3.2700839042663574, + "loss": -0.0877, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.34146398305892944, + "rewards/margins": 0.13711421191692352, + "rewards/rejected": 0.20434975624084473, + "step": 399 + }, + { + "epoch": 0.8368200836820083, + "grad_norm": 18.918377276587357, + "learning_rate": 3.9507259776993954e-08, + "logits/chosen": -0.5115982294082642, + "logits/rejected": -0.6578488349914551, + "logps/chosen": -1.8198411464691162, + "logps/rejected": -2.025270938873291, + "loss": -0.1423, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.41336411237716675, + "rewards/margins": 0.07312976568937302, + "rewards/rejected": 0.3402343690395355, + "step": 400 + }, + { + "epoch": 0.8389121338912134, + "grad_norm": 11.634156261583618, + "learning_rate": 3.8527443919736006e-08, + "logits/chosen": -0.9072321653366089, + "logits/rejected": -0.7701002359390259, + "logps/chosen": -1.0199377536773682, + "logps/rejected": -3.217071533203125, + "loss": -0.1353, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4319874048233032, + "rewards/margins": 0.19452467560768127, + "rewards/rejected": 0.23746272921562195, + "step": 401 + }, + { + "epoch": 0.8410041841004184, + "grad_norm": 12.4551196253915, + "learning_rate": 3.755891600832026e-08, + "logits/chosen": -0.25930002331733704, + "logits/rejected": 0.25398126244544983, + "logps/chosen": -2.0748448371887207, + "logps/rejected": -2.5072836875915527, + "loss": -0.1016, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.39124736189842224, + "rewards/margins": 0.09978035092353821, + "rewards/rejected": 0.29146701097488403, + "step": 402 + }, + { + "epoch": 0.8430962343096234, + "grad_norm": 10.222174114537948, + "learning_rate": 3.660172774066339e-08, + "logits/chosen": -0.05241062492132187, + "logits/rejected": 0.2334146946668625, + "logps/chosen": -1.0232222080230713, + "logps/rejected": -2.3795347213745117, + "loss": -0.137, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3805456757545471, + "rewards/margins": 0.16428259015083313, + "rewards/rejected": 0.21626310050487518, + "step": 403 + }, + { + "epoch": 0.8451882845188284, + "grad_norm": 15.992043901254412, + "learning_rate": 3.565593020939678e-08, + "logits/chosen": -0.27851253747940063, + "logits/rejected": -0.0054774656891822815, + "logps/chosen": -1.8243669271469116, + "logps/rejected": -2.437450885772705, + "loss": -0.1127, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.33263060450553894, + "rewards/margins": 0.05638391897082329, + "rewards/rejected": 0.27624669671058655, + "step": 404 + }, + { + "epoch": 0.8472803347280334, + "grad_norm": 25.026948366336036, + "learning_rate": 3.472157389913874e-08, + "logits/chosen": -0.805883526802063, + "logits/rejected": -0.528130292892456, + "logps/chosen": -1.4973548650741577, + "logps/rejected": -2.4731860160827637, + "loss": -0.1533, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3797512650489807, + "rewards/margins": 0.11224304139614105, + "rewards/rejected": 0.26750820875167847, + "step": 405 + }, + { + "epoch": 0.8493723849372385, + "grad_norm": 18.95659135912971, + "learning_rate": 3.3798708683800305e-08, + "logits/chosen": -0.06515583395957947, + "logits/rejected": -0.24208559095859528, + "logps/chosen": -1.6230918169021606, + "logps/rejected": -2.0216987133026123, + "loss": -0.0905, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.2974805235862732, + "rewards/margins": 0.04249989986419678, + "rewards/rejected": 0.2549806237220764, + "step": 406 + }, + { + "epoch": 0.8514644351464435, + "grad_norm": 15.13226749066972, + "learning_rate": 3.288738382392273e-08, + "logits/chosen": -0.37732651829719543, + "logits/rejected": 0.23026621341705322, + "logps/chosen": -1.16318941116333, + "logps/rejected": -3.1813478469848633, + "loss": -0.1584, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3486320972442627, + "rewards/margins": 0.1388455033302307, + "rewards/rejected": 0.2097865641117096, + "step": 407 + }, + { + "epoch": 0.8535564853556485, + "grad_norm": 12.92143630698041, + "learning_rate": 3.198764796404807e-08, + "logits/chosen": -0.5920988321304321, + "logits/rejected": -0.5197505950927734, + "logps/chosen": -1.685182809829712, + "logps/rejected": -2.676419973373413, + "loss": -0.1647, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.32835954427719116, + "rewards/margins": 0.06703662127256393, + "rewards/rejected": 0.26132291555404663, + "step": 408 + }, + { + "epoch": 0.8556485355648535, + "grad_norm": 15.060307639983238, + "learning_rate": 3.109954913012294e-08, + "logits/chosen": -0.05348600819706917, + "logits/rejected": 0.5084730386734009, + "logps/chosen": -1.1922996044158936, + "logps/rejected": -3.422697067260742, + "loss": -0.1294, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.400797963142395, + "rewards/margins": 0.2237713634967804, + "rewards/rejected": 0.17702659964561462, + "step": 409 + }, + { + "epoch": 0.8577405857740585, + "grad_norm": 10.856906615820568, + "learning_rate": 3.022313472693447e-08, + "logits/chosen": -0.7400836944580078, + "logits/rejected": -0.9220219850540161, + "logps/chosen": -0.9642383456230164, + "logps/rejected": -1.5164411067962646, + "loss": -0.1233, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4663574695587158, + "rewards/margins": 0.1306409388780594, + "rewards/rejected": 0.33571651577949524, + "step": 410 + }, + { + "epoch": 0.8598326359832636, + "grad_norm": 11.044705837575496, + "learning_rate": 2.935845153558053e-08, + "logits/chosen": -0.6763345003128052, + "logits/rejected": -0.42656803131103516, + "logps/chosen": -1.0390863418579102, + "logps/rejected": -2.544398784637451, + "loss": -0.1433, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4070824384689331, + "rewards/margins": 0.17951026558876038, + "rewards/rejected": 0.22757217288017273, + "step": 411 + }, + { + "epoch": 0.8619246861924686, + "grad_norm": 14.117311799051569, + "learning_rate": 2.8505545710972107e-08, + "logits/chosen": -0.008791878819465637, + "logits/rejected": 0.23330970108509064, + "logps/chosen": -1.5099177360534668, + "logps/rejected": -1.9858248233795166, + "loss": -0.0957, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.3444157838821411, + "rewards/margins": 0.04557289183139801, + "rewards/rejected": 0.2988429069519043, + "step": 412 + }, + { + "epoch": 0.8640167364016736, + "grad_norm": 21.343019352885765, + "learning_rate": 2.766446277937029e-08, + "logits/chosen": -0.3148643970489502, + "logits/rejected": -0.20236754417419434, + "logps/chosen": -1.3142023086547852, + "logps/rejected": -2.6143746376037598, + "loss": -0.0726, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3833717703819275, + "rewards/margins": 0.0287872813642025, + "rewards/rejected": 0.3545844852924347, + "step": 413 + }, + { + "epoch": 0.8661087866108786, + "grad_norm": 9.868482152917004, + "learning_rate": 2.683524763595546e-08, + "logits/chosen": 0.10180674493312836, + "logits/rejected": -0.06073710322380066, + "logps/chosen": -0.9547271728515625, + "logps/rejected": -1.5085153579711914, + "loss": -0.1442, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4392520785331726, + "rewards/margins": 0.11705724895000458, + "rewards/rejected": 0.3221948742866516, + "step": 414 + }, + { + "epoch": 0.8682008368200836, + "grad_norm": 24.104841754222196, + "learning_rate": 2.601794454243139e-08, + "logits/chosen": -0.3390297591686249, + "logits/rejected": 0.024038255214691162, + "logps/chosen": -1.0783426761627197, + "logps/rejected": -2.5162158012390137, + "loss": -0.1205, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.435734361410141, + "rewards/margins": 0.15422660112380981, + "rewards/rejected": 0.2815077602863312, + "step": 415 + }, + { + "epoch": 0.8702928870292888, + "grad_norm": 14.28427980032814, + "learning_rate": 2.521259712466256e-08, + "logits/chosen": -1.2510604858398438, + "logits/rejected": -0.2848242223262787, + "logps/chosen": -0.8153286576271057, + "logps/rejected": -1.7719354629516602, + "loss": -0.1589, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.5010541677474976, + "rewards/margins": 0.23067042231559753, + "rewards/rejected": 0.27038371562957764, + "step": 416 + }, + { + "epoch": 0.8723849372384938, + "grad_norm": 25.588656821141594, + "learning_rate": 2.4419248370345285e-08, + "logits/chosen": 0.3352106511592865, + "logits/rejected": 0.3673442602157593, + "logps/chosen": -1.1408625841140747, + "logps/rejected": -3.054860830307007, + "loss": -0.126, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.39492154121398926, + "rewards/margins": 0.11788132786750793, + "rewards/rejected": 0.2770402431488037, + "step": 417 + }, + { + "epoch": 0.8744769874476988, + "grad_norm": 10.077506491437301, + "learning_rate": 2.3637940626713342e-08, + "logits/chosen": 0.2162850946187973, + "logits/rejected": 0.19268055260181427, + "logps/chosen": -1.1083879470825195, + "logps/rejected": -2.3523571491241455, + "loss": -0.1301, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.41468891501426697, + "rewards/margins": 0.19711005687713623, + "rewards/rejected": 0.21757885813713074, + "step": 418 + }, + { + "epoch": 0.8765690376569037, + "grad_norm": 14.531005857324054, + "learning_rate": 2.2868715598277578e-08, + "logits/chosen": -0.6414574384689331, + "logits/rejected": -0.19284185767173767, + "logps/chosen": -1.3791618347167969, + "logps/rejected": -1.8712342977523804, + "loss": -0.1405, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.35826003551483154, + "rewards/margins": 0.051309142261743546, + "rewards/rejected": 0.3069508671760559, + "step": 419 + }, + { + "epoch": 0.8786610878661087, + "grad_norm": 15.518123245027137, + "learning_rate": 2.2111614344599684e-08, + "logits/chosen": -0.6647897362709045, + "logits/rejected": -0.1362893283367157, + "logps/chosen": -1.5059080123901367, + "logps/rejected": -2.9713315963745117, + "loss": -0.122, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3615904450416565, + "rewards/margins": 0.1611703634262085, + "rewards/rejected": 0.2004200667142868, + "step": 420 + }, + { + "epoch": 0.8807531380753139, + "grad_norm": 18.186740715196795, + "learning_rate": 2.1366677278100486e-08, + "logits/chosen": -0.7590320110321045, + "logits/rejected": -0.011863499879837036, + "logps/chosen": -1.4375065565109253, + "logps/rejected": -2.3867299556732178, + "loss": -0.1402, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.40658998489379883, + "rewards/margins": 0.12375946342945099, + "rewards/rejected": 0.28283050656318665, + "step": 421 + }, + { + "epoch": 0.8828451882845189, + "grad_norm": 14.824655173088397, + "learning_rate": 2.0633944161903145e-08, + "logits/chosen": -0.28729909658432007, + "logits/rejected": 0.09423608332872391, + "logps/chosen": -1.6937799453735352, + "logps/rejected": -2.76290225982666, + "loss": -0.14, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.33846837282180786, + "rewards/margins": 0.10761270672082901, + "rewards/rejected": 0.23085570335388184, + "step": 422 + }, + { + "epoch": 0.8849372384937239, + "grad_norm": 14.454926513569529, + "learning_rate": 1.991345410771017e-08, + "logits/chosen": -0.23677058517932892, + "logits/rejected": 0.2690431773662567, + "logps/chosen": -1.7377506494522095, + "logps/rejected": -2.0279836654663086, + "loss": -0.1366, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.31104129552841187, + "rewards/margins": 0.1051572784781456, + "rewards/rejected": 0.20588400959968567, + "step": 423 + }, + { + "epoch": 0.8870292887029289, + "grad_norm": 13.709071053517471, + "learning_rate": 1.9205245573716195e-08, + "logits/chosen": -0.5881615877151489, + "logits/rejected": -0.2020711451768875, + "logps/chosen": -1.4259986877441406, + "logps/rejected": -3.365917682647705, + "loss": -0.1618, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4160638153553009, + "rewards/margins": 0.1756630837917328, + "rewards/rejected": 0.2404007464647293, + "step": 424 + }, + { + "epoch": 0.8891213389121339, + "grad_norm": 15.42709086141289, + "learning_rate": 1.850935636255496e-08, + "logits/chosen": -0.4001805782318115, + "logits/rejected": 0.3592504858970642, + "logps/chosen": -1.2519683837890625, + "logps/rejected": -2.341618061065674, + "loss": -0.1113, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.39130017161369324, + "rewards/margins": 0.1382875144481659, + "rewards/rejected": 0.25301262736320496, + "step": 425 + }, + { + "epoch": 0.891213389121339, + "grad_norm": 18.239206963415292, + "learning_rate": 1.7825823619281448e-08, + "logits/chosen": -0.4060337245464325, + "logits/rejected": -0.25071170926094055, + "logps/chosen": -0.7171350717544556, + "logps/rejected": -2.6098623275756836, + "loss": -0.1455, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.5387618541717529, + "rewards/margins": 0.19522696733474731, + "rewards/rejected": 0.3435348570346832, + "step": 426 + }, + { + "epoch": 0.893305439330544, + "grad_norm": 13.151951030853512, + "learning_rate": 1.7154683829389283e-08, + "logits/chosen": 0.037064895033836365, + "logits/rejected": 0.4181457757949829, + "logps/chosen": -2.020270347595215, + "logps/rejected": -2.110900402069092, + "loss": -0.1494, + "rewards/accuracies": 0.375, + "rewards/chosen": 0.3166952133178711, + "rewards/margins": 0.030689965933561325, + "rewards/rejected": 0.28600525856018066, + "step": 427 + }, + { + "epoch": 0.895397489539749, + "grad_norm": 30.11848854193685, + "learning_rate": 1.649597281686302e-08, + "logits/chosen": 0.27454110980033875, + "logits/rejected": 0.2602013349533081, + "logps/chosen": -1.5725305080413818, + "logps/rejected": -2.114314556121826, + "loss": -0.1403, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3760819137096405, + "rewards/margins": 0.09325071424245834, + "rewards/rejected": 0.28283122181892395, + "step": 428 + }, + { + "epoch": 0.897489539748954, + "grad_norm": 49.208669602647255, + "learning_rate": 1.584972574226623e-08, + "logits/chosen": -0.2047557532787323, + "logits/rejected": 0.5918644070625305, + "logps/chosen": -1.154144525527954, + "logps/rejected": -3.3555400371551514, + "loss": -0.139, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.41403210163116455, + "rewards/margins": 0.12810006737709045, + "rewards/rejected": 0.2859320640563965, + "step": 429 + }, + { + "epoch": 0.899581589958159, + "grad_norm": 20.955344541279917, + "learning_rate": 1.521597710086439e-08, + "logits/chosen": 0.13177312910556793, + "logits/rejected": 0.5117599964141846, + "logps/chosen": -1.241478681564331, + "logps/rejected": -1.3989903926849365, + "loss": -0.1086, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3930959701538086, + "rewards/margins": 0.03417598456144333, + "rewards/rejected": 0.3589199483394623, + "step": 430 + }, + { + "epoch": 0.9016736401673641, + "grad_norm": 14.619595042411454, + "learning_rate": 1.459476072078386e-08, + "logits/chosen": 0.24203670024871826, + "logits/rejected": 0.2092907428741455, + "logps/chosen": -1.1378140449523926, + "logps/rejected": -1.448889970779419, + "loss": -0.1118, + "rewards/accuracies": 0.3125, + "rewards/chosen": 0.4876505732536316, + "rewards/margins": 0.054524172097444534, + "rewards/rejected": 0.43312641978263855, + "step": 431 + }, + { + "epoch": 0.9037656903765691, + "grad_norm": 20.394218134088426, + "learning_rate": 1.3986109761206093e-08, + "logits/chosen": -0.41305041313171387, + "logits/rejected": -0.09600323438644409, + "logps/chosen": -1.150968313217163, + "logps/rejected": -2.4742114543914795, + "loss": -0.1391, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.36665695905685425, + "rewards/margins": 0.12305985391139984, + "rewards/rejected": 0.2435971200466156, + "step": 432 + }, + { + "epoch": 0.9058577405857741, + "grad_norm": 18.209878374392186, + "learning_rate": 1.3390056710597647e-08, + "logits/chosen": -0.21727851033210754, + "logits/rejected": 0.14106367528438568, + "logps/chosen": -1.2363967895507812, + "logps/rejected": -2.26884126663208, + "loss": -0.1446, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3572999835014343, + "rewards/margins": 0.11691320687532425, + "rewards/rejected": 0.24038679897785187, + "step": 433 + }, + { + "epoch": 0.9079497907949791, + "grad_norm": 11.047404016240506, + "learning_rate": 1.280663338497609e-08, + "logits/chosen": -0.43556827306747437, + "logits/rejected": -0.133283793926239, + "logps/chosen": -1.3448657989501953, + "logps/rejected": -2.900851011276245, + "loss": -0.1378, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.33240455389022827, + "rewards/margins": 0.14208939671516418, + "rewards/rejected": 0.19031518697738647, + "step": 434 + }, + { + "epoch": 0.9100418410041841, + "grad_norm": 18.197187537565902, + "learning_rate": 1.2235870926211616e-08, + "logits/chosen": -0.5432617664337158, + "logits/rejected": 0.13345128297805786, + "logps/chosen": -1.0295956134796143, + "logps/rejected": -3.0868406295776367, + "loss": -0.1274, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4138888418674469, + "rewards/margins": 0.20933057367801666, + "rewards/rejected": 0.20455826818943024, + "step": 435 + }, + { + "epoch": 0.9121338912133892, + "grad_norm": 17.261325822213802, + "learning_rate": 1.1677799800364957e-08, + "logits/chosen": -1.115002989768982, + "logits/rejected": -0.11034034192562103, + "logps/chosen": -1.1753132343292236, + "logps/rejected": -2.2985410690307617, + "loss": -0.1525, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3633139729499817, + "rewards/margins": 0.10394985973834991, + "rewards/rejected": 0.2593640685081482, + "step": 436 + }, + { + "epoch": 0.9142259414225942, + "grad_norm": 34.53685792981264, + "learning_rate": 1.1132449796060873e-08, + "logits/chosen": -0.3196355700492859, + "logits/rejected": -0.32760536670684814, + "logps/chosen": -1.0820286273956299, + "logps/rejected": -2.6322782039642334, + "loss": -0.1472, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.4171540141105652, + "rewards/margins": 0.19411149621009827, + "rewards/rejected": 0.22304251790046692, + "step": 437 + }, + { + "epoch": 0.9163179916317992, + "grad_norm": 23.26394464843546, + "learning_rate": 1.0599850022898537e-08, + "logits/chosen": -0.21112748980522156, + "logits/rejected": -0.7489008903503418, + "logps/chosen": -1.442376971244812, + "logps/rejected": -1.3414108753204346, + "loss": -0.1214, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.39732199907302856, + "rewards/margins": 0.025272248312830925, + "rewards/rejected": 0.3720497786998749, + "step": 438 + }, + { + "epoch": 0.9184100418410042, + "grad_norm": 16.039680866672114, + "learning_rate": 1.0080028909897232e-08, + "logits/chosen": 0.08464886993169785, + "logits/rejected": -0.09728383272886276, + "logps/chosen": -1.6759049892425537, + "logps/rejected": -2.3375589847564697, + "loss": -0.1067, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3139100968837738, + "rewards/margins": 0.07879811525344849, + "rewards/rejected": 0.2351119965314865, + "step": 439 + }, + { + "epoch": 0.9205020920502092, + "grad_norm": 14.377105293354811, + "learning_rate": 9.57301420397924e-09, + "logits/chosen": -0.8353943824768066, + "logits/rejected": -0.9240339994430542, + "logps/chosen": -1.6584469079971313, + "logps/rejected": -3.0757076740264893, + "loss": -0.1096, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.43850457668304443, + "rewards/margins": 0.1471424549818039, + "rewards/rejected": 0.29136213660240173, + "step": 440 + }, + { + "epoch": 0.9225941422594143, + "grad_norm": 18.35983958853663, + "learning_rate": 9.078832968488632e-09, + "logits/chosen": -0.48855262994766235, + "logits/rejected": -0.07575897127389908, + "logps/chosen": -1.3988516330718994, + "logps/rejected": -3.0933828353881836, + "loss": -0.1606, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.33013659715652466, + "rewards/margins": 0.0989568755030632, + "rewards/rejected": 0.23117972910404205, + "step": 441 + }, + { + "epoch": 0.9246861924686193, + "grad_norm": 12.572841961166487, + "learning_rate": 8.597511581746625e-09, + "logits/chosen": 0.07626624405384064, + "logits/rejected": 0.3884120285511017, + "logps/chosen": -1.4382637739181519, + "logps/rejected": -3.394392967224121, + "loss": -0.1189, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2986642122268677, + "rewards/margins": 0.108258455991745, + "rewards/rejected": 0.1904057264328003, + "step": 442 + }, + { + "epoch": 0.9267782426778243, + "grad_norm": 11.08282465711567, + "learning_rate": 8.129075735643698e-09, + "logits/chosen": 0.036822497844696045, + "logits/rejected": 0.4752381145954132, + "logps/chosen": -1.358346939086914, + "logps/rejected": -2.069066047668457, + "loss": -0.1501, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.3432375192642212, + "rewards/margins": 0.052591562271118164, + "rewards/rejected": 0.290645956993103, + "step": 443 + }, + { + "epoch": 0.9288702928870293, + "grad_norm": 16.456766535225622, + "learning_rate": 7.673550434268123e-09, + "logits/chosen": -0.1680659055709839, + "logits/rejected": 0.33423447608947754, + "logps/chosen": -1.4859250783920288, + "logps/rejected": -2.750816583633423, + "loss": -0.121, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3396714925765991, + "rewards/margins": 0.14422327280044556, + "rewards/rejected": 0.19544823467731476, + "step": 444 + }, + { + "epoch": 0.9309623430962343, + "grad_norm": 13.963716082092512, + "learning_rate": 7.230959992571367e-09, + "logits/chosen": -0.5167222023010254, + "logits/rejected": -0.10349351167678833, + "logps/chosen": -2.2473559379577637, + "logps/rejected": -1.9560575485229492, + "loss": -0.1235, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.3042965829372406, + "rewards/margins": 0.0013524312525987625, + "rewards/rejected": 0.302944153547287, + "step": 445 + }, + { + "epoch": 0.9330543933054394, + "grad_norm": 11.73510057260775, + "learning_rate": 6.801328035070136e-09, + "logits/chosen": -0.1064242273569107, + "logits/rejected": -0.47670555114746094, + "logps/chosen": -1.6268620491027832, + "logps/rejected": -2.1553955078125, + "loss": -0.1086, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.369182288646698, + "rewards/margins": 0.05523061379790306, + "rewards/rejected": 0.31395167112350464, + "step": 446 + }, + { + "epoch": 0.9351464435146444, + "grad_norm": 21.99025718659999, + "learning_rate": 6.38467749458535e-09, + "logits/chosen": -0.5817941427230835, + "logits/rejected": -0.2126074880361557, + "logps/chosen": -0.6642739176750183, + "logps/rejected": -1.7573022842407227, + "loss": -0.1693, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.581842303276062, + "rewards/margins": 0.2932204008102417, + "rewards/rejected": 0.2886218726634979, + "step": 447 + }, + { + "epoch": 0.9372384937238494, + "grad_norm": 20.176181702826447, + "learning_rate": 5.981030611018234e-09, + "logits/chosen": -0.23663604259490967, + "logits/rejected": -0.3075418472290039, + "logps/chosen": -0.8663835525512695, + "logps/rejected": -1.8129829168319702, + "loss": -0.1573, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4720425307750702, + "rewards/margins": 0.13673511147499084, + "rewards/rejected": 0.33530741930007935, + "step": 448 + }, + { + "epoch": 0.9393305439330544, + "grad_norm": 11.998624086972171, + "learning_rate": 5.590408930162799e-09, + "logits/chosen": -0.9552597999572754, + "logits/rejected": -0.2985736131668091, + "logps/chosen": -1.1250615119934082, + "logps/rejected": -2.544997215270996, + "loss": -0.1118, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.40850841999053955, + "rewards/margins": 0.10064040124416351, + "rewards/rejected": 0.30786800384521484, + "step": 449 + }, + { + "epoch": 0.9414225941422594, + "grad_norm": 17.17071890301872, + "learning_rate": 5.212833302556258e-09, + "logits/chosen": 0.05753253027796745, + "logits/rejected": 0.7037515044212341, + "logps/chosen": -1.349974274635315, + "logps/rejected": -1.7757717370986938, + "loss": -0.1235, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.40113726258277893, + "rewards/margins": 0.038026921451091766, + "rewards/rejected": 0.36311033368110657, + "step": 450 + }, + { + "epoch": 0.9435146443514645, + "grad_norm": 13.909230390768446, + "learning_rate": 4.848323882365668e-09, + "logits/chosen": -0.5183601379394531, + "logits/rejected": 0.12597893178462982, + "logps/chosen": -1.328192949295044, + "logps/rejected": -2.0817277431488037, + "loss": -0.0979, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3509393632411957, + "rewards/margins": 0.10210657864809036, + "rewards/rejected": 0.2488327920436859, + "step": 451 + }, + { + "epoch": 0.9456066945606695, + "grad_norm": 14.71491478198279, + "learning_rate": 4.496900126312431e-09, + "logits/chosen": -0.2766793668270111, + "logits/rejected": 0.13515634834766388, + "logps/chosen": -1.4848310947418213, + "logps/rejected": -2.497296094894409, + "loss": -0.0911, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3426211476325989, + "rewards/margins": 0.09051606804132462, + "rewards/rejected": 0.25210511684417725, + "step": 452 + }, + { + "epoch": 0.9476987447698745, + "grad_norm": 13.65972842123855, + "learning_rate": 4.158580792633482e-09, + "logits/chosen": -0.19712916016578674, + "logits/rejected": 0.3826109766960144, + "logps/chosen": -1.3620458841323853, + "logps/rejected": -3.217412233352661, + "loss": -0.1318, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.310494989156723, + "rewards/margins": 0.16613849997520447, + "rewards/rejected": 0.14435648918151855, + "step": 453 + }, + { + "epoch": 0.9497907949790795, + "grad_norm": 18.05968836248289, + "learning_rate": 3.833383940080231e-09, + "logits/chosen": 0.03498596325516701, + "logits/rejected": 0.11552795022726059, + "logps/chosen": -2.0567967891693115, + "logps/rejected": -1.346252679824829, + "loss": -0.1251, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.33275869488716125, + "rewards/margins": 0.012848753482103348, + "rewards/rejected": 0.319909930229187, + "step": 454 + }, + { + "epoch": 0.9518828451882845, + "grad_norm": 12.21768160390563, + "learning_rate": 3.521326926954532e-09, + "logits/chosen": -0.4226292371749878, + "logits/rejected": -0.23755401372909546, + "logps/chosen": -0.9351893067359924, + "logps/rejected": -2.4535017013549805, + "loss": -0.0938, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.4389905333518982, + "rewards/margins": 0.11947016417980194, + "rewards/rejected": 0.31952038407325745, + "step": 455 + }, + { + "epoch": 0.9539748953974896, + "grad_norm": 18.098045353171948, + "learning_rate": 3.2224264101821108e-09, + "logits/chosen": -0.09337201714515686, + "logits/rejected": 0.5950417518615723, + "logps/chosen": -1.2980998754501343, + "logps/rejected": -2.468187093734741, + "loss": -0.1534, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3416418433189392, + "rewards/margins": 0.13695421814918518, + "rewards/rejected": 0.20468764007091522, + "step": 456 + }, + { + "epoch": 0.9560669456066946, + "grad_norm": 26.130154479920918, + "learning_rate": 2.936698344423505e-09, + "logits/chosen": 0.07752484083175659, + "logits/rejected": 0.2245202213525772, + "logps/chosen": -1.3137603998184204, + "logps/rejected": -2.435314893722534, + "loss": -0.1571, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36322876811027527, + "rewards/margins": 0.16007305681705475, + "rewards/rejected": 0.2031557261943817, + "step": 457 + }, + { + "epoch": 0.9581589958158996, + "grad_norm": 10.469709392256348, + "learning_rate": 2.664157981222437e-09, + "logits/chosen": -0.37826135754585266, + "logits/rejected": -0.2887250483036041, + "logps/chosen": -1.4897507429122925, + "logps/rejected": -2.7404723167419434, + "loss": -0.1025, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.3629001975059509, + "rewards/margins": 0.11764570325613022, + "rewards/rejected": 0.2452545166015625, + "step": 458 + }, + { + "epoch": 0.9602510460251046, + "grad_norm": 19.507854738140097, + "learning_rate": 2.4048198681917154e-09, + "logits/chosen": 0.14118759334087372, + "logits/rejected": 0.5922274589538574, + "logps/chosen": -1.4373829364776611, + "logps/rejected": -2.7581567764282227, + "loss": -0.142, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.41010457277297974, + "rewards/margins": 0.15131796896457672, + "rewards/rejected": 0.2587866187095642, + "step": 459 + }, + { + "epoch": 0.9623430962343096, + "grad_norm": 18.02135980384631, + "learning_rate": 2.158697848236607e-09, + "logits/chosen": -0.6226168870925903, + "logits/rejected": -0.31880009174346924, + "logps/chosen": -1.2920911312103271, + "logps/rejected": -2.5839664936065674, + "loss": -0.1252, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.38381361961364746, + "rewards/margins": 0.1413259208202362, + "rewards/rejected": 0.24248769879341125, + "step": 460 + }, + { + "epoch": 0.9644351464435147, + "grad_norm": 17.032613549200434, + "learning_rate": 1.9258050588161766e-09, + "logits/chosen": 0.1924222707748413, + "logits/rejected": 0.715542733669281, + "logps/chosen": -1.2721333503723145, + "logps/rejected": -4.255842685699463, + "loss": -0.1525, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.3516067862510681, + "rewards/margins": 0.1841929852962494, + "rewards/rejected": 0.16741378605365753, + "step": 461 + }, + { + "epoch": 0.9665271966527197, + "grad_norm": 34.36895188270403, + "learning_rate": 1.7061539312417107e-09, + "logits/chosen": -0.29315185546875, + "logits/rejected": 0.2312111258506775, + "logps/chosen": -1.306030511856079, + "logps/rejected": -3.0991930961608887, + "loss": -0.1276, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.3960568308830261, + "rewards/margins": 0.16641941666603088, + "rewards/rejected": 0.22963739931583405, + "step": 462 + }, + { + "epoch": 0.9686192468619247, + "grad_norm": 19.667410540117263, + "learning_rate": 1.4997561900135236e-09, + "logits/chosen": -0.6279085874557495, + "logits/rejected": -0.18156293034553528, + "logps/chosen": -1.0435895919799805, + "logps/rejected": -2.9352269172668457, + "loss": -0.1557, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4187704920768738, + "rewards/margins": 0.16668733954429626, + "rewards/rejected": 0.2520831525325775, + "step": 463 + }, + { + "epoch": 0.9707112970711297, + "grad_norm": 37.90773614924713, + "learning_rate": 1.3066228521948219e-09, + "logits/chosen": -0.0598616898059845, + "logits/rejected": 0.47852182388305664, + "logps/chosen": -1.1657414436340332, + "logps/rejected": -2.9400830268859863, + "loss": -0.136, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.38903993368148804, + "rewards/margins": 0.2122332900762558, + "rewards/rejected": 0.17680665850639343, + "step": 464 + }, + { + "epoch": 0.9728033472803347, + "grad_norm": 32.18135370625648, + "learning_rate": 1.126764226823812e-09, + "logits/chosen": -0.33765825629234314, + "logits/rejected": 0.25973930954933167, + "logps/chosen": -1.73961341381073, + "logps/rejected": -2.8940162658691406, + "loss": -0.1681, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.3287394344806671, + "rewards/margins": 0.11775898933410645, + "rewards/rejected": 0.21098044514656067, + "step": 465 + }, + { + "epoch": 0.9748953974895398, + "grad_norm": 15.098617874452946, + "learning_rate": 9.60189914363363e-10, + "logits/chosen": 0.35089248418807983, + "logits/rejected": 0.9048178791999817, + "logps/chosen": -1.9484138488769531, + "logps/rejected": -2.5396575927734375, + "loss": -0.1436, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.44588300585746765, + "rewards/margins": 0.2035720944404602, + "rewards/rejected": 0.24231091141700745, + "step": 466 + }, + { + "epoch": 0.9769874476987448, + "grad_norm": 26.9169004762807, + "learning_rate": 8.069088061885276e-10, + "logits/chosen": 0.1534384936094284, + "logits/rejected": 0.5771427750587463, + "logps/chosen": -0.8172132968902588, + "logps/rejected": -3.5388541221618652, + "loss": -0.1277, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.46066659688949585, + "rewards/margins": 0.21784912049770355, + "rewards/rejected": 0.2428174614906311, + "step": 467 + }, + { + "epoch": 0.9790794979079498, + "grad_norm": 17.911143221851358, + "learning_rate": 6.66929084112089e-10, + "logits/chosen": -0.29413968324661255, + "logits/rejected": 0.13598132133483887, + "logps/chosen": -1.2387174367904663, + "logps/rejected": -1.7768938541412354, + "loss": -0.0878, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.4200688600540161, + "rewards/margins": 0.1287473887205124, + "rewards/rejected": 0.2913214862346649, + "step": 468 + }, + { + "epoch": 0.9811715481171548, + "grad_norm": 13.151996858359748, + "learning_rate": 5.402582199476036e-10, + "logits/chosen": -0.49788159132003784, + "logits/rejected": -0.18531841039657593, + "logps/chosen": -0.8361096382141113, + "logps/rejected": -3.4101362228393555, + "loss": -0.1225, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.4737487733364105, + "rewards/margins": 0.23892131447792053, + "rewards/rejected": 0.23482745885849, + "step": 469 + }, + { + "epoch": 0.9832635983263598, + "grad_norm": 16.63754739733106, + "learning_rate": 4.269029751107489e-10, + "logits/chosen": -0.5837739706039429, + "logits/rejected": -0.30460917949676514, + "logps/chosen": -0.9492983222007751, + "logps/rejected": -2.2384424209594727, + "loss": -0.152, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.47352638840675354, + "rewards/margins": 0.12935146689414978, + "rewards/rejected": 0.34417492151260376, + "step": 470 + }, + { + "epoch": 0.9853556485355649, + "grad_norm": 26.79474982462166, + "learning_rate": 3.2686940025836164e-10, + "logits/chosen": -0.20093563199043274, + "logits/rejected": -0.20078404247760773, + "logps/chosen": -1.623030662536621, + "logps/rejected": -1.8910236358642578, + "loss": -0.1135, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.3414144217967987, + "rewards/margins": 0.0637202039361, + "rewards/rejected": 0.2776942253112793, + "step": 471 + }, + { + "epoch": 0.9874476987447699, + "grad_norm": 26.02885361868872, + "learning_rate": 2.4016283496544607e-10, + "logits/chosen": -0.08121422678232193, + "logits/rejected": -0.05113856494426727, + "logps/chosen": -1.5861430168151855, + "logps/rejected": -2.3078389167785645, + "loss": -0.1663, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.35104459524154663, + "rewards/margins": 0.11668679118156433, + "rewards/rejected": 0.2343578040599823, + "step": 472 + }, + { + "epoch": 0.9895397489539749, + "grad_norm": 21.891539506236764, + "learning_rate": 1.6678790744015236e-10, + "logits/chosen": -0.7576691508293152, + "logits/rejected": -0.04828827083110809, + "logps/chosen": -1.8233122825622559, + "logps/rejected": -2.146613121032715, + "loss": -0.1304, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.268475741147995, + "rewards/margins": 0.07678797096014023, + "rewards/rejected": 0.19168779253959656, + "step": 473 + }, + { + "epoch": 0.9916317991631799, + "grad_norm": 28.502708975557262, + "learning_rate": 1.0674853427683484e-10, + "logits/chosen": 0.16506695747375488, + "logits/rejected": -0.4553059935569763, + "logps/chosen": -1.0200731754302979, + "logps/rejected": -1.8700056076049805, + "loss": -0.1593, + "rewards/accuracies": 0.4375, + "rewards/chosen": 0.4241200089454651, + "rewards/margins": 0.11907336115837097, + "rewards/rejected": 0.30504661798477173, + "step": 474 + }, + { + "epoch": 0.9937238493723849, + "grad_norm": 18.524168156983546, + "learning_rate": 6.004792024680294e-11, + "logits/chosen": 0.46647167205810547, + "logits/rejected": 0.5237947702407837, + "logps/chosen": -1.175705909729004, + "logps/rejected": -2.5198819637298584, + "loss": -0.1397, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.4669370651245117, + "rewards/margins": 0.23335213959217072, + "rewards/rejected": 0.2335849404335022, + "step": 475 + }, + { + "epoch": 0.99581589958159, + "grad_norm": 18.476908526093876, + "learning_rate": 2.6688558127485604e-11, + "logits/chosen": -0.22382637858390808, + "logits/rejected": -0.0033440515398979187, + "logps/chosen": -1.3147752285003662, + "logps/rejected": -2.3031740188598633, + "loss": -0.1175, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.36867618560791016, + "rewards/margins": 0.1478295624256134, + "rewards/rejected": 0.22084660828113556, + "step": 476 + }, + { + "epoch": 0.997907949790795, + "grad_norm": 60.634529292954234, + "learning_rate": 6.672228569148952e-12, + "logits/chosen": -0.9228725433349609, + "logits/rejected": -0.574424147605896, + "logps/chosen": -1.5599159002304077, + "logps/rejected": -1.694455623626709, + "loss": -0.1238, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.28848904371261597, + "rewards/margins": 0.02120237797498703, + "rewards/rejected": 0.26728665828704834, + "step": 477 + }, + { + "epoch": 1.0, + "grad_norm": 12.74207673062675, + "learning_rate": 0.0, + "logits/chosen": -0.6011393070220947, + "logits/rejected": -0.007772140204906464, + "logps/chosen": -1.1837854385375977, + "logps/rejected": -2.4837894439697266, + "loss": -0.0854, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.35520443320274353, + "rewards/margins": 0.12285250425338745, + "rewards/rejected": 0.23235191404819489, + "step": 478 + } + ], + "logging_steps": 1, + "max_steps": 478, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 8, + "trial_name": null, + "trial_params": null +}