diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,6109 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.9999375663357682, + "eval_steps": 1000, + "global_step": 4004, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00024973465692701506, + "grad_norm": 0.609375, + "learning_rate": 1.2468827930174565e-08, + "logits/chosen": -0.33114343881607056, + "logits/rejected": -0.24089118838310242, + "logps/chosen": -44.38773727416992, + "logps/rejected": -68.85894775390625, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0024973465692701507, + "grad_norm": 0.609375, + "learning_rate": 1.2468827930174566e-07, + "logits/chosen": -0.4296959638595581, + "logits/rejected": -0.34308701753616333, + "logps/chosen": -43.235145568847656, + "logps/rejected": -80.90267944335938, + "loss": 0.6931, + "rewards/accuracies": 0.4444444477558136, + "rewards/chosen": 0.0005310365813784301, + "rewards/margins": 9.072302782442421e-05, + "rewards/rejected": 0.0004403134807944298, + "step": 10 + }, + { + "epoch": 0.004994693138540301, + "grad_norm": 0.7890625, + "learning_rate": 2.493765586034913e-07, + "logits/chosen": -0.4125714898109436, + "logits/rejected": -0.3169251084327698, + "logps/chosen": -42.952693939208984, + "logps/rejected": -78.09742736816406, + "loss": 0.6928, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.000491735409013927, + "rewards/margins": 0.0007636584923602641, + "rewards/rejected": -0.0012553940759971738, + "step": 20 + }, + { + "epoch": 0.0074920397078104516, + "grad_norm": 0.50390625, + "learning_rate": 3.7406483790523695e-07, + "logits/chosen": -0.4181899130344391, + "logits/rejected": -0.3332025110721588, + "logps/chosen": -44.16044235229492, + "logps/rejected": -71.77767181396484, + "loss": 0.6931, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": -0.00046690466115251184, + "rewards/margins": 0.0001037311740219593, + "rewards/rejected": -0.0005706357769668102, + "step": 30 + }, + { + "epoch": 0.009989386277080603, + "grad_norm": 0.66796875, + "learning_rate": 4.987531172069826e-07, + "logits/chosen": -0.43510785698890686, + "logits/rejected": -0.34268879890441895, + "logps/chosen": -43.815826416015625, + "logps/rejected": -80.65787506103516, + "loss": 0.6923, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0002724333025980741, + "rewards/margins": 0.0016190242022275925, + "rewards/rejected": -0.0013465910451486707, + "step": 40 + }, + { + "epoch": 0.012486732846350752, + "grad_norm": 0.96484375, + "learning_rate": 6.234413965087283e-07, + "logits/chosen": -0.4381956160068512, + "logits/rejected": -0.32796674966812134, + "logps/chosen": -43.25028610229492, + "logps/rejected": -77.0926742553711, + "loss": 0.6927, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0003348322061356157, + "rewards/margins": 0.0009277343633584678, + "rewards/rejected": -0.0005929021863266826, + "step": 50 + }, + { + "epoch": 0.014984079415620903, + "grad_norm": 0.76171875, + "learning_rate": 7.481296758104739e-07, + "logits/chosen": -0.4058023989200592, + "logits/rejected": -0.31743547320365906, + "logps/chosen": -43.332618713378906, + "logps/rejected": -81.47147369384766, + "loss": 0.6923, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0009701108792796731, + "rewards/margins": 0.0017434615874662995, + "rewards/rejected": -0.0007733507081866264, + "step": 60 + }, + { + "epoch": 0.017481425984891052, + "grad_norm": 0.419921875, + "learning_rate": 8.728179551122195e-07, + "logits/chosen": -0.40699687600135803, + "logits/rejected": -0.33324044942855835, + "logps/chosen": -42.81806182861328, + "logps/rejected": -69.8255844116211, + "loss": 0.6918, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0005619878647848964, + "rewards/margins": 0.0026690722443163395, + "rewards/rejected": -0.0021070842631161213, + "step": 70 + }, + { + "epoch": 0.019978772554161205, + "grad_norm": 1.1171875, + "learning_rate": 9.975062344139653e-07, + "logits/chosen": -0.4060027003288269, + "logits/rejected": -0.31257936358451843, + "logps/chosen": -43.606048583984375, + "logps/rejected": -74.920654296875, + "loss": 0.691, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0015056885313242674, + "rewards/margins": 0.004372184630483389, + "rewards/rejected": -0.0028664960991591215, + "step": 80 + }, + { + "epoch": 0.022476119123431355, + "grad_norm": 0.515625, + "learning_rate": 1.1221945137157108e-06, + "logits/chosen": -0.39966338872909546, + "logits/rejected": -0.3299568295478821, + "logps/chosen": -42.9066047668457, + "logps/rejected": -67.96953582763672, + "loss": 0.6899, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.0013856906443834305, + "rewards/margins": 0.00656374916434288, + "rewards/rejected": -0.00517805851995945, + "step": 90 + }, + { + "epoch": 0.024973465692701504, + "grad_norm": 0.73046875, + "learning_rate": 1.2468827930174565e-06, + "logits/chosen": -0.4090539515018463, + "logits/rejected": -0.31136512756347656, + "logps/chosen": -44.48310089111328, + "logps/rejected": -84.36518096923828, + "loss": 0.688, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0026647746562957764, + "rewards/margins": 0.010295169427990913, + "rewards/rejected": -0.00763039430603385, + "step": 100 + }, + { + "epoch": 0.027470812261971653, + "grad_norm": 0.6484375, + "learning_rate": 1.3715710723192023e-06, + "logits/chosen": -0.40336164832115173, + "logits/rejected": -0.3107188642024994, + "logps/chosen": -43.532264709472656, + "logps/rejected": -78.85313415527344, + "loss": 0.6869, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0021588439121842384, + "rewards/margins": 0.012583871372044086, + "rewards/rejected": -0.010425028391182423, + "step": 110 + }, + { + "epoch": 0.029968158831241806, + "grad_norm": 0.62109375, + "learning_rate": 1.4962593516209478e-06, + "logits/chosen": -0.44534754753112793, + "logits/rejected": -0.3525586724281311, + "logps/chosen": -43.2638053894043, + "logps/rejected": -73.06291961669922, + "loss": 0.6844, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0026553391944617033, + "rewards/margins": 0.017566664144396782, + "rewards/rejected": -0.014911326579749584, + "step": 120 + }, + { + "epoch": 0.032465505400511956, + "grad_norm": 0.734375, + "learning_rate": 1.6209476309226935e-06, + "logits/chosen": -0.4450170397758484, + "logits/rejected": -0.356881707906723, + "logps/chosen": -43.020606994628906, + "logps/rejected": -74.61629486083984, + "loss": 0.6783, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.004791858606040478, + "rewards/margins": 0.030012447386980057, + "rewards/rejected": -0.025220584124326706, + "step": 130 + }, + { + "epoch": 0.034962851969782105, + "grad_norm": 0.52734375, + "learning_rate": 1.745635910224439e-06, + "logits/chosen": -0.4518131613731384, + "logits/rejected": -0.35589107871055603, + "logps/chosen": -44.260887145996094, + "logps/rejected": -89.10530853271484, + "loss": 0.6769, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.0043911924585700035, + "rewards/margins": 0.03277025744318962, + "rewards/rejected": -0.02837906777858734, + "step": 140 + }, + { + "epoch": 0.037460198539052254, + "grad_norm": 0.470703125, + "learning_rate": 1.8703241895261848e-06, + "logits/chosen": -0.39934635162353516, + "logits/rejected": -0.32321810722351074, + "logps/chosen": -41.51782989501953, + "logps/rejected": -74.04359436035156, + "loss": 0.67, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.009082725271582603, + "rewards/margins": 0.04699797183275223, + "rewards/rejected": -0.037915244698524475, + "step": 150 + }, + { + "epoch": 0.03995754510832241, + "grad_norm": 0.703125, + "learning_rate": 1.9950124688279305e-06, + "logits/chosen": -0.40700763463974, + "logits/rejected": -0.33096417784690857, + "logps/chosen": -42.31119918823242, + "logps/rejected": -82.06526947021484, + "loss": 0.6641, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.010724621824920177, + "rewards/margins": 0.059142522513866425, + "rewards/rejected": -0.04841790720820427, + "step": 160 + }, + { + "epoch": 0.04245489167759256, + "grad_norm": 0.443359375, + "learning_rate": 2.119700748129676e-06, + "logits/chosen": -0.4063618779182434, + "logits/rejected": -0.31147629022598267, + "logps/chosen": -43.24675750732422, + "logps/rejected": -74.64862060546875, + "loss": 0.6545, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.009759762324392796, + "rewards/margins": 0.07911679148674011, + "rewards/rejected": -0.0693570226430893, + "step": 170 + }, + { + "epoch": 0.04495223824686271, + "grad_norm": 0.369140625, + "learning_rate": 2.2443890274314216e-06, + "logits/chosen": -0.3992343842983246, + "logits/rejected": -0.30265265703201294, + "logps/chosen": -42.563804626464844, + "logps/rejected": -89.08007049560547, + "loss": 0.6496, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.016897384077310562, + "rewards/margins": 0.08930385112762451, + "rewards/rejected": -0.07240646332502365, + "step": 180 + }, + { + "epoch": 0.04744958481613286, + "grad_norm": 0.388671875, + "learning_rate": 2.3690773067331675e-06, + "logits/chosen": -0.393463671207428, + "logits/rejected": -0.29836633801460266, + "logps/chosen": -42.574581146240234, + "logps/rejected": -81.52467346191406, + "loss": 0.6448, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.01014000829309225, + "rewards/margins": 0.09943968802690506, + "rewards/rejected": -0.08929967135190964, + "step": 190 + }, + { + "epoch": 0.04994693138540301, + "grad_norm": 0.423828125, + "learning_rate": 2.493765586034913e-06, + "logits/chosen": -0.3705541491508484, + "logits/rejected": -0.2776363492012024, + "logps/chosen": -43.70671844482422, + "logps/rejected": -79.51457214355469, + "loss": 0.635, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.011818965896964073, + "rewards/margins": 0.12041501700878143, + "rewards/rejected": -0.10859604924917221, + "step": 200 + }, + { + "epoch": 0.05244427795467316, + "grad_norm": 0.400390625, + "learning_rate": 2.6184538653366586e-06, + "logits/chosen": -0.3588668704032898, + "logits/rejected": -0.27049878239631653, + "logps/chosen": -41.42917251586914, + "logps/rejected": -80.90901947021484, + "loss": 0.6249, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.012408060021698475, + "rewards/margins": 0.14213527739048004, + "rewards/rejected": -0.12972721457481384, + "step": 210 + }, + { + "epoch": 0.05494162452394331, + "grad_norm": 0.376953125, + "learning_rate": 2.7431421446384045e-06, + "logits/chosen": -0.32904312014579773, + "logits/rejected": -0.24068386852741241, + "logps/chosen": -42.47250747680664, + "logps/rejected": -89.7323226928711, + "loss": 0.6272, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.011426225304603577, + "rewards/margins": 0.13695809245109558, + "rewards/rejected": -0.125531867146492, + "step": 220 + }, + { + "epoch": 0.05743897109321346, + "grad_norm": 0.4921875, + "learning_rate": 2.86783042394015e-06, + "logits/chosen": -0.3509990870952606, + "logits/rejected": -0.26932188868522644, + "logps/chosen": -41.524696350097656, + "logps/rejected": -84.92623901367188, + "loss": 0.6128, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.016151348128914833, + "rewards/margins": 0.16833417117595673, + "rewards/rejected": -0.15218281745910645, + "step": 230 + }, + { + "epoch": 0.05993631766248361, + "grad_norm": 0.494140625, + "learning_rate": 2.9925187032418956e-06, + "logits/chosen": -0.3635261356830597, + "logits/rejected": -0.26545146107673645, + "logps/chosen": -43.126625061035156, + "logps/rejected": -89.85209655761719, + "loss": 0.6002, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.004656613804399967, + "rewards/margins": 0.19621676206588745, + "rewards/rejected": -0.19156016409397125, + "step": 240 + }, + { + "epoch": 0.06243366423175376, + "grad_norm": 0.435546875, + "learning_rate": 3.117206982543641e-06, + "logits/chosen": -0.3245137929916382, + "logits/rejected": -0.22121305763721466, + "logps/chosen": -42.189552307128906, + "logps/rejected": -91.37117767333984, + "loss": 0.5891, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.009633781388401985, + "rewards/margins": 0.22138457000255585, + "rewards/rejected": -0.2117508202791214, + "step": 250 + }, + { + "epoch": 0.06493101080102391, + "grad_norm": 0.76171875, + "learning_rate": 3.241895261845387e-06, + "logits/chosen": -0.3298066258430481, + "logits/rejected": -0.2243480682373047, + "logps/chosen": -41.658103942871094, + "logps/rejected": -97.98243713378906, + "loss": 0.5608, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.014573690481483936, + "rewards/margins": 0.28632912039756775, + "rewards/rejected": -0.2717553973197937, + "step": 260 + }, + { + "epoch": 0.06742835737029407, + "grad_norm": 0.84765625, + "learning_rate": 3.3665835411471326e-06, + "logits/chosen": -0.274959921836853, + "logits/rejected": -0.15127086639404297, + "logps/chosen": -42.591773986816406, + "logps/rejected": -109.750732421875, + "loss": 0.5226, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.010403521358966827, + "rewards/margins": 0.3793814182281494, + "rewards/rejected": -0.36897793412208557, + "step": 270 + }, + { + "epoch": 0.06992570393956421, + "grad_norm": 1.265625, + "learning_rate": 3.491271820448878e-06, + "logits/chosen": -0.26599782705307007, + "logits/rejected": -0.14541617035865784, + "logps/chosen": -43.68675994873047, + "logps/rejected": -131.44851684570312, + "loss": 0.4865, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.009956231340765953, + "rewards/margins": 0.4785284399986267, + "rewards/rejected": -0.4685722291469574, + "step": 280 + }, + { + "epoch": 0.07242305050883437, + "grad_norm": 1.8828125, + "learning_rate": 3.615960099750624e-06, + "logits/chosen": -0.2336564064025879, + "logits/rejected": -0.09945651143789291, + "logps/chosen": -42.83462142944336, + "logps/rejected": -155.51699829101562, + "loss": 0.3811, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.010509507730603218, + "rewards/margins": 0.8152663111686707, + "rewards/rejected": -0.804756760597229, + "step": 290 + }, + { + "epoch": 0.07492039707810451, + "grad_norm": 1.484375, + "learning_rate": 3.7406483790523696e-06, + "logits/chosen": -0.187991201877594, + "logits/rejected": -0.015538264997303486, + "logps/chosen": -53.440765380859375, + "logps/rejected": -225.8020477294922, + "loss": 0.2419, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.09434106200933456, + "rewards/margins": 1.4534434080123901, + "rewards/rejected": -1.5477845668792725, + "step": 300 + }, + { + "epoch": 0.07741774364737466, + "grad_norm": 0.94140625, + "learning_rate": 3.8653366583541155e-06, + "logits/chosen": -0.10128624737262726, + "logits/rejected": 0.10228855907917023, + "logps/chosen": -66.81230163574219, + "logps/rejected": -350.84722900390625, + "loss": 0.1818, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.22738003730773926, + "rewards/margins": 2.590130567550659, + "rewards/rejected": -2.8175110816955566, + "step": 310 + }, + { + "epoch": 0.07991509021664482, + "grad_norm": 0.66015625, + "learning_rate": 3.990024937655861e-06, + "logits/chosen": -0.06341538578271866, + "logits/rejected": 0.19574430584907532, + "logps/chosen": -63.102256774902344, + "logps/rejected": -506.8706970214844, + "loss": 0.1113, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19701920449733734, + "rewards/margins": 4.0467329025268555, + "rewards/rejected": -4.2437520027160645, + "step": 320 + }, + { + "epoch": 0.08241243678591496, + "grad_norm": 0.703125, + "learning_rate": 4.114713216957607e-06, + "logits/chosen": 0.03162340074777603, + "logits/rejected": 0.2972305417060852, + "logps/chosen": -55.010414123535156, + "logps/rejected": -444.8564453125, + "loss": 0.0957, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1186632290482521, + "rewards/margins": 3.6549084186553955, + "rewards/rejected": -3.773571729660034, + "step": 330 + }, + { + "epoch": 0.08490978335518512, + "grad_norm": 0.48046875, + "learning_rate": 4.239401496259352e-06, + "logits/chosen": 0.05119480937719345, + "logits/rejected": 0.35537463426589966, + "logps/chosen": -52.077064514160156, + "logps/rejected": -547.7008056640625, + "loss": 0.0761, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.08894447982311249, + "rewards/margins": 4.584813117980957, + "rewards/rejected": -4.673757076263428, + "step": 340 + }, + { + "epoch": 0.08740712992445526, + "grad_norm": 0.1796875, + "learning_rate": 4.364089775561098e-06, + "logits/chosen": 0.17311367392539978, + "logits/rejected": 0.49543648958206177, + "logps/chosen": -59.5765380859375, + "logps/rejected": -548.73876953125, + "loss": 0.0769, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1556529700756073, + "rewards/margins": 4.651310443878174, + "rewards/rejected": -4.806963920593262, + "step": 350 + }, + { + "epoch": 0.08990447649372542, + "grad_norm": 0.216796875, + "learning_rate": 4.488778054862843e-06, + "logits/chosen": 0.13120940327644348, + "logits/rejected": 0.5331201553344727, + "logps/chosen": -59.65636444091797, + "logps/rejected": -660.1800537109375, + "loss": 0.0462, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.15369465947151184, + "rewards/margins": 5.723788738250732, + "rewards/rejected": -5.877484321594238, + "step": 360 + }, + { + "epoch": 0.09240182306299556, + "grad_norm": 0.90625, + "learning_rate": 4.6134663341645895e-06, + "logits/chosen": 0.25265592336654663, + "logits/rejected": 0.6620725989341736, + "logps/chosen": -61.8662109375, + "logps/rejected": -688.4534912109375, + "loss": 0.0476, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18499073386192322, + "rewards/margins": 5.994035720825195, + "rewards/rejected": -6.179026126861572, + "step": 370 + }, + { + "epoch": 0.09489916963226572, + "grad_norm": 1.0703125, + "learning_rate": 4.738154613466335e-06, + "logits/chosen": 0.21470895409584045, + "logits/rejected": 0.7127342224121094, + "logps/chosen": -76.92805480957031, + "logps/rejected": -1027.8746337890625, + "loss": 0.0318, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3377203345298767, + "rewards/margins": 9.173759460449219, + "rewards/rejected": -9.511480331420898, + "step": 380 + }, + { + "epoch": 0.09739651620153587, + "grad_norm": 0.013671875, + "learning_rate": 4.862842892768081e-06, + "logits/chosen": 0.2577429413795471, + "logits/rejected": 0.7398630380630493, + "logps/chosen": -87.82744598388672, + "logps/rejected": -828.4269409179688, + "loss": 0.0229, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44494834542274475, + "rewards/margins": 7.14224100112915, + "rewards/rejected": -7.5871901512146, + "step": 390 + }, + { + "epoch": 0.09989386277080602, + "grad_norm": 0.326171875, + "learning_rate": 4.987531172069826e-06, + "logits/chosen": 0.312338262796402, + "logits/rejected": 0.8188611268997192, + "logps/chosen": -83.17652130126953, + "logps/rejected": -929.5695190429688, + "loss": 0.037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39534783363342285, + "rewards/margins": 8.192750930786133, + "rewards/rejected": -8.588098526000977, + "step": 400 + }, + { + "epoch": 0.10239120934007617, + "grad_norm": 0.2412109375, + "learning_rate": 4.999923022460671e-06, + "logits/chosen": 0.2771604657173157, + "logits/rejected": 0.875481903553009, + "logps/chosen": -71.91412353515625, + "logps/rejected": -1142.7008056640625, + "loss": 0.0166, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29563969373703003, + "rewards/margins": 10.368195533752441, + "rewards/rejected": -10.663835525512695, + "step": 410 + }, + { + "epoch": 0.10488855590934631, + "grad_norm": 0.361328125, + "learning_rate": 4.999656933348981e-06, + "logits/chosen": 0.3595578372478485, + "logits/rejected": 0.8383792638778687, + "logps/chosen": -77.22844696044922, + "logps/rejected": -818.2454833984375, + "loss": 0.0347, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3375477194786072, + "rewards/margins": 7.169804573059082, + "rewards/rejected": -7.50735330581665, + "step": 420 + }, + { + "epoch": 0.10738590247861647, + "grad_norm": 0.006805419921875, + "learning_rate": 4.99920080255011e-06, + "logits/chosen": 0.3072226047515869, + "logits/rejected": 0.9365525245666504, + "logps/chosen": -69.96758270263672, + "logps/rejected": -1107.283447265625, + "loss": 0.011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.24378785490989685, + "rewards/margins": 9.968598365783691, + "rewards/rejected": -10.212385177612305, + "step": 430 + }, + { + "epoch": 0.10988324904788661, + "grad_norm": 0.9375, + "learning_rate": 4.998554664742362e-06, + "logits/chosen": 0.386096328496933, + "logits/rejected": 0.9298421740531921, + "logps/chosen": -77.93641662597656, + "logps/rejected": -925.2971801757812, + "loss": 0.0152, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3427829146385193, + "rewards/margins": 8.217004776000977, + "rewards/rejected": -8.559788703918457, + "step": 440 + }, + { + "epoch": 0.11238059561715677, + "grad_norm": 0.162109375, + "learning_rate": 4.997718569049726e-06, + "logits/chosen": 0.38062307238578796, + "logits/rejected": 0.9510132670402527, + "logps/chosen": -62.76348876953125, + "logps/rejected": -1039.5853271484375, + "loss": 0.0178, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.19411073625087738, + "rewards/margins": 9.47815227508545, + "rewards/rejected": -9.672263145446777, + "step": 450 + }, + { + "epoch": 0.11487794218642693, + "grad_norm": 0.0625, + "learning_rate": 4.9966925790381404e-06, + "logits/chosen": 0.4757159352302551, + "logits/rejected": 1.018425703048706, + "logps/chosen": -81.46342468261719, + "logps/rejected": -907.7916870117188, + "loss": 0.0209, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38752180337905884, + "rewards/margins": 7.994225978851318, + "rewards/rejected": -8.381747245788574, + "step": 460 + }, + { + "epoch": 0.11737528875569707, + "grad_norm": 0.39453125, + "learning_rate": 4.995476772710657e-06, + "logits/chosen": 0.40233319997787476, + "logits/rejected": 1.0515995025634766, + "logps/chosen": -82.72390747070312, + "logps/rejected": -1207.831787109375, + "loss": 0.0199, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37652480602264404, + "rewards/margins": 10.886858940124512, + "rewards/rejected": -11.263383865356445, + "step": 470 + }, + { + "epoch": 0.11987263532496722, + "grad_norm": 0.109375, + "learning_rate": 4.994071242501516e-06, + "logits/chosen": 0.4317776560783386, + "logits/rejected": 1.0796253681182861, + "logps/chosen": -62.16728591918945, + "logps/rejected": -1022.6038208007812, + "loss": 0.0134, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.18862931430339813, + "rewards/margins": 9.357450485229492, + "rewards/rejected": -9.546079635620117, + "step": 480 + }, + { + "epoch": 0.12236998189423737, + "grad_norm": 3.3527612686157227e-06, + "learning_rate": 4.992476095269112e-06, + "logits/chosen": 0.4001534581184387, + "logits/rejected": 0.9869491457939148, + "logps/chosen": -64.50323486328125, + "logps/rejected": -1058.342041015625, + "loss": 0.0186, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.21038322150707245, + "rewards/margins": 9.569865226745605, + "rewards/rejected": -9.780248641967773, + "step": 490 + }, + { + "epoch": 0.12486732846350752, + "grad_norm": 0.166015625, + "learning_rate": 4.990691452287877e-06, + "logits/chosen": 0.513416051864624, + "logits/rejected": 1.122924566268921, + "logps/chosen": -86.93208312988281, + "logps/rejected": -1010.1483154296875, + "loss": 0.0175, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43185362219810486, + "rewards/margins": 8.921982765197754, + "rewards/rejected": -9.353837966918945, + "step": 500 + }, + { + "epoch": 0.12736467503277768, + "grad_norm": 0.1806640625, + "learning_rate": 4.988717449239056e-06, + "logits/chosen": 0.5288435220718384, + "logits/rejected": 1.184326410293579, + "logps/chosen": -75.24764251708984, + "logps/rejected": -1083.439697265625, + "loss": 0.0205, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31985238194465637, + "rewards/margins": 9.819514274597168, + "rewards/rejected": -10.139368057250977, + "step": 510 + }, + { + "epoch": 0.12986202160204782, + "grad_norm": 0.05126953125, + "learning_rate": 4.98655423620039e-06, + "logits/chosen": 0.45140591263771057, + "logits/rejected": 1.135999083518982, + "logps/chosen": -66.84233093261719, + "logps/rejected": -1121.6982421875, + "loss": 0.0088, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.23821225762367249, + "rewards/margins": 10.265534400939941, + "rewards/rejected": -10.503746032714844, + "step": 520 + }, + { + "epoch": 0.13235936817131796, + "grad_norm": 0.0130615234375, + "learning_rate": 4.984201977634711e-06, + "logits/chosen": 0.44299745559692383, + "logits/rejected": 1.2149170637130737, + "logps/chosen": -74.1891860961914, + "logps/rejected": -1348.673095703125, + "loss": 0.0081, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30132222175598145, + "rewards/margins": 12.441837310791016, + "rewards/rejected": -12.74316120147705, + "step": 530 + }, + { + "epoch": 0.13485671474058814, + "grad_norm": 0.1982421875, + "learning_rate": 4.9816608523774345e-06, + "logits/chosen": 0.4906342625617981, + "logits/rejected": 1.1866085529327393, + "logps/chosen": -61.67924880981445, + "logps/rejected": -1052.905029296875, + "loss": 0.013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.1749764382839203, + "rewards/margins": 9.563148498535156, + "rewards/rejected": -9.73812484741211, + "step": 540 + }, + { + "epoch": 0.13735406130985828, + "grad_norm": 0.02099609375, + "learning_rate": 4.978931053622964e-06, + "logits/chosen": 0.5177958607673645, + "logits/rejected": 1.2569612264633179, + "logps/chosen": -70.76200866699219, + "logps/rejected": -1278.626708984375, + "loss": 0.0133, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27674826979637146, + "rewards/margins": 11.792952537536621, + "rewards/rejected": -12.069701194763184, + "step": 550 + }, + { + "epoch": 0.13985140787912842, + "grad_norm": 0.0003528594970703125, + "learning_rate": 4.9760127889100044e-06, + "logits/chosen": 0.5248929262161255, + "logits/rejected": 1.2501459121704102, + "logps/chosen": -81.47541809082031, + "logps/rejected": -1154.671630859375, + "loss": 0.0119, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3913446366786957, + "rewards/margins": 10.44621467590332, + "rewards/rejected": -10.837559700012207, + "step": 560 + }, + { + "epoch": 0.1423487544483986, + "grad_norm": 0.09423828125, + "learning_rate": 4.972906280105781e-06, + "logits/chosen": 0.5316249132156372, + "logits/rejected": 1.3082139492034912, + "logps/chosen": -88.09521484375, + "logps/rejected": -1197.23486328125, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4347292482852936, + "rewards/margins": 10.762666702270508, + "rewards/rejected": -11.197395324707031, + "step": 570 + }, + { + "epoch": 0.14484610101766873, + "grad_norm": 0.0106201171875, + "learning_rate": 4.969611763389175e-06, + "logits/chosen": 0.5327505469322205, + "logits/rejected": 1.3031514883041382, + "logps/chosen": -73.19583129882812, + "logps/rejected": -1114.89990234375, + "loss": 0.0085, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30240899324417114, + "rewards/margins": 10.148442268371582, + "rewards/rejected": -10.450851440429688, + "step": 580 + }, + { + "epoch": 0.14734344758693887, + "grad_norm": 0.0751953125, + "learning_rate": 4.966129489232762e-06, + "logits/chosen": 0.47109970450401306, + "logits/rejected": 1.3012760877609253, + "logps/chosen": -71.85210418701172, + "logps/rejected": -1336.1424560546875, + "loss": 0.0071, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.277540385723114, + "rewards/margins": 12.241964340209961, + "rewards/rejected": -12.51950454711914, + "step": 590 + }, + { + "epoch": 0.14984079415620902, + "grad_norm": 0.193359375, + "learning_rate": 4.962459722383775e-06, + "logits/chosen": 0.4269895553588867, + "logits/rejected": 1.1828067302703857, + "logps/chosen": -71.79056549072266, + "logps/rejected": -1337.454345703125, + "loss": 0.0111, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2726261019706726, + "rewards/margins": 12.248571395874023, + "rewards/rejected": -12.521197319030762, + "step": 600 + }, + { + "epoch": 0.1523381407254792, + "grad_norm": 0.09619140625, + "learning_rate": 4.958602741843975e-06, + "logits/chosen": 0.4595261216163635, + "logits/rejected": 1.3015968799591064, + "logps/chosen": -78.28643035888672, + "logps/rejected": -1252.5814208984375, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3374534249305725, + "rewards/margins": 11.378314018249512, + "rewards/rejected": -11.715767860412598, + "step": 610 + }, + { + "epoch": 0.15483548729474933, + "grad_norm": 0.048583984375, + "learning_rate": 4.954558840848437e-06, + "logits/chosen": 0.5825181007385254, + "logits/rejected": 1.3565789461135864, + "logps/chosen": -76.19590759277344, + "logps/rejected": -1119.151611328125, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3366764783859253, + "rewards/margins": 10.178030967712402, + "rewards/rejected": -10.514707565307617, + "step": 620 + }, + { + "epoch": 0.15733283386401947, + "grad_norm": 0.1572265625, + "learning_rate": 4.950328326843258e-06, + "logits/chosen": 0.5459114909172058, + "logits/rejected": 1.3613156080245972, + "logps/chosen": -82.85084533691406, + "logps/rejected": -1277.3514404296875, + "loss": 0.0104, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3968349099159241, + "rewards/margins": 11.52591323852539, + "rewards/rejected": -11.922747611999512, + "step": 630 + }, + { + "epoch": 0.15983018043328964, + "grad_norm": 0.005096435546875, + "learning_rate": 4.945911521462182e-06, + "logits/chosen": 0.5720694065093994, + "logits/rejected": 1.411368727684021, + "logps/chosen": -80.45169067382812, + "logps/rejected": -1358.6873779296875, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3696327209472656, + "rewards/margins": 12.431825637817383, + "rewards/rejected": -12.801457405090332, + "step": 640 + }, + { + "epoch": 0.16232752700255978, + "grad_norm": 0.1708984375, + "learning_rate": 4.941308760502149e-06, + "logits/chosen": 0.5064912438392639, + "logits/rejected": 1.2029752731323242, + "logps/chosen": -67.90926361083984, + "logps/rejected": -1026.6124267578125, + "loss": 0.0125, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2300504744052887, + "rewards/margins": 9.079760551452637, + "rewards/rejected": -9.30981159210205, + "step": 650 + }, + { + "epoch": 0.16482487357182993, + "grad_norm": 0.1787109375, + "learning_rate": 4.936520393897762e-06, + "logits/chosen": 0.4909030497074127, + "logits/rejected": 1.2922523021697998, + "logps/chosen": -69.4020004272461, + "logps/rejected": -1298.668212890625, + "loss": 0.0096, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.25019845366477966, + "rewards/margins": 11.917684555053711, + "rewards/rejected": -12.167882919311523, + "step": 660 + }, + { + "epoch": 0.16732222014110007, + "grad_norm": 0.00469970703125, + "learning_rate": 4.931546785694684e-06, + "logits/chosen": 0.5053218007087708, + "logits/rejected": 1.4669103622436523, + "logps/chosen": -86.32283782958984, + "logps/rejected": -1483.82763671875, + "loss": 0.0045, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42769041657447815, + "rewards/margins": 13.680854797363281, + "rewards/rejected": -14.108546257019043, + "step": 670 + }, + { + "epoch": 0.16981956671037024, + "grad_norm": 0.0238037109375, + "learning_rate": 4.926388314021964e-06, + "logits/chosen": 0.6257452368736267, + "logits/rejected": 1.5271151065826416, + "logps/chosen": -92.75479888916016, + "logps/rejected": -1237.61474609375, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4862573742866516, + "rewards/margins": 11.188620567321777, + "rewards/rejected": -11.67487907409668, + "step": 680 + }, + { + "epoch": 0.17231691327964038, + "grad_norm": 0.000385284423828125, + "learning_rate": 4.921045371063283e-06, + "logits/chosen": 0.584161102771759, + "logits/rejected": 1.478125810623169, + "logps/chosen": -89.634033203125, + "logps/rejected": -1360.25537109375, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46992653608322144, + "rewards/margins": 12.42024040222168, + "rewards/rejected": -12.890167236328125, + "step": 690 + }, + { + "epoch": 0.17481425984891052, + "grad_norm": 0.1318359375, + "learning_rate": 4.915518363027142e-06, + "logits/chosen": 0.5938104391098022, + "logits/rejected": 1.4910205602645874, + "logps/chosen": -73.86201477050781, + "logps/rejected": -1182.0716552734375, + "loss": 0.0059, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3045103847980499, + "rewards/margins": 10.839627265930176, + "rewards/rejected": -11.144137382507324, + "step": 700 + }, + { + "epoch": 0.1773116064181807, + "grad_norm": 0.08740234375, + "learning_rate": 4.909807710115977e-06, + "logits/chosen": 0.543526828289032, + "logits/rejected": 1.467707633972168, + "logps/chosen": -81.62191009521484, + "logps/rejected": -1380.295166015625, + "loss": 0.0066, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38441941142082214, + "rewards/margins": 12.690566062927246, + "rewards/rejected": -13.074984550476074, + "step": 710 + }, + { + "epoch": 0.17980895298745084, + "grad_norm": 0.025146484375, + "learning_rate": 4.903913846494211e-06, + "logits/chosen": 0.4768219590187073, + "logits/rejected": 1.4783326387405396, + "logps/chosen": -79.43184661865234, + "logps/rejected": -1673.644287109375, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3560616374015808, + "rewards/margins": 15.518526077270508, + "rewards/rejected": -15.87458610534668, + "step": 720 + }, + { + "epoch": 0.18230629955672098, + "grad_norm": 0.08154296875, + "learning_rate": 4.897837220255251e-06, + "logits/chosen": 0.5687042474746704, + "logits/rejected": 1.4331896305084229, + "logps/chosen": -86.38923645019531, + "logps/rejected": -1329.374755859375, + "loss": 0.008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4307560920715332, + "rewards/margins": 12.039878845214844, + "rewards/rejected": -12.470634460449219, + "step": 730 + }, + { + "epoch": 0.18480364612599112, + "grad_norm": 0.0101318359375, + "learning_rate": 4.891578293387413e-06, + "logits/chosen": 0.604946494102478, + "logits/rejected": 1.5590946674346924, + "logps/chosen": -80.6954345703125, + "logps/rejected": -1429.257080078125, + "loss": 0.0056, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37448957562446594, + "rewards/margins": 13.214704513549805, + "rewards/rejected": -13.589195251464844, + "step": 740 + }, + { + "epoch": 0.1873009926952613, + "grad_norm": 0.07177734375, + "learning_rate": 4.885137541738808e-06, + "logits/chosen": 0.5679504871368408, + "logits/rejected": 1.4625308513641357, + "logps/chosen": -74.07333374023438, + "logps/rejected": -1173.9857177734375, + "loss": 0.0055, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3022187054157257, + "rewards/margins": 10.615083694458008, + "rewards/rejected": -10.917302131652832, + "step": 750 + }, + { + "epoch": 0.18979833926453143, + "grad_norm": 0.000659942626953125, + "learning_rate": 4.878515454981153e-06, + "logits/chosen": 0.5600544214248657, + "logits/rejected": 1.5554611682891846, + "logps/chosen": -95.30448913574219, + "logps/rejected": -1539.63427734375, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5141724348068237, + "rewards/margins": 14.041679382324219, + "rewards/rejected": -14.555851936340332, + "step": 760 + }, + { + "epoch": 0.19229568583380158, + "grad_norm": 0.154296875, + "learning_rate": 4.8717125365725545e-06, + "logits/chosen": 0.6704256534576416, + "logits/rejected": 1.511325716972351, + "logps/chosen": -78.95833587646484, + "logps/rejected": -1096.5010986328125, + "loss": 0.0109, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35167983174324036, + "rewards/margins": 9.9315185546875, + "rewards/rejected": -10.283197402954102, + "step": 770 + }, + { + "epoch": 0.19479303240307175, + "grad_norm": 0.04443359375, + "learning_rate": 4.864729303719221e-06, + "logits/chosen": 0.49029749631881714, + "logits/rejected": 1.4827202558517456, + "logps/chosen": -78.9704360961914, + "logps/rejected": -1541.809326171875, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34706807136535645, + "rewards/margins": 14.254959106445312, + "rewards/rejected": -14.602025985717773, + "step": 780 + }, + { + "epoch": 0.1972903789723419, + "grad_norm": 0.11328125, + "learning_rate": 4.857566287336152e-06, + "logits/chosen": 0.5910658836364746, + "logits/rejected": 1.5483187437057495, + "logps/chosen": -99.37945556640625, + "logps/rejected": -1432.4755859375, + "loss": 0.0086, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5464431047439575, + "rewards/margins": 12.95887565612793, + "rewards/rejected": -13.505319595336914, + "step": 790 + }, + { + "epoch": 0.19978772554161203, + "grad_norm": 0.0011444091796875, + "learning_rate": 4.850224032006765e-06, + "logits/chosen": 0.6179195642471313, + "logits/rejected": 1.5901352167129517, + "logps/chosen": -81.73147583007812, + "logps/rejected": -1412.5142822265625, + "loss": 0.0054, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38581180572509766, + "rewards/margins": 13.030293464660645, + "rewards/rejected": -13.416107177734375, + "step": 800 + }, + { + "epoch": 0.20228507211088217, + "grad_norm": 0.1357421875, + "learning_rate": 4.8427030959414984e-06, + "logits/chosen": 0.5971755385398865, + "logits/rejected": 1.6486015319824219, + "logps/chosen": -74.70821380615234, + "logps/rejected": -1532.8193359375, + "loss": 0.0063, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3184259831905365, + "rewards/margins": 14.300427436828613, + "rewards/rejected": -14.618852615356445, + "step": 810 + }, + { + "epoch": 0.20478241868015234, + "grad_norm": 0.1689453125, + "learning_rate": 4.835004050935369e-06, + "logits/chosen": 0.6013739705085754, + "logits/rejected": 1.4875710010528564, + "logps/chosen": -74.9230728149414, + "logps/rejected": -1462.062255859375, + "loss": 0.0107, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31746476888656616, + "rewards/margins": 13.55724811553955, + "rewards/rejected": -13.874712944030762, + "step": 820 + }, + { + "epoch": 0.2072797652494225, + "grad_norm": 0.0279541015625, + "learning_rate": 4.8271274823245e-06, + "logits/chosen": 0.6184748411178589, + "logits/rejected": 1.5838812589645386, + "logps/chosen": -74.03543853759766, + "logps/rejected": -1436.349853515625, + "loss": 0.0048, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29689908027648926, + "rewards/margins": 13.336532592773438, + "rewards/rejected": -13.63343334197998, + "step": 830 + }, + { + "epoch": 0.20977711181869263, + "grad_norm": 0.0888671875, + "learning_rate": 4.8190739889416264e-06, + "logits/chosen": 0.6181553602218628, + "logits/rejected": 1.6880038976669312, + "logps/chosen": -73.8195571899414, + "logps/rejected": -1627.819091796875, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3135436177253723, + "rewards/margins": 15.254382133483887, + "rewards/rejected": -15.567927360534668, + "step": 840 + }, + { + "epoch": 0.2122744583879628, + "grad_norm": 0.0703125, + "learning_rate": 4.810844183070553e-06, + "logits/chosen": 0.5540085434913635, + "logits/rejected": 1.6211318969726562, + "logps/chosen": -72.0704116821289, + "logps/rejected": -1341.570068359375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29530271887779236, + "rewards/margins": 12.359551429748535, + "rewards/rejected": -12.654852867126465, + "step": 850 + }, + { + "epoch": 0.21477180495723294, + "grad_norm": 0.1435546875, + "learning_rate": 4.802438690399622e-06, + "logits/chosen": 0.600739598274231, + "logits/rejected": 1.643431305885315, + "logps/chosen": -70.41534423828125, + "logps/rejected": -1476.7939453125, + "loss": 0.0068, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.26624736189842224, + "rewards/margins": 13.719133377075195, + "rewards/rejected": -13.985379219055176, + "step": 860 + }, + { + "epoch": 0.21726915152650308, + "grad_norm": 0.000701904296875, + "learning_rate": 4.793858149974129e-06, + "logits/chosen": 0.6058120727539062, + "logits/rejected": 1.721599817276001, + "logps/chosen": -79.14794921875, + "logps/rejected": -1739.1048583984375, + "loss": 0.0036, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3524496853351593, + "rewards/margins": 16.286325454711914, + "rewards/rejected": -16.638776779174805, + "step": 870 + }, + { + "epoch": 0.21976649809577323, + "grad_norm": 0.047119140625, + "learning_rate": 4.785103214147747e-06, + "logits/chosen": 0.6141168475151062, + "logits/rejected": 1.7174230813980103, + "logps/chosen": -77.48551940917969, + "logps/rejected": -1538.275634765625, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34762924909591675, + "rewards/margins": 14.33294677734375, + "rewards/rejected": -14.680575370788574, + "step": 880 + }, + { + "epoch": 0.2222638446650434, + "grad_norm": 0.000667572021484375, + "learning_rate": 4.776174548532926e-06, + "logits/chosen": 0.6287072896957397, + "logits/rejected": 1.6851770877838135, + "logps/chosen": -77.02873229980469, + "logps/rejected": -1560.05810546875, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3427741825580597, + "rewards/margins": 14.53984260559082, + "rewards/rejected": -14.882616996765137, + "step": 890 + }, + { + "epoch": 0.22476119123431354, + "grad_norm": 0.07080078125, + "learning_rate": 4.767072831950288e-06, + "logits/chosen": 0.617357611656189, + "logits/rejected": 1.7079490423202515, + "logps/chosen": -77.236572265625, + "logps/rejected": -1560.383544921875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3377893567085266, + "rewards/margins": 14.535722732543945, + "rewards/rejected": -14.873510360717773, + "step": 900 + }, + { + "epoch": 0.22725853780358368, + "grad_norm": 0.01129150390625, + "learning_rate": 4.7577987563770226e-06, + "logits/chosen": 0.6263229250907898, + "logits/rejected": 1.674384355545044, + "logps/chosen": -81.606201171875, + "logps/rejected": -1556.177490234375, + "loss": 0.0057, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37518563866615295, + "rewards/margins": 14.350992202758789, + "rewards/rejected": -14.726178169250488, + "step": 910 + }, + { + "epoch": 0.22975588437285385, + "grad_norm": 0.00042724609375, + "learning_rate": 4.748353026894273e-06, + "logits/chosen": 0.63312166929245, + "logits/rejected": 1.7262632846832275, + "logps/chosen": -73.91940307617188, + "logps/rejected": -1473.6871337890625, + "loss": 0.0037, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.29600244760513306, + "rewards/margins": 13.644391059875488, + "rewards/rejected": -13.940393447875977, + "step": 920 + }, + { + "epoch": 0.232253230942124, + "grad_norm": 0.00836181640625, + "learning_rate": 4.738736361633532e-06, + "logits/chosen": 0.6512018442153931, + "logits/rejected": 1.6386451721191406, + "logps/chosen": -75.5640869140625, + "logps/rejected": -1397.0894775390625, + "loss": 0.0042, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3240968585014343, + "rewards/margins": 12.940801620483398, + "rewards/rejected": -13.264900207519531, + "step": 930 + }, + { + "epoch": 0.23475057751139414, + "grad_norm": 0.00250244140625, + "learning_rate": 4.728949491722046e-06, + "logits/chosen": 0.6666821837425232, + "logits/rejected": 1.6732994318008423, + "logps/chosen": -76.89160919189453, + "logps/rejected": -1383.048095703125, + "loss": 0.0052, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3491043150424957, + "rewards/margins": 12.806585311889648, + "rewards/rejected": -13.155691146850586, + "step": 940 + }, + { + "epoch": 0.2372479240806643, + "grad_norm": 0.07421875, + "learning_rate": 4.718993161227231e-06, + "logits/chosen": 0.5883976221084595, + "logits/rejected": 1.7654300928115845, + "logps/chosen": -84.6438217163086, + "logps/rejected": -1749.6275634765625, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3954206109046936, + "rewards/margins": 16.34560775756836, + "rewards/rejected": -16.74102783203125, + "step": 950 + }, + { + "epoch": 0.23974527064993445, + "grad_norm": 0.006500244140625, + "learning_rate": 4.708868127100098e-06, + "logits/chosen": 0.666793167591095, + "logits/rejected": 1.6790577173233032, + "logps/chosen": -75.14250183105469, + "logps/rejected": -1408.4847412109375, + "loss": 0.0034, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3262697458267212, + "rewards/margins": 13.068046569824219, + "rewards/rejected": -13.394315719604492, + "step": 960 + }, + { + "epoch": 0.2422426172192046, + "grad_norm": 0.0001659393310546875, + "learning_rate": 4.6985751591177075e-06, + "logits/chosen": 0.62273108959198, + "logits/rejected": 1.7520809173583984, + "logps/chosen": -87.58243560791016, + "logps/rejected": -1661.821533203125, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4351579546928406, + "rewards/margins": 15.40393352508545, + "rewards/rejected": -15.839093208312988, + "step": 970 + }, + { + "epoch": 0.24473996378847473, + "grad_norm": 0.00885009765625, + "learning_rate": 4.688115039824648e-06, + "logits/chosen": 0.6803555488586426, + "logits/rejected": 1.7338272333145142, + "logps/chosen": -88.861328125, + "logps/rejected": -1561.404541015625, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4565040171146393, + "rewards/margins": 14.439603805541992, + "rewards/rejected": -14.896108627319336, + "step": 980 + }, + { + "epoch": 0.2472373103577449, + "grad_norm": 5.424022674560547e-06, + "learning_rate": 4.677488564473535e-06, + "logits/chosen": 0.6470680832862854, + "logits/rejected": 1.818428635597229, + "logps/chosen": -92.59998321533203, + "logps/rejected": -1680.185791015625, + "loss": 0.0027, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4968494772911072, + "rewards/margins": 15.579751968383789, + "rewards/rejected": -16.076602935791016, + "step": 990 + }, + { + "epoch": 0.24973465692701505, + "grad_norm": 0.03173828125, + "learning_rate": 4.666696540964556e-06, + "logits/chosen": 0.7138900756835938, + "logits/rejected": 1.7771879434585571, + "logps/chosen": -99.16166687011719, + "logps/rejected": -1503.458984375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.551328182220459, + "rewards/margins": 13.802647590637207, + "rewards/rejected": -14.353976249694824, + "step": 1000 + }, + { + "epoch": 0.24973465692701505, + "eval_logits/chosen": 0.7451997995376587, + "eval_logits/rejected": 1.5489047765731812, + "eval_logps/chosen": -80.45184326171875, + "eval_logps/rejected": -830.6234130859375, + "eval_loss": 0.006477854214608669, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": -0.36095961928367615, + "eval_rewards/margins": 7.334376335144043, + "eval_rewards/rejected": -7.695336818695068, + "eval_runtime": 0.619, + "eval_samples_per_second": 8.077, + "eval_steps_per_second": 8.077, + "step": 1000 + }, + { + "epoch": 0.2522320034962852, + "grad_norm": 0.0003871917724609375, + "learning_rate": 4.6557397897840454e-06, + "logits/chosen": 0.693498969078064, + "logits/rejected": 1.753259301185608, + "logps/chosen": -102.29890441894531, + "logps/rejected": -1543.0269775390625, + "loss": 0.005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.583904504776001, + "rewards/margins": 14.051568984985352, + "rewards/rejected": -14.635473251342773, + "step": 1010 + }, + { + "epoch": 0.25472935006555536, + "grad_norm": 0.0003032684326171875, + "learning_rate": 4.644619143942108e-06, + "logits/chosen": 0.5707821249961853, + "logits/rejected": 1.7276995182037354, + "logps/chosen": -88.48922729492188, + "logps/rejected": -1698.140625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44621315598487854, + "rewards/margins": 15.652883529663086, + "rewards/rejected": -16.099096298217773, + "step": 1020 + }, + { + "epoch": 0.2572266966348255, + "grad_norm": 0.076171875, + "learning_rate": 4.633335448909284e-06, + "logits/chosen": 0.658003032207489, + "logits/rejected": 1.7504956722259521, + "logps/chosen": -89.60042572021484, + "logps/rejected": -1624.2664794921875, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4533475339412689, + "rewards/margins": 14.999029159545898, + "rewards/rejected": -15.452377319335938, + "step": 1030 + }, + { + "epoch": 0.25972404320409564, + "grad_norm": 0.0308837890625, + "learning_rate": 4.621889562552272e-06, + "logits/chosen": 0.6270695924758911, + "logits/rejected": 1.8431812524795532, + "logps/chosen": -105.17597961425781, + "logps/rejected": -1880.988037109375, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6098282933235168, + "rewards/margins": 17.456111907958984, + "rewards/rejected": -18.065940856933594, + "step": 1040 + }, + { + "epoch": 0.2622213897733658, + "grad_norm": 3.886222839355469e-05, + "learning_rate": 4.610282355068707e-06, + "logits/chosen": 0.577286422252655, + "logits/rejected": 1.723755121231079, + "logps/chosen": -109.06168365478516, + "logps/rejected": -1825.4671630859375, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6522443890571594, + "rewards/margins": 16.784276962280273, + "rewards/rejected": -17.4365234375, + "step": 1050 + }, + { + "epoch": 0.26471873634263593, + "grad_norm": 0.078125, + "learning_rate": 4.598514708921006e-06, + "logits/chosen": 0.6790138483047485, + "logits/rejected": 1.8542677164077759, + "logps/chosen": -101.43669128417969, + "logps/rejected": -1781.3316650390625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5860346555709839, + "rewards/margins": 16.492624282836914, + "rewards/rejected": -17.078659057617188, + "step": 1060 + }, + { + "epoch": 0.26721608291190607, + "grad_norm": 0.0002994537353515625, + "learning_rate": 4.5865875187692695e-06, + "logits/chosen": 0.6998518109321594, + "logits/rejected": 1.8515506982803345, + "logps/chosen": -90.31967163085938, + "logps/rejected": -1559.104736328125, + "loss": 0.0043, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46949252486228943, + "rewards/margins": 14.420549392700195, + "rewards/rejected": -14.890042304992676, + "step": 1070 + }, + { + "epoch": 0.26971342948117627, + "grad_norm": 0.04296875, + "learning_rate": 4.57450169140327e-06, + "logits/chosen": 0.6541129350662231, + "logits/rejected": 1.944573163986206, + "logps/chosen": -94.61952209472656, + "logps/rejected": -1980.4349365234375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5051703453063965, + "rewards/margins": 18.555133819580078, + "rewards/rejected": -19.060306549072266, + "step": 1080 + }, + { + "epoch": 0.2722107760504464, + "grad_norm": 0.0250244140625, + "learning_rate": 4.562258145673507e-06, + "logits/chosen": 0.624032199382782, + "logits/rejected": 1.8918708562850952, + "logps/chosen": -105.9957275390625, + "logps/rejected": -1974.2584228515625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6295837759971619, + "rewards/margins": 18.393098831176758, + "rewards/rejected": -19.022680282592773, + "step": 1090 + }, + { + "epoch": 0.27470812261971655, + "grad_norm": 0.0026092529296875, + "learning_rate": 4.549857812421353e-06, + "logits/chosen": 0.64922696352005, + "logits/rejected": 1.8475501537322998, + "logps/chosen": -89.56166076660156, + "logps/rejected": -1663.010986328125, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46423882246017456, + "rewards/margins": 15.462870597839355, + "rewards/rejected": -15.92711067199707, + "step": 1100 + }, + { + "epoch": 0.2772054691889867, + "grad_norm": 0.0084228515625, + "learning_rate": 4.537301634408281e-06, + "logits/chosen": 0.6925086975097656, + "logits/rejected": 1.7738326787948608, + "logps/chosen": -85.5665512084961, + "logps/rejected": -1501.200439453125, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4182400703430176, + "rewards/margins": 13.875646591186523, + "rewards/rejected": -14.293886184692383, + "step": 1110 + }, + { + "epoch": 0.27970281575825684, + "grad_norm": 0.0732421875, + "learning_rate": 4.52459056624419e-06, + "logits/chosen": 0.7249046564102173, + "logits/rejected": 1.795248031616211, + "logps/chosen": -102.11865997314453, + "logps/rejected": -1670.522216796875, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5903103351593018, + "rewards/margins": 15.393072128295898, + "rewards/rejected": -15.983380317687988, + "step": 1120 + }, + { + "epoch": 0.282200162327527, + "grad_norm": 0.06494140625, + "learning_rate": 4.51172557431483e-06, + "logits/chosen": 0.6399365663528442, + "logits/rejected": 1.7282390594482422, + "logps/chosen": -105.56150817871094, + "logps/rejected": -1699.7261962890625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6241176724433899, + "rewards/margins": 15.525866508483887, + "rewards/rejected": -16.14998435974121, + "step": 1130 + }, + { + "epoch": 0.2846975088967972, + "grad_norm": 0.054443359375, + "learning_rate": 4.49870763670833e-06, + "logits/chosen": 0.6207230687141418, + "logits/rejected": 1.878230333328247, + "logps/chosen": -95.18315124511719, + "logps/rejected": -1844.5869140625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5091260671615601, + "rewards/margins": 17.207239151000977, + "rewards/rejected": -17.71636390686035, + "step": 1140 + }, + { + "epoch": 0.2871948554660673, + "grad_norm": 0.06591796875, + "learning_rate": 4.4855377431408335e-06, + "logits/chosen": 0.6628460884094238, + "logits/rejected": 1.7432994842529297, + "logps/chosen": -109.8287124633789, + "logps/rejected": -1642.527099609375, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.6535288095474243, + "rewards/margins": 14.924878120422363, + "rewards/rejected": -15.578405380249023, + "step": 1150 + }, + { + "epoch": 0.28969220203533746, + "grad_norm": 0.005096435546875, + "learning_rate": 4.472216894881261e-06, + "logits/chosen": 0.6672796010971069, + "logits/rejected": 1.738856315612793, + "logps/chosen": -89.06207275390625, + "logps/rejected": -1540.048583984375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46248659491539, + "rewards/margins": 14.235052108764648, + "rewards/rejected": -14.697538375854492, + "step": 1160 + }, + { + "epoch": 0.2921895486046076, + "grad_norm": 0.047607421875, + "learning_rate": 4.4587461046751815e-06, + "logits/chosen": 0.6774252653121948, + "logits/rejected": 1.7696081399917603, + "logps/chosen": -81.09745788574219, + "logps/rejected": -1620.4942626953125, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38098251819610596, + "rewards/margins": 15.126324653625488, + "rewards/rejected": -15.507307052612305, + "step": 1170 + }, + { + "epoch": 0.29468689517387775, + "grad_norm": 0.008544921875, + "learning_rate": 4.44512639666781e-06, + "logits/chosen": 0.6951876878738403, + "logits/rejected": 1.7908748388290405, + "logps/chosen": -76.070068359375, + "logps/rejected": -1503.576416015625, + "loss": 0.0032, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32203441858291626, + "rewards/margins": 14.029817581176758, + "rewards/rejected": -14.351852416992188, + "step": 1180 + }, + { + "epoch": 0.2971842417431479, + "grad_norm": 0.890625, + "learning_rate": 4.431358806326158e-06, + "logits/chosen": 0.5664582848548889, + "logits/rejected": 1.6636062860488892, + "logps/chosen": -77.29322814941406, + "logps/rejected": -1649.8541259765625, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32812389731407166, + "rewards/margins": 15.302160263061523, + "rewards/rejected": -15.630284309387207, + "step": 1190 + }, + { + "epoch": 0.29968158831241803, + "grad_norm": 0.765625, + "learning_rate": 4.4174443803603e-06, + "logits/chosen": 0.7006584405899048, + "logits/rejected": 1.8050626516342163, + "logps/chosen": -101.17628479003906, + "logps/rejected": -1638.1669921875, + "loss": 0.004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5764530301094055, + "rewards/margins": 15.088088989257812, + "rewards/rejected": -15.664541244506836, + "step": 1200 + }, + { + "epoch": 0.30217893488168823, + "grad_norm": 0.045166015625, + "learning_rate": 4.4033841766438e-06, + "logits/chosen": 0.5828436613082886, + "logits/rejected": 1.6060895919799805, + "logps/chosen": -79.80711364746094, + "logps/rejected": -1499.616943359375, + "loss": 0.0038, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35304346680641174, + "rewards/margins": 13.869488716125488, + "rewards/rejected": -14.222529411315918, + "step": 1210 + }, + { + "epoch": 0.3046762814509584, + "grad_norm": 0.0005035400390625, + "learning_rate": 4.389179264133281e-06, + "logits/chosen": 0.6191063523292542, + "logits/rejected": 1.7791473865509033, + "logps/chosen": -74.18888854980469, + "logps/rejected": -1617.736572265625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31279683113098145, + "rewards/margins": 15.179719924926758, + "rewards/rejected": -15.492517471313477, + "step": 1220 + }, + { + "epoch": 0.3071736280202285, + "grad_norm": 0.10205078125, + "learning_rate": 4.374830722787159e-06, + "logits/chosen": 0.5632847547531128, + "logits/rejected": 1.709839105606079, + "logps/chosen": -71.81803894042969, + "logps/rejected": -1728.1201171875, + "loss": 0.0035, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2758982479572296, + "rewards/margins": 16.288021087646484, + "rewards/rejected": -16.563919067382812, + "step": 1230 + }, + { + "epoch": 0.30967097458949866, + "grad_norm": 0.08984375, + "learning_rate": 4.360339643483533e-06, + "logits/chosen": 0.5613077878952026, + "logits/rejected": 1.6441980600357056, + "logps/chosen": -71.94982147216797, + "logps/rejected": -1693.345947265625, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.27554553747177124, + "rewards/margins": 15.830523490905762, + "rewards/rejected": -16.106069564819336, + "step": 1240 + }, + { + "epoch": 0.3121683211587688, + "grad_norm": 0.00022411346435546875, + "learning_rate": 4.345707127937253e-06, + "logits/chosen": 0.5210096836090088, + "logits/rejected": 1.81972336769104, + "logps/chosen": -72.21741485595703, + "logps/rejected": -1985.244873046875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2866365611553192, + "rewards/margins": 18.848371505737305, + "rewards/rejected": -19.135007858276367, + "step": 1250 + }, + { + "epoch": 0.31466566772803894, + "grad_norm": 0.0087890625, + "learning_rate": 4.330934288616154e-06, + "logits/chosen": 0.6370071172714233, + "logits/rejected": 1.816506028175354, + "logps/chosen": -77.37321472167969, + "logps/rejected": -1700.324951171875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33770519495010376, + "rewards/margins": 15.96662712097168, + "rewards/rejected": -16.304332733154297, + "step": 1260 + }, + { + "epoch": 0.3171630142973091, + "grad_norm": 0.043701171875, + "learning_rate": 4.316022248656485e-06, + "logits/chosen": 0.5022410154342651, + "logits/rejected": 1.5646175146102905, + "logps/chosen": -71.90743255615234, + "logps/rejected": -1543.3675537109375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2772095799446106, + "rewards/margins": 14.190910339355469, + "rewards/rejected": -14.468118667602539, + "step": 1270 + }, + { + "epoch": 0.3196603608665793, + "grad_norm": 0.01177978515625, + "learning_rate": 4.3009721417775166e-06, + "logits/chosen": 0.5786353349685669, + "logits/rejected": 1.7490533590316772, + "logps/chosen": -76.0634536743164, + "logps/rejected": -1822.3128662109375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33201277256011963, + "rewards/margins": 17.15824317932129, + "rewards/rejected": -17.49025535583496, + "step": 1280 + }, + { + "epoch": 0.3221577074358494, + "grad_norm": 0.006378173828125, + "learning_rate": 4.285785112195346e-06, + "logits/chosen": 0.5005044341087341, + "logits/rejected": 1.6343857049942017, + "logps/chosen": -79.80322265625, + "logps/rejected": -1840.0863037109375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.360287070274353, + "rewards/margins": 17.2882080078125, + "rewards/rejected": -17.648494720458984, + "step": 1290 + }, + { + "epoch": 0.32465505400511957, + "grad_norm": 0.0218505859375, + "learning_rate": 4.27046231453591e-06, + "logits/chosen": 0.5454662442207336, + "logits/rejected": 1.7699216604232788, + "logps/chosen": -73.63867950439453, + "logps/rejected": -1785.0687255859375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3033478856086731, + "rewards/margins": 16.751049041748047, + "rewards/rejected": -17.054393768310547, + "step": 1300 + }, + { + "epoch": 0.3271524005743897, + "grad_norm": 3.4458935260772705e-08, + "learning_rate": 4.255004913747196e-06, + "logits/chosen": 0.5776439905166626, + "logits/rejected": 1.7879540920257568, + "logps/chosen": -73.9155502319336, + "logps/rejected": -1853.062744140625, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3109303414821625, + "rewards/margins": 17.477283477783203, + "rewards/rejected": -17.78821563720703, + "step": 1310 + }, + { + "epoch": 0.32964974714365985, + "grad_norm": 0.0025787353515625, + "learning_rate": 4.2394140850106825e-06, + "logits/chosen": 0.5839067697525024, + "logits/rejected": 1.7082710266113281, + "logps/chosen": -79.85846710205078, + "logps/rejected": -1769.608154296875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36045509576797485, + "rewards/margins": 16.53636360168457, + "rewards/rejected": -16.896818161010742, + "step": 1320 + }, + { + "epoch": 0.33214709371293, + "grad_norm": 0.049072265625, + "learning_rate": 4.223691013651986e-06, + "logits/chosen": 0.4838981032371521, + "logits/rejected": 1.5925347805023193, + "logps/chosen": -78.32035827636719, + "logps/rejected": -1797.3795166015625, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33012980222702026, + "rewards/margins": 16.690799713134766, + "rewards/rejected": -17.02092933654785, + "step": 1330 + }, + { + "epoch": 0.33464444028220014, + "grad_norm": 0.07421875, + "learning_rate": 4.207836895050748e-06, + "logits/chosen": 0.5183486342430115, + "logits/rejected": 1.813838243484497, + "logps/chosen": -78.78418731689453, + "logps/rejected": -2234.604248046875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35720717906951904, + "rewards/margins": 21.229446411132812, + "rewards/rejected": -21.586654663085938, + "step": 1340 + }, + { + "epoch": 0.33714178685147034, + "grad_norm": 0.034912109375, + "learning_rate": 4.1918529345497525e-06, + "logits/chosen": 0.6919676661491394, + "logits/rejected": 1.7151187658309937, + "logps/chosen": -73.4660415649414, + "logps/rejected": -1405.526123046875, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.2937282919883728, + "rewards/margins": 13.013958930969238, + "rewards/rejected": -13.307687759399414, + "step": 1350 + }, + { + "epoch": 0.3396391334207405, + "grad_norm": 0.0673828125, + "learning_rate": 4.175740347363289e-06, + "logits/chosen": 0.58757483959198, + "logits/rejected": 1.651389718055725, + "logps/chosen": -76.53401947021484, + "logps/rejected": -1477.0704345703125, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3171566426753998, + "rewards/margins": 13.620218276977539, + "rewards/rejected": -13.93737506866455, + "step": 1360 + }, + { + "epoch": 0.3421364799900106, + "grad_norm": 0.00014400482177734375, + "learning_rate": 4.159500358484759e-06, + "logits/chosen": 0.5347609519958496, + "logits/rejected": 1.775714635848999, + "logps/chosen": -80.72816467285156, + "logps/rejected": -2101.271240234375, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37172675132751465, + "rewards/margins": 19.874296188354492, + "rewards/rejected": -20.246021270751953, + "step": 1370 + }, + { + "epoch": 0.34463382655928076, + "grad_norm": 0.08544921875, + "learning_rate": 4.143134202593549e-06, + "logits/chosen": 0.624781608581543, + "logits/rejected": 1.6769899129867554, + "logps/chosen": -73.6835708618164, + "logps/rejected": -1517.2562255859375, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.28604602813720703, + "rewards/margins": 14.03973388671875, + "rewards/rejected": -14.325779914855957, + "step": 1380 + }, + { + "epoch": 0.3471311731285509, + "grad_norm": 0.000701904296875, + "learning_rate": 4.126643123961158e-06, + "logits/chosen": 0.5619300007820129, + "logits/rejected": 1.733264684677124, + "logps/chosen": -85.01399230957031, + "logps/rejected": -1977.3433837890625, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4228588938713074, + "rewards/margins": 18.610515594482422, + "rewards/rejected": -19.033374786376953, + "step": 1390 + }, + { + "epoch": 0.34962851969782105, + "grad_norm": 0.054931640625, + "learning_rate": 4.110028376356599e-06, + "logits/chosen": 0.6396419405937195, + "logits/rejected": 1.709763765335083, + "logps/chosen": -78.08328247070312, + "logps/rejected": -1361.017822265625, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32667404413223267, + "rewards/margins": 12.484647750854492, + "rewards/rejected": -12.811322212219238, + "step": 1400 + }, + { + "epoch": 0.3521258662670912, + "grad_norm": 0.0230712890625, + "learning_rate": 4.093291222951079e-06, + "logits/chosen": 0.59341961145401, + "logits/rejected": 1.863669991493225, + "logps/chosen": -88.14995574951172, + "logps/rejected": -1872.766845703125, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4488135874271393, + "rewards/margins": 17.484996795654297, + "rewards/rejected": -17.93381118774414, + "step": 1410 + }, + { + "epoch": 0.3546232128363614, + "grad_norm": 0.058837890625, + "learning_rate": 4.076432936221965e-06, + "logits/chosen": 0.7002652287483215, + "logits/rejected": 1.8483015298843384, + "logps/chosen": -82.04905700683594, + "logps/rejected": -1564.48388671875, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38699063658714294, + "rewards/margins": 14.584707260131836, + "rewards/rejected": -14.971699714660645, + "step": 1420 + }, + { + "epoch": 0.35712055940563153, + "grad_norm": 0.0228271484375, + "learning_rate": 4.059454797856039e-06, + "logits/chosen": 0.6757210493087769, + "logits/rejected": 1.8517526388168335, + "logps/chosen": -79.04359436035156, + "logps/rejected": -1546.5145263671875, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36757320165634155, + "rewards/margins": 14.412260055541992, + "rewards/rejected": -14.77983570098877, + "step": 1430 + }, + { + "epoch": 0.3596179059749017, + "grad_norm": 2.86102294921875e-05, + "learning_rate": 4.042358098652057e-06, + "logits/chosen": 0.6149075627326965, + "logits/rejected": 1.7819246053695679, + "logps/chosen": -79.50230407714844, + "logps/rejected": -1636.815673828125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36546218395233154, + "rewards/margins": 15.302419662475586, + "rewards/rejected": -15.667881965637207, + "step": 1440 + }, + { + "epoch": 0.3621152525441718, + "grad_norm": 0.00115203857421875, + "learning_rate": 4.025144138422615e-06, + "logits/chosen": 0.6270621418952942, + "logits/rejected": 1.8290207386016846, + "logps/chosen": -94.74217224121094, + "logps/rejected": -1927.5875244140625, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5115745663642883, + "rewards/margins": 18.033849716186523, + "rewards/rejected": -18.54542350769043, + "step": 1450 + }, + { + "epoch": 0.36461259911344196, + "grad_norm": 0.01483154296875, + "learning_rate": 4.007814225895321e-06, + "logits/chosen": 0.6495813131332397, + "logits/rejected": 1.905644178390503, + "logps/chosen": -77.37786865234375, + "logps/rejected": -1849.2958984375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35049691796302795, + "rewards/margins": 17.428863525390625, + "rewards/rejected": -17.779361724853516, + "step": 1460 + }, + { + "epoch": 0.3671099456827121, + "grad_norm": 0.030029296875, + "learning_rate": 3.990369678613303e-06, + "logits/chosen": 0.5495260953903198, + "logits/rejected": 1.7052236795425415, + "logps/chosen": -80.6564712524414, + "logps/rejected": -1763.9212646484375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3531542718410492, + "rewards/margins": 16.24311637878418, + "rewards/rejected": -16.596271514892578, + "step": 1470 + }, + { + "epoch": 0.36960729225198224, + "grad_norm": 0.005859375, + "learning_rate": 3.97281182283504e-06, + "logits/chosen": 0.625984251499176, + "logits/rejected": 1.8606735467910767, + "logps/chosen": -79.91282653808594, + "logps/rejected": -2002.6253662109375, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37286117672920227, + "rewards/margins": 18.913448333740234, + "rewards/rejected": -19.28631019592285, + "step": 1480 + }, + { + "epoch": 0.37210463882125244, + "grad_norm": 0.0966796875, + "learning_rate": 3.955141993433526e-06, + "logits/chosen": 0.6155849695205688, + "logits/rejected": 1.8059221506118774, + "logps/chosen": -83.35047912597656, + "logps/rejected": -1705.9954833984375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40526407957077026, + "rewards/margins": 15.947067260742188, + "rewards/rejected": -16.352331161499023, + "step": 1490 + }, + { + "epoch": 0.3746019853905226, + "grad_norm": 0.028076171875, + "learning_rate": 3.937361533794784e-06, + "logits/chosen": 0.6340751647949219, + "logits/rejected": 1.7404279708862305, + "logps/chosen": -88.5922622680664, + "logps/rejected": -1672.023193359375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4479256570339203, + "rewards/margins": 15.48670768737793, + "rewards/rejected": -15.934633255004883, + "step": 1500 + }, + { + "epoch": 0.3770993319597927, + "grad_norm": 0.005706787109375, + "learning_rate": 3.919471795715738e-06, + "logits/chosen": 0.6204045414924622, + "logits/rejected": 1.7921810150146484, + "logps/chosen": -76.07710266113281, + "logps/rejected": -1581.0501708984375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3369273245334625, + "rewards/margins": 14.815042495727539, + "rewards/rejected": -15.151969909667969, + "step": 1510 + }, + { + "epoch": 0.37959667852906287, + "grad_norm": 0.033203125, + "learning_rate": 3.901474139301433e-06, + "logits/chosen": 0.5973562002182007, + "logits/rejected": 1.781730055809021, + "logps/chosen": -83.27436065673828, + "logps/rejected": -1706.674560546875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3964506983757019, + "rewards/margins": 15.901094436645508, + "rewards/rejected": -16.297544479370117, + "step": 1520 + }, + { + "epoch": 0.382094025098333, + "grad_norm": 0.020263671875, + "learning_rate": 3.883369932861634e-06, + "logits/chosen": 0.66780024766922, + "logits/rejected": 1.8160803318023682, + "logps/chosen": -88.08844757080078, + "logps/rejected": -1613.07373046875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44633254408836365, + "rewards/margins": 15.01531982421875, + "rewards/rejected": -15.461652755737305, + "step": 1530 + }, + { + "epoch": 0.38459137166760315, + "grad_norm": 5.245208740234375e-05, + "learning_rate": 3.865160552806796e-06, + "logits/chosen": 0.6651610136032104, + "logits/rejected": 1.8406972885131836, + "logps/chosen": -79.48463439941406, + "logps/rejected": -1628.993896484375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36581045389175415, + "rewards/margins": 15.247782707214355, + "rewards/rejected": -15.613592147827148, + "step": 1540 + }, + { + "epoch": 0.3870887182368733, + "grad_norm": 1.7404556274414062e-05, + "learning_rate": 3.84684738354342e-06, + "logits/chosen": 0.6299249529838562, + "logits/rejected": 1.8017246723175049, + "logps/chosen": -78.25436401367188, + "logps/rejected": -1699.4664306640625, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34191855788230896, + "rewards/margins": 15.943153381347656, + "rewards/rejected": -16.285072326660156, + "step": 1550 + }, + { + "epoch": 0.3895860648061435, + "grad_norm": 0.043212890625, + "learning_rate": 3.828431817368798e-06, + "logits/chosen": 0.6114810705184937, + "logits/rejected": 1.7776029109954834, + "logps/chosen": -83.11959075927734, + "logps/rejected": -1808.9710693359375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39526110887527466, + "rewards/margins": 16.893442153930664, + "rewards/rejected": -17.2887020111084, + "step": 1560 + }, + { + "epoch": 0.39208341137541364, + "grad_norm": 0.0859375, + "learning_rate": 3.8099152543651684e-06, + "logits/chosen": 0.5259889364242554, + "logits/rejected": 1.8491640090942383, + "logps/chosen": -76.72280883789062, + "logps/rejected": -1907.3863525390625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3375067710876465, + "rewards/margins": 18.016918182373047, + "rewards/rejected": -18.354427337646484, + "step": 1570 + }, + { + "epoch": 0.3945807579446838, + "grad_norm": 0.07177734375, + "learning_rate": 3.791299102293261e-06, + "logits/chosen": 0.5979338884353638, + "logits/rejected": 1.8080129623413086, + "logps/chosen": -85.99974060058594, + "logps/rejected": -1962.4896240234375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4262419641017914, + "rewards/margins": 18.461589813232422, + "rewards/rejected": -18.88783073425293, + "step": 1580 + }, + { + "epoch": 0.3970781045139539, + "grad_norm": 0.0078125, + "learning_rate": 3.7725847764852774e-06, + "logits/chosen": 0.5477781891822815, + "logits/rejected": 1.7578785419464111, + "logps/chosen": -83.63060760498047, + "logps/rejected": -1990.638671875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39986157417297363, + "rewards/margins": 18.64605140686035, + "rewards/rejected": -19.045909881591797, + "step": 1590 + }, + { + "epoch": 0.39957545108322406, + "grad_norm": 0.0067138671875, + "learning_rate": 3.7537736997372833e-06, + "logits/chosen": 0.5983849167823792, + "logits/rejected": 1.6318690776824951, + "logps/chosen": -74.38432312011719, + "logps/rejected": -1474.78759765625, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.30779850482940674, + "rewards/margins": 13.598607063293457, + "rewards/rejected": -13.906405448913574, + "step": 1600 + }, + { + "epoch": 0.4020727976524942, + "grad_norm": 0.0035247802734375, + "learning_rate": 3.734867302201038e-06, + "logits/chosen": 0.620630145072937, + "logits/rejected": 1.7149145603179932, + "logps/chosen": -75.28178405761719, + "logps/rejected": -1552.66650390625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32418856024742126, + "rewards/margins": 14.512880325317383, + "rewards/rejected": -14.837068557739258, + "step": 1610 + }, + { + "epoch": 0.40457014422176435, + "grad_norm": 0.04833984375, + "learning_rate": 3.7158670212752666e-06, + "logits/chosen": 0.609667181968689, + "logits/rejected": 1.8217569589614868, + "logps/chosen": -75.36375427246094, + "logps/rejected": -1846.893798828125, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31798630952835083, + "rewards/margins": 17.42896842956543, + "rewards/rejected": -17.746957778930664, + "step": 1620 + }, + { + "epoch": 0.40706749079103455, + "grad_norm": 0.0034332275390625, + "learning_rate": 3.696774301496376e-06, + "logits/chosen": 0.6272271871566772, + "logits/rejected": 1.8513765335083008, + "logps/chosen": -76.99528503417969, + "logps/rejected": -1668.758056640625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33525028824806213, + "rewards/margins": 15.664929389953613, + "rewards/rejected": -16.000181198120117, + "step": 1630 + }, + { + "epoch": 0.4095648373603047, + "grad_norm": 0.0020599365234375, + "learning_rate": 3.677590594428629e-06, + "logits/chosen": 0.6275375485420227, + "logits/rejected": 1.746649980545044, + "logps/chosen": -82.9039535522461, + "logps/rejected": -1647.865478515625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.402643620967865, + "rewards/margins": 15.365156173706055, + "rewards/rejected": -15.767801284790039, + "step": 1640 + }, + { + "epoch": 0.41206218392957483, + "grad_norm": 0.0001926422119140625, + "learning_rate": 3.658317358553794e-06, + "logits/chosen": 0.6051415205001831, + "logits/rejected": 1.807227373123169, + "logps/chosen": -78.21363830566406, + "logps/rejected": -1698.2001953125, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3519694209098816, + "rewards/margins": 15.862031936645508, + "rewards/rejected": -16.214000701904297, + "step": 1650 + }, + { + "epoch": 0.414559530498845, + "grad_norm": 0.05908203125, + "learning_rate": 3.638956059160252e-06, + "logits/chosen": 0.659566342830658, + "logits/rejected": 1.9395606517791748, + "logps/chosen": -79.38732147216797, + "logps/rejected": -1887.150390625, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.369088351726532, + "rewards/margins": 17.81894874572754, + "rewards/rejected": -18.188034057617188, + "step": 1660 + }, + { + "epoch": 0.4170568770681151, + "grad_norm": 0.0064697265625, + "learning_rate": 3.6195081682315972e-06, + "logits/chosen": 0.6888834834098816, + "logits/rejected": 1.855298638343811, + "logps/chosen": -87.92467498779297, + "logps/rejected": -1717.685546875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45286068320274353, + "rewards/margins": 16.04346466064453, + "rewards/rejected": -16.496326446533203, + "step": 1670 + }, + { + "epoch": 0.41955422363738526, + "grad_norm": 0.026123046875, + "learning_rate": 3.5999751643347342e-06, + "logits/chosen": 0.5452974438667297, + "logits/rejected": 1.710627794265747, + "logps/chosen": -84.69573974609375, + "logps/rejected": -1964.759033203125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4223009943962097, + "rewards/margins": 18.474639892578125, + "rewards/rejected": -18.896940231323242, + "step": 1680 + }, + { + "epoch": 0.4220515702066554, + "grad_norm": 0.050537109375, + "learning_rate": 3.5803585325074536e-06, + "logits/chosen": 0.5890778303146362, + "logits/rejected": 1.8013776540756226, + "logps/chosen": -78.17984008789062, + "logps/rejected": -1845.5576171875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3491733968257904, + "rewards/margins": 17.39130210876465, + "rewards/rejected": -17.740474700927734, + "step": 1690 + }, + { + "epoch": 0.4245489167759256, + "grad_norm": 0.0849609375, + "learning_rate": 3.5606597641455387e-06, + "logits/chosen": 0.6665171384811401, + "logits/rejected": 1.7867825031280518, + "logps/chosen": -82.6786117553711, + "logps/rejected": -1745.9921875, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4073053300380707, + "rewards/margins": 16.353519439697266, + "rewards/rejected": -16.760822296142578, + "step": 1700 + }, + { + "epoch": 0.42704626334519574, + "grad_norm": 0.054443359375, + "learning_rate": 3.540880356889376e-06, + "logits/chosen": 0.6666916012763977, + "logits/rejected": 1.773199439048767, + "logps/chosen": -83.08412170410156, + "logps/rejected": -1565.4173583984375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39527103304862976, + "rewards/margins": 14.483154296875, + "rewards/rejected": -14.878425598144531, + "step": 1710 + }, + { + "epoch": 0.4295436099144659, + "grad_norm": 0.036865234375, + "learning_rate": 3.5210218145100934e-06, + "logits/chosen": 0.6796804666519165, + "logits/rejected": 1.826575517654419, + "logps/chosen": -76.29814147949219, + "logps/rejected": -1558.442626953125, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3252686560153961, + "rewards/margins": 14.555532455444336, + "rewards/rejected": -14.8808012008667, + "step": 1720 + }, + { + "epoch": 0.432040956483736, + "grad_norm": 0.037841796875, + "learning_rate": 3.5010856467952335e-06, + "logits/chosen": 0.6157870292663574, + "logits/rejected": 1.7099930047988892, + "logps/chosen": -81.55091094970703, + "logps/rejected": -1618.1546630859375, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3764324188232422, + "rewards/margins": 14.980290412902832, + "rewards/rejected": -15.356722831726074, + "step": 1730 + }, + { + "epoch": 0.43453830305300617, + "grad_norm": 0.1484375, + "learning_rate": 3.4810733694339687e-06, + "logits/chosen": 0.5888208150863647, + "logits/rejected": 1.7958781719207764, + "logps/chosen": -84.57051086425781, + "logps/rejected": -1871.361083984375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4017999768257141, + "rewards/margins": 17.520339965820312, + "rewards/rejected": -17.922138214111328, + "step": 1740 + }, + { + "epoch": 0.4370356496222763, + "grad_norm": 0.032958984375, + "learning_rate": 3.4609865039018676e-06, + "logits/chosen": 0.682072639465332, + "logits/rejected": 1.7670128345489502, + "logps/chosen": -83.45240783691406, + "logps/rejected": -1766.69140625, + "loss": 0.0031, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40751171112060547, + "rewards/margins": 16.532672882080078, + "rewards/rejected": -16.940181732177734, + "step": 1750 + }, + { + "epoch": 0.43953299619154645, + "grad_norm": 0.024658203125, + "learning_rate": 3.4408265773452226e-06, + "logits/chosen": 0.6357883214950562, + "logits/rejected": 1.7647396326065063, + "logps/chosen": -75.90863037109375, + "logps/rejected": -1793.2620849609375, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32462552189826965, + "rewards/margins": 16.89077377319336, + "rewards/rejected": -17.21540069580078, + "step": 1760 + }, + { + "epoch": 0.44203034276081665, + "grad_norm": 0.00022411346435546875, + "learning_rate": 3.420595122464942e-06, + "logits/chosen": 0.5758832693099976, + "logits/rejected": 1.7814972400665283, + "logps/chosen": -79.77733612060547, + "logps/rejected": -1759.0218505859375, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36440029740333557, + "rewards/margins": 16.545846939086914, + "rewards/rejected": -16.910245895385742, + "step": 1770 + }, + { + "epoch": 0.4445276893300868, + "grad_norm": 0.0712890625, + "learning_rate": 3.4002936774000284e-06, + "logits/chosen": 0.5318555235862732, + "logits/rejected": 1.8811362981796265, + "logps/chosen": -77.92293548583984, + "logps/rejected": -2195.729736328125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3443797826766968, + "rewards/margins": 20.8763370513916, + "rewards/rejected": -21.22071647644043, + "step": 1780 + }, + { + "epoch": 0.44702503589935694, + "grad_norm": 0.035888671875, + "learning_rate": 3.3799237856106348e-06, + "logits/chosen": 0.5407482385635376, + "logits/rejected": 1.725608229637146, + "logps/chosen": -77.46542358398438, + "logps/rejected": -1750.359619140625, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34000295400619507, + "rewards/margins": 16.431018829345703, + "rewards/rejected": -16.77102279663086, + "step": 1790 + }, + { + "epoch": 0.4495223824686271, + "grad_norm": 0.0003185272216796875, + "learning_rate": 3.35948699576072e-06, + "logits/chosen": 0.5788090825080872, + "logits/rejected": 1.8735895156860352, + "logps/chosen": -83.07856750488281, + "logps/rejected": -2100.754638671875, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3939378559589386, + "rewards/margins": 19.880746841430664, + "rewards/rejected": -20.274681091308594, + "step": 1800 + }, + { + "epoch": 0.4520197290378972, + "grad_norm": 0.13671875, + "learning_rate": 3.3389848616003085e-06, + "logits/chosen": 0.5929907560348511, + "logits/rejected": 1.6974430084228516, + "logps/chosen": -79.56812286376953, + "logps/rejected": -1787.6068115234375, + "loss": 0.0033, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3681466281414032, + "rewards/margins": 16.794055938720703, + "rewards/rejected": -17.16220474243164, + "step": 1810 + }, + { + "epoch": 0.45451707560716736, + "grad_norm": 0.0390625, + "learning_rate": 3.3184189418473674e-06, + "logits/chosen": 0.5751794576644897, + "logits/rejected": 1.7812267541885376, + "logps/chosen": -77.59693908691406, + "logps/rejected": -1778.1439208984375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3408336043357849, + "rewards/margins": 16.739145278930664, + "rewards/rejected": -17.079978942871094, + "step": 1820 + }, + { + "epoch": 0.45701442217643756, + "grad_norm": 0.001373291015625, + "learning_rate": 3.2977908000692925e-06, + "logits/chosen": 0.5487096905708313, + "logits/rejected": 1.74447500705719, + "logps/chosen": -80.25045013427734, + "logps/rejected": -1946.0765380859375, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3701193928718567, + "rewards/margins": 18.401947021484375, + "rewards/rejected": -18.77206802368164, + "step": 1830 + }, + { + "epoch": 0.4595117687457077, + "grad_norm": 0.048095703125, + "learning_rate": 3.2771020045640435e-06, + "logits/chosen": 0.6444208025932312, + "logits/rejected": 1.6972076892852783, + "logps/chosen": -78.00311279296875, + "logps/rejected": -1579.7557373046875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3551621735095978, + "rewards/margins": 14.743237495422363, + "rewards/rejected": -15.09839916229248, + "step": 1840 + }, + { + "epoch": 0.46200911531497785, + "grad_norm": 0.07177734375, + "learning_rate": 3.256354128240907e-06, + "logits/chosen": 0.6255194544792175, + "logits/rejected": 1.7124531269073486, + "logps/chosen": -85.12455749511719, + "logps/rejected": -1608.01171875, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.401846319437027, + "rewards/margins": 14.84345531463623, + "rewards/rejected": -15.245302200317383, + "step": 1850 + }, + { + "epoch": 0.464506461884248, + "grad_norm": 1.191438059322536e-10, + "learning_rate": 3.235548748500914e-06, + "logits/chosen": 0.5620906352996826, + "logits/rejected": 1.8212181329727173, + "logps/chosen": -78.25764465332031, + "logps/rejected": -1836.809326171875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3476276397705078, + "rewards/margins": 17.330211639404297, + "rewards/rejected": -17.677841186523438, + "step": 1860 + }, + { + "epoch": 0.46700380845351813, + "grad_norm": 0.038330078125, + "learning_rate": 3.214687447116913e-06, + "logits/chosen": 0.5774132609367371, + "logits/rejected": 1.7261114120483398, + "logps/chosen": -76.27984619140625, + "logps/rejected": -1707.5220947265625, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3342064321041107, + "rewards/margins": 15.912832260131836, + "rewards/rejected": -16.247039794921875, + "step": 1870 + }, + { + "epoch": 0.4695011550227883, + "grad_norm": 0.002532958984375, + "learning_rate": 3.193771810113313e-06, + "logits/chosen": 0.5532559752464294, + "logits/rejected": 1.8629133701324463, + "logps/chosen": -79.43685150146484, + "logps/rejected": -2138.56884765625, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3567715287208557, + "rewards/margins": 20.29428482055664, + "rewards/rejected": -20.65105628967285, + "step": 1880 + }, + { + "epoch": 0.4719985015920584, + "grad_norm": 0.018310546875, + "learning_rate": 3.1728034276455032e-06, + "logits/chosen": 0.6407243609428406, + "logits/rejected": 1.7773427963256836, + "logps/chosen": -75.46717834472656, + "logps/rejected": -1624.9876708984375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3326609432697296, + "rewards/margins": 15.156428337097168, + "rewards/rejected": -15.489087104797363, + "step": 1890 + }, + { + "epoch": 0.4744958481613286, + "grad_norm": 0.001556396484375, + "learning_rate": 3.1517838938789597e-06, + "logits/chosen": 0.5151150822639465, + "logits/rejected": 1.6990512609481812, + "logps/chosen": -79.35011291503906, + "logps/rejected": -1993.761962890625, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3504992127418518, + "rewards/margins": 18.6396427154541, + "rewards/rejected": -18.990140914916992, + "step": 1900 + }, + { + "epoch": 0.47699319473059876, + "grad_norm": 0.036376953125, + "learning_rate": 3.130714806868041e-06, + "logits/chosen": 0.5437807440757751, + "logits/rejected": 1.6498979330062866, + "logps/chosen": -77.74958801269531, + "logps/rejected": -1746.6015625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3441064655780792, + "rewards/margins": 16.398990631103516, + "rewards/rejected": -16.74309730529785, + "step": 1910 + }, + { + "epoch": 0.4794905412998689, + "grad_norm": 0.0269775390625, + "learning_rate": 3.1095977684344976e-06, + "logits/chosen": 0.6197426319122314, + "logits/rejected": 1.865501046180725, + "logps/chosen": -83.05316162109375, + "logps/rejected": -1912.490234375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40501928329467773, + "rewards/margins": 18.013139724731445, + "rewards/rejected": -18.41815757751465, + "step": 1920 + }, + { + "epoch": 0.48198788786913904, + "grad_norm": 0.004058837890625, + "learning_rate": 3.0884343840456874e-06, + "logits/chosen": 0.5581328868865967, + "logits/rejected": 1.818427324295044, + "logps/chosen": -82.58245849609375, + "logps/rejected": -2075.47509765625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39019304513931274, + "rewards/margins": 19.626462936401367, + "rewards/rejected": -20.016658782958984, + "step": 1930 + }, + { + "epoch": 0.4844852344384092, + "grad_norm": 5.0067901611328125e-06, + "learning_rate": 3.0672262626925174e-06, + "logits/chosen": 0.49209919571876526, + "logits/rejected": 1.718467354774475, + "logps/chosen": -82.48509216308594, + "logps/rejected": -1921.3333740234375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35821837186813354, + "rewards/margins": 18.051937103271484, + "rewards/rejected": -18.410158157348633, + "step": 1940 + }, + { + "epoch": 0.4869825810076793, + "grad_norm": 0.0255126953125, + "learning_rate": 3.0459750167671147e-06, + "logits/chosen": 0.4969088137149811, + "logits/rejected": 1.7654139995574951, + "logps/chosen": -79.9859390258789, + "logps/rejected": -2075.444580078125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3472353518009186, + "rewards/margins": 19.543991088867188, + "rewards/rejected": -19.89122772216797, + "step": 1950 + }, + { + "epoch": 0.48947992757694947, + "grad_norm": 0.1552734375, + "learning_rate": 3.024682261940247e-06, + "logits/chosen": 0.5588921904563904, + "logits/rejected": 1.6852495670318604, + "logps/chosen": -83.98374938964844, + "logps/rejected": -1691.5599365234375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39910078048706055, + "rewards/margins": 15.737649917602539, + "rewards/rejected": -16.136751174926758, + "step": 1960 + }, + { + "epoch": 0.49197727414621967, + "grad_norm": 1.4185905456542969e-05, + "learning_rate": 3.0033496170384803e-06, + "logits/chosen": 0.6266374588012695, + "logits/rejected": 1.8179903030395508, + "logps/chosen": -77.69737243652344, + "logps/rejected": -1697.776123046875, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3449680805206299, + "rewards/margins": 15.93467903137207, + "rewards/rejected": -16.279645919799805, + "step": 1970 + }, + { + "epoch": 0.4944746207154898, + "grad_norm": 0.0458984375, + "learning_rate": 2.9819787039211068e-06, + "logits/chosen": 0.5513324737548828, + "logits/rejected": 1.6900783777236938, + "logps/chosen": -76.17434692382812, + "logps/rejected": -1829.5364990234375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.31986111402511597, + "rewards/margins": 17.185441970825195, + "rewards/rejected": -17.50530242919922, + "step": 1980 + }, + { + "epoch": 0.49697196728475995, + "grad_norm": 0.002777099609375, + "learning_rate": 2.960571147356845e-06, + "logits/chosen": 0.5562096834182739, + "logits/rejected": 1.8595008850097656, + "logps/chosen": -83.13392639160156, + "logps/rejected": -2010.169921875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.395429790019989, + "rewards/margins": 18.9808292388916, + "rewards/rejected": -19.37626075744629, + "step": 1990 + }, + { + "epoch": 0.4994693138540301, + "grad_norm": 0.000293731689453125, + "learning_rate": 2.9391285749003046e-06, + "logits/chosen": 0.5312787294387817, + "logits/rejected": 1.7356303930282593, + "logps/chosen": -95.15312194824219, + "logps/rejected": -2160.408935546875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5149081349372864, + "rewards/margins": 20.321773529052734, + "rewards/rejected": -20.836681365966797, + "step": 2000 + }, + { + "epoch": 0.4994693138540301, + "eval_logits/chosen": 0.6455119848251343, + "eval_logits/rejected": 1.5546293258666992, + "eval_logps/chosen": -82.33647155761719, + "eval_logps/rejected": -980.0130615234375, + "eval_loss": 0.0030529608484357595, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": -0.37980592250823975, + "eval_rewards/margins": 8.809426307678223, + "eval_rewards/rejected": -9.189233779907227, + "eval_runtime": 0.6247, + "eval_samples_per_second": 8.004, + "eval_steps_per_second": 8.004, + "step": 2000 + }, + { + "epoch": 0.5019666604233003, + "grad_norm": 0.06494140625, + "learning_rate": 2.9176526167682543e-06, + "logits/chosen": 0.6404844522476196, + "logits/rejected": 1.8602796792984009, + "logps/chosen": -82.97758483886719, + "logps/rejected": -1836.1568603515625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4049321115016937, + "rewards/margins": 17.25887107849121, + "rewards/rejected": -17.663803100585938, + "step": 2010 + }, + { + "epoch": 0.5044640069925704, + "grad_norm": 0.0166015625, + "learning_rate": 2.8961449057156775e-06, + "logits/chosen": 0.5347205400466919, + "logits/rejected": 1.700486421585083, + "logps/chosen": -84.12736511230469, + "logps/rejected": -1874.214111328125, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3977189064025879, + "rewards/margins": 17.567378997802734, + "rewards/rejected": -17.965099334716797, + "step": 2020 + }, + { + "epoch": 0.5069613535618406, + "grad_norm": 0.0013427734375, + "learning_rate": 2.874607076911642e-06, + "logits/chosen": 0.5987354516983032, + "logits/rejected": 1.7991135120391846, + "logps/chosen": -81.83995819091797, + "logps/rejected": -1828.4136962890625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38373640179634094, + "rewards/margins": 17.181564331054688, + "rewards/rejected": -17.5653018951416, + "step": 2030 + }, + { + "epoch": 0.5094587001311107, + "grad_norm": 0.000850677490234375, + "learning_rate": 2.8530407678149806e-06, + "logits/chosen": 0.6027461886405945, + "logits/rejected": 1.730103850364685, + "logps/chosen": -81.79703521728516, + "logps/rejected": -1646.1201171875, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37951746582984924, + "rewards/margins": 15.314178466796875, + "rewards/rejected": -15.693696975708008, + "step": 2040 + }, + { + "epoch": 0.5119560467003809, + "grad_norm": 0.00055694580078125, + "learning_rate": 2.8314476180498003e-06, + "logits/chosen": 0.6401151418685913, + "logits/rejected": 1.7924197912216187, + "logps/chosen": -85.15666198730469, + "logps/rejected": -1746.1314697265625, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.421166330575943, + "rewards/margins": 16.335796356201172, + "rewards/rejected": -16.756961822509766, + "step": 2050 + }, + { + "epoch": 0.514453393269651, + "grad_norm": 0.032958984375, + "learning_rate": 2.8098292692808253e-06, + "logits/chosen": 0.6529192328453064, + "logits/rejected": 1.7113368511199951, + "logps/chosen": -83.26264953613281, + "logps/rejected": -1448.375732421875, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3913651704788208, + "rewards/margins": 13.431114196777344, + "rewards/rejected": -13.822479248046875, + "step": 2060 + }, + { + "epoch": 0.5169507398389211, + "grad_norm": 0.035400390625, + "learning_rate": 2.7881873650885904e-06, + "logits/chosen": 0.6235641241073608, + "logits/rejected": 1.7722196578979492, + "logps/chosen": -85.73550415039062, + "logps/rejected": -1683.2877197265625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42536306381225586, + "rewards/margins": 15.712361335754395, + "rewards/rejected": -16.13772201538086, + "step": 2070 + }, + { + "epoch": 0.5194480864081913, + "grad_norm": 0.07470703125, + "learning_rate": 2.7665235508444772e-06, + "logits/chosen": 0.5478901267051697, + "logits/rejected": 1.8091357946395874, + "logps/chosen": -79.84373474121094, + "logps/rejected": -1996.998779296875, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.368811696767807, + "rewards/margins": 18.87316131591797, + "rewards/rejected": -19.241975784301758, + "step": 2080 + }, + { + "epoch": 0.5219454329774614, + "grad_norm": 0.000518798828125, + "learning_rate": 2.7448394735856275e-06, + "logits/chosen": 0.5016141533851624, + "logits/rejected": 1.7318010330200195, + "logps/chosen": -88.0289306640625, + "logps/rejected": -2137.91552734375, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43785446882247925, + "rewards/margins": 20.17669105529785, + "rewards/rejected": -20.614543914794922, + "step": 2090 + }, + { + "epoch": 0.5244427795467316, + "grad_norm": 0.041748046875, + "learning_rate": 2.723136781889722e-06, + "logits/chosen": 0.6009372472763062, + "logits/rejected": 1.8194977045059204, + "logps/chosen": -82.27381896972656, + "logps/rejected": -1805.30859375, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38621821999549866, + "rewards/margins": 16.96898651123047, + "rewards/rejected": -17.355205535888672, + "step": 2100 + }, + { + "epoch": 0.5269401261160017, + "grad_norm": 0.0242919921875, + "learning_rate": 2.7014171257496414e-06, + "logits/chosen": 0.5697668790817261, + "logits/rejected": 1.7131723165512085, + "logps/chosen": -84.2120132446289, + "logps/rejected": -1700.6185302734375, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3910997211933136, + "rewards/margins": 15.752738952636719, + "rewards/rejected": -16.14383888244629, + "step": 2110 + }, + { + "epoch": 0.5294374726852719, + "grad_norm": 0.052978515625, + "learning_rate": 2.6796821564480237e-06, + "logits/chosen": 0.601753294467926, + "logits/rejected": 1.729688048362732, + "logps/chosen": -76.5484390258789, + "logps/rejected": -1640.7828369140625, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32075661420822144, + "rewards/margins": 15.291044235229492, + "rewards/rejected": -15.611801147460938, + "step": 2120 + }, + { + "epoch": 0.531934819254542, + "grad_norm": 0.0012664794921875, + "learning_rate": 2.6579335264317253e-06, + "logits/chosen": 0.5816048383712769, + "logits/rejected": 1.7906360626220703, + "logps/chosen": -85.20054626464844, + "logps/rejected": -1883.8203125, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40961331129074097, + "rewards/margins": 17.597332000732422, + "rewards/rejected": -18.006946563720703, + "step": 2130 + }, + { + "epoch": 0.5344321658238121, + "grad_norm": 0.00101470947265625, + "learning_rate": 2.6361728891861843e-06, + "logits/chosen": 0.5752017498016357, + "logits/rejected": 1.6957403421401978, + "logps/chosen": -86.33828735351562, + "logps/rejected": -1814.455322265625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4084859788417816, + "rewards/margins": 16.83043670654297, + "rewards/rejected": -17.238922119140625, + "step": 2140 + }, + { + "epoch": 0.5369295123930824, + "grad_norm": 0.009521484375, + "learning_rate": 2.614401899109716e-06, + "logits/chosen": 0.5796340703964233, + "logits/rejected": 1.7896589040756226, + "logps/chosen": -78.46866607666016, + "logps/rejected": -1804.115234375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3526671528816223, + "rewards/margins": 16.97123908996582, + "rewards/rejected": -17.32390785217285, + "step": 2150 + }, + { + "epoch": 0.5394268589623525, + "grad_norm": 0.0263671875, + "learning_rate": 2.5926222113877282e-06, + "logits/chosen": 0.5532792806625366, + "logits/rejected": 1.7575023174285889, + "logps/chosen": -86.9544906616211, + "logps/rejected": -1865.7669677734375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4337650239467621, + "rewards/margins": 17.272930145263672, + "rewards/rejected": -17.706693649291992, + "step": 2160 + }, + { + "epoch": 0.5419242055316227, + "grad_norm": 0.0419921875, + "learning_rate": 2.570835481866889e-06, + "logits/chosen": 0.6227487921714783, + "logits/rejected": 1.7569319009780884, + "logps/chosen": -83.56333923339844, + "logps/rejected": -1739.373046875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4034454822540283, + "rewards/margins": 16.28359031677246, + "rewards/rejected": -16.687036514282227, + "step": 2170 + }, + { + "epoch": 0.5444215521008928, + "grad_norm": 0.0240478515625, + "learning_rate": 2.5490433669292337e-06, + "logits/chosen": 0.5318483114242554, + "logits/rejected": 1.7802917957305908, + "logps/chosen": -83.16242218017578, + "logps/rejected": -2065.672119140625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3906503915786743, + "rewards/margins": 19.537628173828125, + "rewards/rejected": -19.92827796936035, + "step": 2180 + }, + { + "epoch": 0.546918898670163, + "grad_norm": 0.00081634521484375, + "learning_rate": 2.527247523366232e-06, + "logits/chosen": 0.55711829662323, + "logits/rejected": 1.7834659814834595, + "logps/chosen": -89.28174591064453, + "logps/rejected": -1952.4468994140625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45531249046325684, + "rewards/margins": 18.357410430908203, + "rewards/rejected": -18.812725067138672, + "step": 2190 + }, + { + "epoch": 0.5494162452394331, + "grad_norm": 0.0498046875, + "learning_rate": 2.5054496082528336e-06, + "logits/chosen": 0.6078628897666931, + "logits/rejected": 1.8645546436309814, + "logps/chosen": -78.72160339355469, + "logps/rejected": -1901.3984375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3558036983013153, + "rewards/margins": 17.979564666748047, + "rewards/rejected": -18.335365295410156, + "step": 2200 + }, + { + "epoch": 0.5519135918087033, + "grad_norm": 0.030029296875, + "learning_rate": 2.483651278821481e-06, + "logits/chosen": 0.6357477903366089, + "logits/rejected": 1.8168609142303467, + "logps/chosen": -86.10234069824219, + "logps/rejected": -1748.644775390625, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43243059515953064, + "rewards/margins": 16.351675033569336, + "rewards/rejected": -16.78410530090332, + "step": 2210 + }, + { + "epoch": 0.5544109383779734, + "grad_norm": 0.01177978515625, + "learning_rate": 2.4618541923361166e-06, + "logits/chosen": 0.6292850971221924, + "logits/rejected": 1.7243268489837646, + "logps/chosen": -83.60933685302734, + "logps/rejected": -1522.750244140625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39073851704597473, + "rewards/margins": 14.038189888000488, + "rewards/rejected": -14.428926467895508, + "step": 2220 + }, + { + "epoch": 0.5569082849472435, + "grad_norm": 0.00070953369140625, + "learning_rate": 2.4400600059661836e-06, + "logits/chosen": 0.5282065868377686, + "logits/rejected": 1.8849893808364868, + "logps/chosen": -86.94523620605469, + "logps/rejected": -2129.426025390625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42179185152053833, + "rewards/margins": 20.111438751220703, + "rewards/rejected": -20.53322982788086, + "step": 2230 + }, + { + "epoch": 0.5594056315165137, + "grad_norm": 0.008544921875, + "learning_rate": 2.41827037666064e-06, + "logits/chosen": 0.6786268353462219, + "logits/rejected": 1.8249973058700562, + "logps/chosen": -76.79341888427734, + "logps/rejected": -1632.072265625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33309558033943176, + "rewards/margins": 15.309873580932617, + "rewards/rejected": -15.642970085144043, + "step": 2240 + }, + { + "epoch": 0.5619029780857838, + "grad_norm": 0.018798828125, + "learning_rate": 2.396486961021983e-06, + "logits/chosen": 0.617296576499939, + "logits/rejected": 1.860708236694336, + "logps/chosen": -89.55140686035156, + "logps/rejected": -1905.1920166015625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4716108441352844, + "rewards/margins": 17.87620735168457, + "rewards/rejected": -18.347820281982422, + "step": 2250 + }, + { + "epoch": 0.564400324655054, + "grad_norm": 0.00726318359375, + "learning_rate": 2.3747114151802993e-06, + "logits/chosen": 0.6001085638999939, + "logits/rejected": 1.8411632776260376, + "logps/chosen": -79.05329895019531, + "logps/rejected": -1769.6693115234375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3584744334220886, + "rewards/margins": 16.644119262695312, + "rewards/rejected": -17.002593994140625, + "step": 2260 + }, + { + "epoch": 0.5668976712243241, + "grad_norm": 0.038330078125, + "learning_rate": 2.352945394667363e-06, + "logits/chosen": 0.5482415556907654, + "logits/rejected": 1.7739299535751343, + "logps/chosen": -88.25926971435547, + "logps/rejected": -2113.56884765625, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43419378995895386, + "rewards/margins": 19.882293701171875, + "rewards/rejected": -20.31648826599121, + "step": 2270 + }, + { + "epoch": 0.5693950177935944, + "grad_norm": 0.06494140625, + "learning_rate": 2.3311905542907627e-06, + "logits/chosen": 0.6261372566223145, + "logits/rejected": 1.787941336631775, + "logps/chosen": -80.4935302734375, + "logps/rejected": -1684.802001953125, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.378140389919281, + "rewards/margins": 15.795267105102539, + "rewards/rejected": -16.17340660095215, + "step": 2280 + }, + { + "epoch": 0.5718923643628645, + "grad_norm": 0.033935546875, + "learning_rate": 2.30944854800809e-06, + "logits/chosen": 0.6286464929580688, + "logits/rejected": 1.8051517009735107, + "logps/chosen": -80.61420440673828, + "logps/rejected": -1804.312744140625, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37235134840011597, + "rewards/margins": 16.959814071655273, + "rewards/rejected": -17.332164764404297, + "step": 2290 + }, + { + "epoch": 0.5743897109321346, + "grad_norm": 0.0019683837890625, + "learning_rate": 2.287721028801204e-06, + "logits/chosen": 0.5823894739151001, + "logits/rejected": 1.7553184032440186, + "logps/chosen": -89.6335678100586, + "logps/rejected": -1704.480712890625, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.45537322759628296, + "rewards/margins": 15.833767890930176, + "rewards/rejected": -16.289142608642578, + "step": 2300 + }, + { + "epoch": 0.5768870575014048, + "grad_norm": 0.021728515625, + "learning_rate": 2.26600964855055e-06, + "logits/chosen": 0.6238933205604553, + "logits/rejected": 1.7979097366333008, + "logps/chosen": -79.57666778564453, + "logps/rejected": -1692.2340087890625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3568623960018158, + "rewards/margins": 15.8856201171875, + "rewards/rejected": -16.242483139038086, + "step": 2310 + }, + { + "epoch": 0.5793844040706749, + "grad_norm": 0.007476806640625, + "learning_rate": 2.244316057909573e-06, + "logits/chosen": 0.6190879344940186, + "logits/rejected": 1.7797908782958984, + "logps/chosen": -86.66450500488281, + "logps/rejected": -1799.299560546875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4307888448238373, + "rewards/margins": 16.876020431518555, + "rewards/rejected": -17.30681037902832, + "step": 2320 + }, + { + "epoch": 0.5818817506399451, + "grad_norm": 0.005279541015625, + "learning_rate": 2.2226419061792282e-06, + "logits/chosen": 0.5849915742874146, + "logits/rejected": 1.8162180185317993, + "logps/chosen": -85.55912017822266, + "logps/rejected": -1866.6416015625, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42593854665756226, + "rewards/margins": 17.541982650756836, + "rewards/rejected": -17.96792221069336, + "step": 2330 + }, + { + "epoch": 0.5843790972092152, + "grad_norm": 0.002044677734375, + "learning_rate": 2.200988841182589e-06, + "logits/chosen": 0.6237704157829285, + "logits/rejected": 1.8964016437530518, + "logps/chosen": -95.87744140625, + "logps/rejected": -2077.869140625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5327093005180359, + "rewards/margins": 19.543682098388672, + "rewards/rejected": -20.076391220092773, + "step": 2340 + }, + { + "epoch": 0.5868764437784854, + "grad_norm": 0.00469970703125, + "learning_rate": 2.179358509139559e-06, + "logits/chosen": 0.6188510060310364, + "logits/rejected": 1.7551387548446655, + "logps/chosen": -82.06452941894531, + "logps/rejected": -1564.6396484375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.359652578830719, + "rewards/margins": 14.480381965637207, + "rewards/rejected": -14.840034484863281, + "step": 2350 + }, + { + "epoch": 0.5893737903477555, + "grad_norm": 0.05322265625, + "learning_rate": 2.1577525545417254e-06, + "logits/chosen": 0.642662525177002, + "logits/rejected": 1.8487951755523682, + "logps/chosen": -85.49614715576172, + "logps/rejected": -1861.0296630859375, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41231250762939453, + "rewards/margins": 17.494285583496094, + "rewards/rejected": -17.906597137451172, + "step": 2360 + }, + { + "epoch": 0.5918711369170256, + "grad_norm": 0.059326171875, + "learning_rate": 2.1361726200273293e-06, + "logits/chosen": 0.6013755202293396, + "logits/rejected": 1.8614768981933594, + "logps/chosen": -82.80476379394531, + "logps/rejected": -1880.697021484375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38074877858161926, + "rewards/margins": 17.66275978088379, + "rewards/rejected": -18.043506622314453, + "step": 2370 + }, + { + "epoch": 0.5943684834862958, + "grad_norm": 0.04638671875, + "learning_rate": 2.1146203462563773e-06, + "logits/chosen": 0.6672108769416809, + "logits/rejected": 1.8760160207748413, + "logps/chosen": -85.3284912109375, + "logps/rejected": -1663.7691650390625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42937812209129333, + "rewards/margins": 15.528106689453125, + "rewards/rejected": -15.957483291625977, + "step": 2380 + }, + { + "epoch": 0.5968658300555659, + "grad_norm": 0.037109375, + "learning_rate": 2.0930973717859117e-06, + "logits/chosen": 0.5693127512931824, + "logits/rejected": 1.8059355020523071, + "logps/chosen": -86.84693145751953, + "logps/rejected": -1841.8323974609375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4406910836696625, + "rewards/margins": 17.286680221557617, + "rewards/rejected": -17.727371215820312, + "step": 2390 + }, + { + "epoch": 0.5993631766248361, + "grad_norm": 4.8160552978515625e-05, + "learning_rate": 2.0716053329454337e-06, + "logits/chosen": 0.633589506149292, + "logits/rejected": 1.8425014019012451, + "logps/chosen": -84.12596130371094, + "logps/rejected": -1987.7154541015625, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40977373719215393, + "rewards/margins": 18.74795913696289, + "rewards/rejected": -19.157733917236328, + "step": 2400 + }, + { + "epoch": 0.6018605231941062, + "grad_norm": 0.020751953125, + "learning_rate": 2.0501458637124963e-06, + "logits/chosen": 0.6005128026008606, + "logits/rejected": 1.9649635553359985, + "logps/chosen": -89.92936706542969, + "logps/rejected": -2303.25341796875, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46929341554641724, + "rewards/margins": 21.84885597229004, + "rewards/rejected": -22.31814956665039, + "step": 2410 + }, + { + "epoch": 0.6043578697633765, + "grad_norm": 0.0059814453125, + "learning_rate": 2.0287205955884812e-06, + "logits/chosen": 0.5659859776496887, + "logits/rejected": 1.7156604528427124, + "logps/chosen": -82.71956634521484, + "logps/rejected": -1716.356689453125, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37312421202659607, + "rewards/margins": 15.841397285461426, + "rewards/rejected": -16.214521408081055, + "step": 2420 + }, + { + "epoch": 0.6068552163326466, + "grad_norm": 0.031494140625, + "learning_rate": 2.0073311574745583e-06, + "logits/chosen": 0.615592896938324, + "logits/rejected": 1.8998111486434937, + "logps/chosen": -83.0928726196289, + "logps/rejected": -2058.5458984375, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3958067297935486, + "rewards/margins": 19.46796226501465, + "rewards/rejected": -19.863767623901367, + "step": 2430 + }, + { + "epoch": 0.6093525629019167, + "grad_norm": 0.0108642578125, + "learning_rate": 1.9859791755478453e-06, + "logits/chosen": 0.612500786781311, + "logits/rejected": 1.7525627613067627, + "logps/chosen": -79.78617858886719, + "logps/rejected": -1609.921142578125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3665863871574402, + "rewards/margins": 15.052395820617676, + "rewards/rejected": -15.418981552124023, + "step": 2440 + }, + { + "epoch": 0.6118499094711869, + "grad_norm": 0.05419921875, + "learning_rate": 1.9646662731377737e-06, + "logits/chosen": 0.6692113876342773, + "logits/rejected": 1.816349983215332, + "logps/chosen": -84.40538787841797, + "logps/rejected": -1675.842041015625, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40326443314552307, + "rewards/margins": 15.622156143188477, + "rewards/rejected": -16.025419235229492, + "step": 2450 + }, + { + "epoch": 0.614347256040457, + "grad_norm": 0.00130462646484375, + "learning_rate": 1.9433940706026743e-06, + "logits/chosen": 0.5840574502944946, + "logits/rejected": 1.8281028270721436, + "logps/chosen": -86.7694320678711, + "logps/rejected": -2069.530517578125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4303639531135559, + "rewards/margins": 19.5331974029541, + "rewards/rejected": -19.963563919067383, + "step": 2460 + }, + { + "epoch": 0.6168446026097272, + "grad_norm": 0.00012493133544921875, + "learning_rate": 1.9221641852065807e-06, + "logits/chosen": 0.6755739450454712, + "logits/rejected": 1.845654845237732, + "logps/chosen": -88.67765045166016, + "logps/rejected": -1714.5185546875, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.46009987592697144, + "rewards/margins": 15.997251510620117, + "rewards/rejected": -16.457351684570312, + "step": 2470 + }, + { + "epoch": 0.6193419491789973, + "grad_norm": 0.039794921875, + "learning_rate": 1.9009782309962805e-06, + "logits/chosen": 0.5677890181541443, + "logits/rejected": 1.8127906322479248, + "logps/chosen": -76.20381164550781, + "logps/rejected": -1766.592529296875, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3160540759563446, + "rewards/margins": 16.549949645996094, + "rewards/rejected": -16.866003036499023, + "step": 2480 + }, + { + "epoch": 0.6218392957482675, + "grad_norm": 0.037841796875, + "learning_rate": 1.8798378186785979e-06, + "logits/chosen": 0.6165963411331177, + "logits/rejected": 1.7844451665878296, + "logps/chosen": -80.53484344482422, + "logps/rejected": -1799.0015869140625, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37462836503982544, + "rewards/margins": 16.907718658447266, + "rewards/rejected": -17.282346725463867, + "step": 2490 + }, + { + "epoch": 0.6243366423175376, + "grad_norm": 0.01007080078125, + "learning_rate": 1.8587445554979404e-06, + "logits/chosen": 0.6141692399978638, + "logits/rejected": 1.8811423778533936, + "logps/chosen": -87.24476623535156, + "logps/rejected": -2009.0035400390625, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4570063054561615, + "rewards/margins": 18.914798736572266, + "rewards/rejected": -19.371807098388672, + "step": 2500 + }, + { + "epoch": 0.6268339888868077, + "grad_norm": 0.00011968612670898438, + "learning_rate": 1.8377000451141013e-06, + "logits/chosen": 0.6391327977180481, + "logits/rejected": 1.9281005859375, + "logps/chosen": -86.39270782470703, + "logps/rejected": -1954.652099609375, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4386569857597351, + "rewards/margins": 18.369953155517578, + "rewards/rejected": -18.808609008789062, + "step": 2510 + }, + { + "epoch": 0.6293313354560779, + "grad_norm": 0.0277099609375, + "learning_rate": 1.8167058874803405e-06, + "logits/chosen": 0.5556064248085022, + "logits/rejected": 1.718269944190979, + "logps/chosen": -86.69864654541016, + "logps/rejected": -1939.4332275390625, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4233566224575043, + "rewards/margins": 18.154285430908203, + "rewards/rejected": -18.5776424407959, + "step": 2520 + }, + { + "epoch": 0.631828682025348, + "grad_norm": 0.0771484375, + "learning_rate": 1.7957636787217451e-06, + "logits/chosen": 0.5710119009017944, + "logits/rejected": 1.7915983200073242, + "logps/chosen": -79.21806335449219, + "logps/rejected": -1942.8629150390625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3619151711463928, + "rewards/margins": 18.352802276611328, + "rewards/rejected": -18.71471405029297, + "step": 2530 + }, + { + "epoch": 0.6343260285946182, + "grad_norm": 0.0155029296875, + "learning_rate": 1.7748750110138768e-06, + "logits/chosen": 0.5197774171829224, + "logits/rejected": 1.737969160079956, + "logps/chosen": -88.46708679199219, + "logps/rejected": -2104.29296875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42627063393592834, + "rewards/margins": 19.79252052307129, + "rewards/rejected": -20.21879005432129, + "step": 2540 + }, + { + "epoch": 0.6368233751638883, + "grad_norm": 0.021484375, + "learning_rate": 1.7540414724617282e-06, + "logits/chosen": 0.5759893655776978, + "logits/rejected": 1.7303228378295898, + "logps/chosen": -76.38877868652344, + "logps/rejected": -1828.7796630859375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.32031676173210144, + "rewards/margins": 17.18129539489746, + "rewards/rejected": -17.50161361694336, + "step": 2550 + }, + { + "epoch": 0.6393207217331586, + "grad_norm": 0.0196533203125, + "learning_rate": 1.7332646469789827e-06, + "logits/chosen": 0.6225037574768066, + "logits/rejected": 1.779985785484314, + "logps/chosen": -85.09439086914062, + "logps/rejected": -1516.5433349609375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4056459069252014, + "rewards/margins": 14.090934753417969, + "rewards/rejected": -14.496580123901367, + "step": 2560 + }, + { + "epoch": 0.6418180683024287, + "grad_norm": 0.036376953125, + "learning_rate": 1.7125461141675881e-06, + "logits/chosen": 0.643700122833252, + "logits/rejected": 1.8465179204940796, + "logps/chosen": -80.20843505859375, + "logps/rejected": -1810.455078125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3701106905937195, + "rewards/margins": 16.961139678955078, + "rewards/rejected": -17.331249237060547, + "step": 2570 + }, + { + "epoch": 0.6443154148716989, + "grad_norm": 0.0289306640625, + "learning_rate": 1.6918874491976744e-06, + "logits/chosen": 0.5704053640365601, + "logits/rejected": 1.6976230144500732, + "logps/chosen": -84.80411529541016, + "logps/rejected": -1765.397216796875, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4006038308143616, + "rewards/margins": 16.490291595458984, + "rewards/rejected": -16.89089584350586, + "step": 2580 + }, + { + "epoch": 0.646812761440969, + "grad_norm": 0.000751495361328125, + "learning_rate": 1.6712902226877917e-06, + "logits/chosen": 0.6289549469947815, + "logits/rejected": 1.850502610206604, + "logps/chosen": -88.18513488769531, + "logps/rejected": -1988.833984375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4497924745082855, + "rewards/margins": 18.730804443359375, + "rewards/rejected": -19.180593490600586, + "step": 2590 + }, + { + "epoch": 0.6493101080102391, + "grad_norm": 0.0012664794921875, + "learning_rate": 1.6507560005854977e-06, + "logits/chosen": 0.5509642362594604, + "logits/rejected": 1.7071406841278076, + "logps/chosen": -83.9954833984375, + "logps/rejected": -1830.5101318359375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3968578577041626, + "rewards/margins": 17.073257446289062, + "rewards/rejected": -17.470115661621094, + "step": 2600 + }, + { + "epoch": 0.6518074545795093, + "grad_norm": 0.008544921875, + "learning_rate": 1.6302863440483121e-06, + "logits/chosen": 0.5004338026046753, + "logits/rejected": 1.7150554656982422, + "logps/chosen": -82.43248748779297, + "logps/rejected": -1880.423828125, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36883944272994995, + "rewards/margins": 17.624969482421875, + "rewards/rejected": -17.99380874633789, + "step": 2610 + }, + { + "epoch": 0.6543048011487794, + "grad_norm": 0.0341796875, + "learning_rate": 1.6098828093250203e-06, + "logits/chosen": 0.5160735845565796, + "logits/rejected": 1.7385032176971436, + "logps/chosen": -79.65149688720703, + "logps/rejected": -2075.0966796875, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3529607653617859, + "rewards/margins": 19.472675323486328, + "rewards/rejected": -19.825634002685547, + "step": 2620 + }, + { + "epoch": 0.6568021477180496, + "grad_norm": 0.027099609375, + "learning_rate": 1.5895469476373545e-06, + "logits/chosen": 0.5671316385269165, + "logits/rejected": 1.722346305847168, + "logps/chosen": -81.55863189697266, + "logps/rejected": -1681.589111328125, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3755113184452057, + "rewards/margins": 15.65942096710205, + "rewards/rejected": -16.03493309020996, + "step": 2630 + }, + { + "epoch": 0.6592994942873197, + "grad_norm": 0.091796875, + "learning_rate": 1.5692803050620642e-06, + "logits/chosen": 0.6067586541175842, + "logits/rejected": 1.7199970483779907, + "logps/chosen": -83.18370056152344, + "logps/rejected": -1680.830322265625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38230761885643005, + "rewards/margins": 15.612162590026855, + "rewards/rejected": -15.994470596313477, + "step": 2640 + }, + { + "epoch": 0.6617968408565899, + "grad_norm": 0.007720947265625, + "learning_rate": 1.5490844224133717e-06, + "logits/chosen": 0.6019744873046875, + "logits/rejected": 1.8558555841445923, + "logps/chosen": -80.30780792236328, + "logps/rejected": -1929.556640625, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37255150079727173, + "rewards/margins": 18.213796615600586, + "rewards/rejected": -18.586347579956055, + "step": 2650 + }, + { + "epoch": 0.66429418742586, + "grad_norm": 0.050537109375, + "learning_rate": 1.528960835125822e-06, + "logits/chosen": 0.6771095991134644, + "logits/rejected": 1.8372013568878174, + "logps/chosen": -80.95626068115234, + "logps/rejected": -1653.2200927734375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3823738992214203, + "rewards/margins": 15.478759765625, + "rewards/rejected": -15.86113452911377, + "step": 2660 + }, + { + "epoch": 0.6667915339951301, + "grad_norm": 0.0147705078125, + "learning_rate": 1.5089110731375568e-06, + "logits/chosen": 0.581728994846344, + "logits/rejected": 1.7974720001220703, + "logps/chosen": -79.06498718261719, + "logps/rejected": -1824.8765869140625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35732579231262207, + "rewards/margins": 17.178985595703125, + "rewards/rejected": -17.53631019592285, + "step": 2670 + }, + { + "epoch": 0.6692888805644003, + "grad_norm": 0.032470703125, + "learning_rate": 1.4889366607739925e-06, + "logits/chosen": 0.6092284917831421, + "logits/rejected": 1.6554501056671143, + "logps/chosen": -78.19693756103516, + "logps/rejected": -1454.85107421875, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3566651940345764, + "rewards/margins": 13.518289566040039, + "rewards/rejected": -13.874954223632812, + "step": 2680 + }, + { + "epoch": 0.6717862271336704, + "grad_norm": 0.017578125, + "learning_rate": 1.4690391166319307e-06, + "logits/chosen": 0.5935393571853638, + "logits/rejected": 1.761783242225647, + "logps/chosen": -85.10191345214844, + "logps/rejected": -1834.072021484375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4155469536781311, + "rewards/margins": 17.14066505432129, + "rewards/rejected": -17.556209564208984, + "step": 2690 + }, + { + "epoch": 0.6742835737029407, + "grad_norm": 0.036376953125, + "learning_rate": 1.4492199534641055e-06, + "logits/chosen": 0.6022766828536987, + "logits/rejected": 1.8546326160430908, + "logps/chosen": -84.67176055908203, + "logps/rejected": -1870.0103759765625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41111135482788086, + "rewards/margins": 17.593767166137695, + "rewards/rejected": -18.0048770904541, + "step": 2700 + }, + { + "epoch": 0.6767809202722108, + "grad_norm": 0.001617431640625, + "learning_rate": 1.429480678064174e-06, + "logits/chosen": 0.4885890483856201, + "logits/rejected": 1.7658706903457642, + "logps/chosen": -85.19550323486328, + "logps/rejected": -2298.07373046875, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4054490923881531, + "rewards/margins": 21.81135368347168, + "rewards/rejected": -22.2168025970459, + "step": 2710 + }, + { + "epoch": 0.679278266841481, + "grad_norm": 0.021484375, + "learning_rate": 1.4098227911521523e-06, + "logits/chosen": 0.6109157204627991, + "logits/rejected": 1.8104369640350342, + "logps/chosen": -92.25324249267578, + "logps/rejected": -1919.9534912109375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4862859845161438, + "rewards/margins": 18.005590438842773, + "rewards/rejected": -18.49187660217285, + "step": 2720 + }, + { + "epoch": 0.6817756134107511, + "grad_norm": 0.0732421875, + "learning_rate": 1.3902477872603295e-06, + "logits/chosen": 0.6768110990524292, + "logits/rejected": 1.7166074514389038, + "logps/chosen": -80.58265686035156, + "logps/rejected": -1554.531982421875, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36700159311294556, + "rewards/margins": 14.375741958618164, + "rewards/rejected": -14.742744445800781, + "step": 2730 + }, + { + "epoch": 0.6842729599800212, + "grad_norm": 0.0478515625, + "learning_rate": 1.370757154619638e-06, + "logits/chosen": 0.5914765000343323, + "logits/rejected": 1.823325753211975, + "logps/chosen": -86.78914642333984, + "logps/rejected": -1906.5355224609375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42601895332336426, + "rewards/margins": 17.833972930908203, + "rewards/rejected": -18.259990692138672, + "step": 2740 + }, + { + "epoch": 0.6867703065492914, + "grad_norm": 0.005523681640625, + "learning_rate": 1.3513523750465049e-06, + "logits/chosen": 0.5616365075111389, + "logits/rejected": 1.7267478704452515, + "logps/chosen": -81.25291442871094, + "logps/rejected": -1686.3486328125, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36524245142936707, + "rewards/margins": 15.707315444946289, + "rewards/rejected": -16.07255744934082, + "step": 2750 + }, + { + "epoch": 0.6892676531185615, + "grad_norm": 0.00193023681640625, + "learning_rate": 1.332034923830199e-06, + "logits/chosen": 0.5695074200630188, + "logits/rejected": 1.8329308032989502, + "logps/chosen": -82.82709503173828, + "logps/rejected": -1805.4775390625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39727023243904114, + "rewards/margins": 16.94883918762207, + "rewards/rejected": -17.346107482910156, + "step": 2760 + }, + { + "epoch": 0.6917649996878317, + "grad_norm": 0.040771484375, + "learning_rate": 1.31280626962067e-06, + "logits/chosen": 0.6029590368270874, + "logits/rejected": 1.6939897537231445, + "logps/chosen": -86.7250747680664, + "logps/rejected": -1587.8193359375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42630377411842346, + "rewards/margins": 14.680148124694824, + "rewards/rejected": -15.106452941894531, + "step": 2770 + }, + { + "epoch": 0.6942623462571018, + "grad_norm": 0.0003566741943359375, + "learning_rate": 1.2936678743168813e-06, + "logits/chosen": 0.5795254707336426, + "logits/rejected": 1.7682584524154663, + "logps/chosen": -83.47227478027344, + "logps/rejected": -1894.138916015625, + "loss": 0.0021, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39181336760520935, + "rewards/margins": 17.82851791381836, + "rewards/rejected": -18.22032928466797, + "step": 2780 + }, + { + "epoch": 0.696759692826372, + "grad_norm": 0.01373291015625, + "learning_rate": 1.2746211929556777e-06, + "logits/chosen": 0.5124091506004333, + "logits/rejected": 2.0397300720214844, + "logps/chosen": -85.71356201171875, + "logps/rejected": -2490.38232421875, + "loss": 0.0004, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42116060853004456, + "rewards/margins": 23.745744705200195, + "rewards/rejected": -24.16690444946289, + "step": 2790 + }, + { + "epoch": 0.6992570393956421, + "grad_norm": 1.30385160446167e-08, + "learning_rate": 1.2556676736011558e-06, + "logits/chosen": 0.6134932637214661, + "logits/rejected": 1.816425085067749, + "logps/chosen": -85.68560791015625, + "logps/rejected": -1998.1363525390625, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43286705017089844, + "rewards/margins": 18.802690505981445, + "rewards/rejected": -19.235559463500977, + "step": 2800 + }, + { + "epoch": 0.7017543859649122, + "grad_norm": 0.0089111328125, + "learning_rate": 1.2368087572345772e-06, + "logits/chosen": 0.6667296886444092, + "logits/rejected": 1.7410768270492554, + "logps/chosen": -84.29058837890625, + "logps/rejected": -1482.312255859375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4156969487667084, + "rewards/margins": 13.738250732421875, + "rewards/rejected": -14.153947830200195, + "step": 2810 + }, + { + "epoch": 0.7042517325341824, + "grad_norm": 0.0927734375, + "learning_rate": 1.2180458776448067e-06, + "logits/chosen": 0.5982272028923035, + "logits/rejected": 1.7856439352035522, + "logps/chosen": -89.98011016845703, + "logps/rejected": -1943.0396728515625, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4640469551086426, + "rewards/margins": 18.235332489013672, + "rewards/rejected": -18.699377059936523, + "step": 2820 + }, + { + "epoch": 0.7067490791034525, + "grad_norm": 0.02392578125, + "learning_rate": 1.1993804613193158e-06, + "logits/chosen": 0.6234604120254517, + "logits/rejected": 1.765428900718689, + "logps/chosen": -87.09599304199219, + "logps/rejected": -1579.3837890625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.427617609500885, + "rewards/margins": 14.572957038879395, + "rewards/rejected": -15.000572204589844, + "step": 2830 + }, + { + "epoch": 0.7092464256727228, + "grad_norm": 3.7401914596557617e-06, + "learning_rate": 1.1808139273357232e-06, + "logits/chosen": 0.5544342398643494, + "logits/rejected": 1.7727603912353516, + "logps/chosen": -83.8676528930664, + "logps/rejected": -1906.515380859375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3836463987827301, + "rewards/margins": 17.823671340942383, + "rewards/rejected": -18.207317352294922, + "step": 2840 + }, + { + "epoch": 0.7117437722419929, + "grad_norm": 5.5789947509765625e-05, + "learning_rate": 1.1623476872539108e-06, + "logits/chosen": 0.5153034925460815, + "logits/rejected": 1.8462998867034912, + "logps/chosen": -94.67253112792969, + "logps/rejected": -2197.599609375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5202997326850891, + "rewards/margins": 20.727157592773438, + "rewards/rejected": -21.247455596923828, + "step": 2850 + }, + { + "epoch": 0.7142411188112631, + "grad_norm": 0.042724609375, + "learning_rate": 1.1439831450087032e-06, + "logits/chosen": 0.580392062664032, + "logits/rejected": 1.876275658607483, + "logps/chosen": -87.93695068359375, + "logps/rejected": -2129.860107421875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4418310225009918, + "rewards/margins": 20.121156692504883, + "rewards/rejected": -20.56298828125, + "step": 2860 + }, + { + "epoch": 0.7167384653805332, + "grad_norm": 0.0458984375, + "learning_rate": 1.1257216968031357e-06, + "logits/chosen": 0.6597843170166016, + "logits/rejected": 1.8998768329620361, + "logps/chosen": -80.04630279541016, + "logps/rejected": -1752.5791015625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37010782957077026, + "rewards/margins": 16.46237564086914, + "rewards/rejected": -16.832483291625977, + "step": 2870 + }, + { + "epoch": 0.7192358119498033, + "grad_norm": 0.00104522705078125, + "learning_rate": 1.1075647310022974e-06, + "logits/chosen": 0.634041965007782, + "logits/rejected": 1.8106848001480103, + "logps/chosen": -78.93439483642578, + "logps/rejected": -1525.8900146484375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36263298988342285, + "rewards/margins": 14.232080459594727, + "rewards/rejected": -14.594714164733887, + "step": 2880 + }, + { + "epoch": 0.7217331585190735, + "grad_norm": 0.002899169921875, + "learning_rate": 1.0895136280277863e-06, + "logits/chosen": 0.5515082478523254, + "logits/rejected": 1.7851063013076782, + "logps/chosen": -87.03413391113281, + "logps/rejected": -2093.734375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4326532781124115, + "rewards/margins": 19.687620162963867, + "rewards/rejected": -20.120275497436523, + "step": 2890 + }, + { + "epoch": 0.7242305050883436, + "grad_norm": 0.060546875, + "learning_rate": 1.0715697602527542e-06, + "logits/chosen": 0.5289216041564941, + "logits/rejected": 1.7743902206420898, + "logps/chosen": -85.22486114501953, + "logps/rejected": -1992.9351806640625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4111485481262207, + "rewards/margins": 18.597354888916016, + "rewards/rejected": -19.00850486755371, + "step": 2900 + }, + { + "epoch": 0.7267278516576138, + "grad_norm": 0.04248046875, + "learning_rate": 1.0537344918975708e-06, + "logits/chosen": 0.654784083366394, + "logits/rejected": 1.7333883047103882, + "logps/chosen": -85.55310821533203, + "logps/rejected": -1545.492919921875, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4073718190193176, + "rewards/margins": 14.238784790039062, + "rewards/rejected": -14.646156311035156, + "step": 2910 + }, + { + "epoch": 0.7292251982268839, + "grad_norm": 1.5079975128173828e-05, + "learning_rate": 1.036009178926107e-06, + "logits/chosen": 0.570530891418457, + "logits/rejected": 1.8232314586639404, + "logps/chosen": -87.81031799316406, + "logps/rejected": -1891.7252197265625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44701051712036133, + "rewards/margins": 17.766794204711914, + "rewards/rejected": -18.213804244995117, + "step": 2920 + }, + { + "epoch": 0.7317225447961541, + "grad_norm": 0.016357421875, + "learning_rate": 1.0183951689426438e-06, + "logits/chosen": 0.5162047147750854, + "logits/rejected": 1.80562424659729, + "logps/chosen": -78.40940856933594, + "logps/rejected": -2212.948974609375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33928531408309937, + "rewards/margins": 21.02674674987793, + "rewards/rejected": -21.366031646728516, + "step": 2930 + }, + { + "epoch": 0.7342198913654242, + "grad_norm": 0.01409912109375, + "learning_rate": 1.0008938010894156e-06, + "logits/chosen": 0.5077947974205017, + "logits/rejected": 1.8344638347625732, + "logps/chosen": -81.39566802978516, + "logps/rejected": -2270.706298828125, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3777496814727783, + "rewards/margins": 21.60778045654297, + "rewards/rejected": -21.98552894592285, + "step": 2940 + }, + { + "epoch": 0.7367172379346943, + "grad_norm": 0.0198974609375, + "learning_rate": 9.83506405944804e-07, + "logits/chosen": 0.5673650503158569, + "logits/rejected": 1.745111107826233, + "logps/chosen": -77.10914611816406, + "logps/rejected": -1838.6285400390625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34020406007766724, + "rewards/margins": 17.217037200927734, + "rewards/rejected": -17.55724334716797, + "step": 2950 + }, + { + "epoch": 0.7392145845039645, + "grad_norm": 0.00946044921875, + "learning_rate": 9.662343054221743e-07, + "logits/chosen": 0.5164293050765991, + "logits/rejected": 1.726947546005249, + "logps/chosen": -88.59376525878906, + "logps/rejected": -2064.08642578125, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4376547336578369, + "rewards/margins": 19.29999351501465, + "rewards/rejected": -19.73764991760254, + "step": 2960 + }, + { + "epoch": 0.7417119310732347, + "grad_norm": 0.0595703125, + "learning_rate": 9.490788126693754e-07, + "logits/chosen": 0.6247397661209106, + "logits/rejected": 1.8680105209350586, + "logps/chosen": -86.50829315185547, + "logps/rejected": -1925.3023681640625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4314423203468323, + "rewards/margins": 17.977245330810547, + "rewards/rejected": -18.408687591552734, + "step": 2970 + }, + { + "epoch": 0.7442092776425049, + "grad_norm": 0.0030975341796875, + "learning_rate": 9.32041231968904e-07, + "logits/chosen": 0.582064151763916, + "logits/rejected": 1.8307263851165771, + "logps/chosen": -87.89469909667969, + "logps/rejected": -2049.106689453125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4329482614994049, + "rewards/margins": 19.329082489013672, + "rewards/rejected": -19.762033462524414, + "step": 2980 + }, + { + "epoch": 0.746706624211775, + "grad_norm": 0.328125, + "learning_rate": 9.151228586387464e-07, + "logits/chosen": 0.6141242384910583, + "logits/rejected": 1.747831106185913, + "logps/chosen": -80.1594467163086, + "logps/rejected": -1789.0576171875, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36252784729003906, + "rewards/margins": 16.812475204467773, + "rewards/rejected": -17.175004959106445, + "step": 2990 + }, + { + "epoch": 0.7492039707810452, + "grad_norm": 0.06494140625, + "learning_rate": 8.983249789338941e-07, + "logits/chosen": 0.6428495645523071, + "logits/rejected": 1.7919700145721436, + "logps/chosen": -82.71188354492188, + "logps/rejected": -1678.5035400390625, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3992861807346344, + "rewards/margins": 15.69371509552002, + "rewards/rejected": -16.092998504638672, + "step": 3000 + }, + { + "epoch": 0.7492039707810452, + "eval_logits/chosen": 0.656849205493927, + "eval_logits/rejected": 1.5703133344650269, + "eval_logps/chosen": -84.02084350585938, + "eval_logps/rejected": -995.490234375, + "eval_loss": 0.0028192740865051746, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": -0.39664965867996216, + "eval_rewards/margins": 8.947355270385742, + "eval_rewards/rejected": -9.34400463104248, + "eval_runtime": 0.621, + "eval_samples_per_second": 8.052, + "eval_steps_per_second": 8.052, + "step": 3000 + }, + { + "epoch": 0.7517013173503153, + "grad_norm": 0.0205078125, + "learning_rate": 8.816488699485593e-07, + "logits/chosen": 0.634880006313324, + "logits/rejected": 1.8458068370819092, + "logps/chosen": -89.79926300048828, + "logps/rejected": -1852.7974853515625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4646337628364563, + "rewards/margins": 17.353429794311523, + "rewards/rejected": -17.81806182861328, + "step": 3010 + }, + { + "epoch": 0.7541986639195855, + "grad_norm": 4.553794860839844e-05, + "learning_rate": 8.650957995190784e-07, + "logits/chosen": 0.5151122212409973, + "logits/rejected": 1.7481235265731812, + "logps/chosen": -79.8306884765625, + "logps/rejected": -2110.71923828125, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3657647669315338, + "rewards/margins": 20.01117706298828, + "rewards/rejected": -20.37693977355957, + "step": 3020 + }, + { + "epoch": 0.7566960104888556, + "grad_norm": 0.045166015625, + "learning_rate": 8.486670261275193e-07, + "logits/chosen": 0.6202859878540039, + "logits/rejected": 1.8134170770645142, + "logps/chosen": -84.73997497558594, + "logps/rejected": -1783.783203125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42085719108581543, + "rewards/margins": 16.739408493041992, + "rewards/rejected": -17.160266876220703, + "step": 3030 + }, + { + "epoch": 0.7591933570581257, + "grad_norm": 0.0111083984375, + "learning_rate": 8.32363798806011e-07, + "logits/chosen": 0.5721080303192139, + "logits/rejected": 1.739031195640564, + "logps/chosen": -85.13414764404297, + "logps/rejected": -1891.197509765625, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42727169394493103, + "rewards/margins": 17.766677856445312, + "rewards/rejected": -18.193950653076172, + "step": 3040 + }, + { + "epoch": 0.7616907036273959, + "grad_norm": 0.0208740234375, + "learning_rate": 8.161873570417742e-07, + "logits/chosen": 0.5966504812240601, + "logits/rejected": 1.8666013479232788, + "logps/chosen": -89.76878356933594, + "logps/rejected": -1943.291015625, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4609198570251465, + "rewards/margins": 18.261484146118164, + "rewards/rejected": -18.722402572631836, + "step": 3050 + }, + { + "epoch": 0.764188050196666, + "grad_norm": 0.0098876953125, + "learning_rate": 8.001389306828897e-07, + "logits/chosen": 0.4526674151420593, + "logits/rejected": 1.7495372295379639, + "logps/chosen": -82.58873748779297, + "logps/rejected": -2175.00244140625, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3694322407245636, + "rewards/margins": 20.39395523071289, + "rewards/rejected": -20.763385772705078, + "step": 3060 + }, + { + "epoch": 0.7666853967659362, + "grad_norm": 1.7508864402770996e-06, + "learning_rate": 7.842197398447993e-07, + "logits/chosen": 0.5828143358230591, + "logits/rejected": 1.8392832279205322, + "logps/chosen": -79.72120666503906, + "logps/rejected": -1959.4478759765625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3589129149913788, + "rewards/margins": 18.44549560546875, + "rewards/rejected": -18.804407119750977, + "step": 3070 + }, + { + "epoch": 0.7691827433352063, + "grad_norm": 0.00104522705078125, + "learning_rate": 7.684309948175414e-07, + "logits/chosen": 0.5747276544570923, + "logits/rejected": 1.7614377737045288, + "logps/chosen": -83.3620834350586, + "logps/rejected": -1877.7086181640625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4084179401397705, + "rewards/margins": 17.644577026367188, + "rewards/rejected": -18.052997589111328, + "step": 3080 + }, + { + "epoch": 0.7716800899044765, + "grad_norm": 0.00160980224609375, + "learning_rate": 7.527738959737371e-07, + "logits/chosen": 0.536163330078125, + "logits/rejected": 1.8368165493011475, + "logps/chosen": -81.3559799194336, + "logps/rejected": -1827.6383056640625, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3685862720012665, + "rewards/margins": 17.127788543701172, + "rewards/rejected": -17.496374130249023, + "step": 3090 + }, + { + "epoch": 0.7741774364737466, + "grad_norm": 0.061279296875, + "learning_rate": 7.372496336773269e-07, + "logits/chosen": 0.6259430050849915, + "logits/rejected": 1.7605386972427368, + "logps/chosen": -82.03521728515625, + "logps/rejected": -1697.513916015625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3883189260959625, + "rewards/margins": 15.871994018554688, + "rewards/rejected": -16.260311126708984, + "step": 3100 + }, + { + "epoch": 0.7766747830430168, + "grad_norm": 0.0439453125, + "learning_rate": 7.218593881930744e-07, + "logits/chosen": 0.6127210259437561, + "logits/rejected": 1.7632982730865479, + "logps/chosen": -77.52657318115234, + "logps/rejected": -1801.5595703125, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3411308526992798, + "rewards/margins": 16.98233985900879, + "rewards/rejected": -17.323471069335938, + "step": 3110 + }, + { + "epoch": 0.779172129612287, + "grad_norm": 0.017333984375, + "learning_rate": 7.066043295968342e-07, + "logits/chosen": 0.5858328938484192, + "logits/rejected": 1.7214057445526123, + "logps/chosen": -82.28968048095703, + "logps/rejected": -1686.96875, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.377600759267807, + "rewards/margins": 15.575152397155762, + "rewards/rejected": -15.952753067016602, + "step": 3120 + }, + { + "epoch": 0.7816694761815571, + "grad_norm": 0.00028228759765625, + "learning_rate": 6.914856176865891e-07, + "logits/chosen": 0.5658802390098572, + "logits/rejected": 1.7670371532440186, + "logps/chosen": -78.00981140136719, + "logps/rejected": -1716.184814453125, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34367048740386963, + "rewards/margins": 16.066198348999023, + "rewards/rejected": -16.409870147705078, + "step": 3130 + }, + { + "epoch": 0.7841668227508273, + "grad_norm": 6.198883056640625e-05, + "learning_rate": 6.765044018942804e-07, + "logits/chosen": 0.6243360042572021, + "logits/rejected": 1.8233163356781006, + "logps/chosen": -77.52008819580078, + "logps/rejected": -1712.4619140625, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3422839641571045, + "rewards/margins": 16.079269409179688, + "rewards/rejected": -16.421554565429688, + "step": 3140 + }, + { + "epoch": 0.7866641693200974, + "grad_norm": 0.134765625, + "learning_rate": 6.616618211984169e-07, + "logits/chosen": 0.617003321647644, + "logits/rejected": 1.855446219444275, + "logps/chosen": -81.59101104736328, + "logps/rejected": -1846.211669921875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37706810235977173, + "rewards/margins": 17.38241958618164, + "rewards/rejected": -17.75948715209961, + "step": 3150 + }, + { + "epoch": 0.7891615158893676, + "grad_norm": 0.04833984375, + "learning_rate": 6.469590040374799e-07, + "logits/chosen": 0.5514385104179382, + "logits/rejected": 1.7037725448608398, + "logps/chosen": -92.5042724609375, + "logps/rejected": -1909.711181640625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4816574156284332, + "rewards/margins": 17.822921752929688, + "rewards/rejected": -18.304576873779297, + "step": 3160 + }, + { + "epoch": 0.7916588624586377, + "grad_norm": 0.03173828125, + "learning_rate": 6.32397068224136e-07, + "logits/chosen": 0.528624415397644, + "logits/rejected": 1.6811710596084595, + "logps/chosen": -89.05570983886719, + "logps/rejected": -1862.1185302734375, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44779324531555176, + "rewards/margins": 17.432941436767578, + "rewards/rejected": -17.880733489990234, + "step": 3170 + }, + { + "epoch": 0.7941562090279078, + "grad_norm": 0.04052734375, + "learning_rate": 6.17977120860249e-07, + "logits/chosen": 0.5938631296157837, + "logits/rejected": 1.7992160320281982, + "logps/chosen": -80.42100524902344, + "logps/rejected": -1828.5582275390625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37270471453666687, + "rewards/margins": 17.214147567749023, + "rewards/rejected": -17.586851119995117, + "step": 3180 + }, + { + "epoch": 0.796653555597178, + "grad_norm": 0.07177734375, + "learning_rate": 6.037002582527121e-07, + "logits/chosen": 0.6156030893325806, + "logits/rejected": 1.7690448760986328, + "logps/chosen": -83.84468078613281, + "logps/rejected": -1731.0732421875, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4016353189945221, + "rewards/margins": 16.079975128173828, + "rewards/rejected": -16.481611251831055, + "step": 3190 + }, + { + "epoch": 0.7991509021664481, + "grad_norm": 0.03759765625, + "learning_rate": 5.895675658300981e-07, + "logits/chosen": 0.6333300471305847, + "logits/rejected": 1.8136202096939087, + "logps/chosen": -79.72032165527344, + "logps/rejected": -1559.860595703125, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37166255712509155, + "rewards/margins": 14.575735092163086, + "rewards/rejected": -14.947400093078613, + "step": 3200 + }, + { + "epoch": 0.8016482487357183, + "grad_norm": 0.032470703125, + "learning_rate": 5.755801180601381e-07, + "logits/chosen": 0.5778881907463074, + "logits/rejected": 1.754577875137329, + "logps/chosen": -85.21940612792969, + "logps/rejected": -1787.017333984375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40619927644729614, + "rewards/margins": 16.748323440551758, + "rewards/rejected": -17.154521942138672, + "step": 3210 + }, + { + "epoch": 0.8041455953049884, + "grad_norm": 0.050537109375, + "learning_rate": 5.617389783680307e-07, + "logits/chosen": 0.5147963762283325, + "logits/rejected": 1.858233094215393, + "logps/chosen": -85.53825378417969, + "logps/rejected": -2189.0078125, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41369113326072693, + "rewards/margins": 20.725154876708984, + "rewards/rejected": -21.13884925842285, + "step": 3220 + }, + { + "epoch": 0.8066429418742586, + "grad_norm": 0.035888671875, + "learning_rate": 5.48045199055596e-07, + "logits/chosen": 0.6537925004959106, + "logits/rejected": 1.8616943359375, + "logps/chosen": -81.87962341308594, + "logps/rejected": -1831.1129150390625, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3843618333339691, + "rewards/margins": 17.22643280029297, + "rewards/rejected": -17.610795974731445, + "step": 3230 + }, + { + "epoch": 0.8091402884435287, + "grad_norm": 0.02197265625, + "learning_rate": 5.344998212212704e-07, + "logits/chosen": 0.5282970070838928, + "logits/rejected": 1.810681939125061, + "logps/chosen": -85.24520111083984, + "logps/rejected": -2183.88037109375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4112313389778137, + "rewards/margins": 20.611225128173828, + "rewards/rejected": -21.022457122802734, + "step": 3240 + }, + { + "epoch": 0.811637635012799, + "grad_norm": 0.0023956298828125, + "learning_rate": 5.211038746809551e-07, + "logits/chosen": 0.6539278030395508, + "logits/rejected": 1.8353042602539062, + "logps/chosen": -83.72335815429688, + "logps/rejected": -1798.7239990234375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4129239618778229, + "rewards/margins": 16.881694793701172, + "rewards/rejected": -17.294618606567383, + "step": 3250 + }, + { + "epoch": 0.8141349815820691, + "grad_norm": 0.0294189453125, + "learning_rate": 5.078583778897216e-07, + "logits/chosen": 0.6602455377578735, + "logits/rejected": 1.7690246105194092, + "logps/chosen": -93.29869079589844, + "logps/rejected": -1809.647216796875, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4940834641456604, + "rewards/margins": 16.90252113342285, + "rewards/rejected": -17.396602630615234, + "step": 3260 + }, + { + "epoch": 0.8166323281513392, + "grad_norm": 0.0830078125, + "learning_rate": 4.94764337864384e-07, + "logits/chosen": 0.6102297902107239, + "logits/rejected": 1.7652736902236938, + "logps/chosen": -81.92073059082031, + "logps/rejected": -1667.0634765625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38310879468917847, + "rewards/margins": 15.526937484741211, + "rewards/rejected": -15.910046577453613, + "step": 3270 + }, + { + "epoch": 0.8191296747206094, + "grad_norm": 0.04150390625, + "learning_rate": 4.818227501069328e-07, + "logits/chosen": 0.5220754742622375, + "logits/rejected": 1.9412825107574463, + "logps/chosen": -81.39790344238281, + "logps/rejected": -2294.091064453125, + "loss": 0.0003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3824032247066498, + "rewards/margins": 21.823511123657227, + "rewards/rejected": -22.205913543701172, + "step": 3280 + }, + { + "epoch": 0.8216270212898795, + "grad_norm": 0.025146484375, + "learning_rate": 4.690345985288572e-07, + "logits/chosen": 0.5984300971031189, + "logits/rejected": 1.787674903869629, + "logps/chosen": -85.31007385253906, + "logps/rejected": -1911.9349365234375, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41559547185897827, + "rewards/margins": 17.892559051513672, + "rewards/rejected": -18.308155059814453, + "step": 3290 + }, + { + "epoch": 0.8241243678591497, + "grad_norm": 8.791685104370117e-07, + "learning_rate": 4.5640085537633633e-07, + "logits/chosen": 0.5342472791671753, + "logits/rejected": 1.8117504119873047, + "logps/chosen": -77.17405700683594, + "logps/rejected": -2120.84912109375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33397001028060913, + "rewards/margins": 20.135189056396484, + "rewards/rejected": -20.469158172607422, + "step": 3300 + }, + { + "epoch": 0.8266217144284198, + "grad_norm": 7.581710815429688e-05, + "learning_rate": 4.439224811563211e-07, + "logits/chosen": 0.5074091553688049, + "logits/rejected": 1.69931960105896, + "logps/chosen": -87.99649047851562, + "logps/rejected": -1984.584228515625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.442207008600235, + "rewards/margins": 18.59341049194336, + "rewards/rejected": -19.03561782836914, + "step": 3310 + }, + { + "epoch": 0.82911906099769, + "grad_norm": 0.00012111663818359375, + "learning_rate": 4.316004245635158e-07, + "logits/chosen": 0.533842921257019, + "logits/rejected": 1.7812063694000244, + "logps/chosen": -89.41615295410156, + "logps/rejected": -2132.177734375, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4604712426662445, + "rewards/margins": 20.115650177001953, + "rewards/rejected": -20.576122283935547, + "step": 3320 + }, + { + "epoch": 0.8316164075669601, + "grad_norm": 6.8247318267822266e-06, + "learning_rate": 4.194356224082455e-07, + "logits/chosen": 0.4998435378074646, + "logits/rejected": 1.818884253501892, + "logps/chosen": -89.97554779052734, + "logps/rejected": -2246.93017578125, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4607468247413635, + "rewards/margins": 21.1903133392334, + "rewards/rejected": -21.651060104370117, + "step": 3330 + }, + { + "epoch": 0.8341137541362302, + "grad_norm": 0.0008697509765625, + "learning_rate": 4.074289995452338e-07, + "logits/chosen": 0.663809597492218, + "logits/rejected": 1.8902143239974976, + "logps/chosen": -79.80634307861328, + "logps/rejected": -1863.5374755859375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3643765449523926, + "rewards/margins": 17.57371711730957, + "rewards/rejected": -17.938095092773438, + "step": 3340 + }, + { + "epoch": 0.8366111007055004, + "grad_norm": 9.715557098388672e-06, + "learning_rate": 3.9558146880329246e-07, + "logits/chosen": 0.5806099772453308, + "logits/rejected": 1.7180675268173218, + "logps/chosen": -88.5436782836914, + "logps/rejected": -1847.456298828125, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44766122102737427, + "rewards/margins": 17.159259796142578, + "rewards/rejected": -17.606922149658203, + "step": 3350 + }, + { + "epoch": 0.8391084472747705, + "grad_norm": 0.011962890625, + "learning_rate": 3.838939309159187e-07, + "logits/chosen": 0.6112891435623169, + "logits/rejected": 1.7461353540420532, + "logps/chosen": -85.6560287475586, + "logps/rejected": -1812.320068359375, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42908936738967896, + "rewards/margins": 16.986312866210938, + "rewards/rejected": -17.415403366088867, + "step": 3360 + }, + { + "epoch": 0.8416057938440407, + "grad_norm": 0.00148773193359375, + "learning_rate": 3.723672744528162e-07, + "logits/chosen": 0.5621702671051025, + "logits/rejected": 1.8088220357894897, + "logps/chosen": -78.2458724975586, + "logps/rejected": -1943.078125, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34909874200820923, + "rewards/margins": 18.354211807250977, + "rewards/rejected": -18.703310012817383, + "step": 3370 + }, + { + "epoch": 0.8441031404133108, + "grad_norm": 0.00182342529296875, + "learning_rate": 3.6100237575233647e-07, + "logits/chosen": 0.6288230419158936, + "logits/rejected": 1.7829002141952515, + "logps/chosen": -84.52963256835938, + "logps/rejected": -1607.1246337890625, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41369467973709106, + "rewards/margins": 15.001932144165039, + "rewards/rejected": -15.415626525878906, + "step": 3380 + }, + { + "epoch": 0.8466004869825811, + "grad_norm": 0.0247802734375, + "learning_rate": 3.4980009885486054e-07, + "logits/chosen": 0.6984633803367615, + "logits/rejected": 1.8022708892822266, + "logps/chosen": -77.57188415527344, + "logps/rejected": -1505.264892578125, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3510386645793915, + "rewards/margins": 14.059832572937012, + "rewards/rejected": -14.410870552062988, + "step": 3390 + }, + { + "epoch": 0.8490978335518512, + "grad_norm": 0.0277099609375, + "learning_rate": 3.3876129543710197e-07, + "logits/chosen": 0.5471528768539429, + "logits/rejected": 1.768148422241211, + "logps/chosen": -87.0343246459961, + "logps/rejected": -2079.26708984375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44154462218284607, + "rewards/margins": 19.6082706451416, + "rewards/rejected": -20.049814224243164, + "step": 3400 + }, + { + "epoch": 0.8515951801211213, + "grad_norm": 0.00250244140625, + "learning_rate": 3.2788680474735687e-07, + "logits/chosen": 0.5990682244300842, + "logits/rejected": 1.8559329509735107, + "logps/chosen": -80.894287109375, + "logps/rejected": -1869.932373046875, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3692544400691986, + "rewards/margins": 17.629741668701172, + "rewards/rejected": -17.99899673461914, + "step": 3410 + }, + { + "epoch": 0.8540925266903915, + "grad_norm": 0.00060272216796875, + "learning_rate": 3.1717745354170214e-07, + "logits/chosen": 0.550452470779419, + "logits/rejected": 1.9075467586517334, + "logps/chosen": -88.12527465820312, + "logps/rejected": -2150.641357421875, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4492851793766022, + "rewards/margins": 20.334678649902344, + "rewards/rejected": -20.78396224975586, + "step": 3420 + }, + { + "epoch": 0.8565898732596616, + "grad_norm": 0.000537872314453125, + "learning_rate": 3.0663405602113727e-07, + "logits/chosen": 0.5784090757369995, + "logits/rejected": 1.8440923690795898, + "logps/chosen": -77.76791381835938, + "logps/rejected": -1944.1751708984375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.34689050912857056, + "rewards/margins": 18.372732162475586, + "rewards/rejected": -18.719623565673828, + "step": 3430 + }, + { + "epoch": 0.8590872198289318, + "grad_norm": 0.01416015625, + "learning_rate": 2.9625741376968107e-07, + "logits/chosen": 0.5425665378570557, + "logits/rejected": 1.7546192407608032, + "logps/chosen": -84.84912109375, + "logps/rejected": -2032.4622802734375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40073734521865845, + "rewards/margins": 18.93886375427246, + "rewards/rejected": -19.339599609375, + "step": 3440 + }, + { + "epoch": 0.8615845663982019, + "grad_norm": 0.0206298828125, + "learning_rate": 2.8604831569343324e-07, + "logits/chosen": 0.5840142965316772, + "logits/rejected": 1.6774394512176514, + "logps/chosen": -87.08283996582031, + "logps/rejected": -1656.0628662109375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4280741214752197, + "rewards/margins": 15.362825393676758, + "rewards/rejected": -15.790898323059082, + "step": 3450 + }, + { + "epoch": 0.864081912967472, + "grad_norm": 0.00439453125, + "learning_rate": 2.760075379605942e-07, + "logits/chosen": 0.5762545466423035, + "logits/rejected": 1.788022756576538, + "logps/chosen": -84.1275634765625, + "logps/rejected": -1882.4407958984375, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40409308671951294, + "rewards/margins": 17.698461532592773, + "rewards/rejected": -18.102556228637695, + "step": 3460 + }, + { + "epoch": 0.8665792595367422, + "grad_norm": 0.042724609375, + "learning_rate": 2.661358439424552e-07, + "logits/chosen": 0.6203972697257996, + "logits/rejected": 1.7815377712249756, + "logps/chosen": -79.66865539550781, + "logps/rejected": -1772.3359375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36624962091445923, + "rewards/margins": 16.665634155273438, + "rewards/rejected": -17.031885147094727, + "step": 3470 + }, + { + "epoch": 0.8690766061060123, + "grad_norm": 0.0026397705078125, + "learning_rate": 2.564339841553615e-07, + "logits/chosen": 0.6519962549209595, + "logits/rejected": 1.8045860528945923, + "logps/chosen": -84.95247650146484, + "logps/rejected": -1720.0745849609375, + "loss": 0.0018, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4275601804256439, + "rewards/margins": 16.091691970825195, + "rewards/rejected": -16.51925277709961, + "step": 3480 + }, + { + "epoch": 0.8715739526752825, + "grad_norm": 0.1845703125, + "learning_rate": 2.469026962036539e-07, + "logits/chosen": 0.5797117352485657, + "logits/rejected": 1.6833524703979492, + "logps/chosen": -88.14852905273438, + "logps/rejected": -1696.3714599609375, + "loss": 0.003, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4267314374446869, + "rewards/margins": 15.646469116210938, + "rewards/rejected": -16.07320213317871, + "step": 3490 + }, + { + "epoch": 0.8740712992445526, + "grad_norm": 0.046142578125, + "learning_rate": 2.3754270472358786e-07, + "logits/chosen": 0.6232503652572632, + "logits/rejected": 1.6990222930908203, + "logps/chosen": -83.14488983154297, + "logps/rejected": -1672.453857421875, + "loss": 0.0022, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39773061871528625, + "rewards/margins": 15.525070190429688, + "rewards/rejected": -15.922798156738281, + "step": 3500 + }, + { + "epoch": 0.8765686458138228, + "grad_norm": 0.07373046875, + "learning_rate": 2.283547213282458e-07, + "logits/chosen": 0.5654767155647278, + "logits/rejected": 1.7425930500030518, + "logps/chosen": -85.06395721435547, + "logps/rejected": -1774.28125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4012150168418884, + "rewards/margins": 16.552427291870117, + "rewards/rejected": -16.953643798828125, + "step": 3510 + }, + { + "epoch": 0.8790659923830929, + "grad_norm": 0.052734375, + "learning_rate": 2.1933944455343166e-07, + "logits/chosen": 0.5508383512496948, + "logits/rejected": 1.7986376285552979, + "logps/chosen": -81.19587707519531, + "logps/rejected": -2142.4736328125, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.374092161655426, + "rewards/margins": 20.307262420654297, + "rewards/rejected": -20.681354522705078, + "step": 3520 + }, + { + "epoch": 0.8815633389523632, + "grad_norm": 0.003265380859375, + "learning_rate": 2.104975598045647e-07, + "logits/chosen": 0.5937038660049438, + "logits/rejected": 1.7039823532104492, + "logps/chosen": -81.80549621582031, + "logps/rejected": -1617.2646484375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.38183221220970154, + "rewards/margins": 15.109405517578125, + "rewards/rejected": -15.491238594055176, + "step": 3530 + }, + { + "epoch": 0.8840606855216333, + "grad_norm": 0.032470703125, + "learning_rate": 2.018297393045701e-07, + "logits/chosen": 0.6291056871414185, + "logits/rejected": 1.7945873737335205, + "logps/chosen": -81.0144271850586, + "logps/rejected": -1751.724609375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3717043697834015, + "rewards/margins": 16.441532135009766, + "rewards/rejected": -16.813236236572266, + "step": 3540 + }, + { + "epoch": 0.8865580320909034, + "grad_norm": 0.039306640625, + "learning_rate": 1.9333664204277236e-07, + "logits/chosen": 0.5141820907592773, + "logits/rejected": 1.6924489736557007, + "logps/chosen": -83.84037780761719, + "logps/rejected": -2023.642822265625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3969913423061371, + "rewards/margins": 18.992799758911133, + "rewards/rejected": -19.38979148864746, + "step": 3550 + }, + { + "epoch": 0.8890553786601736, + "grad_norm": 2.4437904357910156e-06, + "learning_rate": 1.8501891372479124e-07, + "logits/chosen": 0.5262492895126343, + "logits/rejected": 1.801138162612915, + "logps/chosen": -82.75626373291016, + "logps/rejected": -1913.7564697265625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.37326088547706604, + "rewards/margins": 17.95614242553711, + "rewards/rejected": -18.329402923583984, + "step": 3560 + }, + { + "epoch": 0.8915527252294437, + "grad_norm": 0.0230712890625, + "learning_rate": 1.7687718672345533e-07, + "logits/chosen": 0.5257088541984558, + "logits/rejected": 1.7338473796844482, + "logps/chosen": -84.81585693359375, + "logps/rejected": -1994.5963134765625, + "loss": 0.0007, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4195898473262787, + "rewards/margins": 18.773387908935547, + "rewards/rejected": -19.19297981262207, + "step": 3570 + }, + { + "epoch": 0.8940500717987139, + "grad_norm": 0.032958984375, + "learning_rate": 1.689120800307212e-07, + "logits/chosen": 0.43529587984085083, + "logits/rejected": 1.6352293491363525, + "logps/chosen": -84.8902816772461, + "logps/rejected": -2162.602294921875, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4021669030189514, + "rewards/margins": 20.277816772460938, + "rewards/rejected": -20.679983139038086, + "step": 3580 + }, + { + "epoch": 0.896547418367984, + "grad_norm": 0.09033203125, + "learning_rate": 1.6112419921061357e-07, + "logits/chosen": 0.6369230151176453, + "logits/rejected": 1.848402976989746, + "logps/chosen": -89.04129791259766, + "logps/rejected": -1788.65234375, + "loss": 0.0025, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4632226526737213, + "rewards/margins": 16.736385345458984, + "rewards/rejected": -17.199607849121094, + "step": 3590 + }, + { + "epoch": 0.8990447649372542, + "grad_norm": 0.039306640625, + "learning_rate": 1.5351413635318807e-07, + "logits/chosen": 0.5430204272270203, + "logits/rejected": 1.6954717636108398, + "logps/chosen": -80.37650299072266, + "logps/rejected": -1739.5556640625, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.36847007274627686, + "rewards/margins": 16.253259658813477, + "rewards/rejected": -16.621726989746094, + "step": 3600 + }, + { + "epoch": 0.9015421115065243, + "grad_norm": 0.00982666015625, + "learning_rate": 1.460824700295138e-07, + "logits/chosen": 0.5976991653442383, + "logits/rejected": 1.8408482074737549, + "logps/chosen": -83.05894470214844, + "logps/rejected": -1974.3404541015625, + "loss": 0.0016, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40586838126182556, + "rewards/margins": 18.630273818969727, + "rewards/rejected": -19.036144256591797, + "step": 3610 + }, + { + "epoch": 0.9040394580757944, + "grad_norm": 0.01531982421875, + "learning_rate": 1.3882976524768694e-07, + "logits/chosen": 0.6637327075004578, + "logits/rejected": 1.75222909450531, + "logps/chosen": -82.24571228027344, + "logps/rejected": -1613.786865234375, + "loss": 0.0023, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39647185802459717, + "rewards/margins": 15.058723449707031, + "rewards/rejected": -15.455195426940918, + "step": 3620 + }, + { + "epoch": 0.9065368046450646, + "grad_norm": 0.00054931640625, + "learning_rate": 1.3175657340987664e-07, + "logits/chosen": 0.6287505030632019, + "logits/rejected": 1.799709677696228, + "logps/chosen": -84.08810424804688, + "logps/rejected": -1803.7301025390625, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4151592254638672, + "rewards/margins": 16.90250015258789, + "rewards/rejected": -17.317657470703125, + "step": 3630 + }, + { + "epoch": 0.9090341512143347, + "grad_norm": 0.0010986328125, + "learning_rate": 1.2486343227040122e-07, + "logits/chosen": 0.5875022411346436, + "logits/rejected": 1.7384836673736572, + "logps/chosen": -87.7680435180664, + "logps/rejected": -1834.406982421875, + "loss": 0.0029, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4344062805175781, + "rewards/margins": 17.189672470092773, + "rewards/rejected": -17.62407875061035, + "step": 3640 + }, + { + "epoch": 0.9115314977836049, + "grad_norm": 0.0419921875, + "learning_rate": 1.181508658948452e-07, + "logits/chosen": 0.6155994534492493, + "logits/rejected": 1.7817541360855103, + "logps/chosen": -80.59324645996094, + "logps/rejected": -1767.1129150390625, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3722308278083801, + "rewards/margins": 16.59187889099121, + "rewards/rejected": -16.964111328125, + "step": 3650 + }, + { + "epoch": 0.9140288443528751, + "grad_norm": 0.0458984375, + "learning_rate": 1.1161938462021627e-07, + "logits/chosen": 0.6269813776016235, + "logits/rejected": 1.7340434789657593, + "logps/chosen": -86.62757110595703, + "logps/rejected": -1731.770751953125, + "loss": 0.0028, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4310578405857086, + "rewards/margins": 16.17133903503418, + "rewards/rejected": -16.602397918701172, + "step": 3660 + }, + { + "epoch": 0.9165261909221453, + "grad_norm": 0.1083984375, + "learning_rate": 1.0526948501614536e-07, + "logits/chosen": 0.5681526064872742, + "logits/rejected": 1.8455768823623657, + "logps/chosen": -88.42396545410156, + "logps/rejected": -1962.5306396484375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44573092460632324, + "rewards/margins": 18.391765594482422, + "rewards/rejected": -18.837499618530273, + "step": 3670 + }, + { + "epoch": 0.9190235374914154, + "grad_norm": 0.0791015625, + "learning_rate": 9.910164984713477e-08, + "logits/chosen": 0.5716847777366638, + "logits/rejected": 1.790804147720337, + "logps/chosen": -88.99705505371094, + "logps/rejected": -2003.9739990234375, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.457981675863266, + "rewards/margins": 18.8723087310791, + "rewards/rejected": -19.330291748046875, + "step": 3680 + }, + { + "epoch": 0.9215208840606856, + "grad_norm": 0.0830078125, + "learning_rate": 9.311634803585323e-08, + "logits/chosen": 0.5493127107620239, + "logits/rejected": 1.8056682348251343, + "logps/chosen": -80.04996490478516, + "logps/rejected": -2063.024169921875, + "loss": 0.0024, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3674994707107544, + "rewards/margins": 19.52413558959961, + "rewards/rejected": -19.891637802124023, + "step": 3690 + }, + { + "epoch": 0.9240182306299557, + "grad_norm": 0.000400543212890625, + "learning_rate": 8.7314034627487e-08, + "logits/chosen": 0.5750405192375183, + "logits/rejected": 1.8101009130477905, + "logps/chosen": -78.46788024902344, + "logps/rejected": -1988.633056640625, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35628947615623474, + "rewards/margins": 18.821575164794922, + "rewards/rejected": -19.1778621673584, + "step": 3700 + }, + { + "epoch": 0.9265155771992258, + "grad_norm": 0.03515625, + "learning_rate": 8.16951507551439e-08, + "logits/chosen": 0.6284887194633484, + "logits/rejected": 1.7544790506362915, + "logps/chosen": -78.0561752319336, + "logps/rejected": -1719.0638427734375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3490171432495117, + "rewards/margins": 16.02911949157715, + "rewards/rejected": -16.378137588500977, + "step": 3710 + }, + { + "epoch": 0.929012923768496, + "grad_norm": 6.4849853515625e-05, + "learning_rate": 7.626012360631291e-08, + "logits/chosen": 0.5767999887466431, + "logits/rejected": 1.8027598857879639, + "logps/chosen": -87.67066192626953, + "logps/rejected": -1751.861328125, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.44937849044799805, + "rewards/margins": 16.375062942504883, + "rewards/rejected": -16.82444190979004, + "step": 3720 + }, + { + "epoch": 0.9315102703377661, + "grad_norm": 0.0006256103515625, + "learning_rate": 7.100936639038936e-08, + "logits/chosen": 0.5324774384498596, + "logits/rejected": 1.9097219705581665, + "logps/chosen": -89.81242370605469, + "logps/rejected": -2373.812744140625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4651169776916504, + "rewards/margins": 22.532739639282227, + "rewards/rejected": -22.997854232788086, + "step": 3730 + }, + { + "epoch": 0.9340076169070363, + "grad_norm": 1.0356307029724121e-06, + "learning_rate": 6.594327830725916e-08, + "logits/chosen": 0.5782414674758911, + "logits/rejected": 1.906734824180603, + "logps/chosen": -80.02381896972656, + "logps/rejected": -1915.576416015625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3699369728565216, + "rewards/margins": 18.08548927307129, + "rewards/rejected": -18.455425262451172, + "step": 3740 + }, + { + "epoch": 0.9365049634763064, + "grad_norm": 0.039306640625, + "learning_rate": 6.106224451694592e-08, + "logits/chosen": 0.5905268788337708, + "logits/rejected": 1.7933048009872437, + "logps/chosen": -77.8465347290039, + "logps/rejected": -1931.9468994140625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3471309244632721, + "rewards/margins": 18.231327056884766, + "rewards/rejected": -18.57845687866211, + "step": 3750 + }, + { + "epoch": 0.9390023100455765, + "grad_norm": 0.07861328125, + "learning_rate": 5.636663611033266e-08, + "logits/chosen": 0.6453654170036316, + "logits/rejected": 1.9545791149139404, + "logps/chosen": -88.19773864746094, + "logps/rejected": -2014.8583984375, + "loss": 0.0013, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4550137519836426, + "rewards/margins": 18.988948822021484, + "rewards/rejected": -19.443960189819336, + "step": 3760 + }, + { + "epoch": 0.9414996566148467, + "grad_norm": 0.00152587890625, + "learning_rate": 5.185681008094579e-08, + "logits/chosen": 0.5483246445655823, + "logits/rejected": 1.754540205001831, + "logps/chosen": -81.78535461425781, + "logps/rejected": -1835.6578369140625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3718385696411133, + "rewards/margins": 17.2070369720459, + "rewards/rejected": -17.578876495361328, + "step": 3770 + }, + { + "epoch": 0.9439970031841168, + "grad_norm": 0.00086212158203125, + "learning_rate": 4.753310929781513e-08, + "logits/chosen": 0.6061893701553345, + "logits/rejected": 1.7712287902832031, + "logps/chosen": -84.43482971191406, + "logps/rejected": -1739.1148681640625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4173789620399475, + "rewards/margins": 16.286354064941406, + "rewards/rejected": -16.703731536865234, + "step": 3780 + }, + { + "epoch": 0.946494349753387, + "grad_norm": 1.2874603271484375e-05, + "learning_rate": 4.3395862479405914e-08, + "logits/chosen": 0.5530301928520203, + "logits/rejected": 1.790492296218872, + "logps/chosen": -98.83070373535156, + "logps/rejected": -1927.699462890625, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.5457450747489929, + "rewards/margins": 17.921890258789062, + "rewards/rejected": -18.467636108398438, + "step": 3790 + }, + { + "epoch": 0.9489916963226572, + "grad_norm": 0.0028228759765625, + "learning_rate": 3.9445384168628474e-08, + "logits/chosen": 0.5836749076843262, + "logits/rejected": 1.8246829509735107, + "logps/chosen": -80.40269470214844, + "logps/rejected": -1708.9019775390625, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3665417730808258, + "rewards/margins": 15.90452766418457, + "rewards/rejected": -16.271068572998047, + "step": 3800 + }, + { + "epoch": 0.9514890428919274, + "grad_norm": 0.017578125, + "learning_rate": 3.5681974708923484e-08, + "logits/chosen": 0.6176645159721375, + "logits/rejected": 1.7562223672866821, + "logps/chosen": -82.15937805175781, + "logps/rejected": -1655.8538818359375, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3880414068698883, + "rewards/margins": 15.394001960754395, + "rewards/rejected": -15.78204345703125, + "step": 3810 + }, + { + "epoch": 0.9539863894611975, + "grad_norm": 0.003021240234375, + "learning_rate": 3.210592022142717e-08, + "logits/chosen": 0.6430649161338806, + "logits/rejected": 1.7840299606323242, + "logps/chosen": -88.11245727539062, + "logps/rejected": -1835.171630859375, + "loss": 0.0009, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4654541611671448, + "rewards/margins": 17.16687774658203, + "rewards/rejected": -17.6323299407959, + "step": 3820 + }, + { + "epoch": 0.9564837360304677, + "grad_norm": 0.0260009765625, + "learning_rate": 2.8717492583220095e-08, + "logits/chosen": 0.6011831164360046, + "logits/rejected": 1.8058007955551147, + "logps/chosen": -83.06114959716797, + "logps/rejected": -1898.2939453125, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39993369579315186, + "rewards/margins": 17.886281967163086, + "rewards/rejected": -18.28621482849121, + "step": 3830 + }, + { + "epoch": 0.9589810825997378, + "grad_norm": 0.109375, + "learning_rate": 2.551694940665539e-08, + "logits/chosen": 0.600081741809845, + "logits/rejected": 1.7849693298339844, + "logps/chosen": -82.53587341308594, + "logps/rejected": -1787.7008056640625, + "loss": 0.0015, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39744192361831665, + "rewards/margins": 16.777345657348633, + "rewards/rejected": -17.174787521362305, + "step": 3840 + }, + { + "epoch": 0.9614784291690079, + "grad_norm": 5.4836273193359375e-05, + "learning_rate": 2.2504534019774092e-08, + "logits/chosen": 0.712979257106781, + "logits/rejected": 1.8468831777572632, + "logps/chosen": -80.92863464355469, + "logps/rejected": -1629.561767578125, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3758103847503662, + "rewards/margins": 15.236900329589844, + "rewards/rejected": -15.612710952758789, + "step": 3850 + }, + { + "epoch": 0.9639757757382781, + "grad_norm": 0.058349609375, + "learning_rate": 1.9680475447805826e-08, + "logits/chosen": 0.6279615759849548, + "logits/rejected": 1.7799345254898071, + "logps/chosen": -81.30252838134766, + "logps/rejected": -1729.193115234375, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3813311457633972, + "rewards/margins": 16.206878662109375, + "rewards/rejected": -16.58820915222168, + "step": 3860 + }, + { + "epoch": 0.9664731223075482, + "grad_norm": 0.000812530517578125, + "learning_rate": 1.70449883957563e-08, + "logits/chosen": 0.5945799350738525, + "logits/rejected": 1.7449548244476318, + "logps/chosen": -79.51522064208984, + "logps/rejected": -1772.429931640625, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3488074243068695, + "rewards/margins": 16.572925567626953, + "rewards/rejected": -16.921733856201172, + "step": 3870 + }, + { + "epoch": 0.9689704688768184, + "grad_norm": 0.03662109375, + "learning_rate": 1.4598273232083182e-08, + "logits/chosen": 0.5940654873847961, + "logits/rejected": 1.737255334854126, + "logps/chosen": -89.48096466064453, + "logps/rejected": -1763.216064453125, + "loss": 0.0026, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4539431631565094, + "rewards/margins": 16.4827880859375, + "rewards/rejected": -16.93673324584961, + "step": 3880 + }, + { + "epoch": 0.9714678154460885, + "grad_norm": 7.677078247070312e-05, + "learning_rate": 1.2340515973464917e-08, + "logits/chosen": 0.5371723771095276, + "logits/rejected": 1.6967008113861084, + "logps/chosen": -89.69468688964844, + "logps/rejected": -1855.769287109375, + "loss": 0.0005, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4602131247520447, + "rewards/margins": 17.378524780273438, + "rewards/rejected": -17.8387393951416, + "step": 3890 + }, + { + "epoch": 0.9739651620153587, + "grad_norm": 0.01055908203125, + "learning_rate": 1.0271888270655118e-08, + "logits/chosen": 0.5918472409248352, + "logits/rejected": 1.6669203042984009, + "logps/chosen": -84.76191711425781, + "logps/rejected": -1762.7152099609375, + "loss": 0.0017, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.40269866585731506, + "rewards/margins": 16.396747589111328, + "rewards/rejected": -16.79944610595703, + "step": 3900 + }, + { + "epoch": 0.9764625085846288, + "grad_norm": 0.005584716796875, + "learning_rate": 8.392547395435769e-09, + "logits/chosen": 0.6482867002487183, + "logits/rejected": 1.7531925439834595, + "logps/chosen": -84.86217498779297, + "logps/rejected": -1594.6502685546875, + "loss": 0.0014, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4176376461982727, + "rewards/margins": 14.827906608581543, + "rewards/rejected": -15.245546340942383, + "step": 3910 + }, + { + "epoch": 0.9789598551538989, + "grad_norm": 1.0251998901367188e-05, + "learning_rate": 6.702636228657911e-09, + "logits/chosen": 0.6012560129165649, + "logits/rejected": 1.7790956497192383, + "logps/chosen": -85.35179138183594, + "logps/rejected": -1768.960693359375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.41833052039146423, + "rewards/margins": 16.56346321105957, + "rewards/rejected": -16.981792449951172, + "step": 3920 + }, + { + "epoch": 0.9814572017231691, + "grad_norm": 0.049560546875, + "learning_rate": 5.2022832493800465e-09, + "logits/chosen": 0.575610339641571, + "logits/rejected": 1.6641845703125, + "logps/chosen": -87.38862609863281, + "logps/rejected": -1609.995849609375, + "loss": 0.001, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.43747106194496155, + "rewards/margins": 15.003010749816895, + "rewards/rejected": -15.440483093261719, + "step": 3930 + }, + { + "epoch": 0.9839545482924393, + "grad_norm": 0.00927734375, + "learning_rate": 3.891602525100124e-09, + "logits/chosen": 0.5365520119667053, + "logits/rejected": 1.7841014862060547, + "logps/chosen": -77.71090698242188, + "logps/rejected": -1843.400634765625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.33150431513786316, + "rewards/margins": 17.311891555786133, + "rewards/rejected": -17.643396377563477, + "step": 3940 + }, + { + "epoch": 0.9864518948617095, + "grad_norm": 0.000896453857421875, + "learning_rate": 2.7706937030827495e-09, + "logits/chosen": 0.6269220113754272, + "logits/rejected": 1.821447730064392, + "logps/chosen": -79.47364807128906, + "logps/rejected": -1583.856689453125, + "loss": 0.0012, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35821038484573364, + "rewards/margins": 14.733774185180664, + "rewards/rejected": -15.091984748840332, + "step": 3950 + }, + { + "epoch": 0.9889492414309796, + "grad_norm": 0.021728515625, + "learning_rate": 1.839642002783859e-09, + "logits/chosen": 0.7017726302146912, + "logits/rejected": 1.8069097995758057, + "logps/chosen": -79.8754653930664, + "logps/rejected": -1646.0501708984375, + "loss": 0.002, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.3667237162590027, + "rewards/margins": 15.337608337402344, + "rewards/rejected": -15.704330444335938, + "step": 3960 + }, + { + "epoch": 0.9914465880002498, + "grad_norm": 0.021728515625, + "learning_rate": 1.0985182093714574e-09, + "logits/chosen": 0.6416125297546387, + "logits/rejected": 1.760498046875, + "logps/chosen": -85.49261474609375, + "logps/rejected": -1687.677978515625, + "loss": 0.0006, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.42397341132164, + "rewards/margins": 15.762815475463867, + "rewards/rejected": -16.18678855895996, + "step": 3970 + }, + { + "epoch": 0.9939439345695199, + "grad_norm": 0.003143310546875, + "learning_rate": 5.473786683440896e-10, + "logits/chosen": 0.5962403416633606, + "logits/rejected": 1.8247146606445312, + "logps/chosen": -83.6251220703125, + "logps/rejected": -2010.3323974609375, + "loss": 0.0019, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.4074554443359375, + "rewards/margins": 18.965452194213867, + "rewards/rejected": -19.372909545898438, + "step": 3980 + }, + { + "epoch": 0.99644128113879, + "grad_norm": 0.049560546875, + "learning_rate": 1.862652812467669e-10, + "logits/chosen": 0.5162760615348816, + "logits/rejected": 1.6673294305801392, + "logps/chosen": -83.96633911132812, + "logps/rejected": -1847.5804443359375, + "loss": 0.0008, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.39398157596588135, + "rewards/margins": 17.087627410888672, + "rewards/rejected": -17.481609344482422, + "step": 3990 + }, + { + "epoch": 0.9989386277080602, + "grad_norm": 2.9802322387695312e-05, + "learning_rate": 1.5205502486292932e-11, + "logits/chosen": 0.582720935344696, + "logits/rejected": 1.8548141717910767, + "logps/chosen": -80.02593994140625, + "logps/rejected": -1968.4027099609375, + "loss": 0.0011, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.35573673248291016, + "rewards/margins": 18.611202239990234, + "rewards/rejected": -18.96693992614746, + "step": 4000 + }, + { + "epoch": 0.9989386277080602, + "eval_logits/chosen": 0.6540641784667969, + "eval_logits/rejected": 1.569779634475708, + "eval_logps/chosen": -84.51407623291016, + "eval_logps/rejected": -994.2071533203125, + "eval_loss": 0.00282670627348125, + "eval_rewards/accuracies": 1.0, + "eval_rewards/chosen": -0.40158194303512573, + "eval_rewards/margins": 8.929591178894043, + "eval_rewards/rejected": -9.33117389678955, + "eval_runtime": 0.6151, + "eval_samples_per_second": 8.128, + "eval_steps_per_second": 8.128, + "step": 4000 + }, + { + "epoch": 0.9999375663357682, + "step": 4004, + "total_flos": 0.0, + "train_loss": 0.05122942747121405, + "train_runtime": 6577.5594, + "train_samples_per_second": 2.435, + "train_steps_per_second": 0.609 + } + ], + "logging_steps": 10, + "max_steps": 4004, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 1000, + "total_flos": 0.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}