diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8488 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 3821, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "dpo_losses": 0.6931471824645996, + "epoch": 0.0, + "grad_norm": 1.6611914425325234, + "learning_rate": 1.3054830287206268e-08, + "logits/chosen": -2.909182548522949, + "logits/rejected": -2.942319393157959, + "logps/chosen": -202.1656494140625, + "logps/rejected": -236.2765350341797, + "loss": 0.6931, + "positive_losses": 0.0, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/margins_max": 0.0, + "rewards/margins_min": 0.0, + "rewards/margins_std": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "dpo_losses": 0.6931732296943665, + "epoch": 0.0, + "grad_norm": 31.969198661929852, + "learning_rate": 1.3054830287206266e-07, + "logits/chosen": -2.8677401542663574, + "logits/rejected": -2.7863104343414307, + "logps/chosen": -300.2490234375, + "logps/rejected": -226.55227661132812, + "loss": 0.7007, + "positive_losses": 0.08082646876573563, + "rewards/accuracies": 0.4027777910232544, + "rewards/chosen": -0.00019778122077696025, + "rewards/margins": -5.036979200667702e-05, + "rewards/margins_max": 0.002247290452942252, + "rewards/margins_min": -0.0025868744123727083, + "rewards/margins_std": 0.0021279146894812584, + "rewards/rejected": -0.0001474114542361349, + "step": 10 + }, + { + "dpo_losses": 0.6931561231613159, + "epoch": 0.01, + "grad_norm": 14.306452777137075, + "learning_rate": 2.610966057441253e-07, + "logits/chosen": -2.8997511863708496, + "logits/rejected": -2.821716070175171, + "logps/chosen": -342.60882568359375, + "logps/rejected": -237.42715454101562, + "loss": 0.6975, + "positive_losses": 0.0433620922267437, + "rewards/accuracies": 0.5375000238418579, + "rewards/chosen": 0.0008173284004442394, + "rewards/margins": -1.608573984412942e-05, + "rewards/margins_max": 0.0027120746672153473, + "rewards/margins_min": -0.003053296823054552, + "rewards/margins_std": 0.0025677671656012535, + "rewards/rejected": 0.0008334142039529979, + "step": 20 + }, + { + "dpo_losses": 0.6928127408027649, + "epoch": 0.01, + "grad_norm": 6.76920412465167, + "learning_rate": 3.9164490861618804e-07, + "logits/chosen": -2.7987051010131836, + "logits/rejected": -2.8019251823425293, + "logps/chosen": -303.80767822265625, + "logps/rejected": -266.1310729980469, + "loss": 0.6945, + "positive_losses": 0.018559837713837624, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.004252296872437, + "rewards/margins": 0.0006736738723702729, + "rewards/margins_max": 0.004781021270900965, + "rewards/margins_min": -0.0029487106949090958, + "rewards/margins_std": 0.003470769850537181, + "rewards/rejected": 0.003578622592613101, + "step": 30 + }, + { + "dpo_losses": 0.692796528339386, + "epoch": 0.01, + "grad_norm": 5.604010862243276, + "learning_rate": 5.221932114882506e-07, + "logits/chosen": -2.813326120376587, + "logits/rejected": -2.8262887001037598, + "logps/chosen": -280.18328857421875, + "logps/rejected": -280.54071044921875, + "loss": 0.6934, + "positive_losses": 0.011556625366210938, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.008161318488419056, + "rewards/margins": 0.0007072348380461335, + "rewards/margins_max": 0.005385141354054213, + "rewards/margins_min": -0.0038401507772505283, + "rewards/margins_std": 0.004110351204872131, + "rewards/rejected": 0.007454083301126957, + "step": 40 + }, + { + "dpo_losses": 0.6927393674850464, + "epoch": 0.01, + "grad_norm": 2.04497375713942, + "learning_rate": 6.527415143603135e-07, + "logits/chosen": -2.886263608932495, + "logits/rejected": -2.8584060668945312, + "logps/chosen": -293.516845703125, + "logps/rejected": -290.35809326171875, + "loss": 0.693, + "positive_losses": 0.0017120360862463713, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.011908010579645634, + "rewards/margins": 0.0008212241227738559, + "rewards/margins_max": 0.005125211086124182, + "rewards/margins_min": -0.00381028326228261, + "rewards/margins_std": 0.004039672203361988, + "rewards/rejected": 0.011086787097156048, + "step": 50 + }, + { + "dpo_losses": 0.6924117207527161, + "epoch": 0.02, + "grad_norm": 1.9405895334341494, + "learning_rate": 7.832898172323761e-07, + "logits/chosen": -2.821598768234253, + "logits/rejected": -2.7612762451171875, + "logps/chosen": -284.66741943359375, + "logps/rejected": -250.3779296875, + "loss": 0.6928, + "positive_losses": 0.0, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.014016744680702686, + "rewards/margins": 0.0014791989233344793, + "rewards/margins_max": 0.0065080830827355385, + "rewards/margins_min": -0.003556284587830305, + "rewards/margins_std": 0.004505008924752474, + "rewards/rejected": 0.012537546455860138, + "step": 60 + }, + { + "dpo_losses": 0.692319929599762, + "epoch": 0.02, + "grad_norm": 1.6259721635638693, + "learning_rate": 9.138381201044387e-07, + "logits/chosen": -2.8565468788146973, + "logits/rejected": -2.826981782913208, + "logps/chosen": -247.64559936523438, + "logps/rejected": -229.2498779296875, + "loss": 0.6926, + "positive_losses": 0.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.015192938968539238, + "rewards/margins": 0.001666490687057376, + "rewards/margins_max": 0.00816585123538971, + "rewards/margins_min": -0.005111562553793192, + "rewards/margins_std": 0.0060561723075807095, + "rewards/rejected": 0.013526448979973793, + "step": 70 + }, + { + "dpo_losses": 0.6917449235916138, + "epoch": 0.02, + "grad_norm": 2.4954140563057763, + "learning_rate": 1.0443864229765013e-06, + "logits/chosen": -2.8182671070098877, + "logits/rejected": -2.7784154415130615, + "logps/chosen": -275.432373046875, + "logps/rejected": -225.180419921875, + "loss": 0.6922, + "positive_losses": 0.0004261016729287803, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.016106154769659042, + "rewards/margins": 0.002822594018653035, + "rewards/margins_max": 0.011124899610877037, + "rewards/margins_min": -0.004864133894443512, + "rewards/margins_std": 0.007102040108293295, + "rewards/rejected": 0.01328356098383665, + "step": 80 + }, + { + "dpo_losses": 0.6901671886444092, + "epoch": 0.02, + "grad_norm": 11.82878483855531, + "learning_rate": 1.1749347258485642e-06, + "logits/chosen": -2.879368543624878, + "logits/rejected": -2.833066940307617, + "logps/chosen": -322.00164794921875, + "logps/rejected": -271.5631408691406, + "loss": 0.6924, + "positive_losses": 0.0, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02144201099872589, + "rewards/margins": 0.0059935045428574085, + "rewards/margins_max": 0.01698596030473709, + "rewards/margins_min": -0.004721859935671091, + "rewards/margins_std": 0.009729361161589622, + "rewards/rejected": 0.015448507852852345, + "step": 90 + }, + { + "dpo_losses": 0.6903530359268188, + "epoch": 0.03, + "grad_norm": 2.1562864880999015, + "learning_rate": 1.305483028720627e-06, + "logits/chosen": -2.7181944847106934, + "logits/rejected": -2.6738967895507812, + "logps/chosen": -341.0799560546875, + "logps/rejected": -240.3539581298828, + "loss": 0.6911, + "positive_losses": 0.0, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.02394462376832962, + "rewards/margins": 0.0056390054523944855, + "rewards/margins_max": 0.019726106896996498, + "rewards/margins_min": -0.006556331180036068, + "rewards/margins_std": 0.01177787035703659, + "rewards/rejected": 0.018305614590644836, + "step": 100 + }, + { + "epoch": 0.03, + "eval_dpo_losses": 0.690110445022583, + "eval_logits/chosen": -2.8053641319274902, + "eval_logits/rejected": -2.766242742538452, + "eval_logps/chosen": -282.0331115722656, + "eval_logps/rejected": -256.6321105957031, + "eval_loss": 0.6917737722396851, + "eval_positive_losses": 0.009668411687016487, + "eval_rewards/accuracies": 0.6669999957084656, + "eval_rewards/chosen": 0.025603098794817924, + "eval_rewards/margins": 0.006135226227343082, + "eval_rewards/margins_max": 0.02824905700981617, + "eval_rewards/margins_min": -0.013623973354697227, + "eval_rewards/margins_std": 0.013793708756566048, + "eval_rewards/rejected": 0.019467873498797417, + "eval_runtime": 428.7568, + "eval_samples_per_second": 4.665, + "eval_steps_per_second": 0.292, + "step": 100 + }, + { + "dpo_losses": 0.6918027400970459, + "epoch": 0.03, + "grad_norm": 2.156851422013261, + "learning_rate": 1.4360313315926894e-06, + "logits/chosen": -2.8136510848999023, + "logits/rejected": -2.823888063430786, + "logps/chosen": -253.9114532470703, + "logps/rejected": -245.44131469726562, + "loss": 0.6909, + "positive_losses": 0.0, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.023961525410413742, + "rewards/margins": 0.002739057643339038, + "rewards/margins_max": 0.017869509756565094, + "rewards/margins_min": -0.011799002066254616, + "rewards/margins_std": 0.01312586385756731, + "rewards/rejected": 0.021222466602921486, + "step": 110 + }, + { + "dpo_losses": 0.6911963224411011, + "epoch": 0.03, + "grad_norm": 21.782948059706587, + "learning_rate": 1.5665796344647521e-06, + "logits/chosen": -2.802748203277588, + "logits/rejected": -2.7897446155548096, + "logps/chosen": -276.5313415527344, + "logps/rejected": -233.54483032226562, + "loss": 0.7017, + "positive_losses": 0.20288391411304474, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.027138402685523033, + "rewards/margins": 0.0040526604279875755, + "rewards/margins_max": 0.02045324817299843, + "rewards/margins_min": -0.015398552641272545, + "rewards/margins_std": 0.01642894372344017, + "rewards/rejected": 0.02308574505150318, + "step": 120 + }, + { + "dpo_losses": 0.6882020831108093, + "epoch": 0.03, + "grad_norm": 10.984128078327988, + "learning_rate": 1.6971279373368146e-06, + "logits/chosen": -2.8226253986358643, + "logits/rejected": -2.7765445709228516, + "logps/chosen": -260.9440612792969, + "logps/rejected": -306.1463317871094, + "loss": 0.6895, + "positive_losses": 0.0018714905017986894, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.029916221275925636, + "rewards/margins": 0.010501563549041748, + "rewards/margins_max": 0.046757232397794724, + "rewards/margins_min": -0.01352185569703579, + "rewards/margins_std": 0.027795681729912758, + "rewards/rejected": 0.01941465586423874, + "step": 130 + }, + { + "dpo_losses": 0.6875099539756775, + "epoch": 0.04, + "grad_norm": 1.8029906647620886, + "learning_rate": 1.8276762402088774e-06, + "logits/chosen": -2.8402276039123535, + "logits/rejected": -2.7861874103546143, + "logps/chosen": -259.905029296875, + "logps/rejected": -230.09567260742188, + "loss": 0.7114, + "positive_losses": 0.06240863725543022, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.03723273053765297, + "rewards/margins": 0.011453449726104736, + "rewards/margins_max": 0.03811323270201683, + "rewards/margins_min": -0.010656825266778469, + "rewards/margins_std": 0.021766941994428635, + "rewards/rejected": 0.025779282674193382, + "step": 140 + }, + { + "dpo_losses": 0.688108503818512, + "epoch": 0.04, + "grad_norm": 1.8550180630691293, + "learning_rate": 1.9582245430809403e-06, + "logits/chosen": -2.8100781440734863, + "logits/rejected": -2.791452646255493, + "logps/chosen": -238.95901489257812, + "logps/rejected": -254.30355834960938, + "loss": 0.6919, + "positive_losses": 0.016205215826630592, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0451044887304306, + "rewards/margins": 0.010226741433143616, + "rewards/margins_max": 0.03394794091582298, + "rewards/margins_min": -0.010631146840751171, + "rewards/margins_std": 0.02019248902797699, + "rewards/rejected": 0.03487774729728699, + "step": 150 + }, + { + "dpo_losses": 0.6860537528991699, + "epoch": 0.04, + "grad_norm": 9.356915744700173, + "learning_rate": 2.0887728459530026e-06, + "logits/chosen": -2.7489659786224365, + "logits/rejected": -2.7682833671569824, + "logps/chosen": -269.04193115234375, + "logps/rejected": -257.1709899902344, + "loss": 0.6854, + "positive_losses": 0.007275390438735485, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.05685564875602722, + "rewards/margins": 0.014535295777022839, + "rewards/margins_max": 0.0533498153090477, + "rewards/margins_min": -0.021386949345469475, + "rewards/margins_std": 0.033080536872148514, + "rewards/rejected": 0.04232034832239151, + "step": 160 + }, + { + "dpo_losses": 0.6875606179237366, + "epoch": 0.04, + "grad_norm": 1.850999384368349, + "learning_rate": 2.2193211488250653e-06, + "logits/chosen": -2.7869582176208496, + "logits/rejected": -2.797764539718628, + "logps/chosen": -264.21929931640625, + "logps/rejected": -230.4424591064453, + "loss": 0.6905, + "positive_losses": 0.022145461291074753, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0651407465338707, + "rewards/margins": 0.011593434028327465, + "rewards/margins_max": 0.05123548582196236, + "rewards/margins_min": -0.02727467194199562, + "rewards/margins_std": 0.03492007032036781, + "rewards/rejected": 0.053547315299510956, + "step": 170 + }, + { + "dpo_losses": 0.6835399866104126, + "epoch": 0.05, + "grad_norm": 9.104754000529407, + "learning_rate": 2.3498694516971284e-06, + "logits/chosen": -2.8353443145751953, + "logits/rejected": -2.7759976387023926, + "logps/chosen": -318.8158874511719, + "logps/rejected": -278.764404296875, + "loss": 0.6944, + "positive_losses": 0.17639747262001038, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.07056266814470291, + "rewards/margins": 0.019875146448612213, + "rewards/margins_max": 0.06871607899665833, + "rewards/margins_min": -0.030139094218611717, + "rewards/margins_std": 0.04327835515141487, + "rewards/rejected": 0.050687529146671295, + "step": 180 + }, + { + "dpo_losses": 0.6838158369064331, + "epoch": 0.05, + "grad_norm": 9.269298320661703, + "learning_rate": 2.4804177545691907e-06, + "logits/chosen": -2.835176706314087, + "logits/rejected": -2.8169798851013184, + "logps/chosen": -227.8518524169922, + "logps/rejected": -219.61294555664062, + "loss": 0.6869, + "positive_losses": 0.031234169378876686, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.07945264130830765, + "rewards/margins": 0.019288327544927597, + "rewards/margins_max": 0.06636989861726761, + "rewards/margins_min": -0.024780582636594772, + "rewards/margins_std": 0.04111936315894127, + "rewards/rejected": 0.06016431376338005, + "step": 190 + }, + { + "dpo_losses": 0.6795304417610168, + "epoch": 0.05, + "grad_norm": 2.036685420364637, + "learning_rate": 2.610966057441254e-06, + "logits/chosen": -2.832998037338257, + "logits/rejected": -2.7580273151397705, + "logps/chosen": -308.475341796875, + "logps/rejected": -245.78466796875, + "loss": 0.6847, + "positive_losses": 0.009261703118681908, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.08847187459468842, + "rewards/margins": 0.027999553829431534, + "rewards/margins_max": 0.0806564912199974, + "rewards/margins_min": -0.012813249602913857, + "rewards/margins_std": 0.042584288865327835, + "rewards/rejected": 0.06047232076525688, + "step": 200 + }, + { + "epoch": 0.05, + "eval_dpo_losses": 0.6806074976921082, + "eval_logits/chosen": -2.798675775527954, + "eval_logits/rejected": -2.7591915130615234, + "eval_logps/chosen": -276.15399169921875, + "eval_logps/rejected": -252.74549865722656, + "eval_loss": 0.6918641924858093, + "eval_positive_losses": 0.031035613268613815, + "eval_rewards/accuracies": 0.6710000038146973, + "eval_rewards/chosen": 0.084394171833992, + "eval_rewards/margins": 0.026059743016958237, + "eval_rewards/margins_max": 0.1132497489452362, + "eval_rewards/margins_min": -0.05120939016342163, + "eval_rewards/margins_std": 0.05419847369194031, + "eval_rewards/rejected": 0.05833442509174347, + "eval_runtime": 428.0904, + "eval_samples_per_second": 4.672, + "eval_steps_per_second": 0.292, + "step": 200 + }, + { + "dpo_losses": 0.6805652379989624, + "epoch": 0.05, + "grad_norm": 2.5355264778356994, + "learning_rate": 2.741514360313316e-06, + "logits/chosen": -2.778172731399536, + "logits/rejected": -2.7424604892730713, + "logps/chosen": -288.6905212402344, + "logps/rejected": -287.5398254394531, + "loss": 0.6883, + "positive_losses": 0.0, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0932592898607254, + "rewards/margins": 0.026345139369368553, + "rewards/margins_max": 0.10087727010250092, + "rewards/margins_min": -0.03499458357691765, + "rewards/margins_std": 0.05978749319911003, + "rewards/rejected": 0.0669141560792923, + "step": 210 + }, + { + "dpo_losses": 0.682792067527771, + "epoch": 0.06, + "grad_norm": 1.7829356332394, + "learning_rate": 2.872062663185379e-06, + "logits/chosen": -2.7870593070983887, + "logits/rejected": -2.750718116760254, + "logps/chosen": -261.4877624511719, + "logps/rejected": -268.4627990722656, + "loss": 0.6804, + "positive_losses": 0.0, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.10234057903289795, + "rewards/margins": 0.021648995578289032, + "rewards/margins_max": 0.07374467700719833, + "rewards/margins_min": -0.03438428044319153, + "rewards/margins_std": 0.04767972230911255, + "rewards/rejected": 0.08069159090518951, + "step": 220 + }, + { + "dpo_losses": 0.6789739727973938, + "epoch": 0.06, + "grad_norm": 1.8845999689384942, + "learning_rate": 3.0026109660574416e-06, + "logits/chosen": -2.8217036724090576, + "logits/rejected": -2.8107285499572754, + "logps/chosen": -275.61614990234375, + "logps/rejected": -252.70364379882812, + "loss": 0.6815, + "positive_losses": 0.05097904056310654, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.12186799943447113, + "rewards/margins": 0.029651399701833725, + "rewards/margins_max": 0.1046239361166954, + "rewards/margins_min": -0.04132336378097534, + "rewards/margins_std": 0.06577299535274506, + "rewards/rejected": 0.0922165960073471, + "step": 230 + }, + { + "dpo_losses": 0.6752057075500488, + "epoch": 0.06, + "grad_norm": 2.0298898691454608, + "learning_rate": 3.1331592689295043e-06, + "logits/chosen": -2.7409098148345947, + "logits/rejected": -2.6662404537200928, + "logps/chosen": -274.1945495605469, + "logps/rejected": -229.91552734375, + "loss": 0.6731, + "positive_losses": 0.002334022428840399, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.10710887610912323, + "rewards/margins": 0.037525489926338196, + "rewards/margins_max": 0.11302550882101059, + "rewards/margins_min": -0.03477070480585098, + "rewards/margins_std": 0.06618161499500275, + "rewards/rejected": 0.06958337128162384, + "step": 240 + }, + { + "dpo_losses": 0.6733521223068237, + "epoch": 0.07, + "grad_norm": 7.746980632694252, + "learning_rate": 3.263707571801567e-06, + "logits/chosen": -2.8070075511932373, + "logits/rejected": -2.7885661125183105, + "logps/chosen": -259.63763427734375, + "logps/rejected": -228.86135864257812, + "loss": 0.6798, + "positive_losses": 0.03568840026855469, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.09187337756156921, + "rewards/margins": 0.0415494330227375, + "rewards/margins_max": 0.12277477979660034, + "rewards/margins_min": -0.03311099857091904, + "rewards/margins_std": 0.06869812309741974, + "rewards/rejected": 0.05032395198941231, + "step": 250 + }, + { + "dpo_losses": 0.6689683198928833, + "epoch": 0.07, + "grad_norm": 20.958700350005678, + "learning_rate": 3.3942558746736293e-06, + "logits/chosen": -2.81817889213562, + "logits/rejected": -2.772019863128662, + "logps/chosen": -312.8612060546875, + "logps/rejected": -309.1396484375, + "loss": 0.6777, + "positive_losses": 0.13976097106933594, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1171041950583458, + "rewards/margins": 0.05053260922431946, + "rewards/margins_max": 0.13087162375450134, + "rewards/margins_min": -0.023819979280233383, + "rewards/margins_std": 0.07033131271600723, + "rewards/rejected": 0.06657158583402634, + "step": 260 + }, + { + "dpo_losses": 0.673005223274231, + "epoch": 0.07, + "grad_norm": 8.312133780995351, + "learning_rate": 3.524804177545692e-06, + "logits/chosen": -2.809727907180786, + "logits/rejected": -2.7245917320251465, + "logps/chosen": -302.3597106933594, + "logps/rejected": -282.5569763183594, + "loss": 0.6822, + "positive_losses": 0.22031250596046448, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.12276358902454376, + "rewards/margins": 0.04253315180540085, + "rewards/margins_max": 0.13749028742313385, + "rewards/margins_min": -0.0451325885951519, + "rewards/margins_std": 0.08130183815956116, + "rewards/rejected": 0.08023042976856232, + "step": 270 + }, + { + "dpo_losses": 0.6772290468215942, + "epoch": 0.07, + "grad_norm": 9.803365355327683, + "learning_rate": 3.6553524804177547e-06, + "logits/chosen": -2.886641502380371, + "logits/rejected": -2.8426270484924316, + "logps/chosen": -279.29400634765625, + "logps/rejected": -256.7546691894531, + "loss": 0.6867, + "positive_losses": 0.09031333774328232, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.14377516508102417, + "rewards/margins": 0.03387909010052681, + "rewards/margins_max": 0.1268281638622284, + "rewards/margins_min": -0.05632222443819046, + "rewards/margins_std": 0.08138807117938995, + "rewards/rejected": 0.10989607870578766, + "step": 280 + }, + { + "dpo_losses": 0.6649600863456726, + "epoch": 0.08, + "grad_norm": 11.360849822588373, + "learning_rate": 3.7859007832898174e-06, + "logits/chosen": -2.850799322128296, + "logits/rejected": -2.8138887882232666, + "logps/chosen": -282.0655212402344, + "logps/rejected": -245.19259643554688, + "loss": 0.6748, + "positive_losses": 0.07300148159265518, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.15921849012374878, + "rewards/margins": 0.05898389220237732, + "rewards/margins_max": 0.1448855698108673, + "rewards/margins_min": -0.019111331552267075, + "rewards/margins_std": 0.07298921048641205, + "rewards/rejected": 0.10023460537195206, + "step": 290 + }, + { + "dpo_losses": 0.6765052080154419, + "epoch": 0.08, + "grad_norm": 1.9811501318373503, + "learning_rate": 3.9164490861618806e-06, + "logits/chosen": -2.855132579803467, + "logits/rejected": -2.780561923980713, + "logps/chosen": -268.2986145019531, + "logps/rejected": -228.91799926757812, + "loss": 0.686, + "positive_losses": 0.11273155361413956, + "rewards/accuracies": 0.5625, + "rewards/chosen": 0.12802688777446747, + "rewards/margins": 0.03576163575053215, + "rewards/margins_max": 0.15934522449970245, + "rewards/margins_min": -0.04905729740858078, + "rewards/margins_std": 0.09489398449659348, + "rewards/rejected": 0.09226525574922562, + "step": 300 + }, + { + "epoch": 0.08, + "eval_dpo_losses": 0.6693459749221802, + "eval_logits/chosen": -2.7859046459198, + "eval_logits/rejected": -2.747396945953369, + "eval_logps/chosen": -269.94671630859375, + "eval_logps/rejected": -249.0195770263672, + "eval_loss": 0.6901127696037292, + "eval_positive_losses": 0.08413992077112198, + "eval_rewards/accuracies": 0.6949999928474426, + "eval_rewards/chosen": 0.14646685123443604, + "eval_rewards/margins": 0.05087343230843544, + "eval_rewards/margins_max": 0.20712482929229736, + "eval_rewards/margins_min": -0.09147688001394272, + "eval_rewards/margins_std": 0.0988682359457016, + "eval_rewards/rejected": 0.0955934152007103, + "eval_runtime": 428.161, + "eval_samples_per_second": 4.671, + "eval_steps_per_second": 0.292, + "step": 300 + }, + { + "dpo_losses": 0.6752734184265137, + "epoch": 0.08, + "grad_norm": 2.2957750658047917, + "learning_rate": 4.046997389033943e-06, + "logits/chosen": -2.8415491580963135, + "logits/rejected": -2.8036723136901855, + "logps/chosen": -237.29660034179688, + "logps/rejected": -239.52114868164062, + "loss": 0.6832, + "positive_losses": 0.04194068908691406, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.13866667449474335, + "rewards/margins": 0.03833303973078728, + "rewards/margins_max": 0.1446342021226883, + "rewards/margins_min": -0.04681024327874184, + "rewards/margins_std": 0.08503605425357819, + "rewards/rejected": 0.10033364593982697, + "step": 310 + }, + { + "dpo_losses": 0.6736447215080261, + "epoch": 0.08, + "grad_norm": 4.724465451959904, + "learning_rate": 4.177545691906005e-06, + "logits/chosen": -2.8330652713775635, + "logits/rejected": -2.797053813934326, + "logps/chosen": -267.61407470703125, + "logps/rejected": -243.7859344482422, + "loss": 0.6854, + "positive_losses": 0.23391112685203552, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.15120115876197815, + "rewards/margins": 0.04250707849860191, + "rewards/margins_max": 0.1621485948562622, + "rewards/margins_min": -0.06767591834068298, + "rewards/margins_std": 0.10243155807256699, + "rewards/rejected": 0.10869406163692474, + "step": 320 + }, + { + "dpo_losses": 0.6691862344741821, + "epoch": 0.09, + "grad_norm": 5.16089323962061, + "learning_rate": 4.308093994778068e-06, + "logits/chosen": -2.8560335636138916, + "logits/rejected": -2.7866241931915283, + "logps/chosen": -273.2575378417969, + "logps/rejected": -278.06524658203125, + "loss": 0.6763, + "positive_losses": 0.0, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.1741090714931488, + "rewards/margins": 0.05146823078393936, + "rewards/margins_max": 0.16819505393505096, + "rewards/margins_min": -0.050195444375276566, + "rewards/margins_std": 0.09703671187162399, + "rewards/rejected": 0.12264084815979004, + "step": 330 + }, + { + "dpo_losses": 0.6649759411811829, + "epoch": 0.09, + "grad_norm": 6.243975978729356, + "learning_rate": 4.4386422976501306e-06, + "logits/chosen": -2.8070483207702637, + "logits/rejected": -2.795396089553833, + "logps/chosen": -281.8221130371094, + "logps/rejected": -258.61187744140625, + "loss": 0.6762, + "positive_losses": 0.09027175605297089, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.15640690922737122, + "rewards/margins": 0.06027814745903015, + "rewards/margins_max": 0.18680498003959656, + "rewards/margins_min": -0.036148302257061005, + "rewards/margins_std": 0.10083751380443573, + "rewards/rejected": 0.09612873941659927, + "step": 340 + }, + { + "dpo_losses": 0.6607569456100464, + "epoch": 0.09, + "grad_norm": 13.972627856764554, + "learning_rate": 4.569190600522193e-06, + "logits/chosen": -2.6934661865234375, + "logits/rejected": -2.6470305919647217, + "logps/chosen": -265.8113098144531, + "logps/rejected": -233.8306121826172, + "loss": 0.7053, + "positive_losses": 0.37513604760169983, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13258785009384155, + "rewards/margins": 0.07012196630239487, + "rewards/margins_max": 0.2063256800174713, + "rewards/margins_min": -0.06749741733074188, + "rewards/margins_std": 0.1223006621003151, + "rewards/rejected": 0.06246587634086609, + "step": 350 + }, + { + "dpo_losses": 0.6704517006874084, + "epoch": 0.09, + "grad_norm": 8.96812984435849, + "learning_rate": 4.699738903394257e-06, + "logits/chosen": -2.8276827335357666, + "logits/rejected": -2.8534798622131348, + "logps/chosen": -274.4246520996094, + "logps/rejected": -240.2241668701172, + "loss": 0.6834, + "positive_losses": 0.19836406409740448, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.14870551228523254, + "rewards/margins": 0.049779582768678665, + "rewards/margins_max": 0.1779724657535553, + "rewards/margins_min": -0.07502292096614838, + "rewards/margins_std": 0.11457856744527817, + "rewards/rejected": 0.09892591834068298, + "step": 360 + }, + { + "dpo_losses": 0.676064670085907, + "epoch": 0.1, + "grad_norm": 1.9760803116991252, + "learning_rate": 4.8302872062663196e-06, + "logits/chosen": -2.7008068561553955, + "logits/rejected": -2.6861443519592285, + "logps/chosen": -259.9791564941406, + "logps/rejected": -247.25051879882812, + "loss": 0.6935, + "positive_losses": 0.02337036095559597, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.16256669163703918, + "rewards/margins": 0.03676387667655945, + "rewards/margins_max": 0.13351775705814362, + "rewards/margins_min": -0.05251342058181763, + "rewards/margins_std": 0.08308423310518265, + "rewards/rejected": 0.12580282986164093, + "step": 370 + }, + { + "dpo_losses": 0.6824192404747009, + "epoch": 0.1, + "grad_norm": 1.9243637346295395, + "learning_rate": 4.9608355091383814e-06, + "logits/chosen": -2.746785879135132, + "logits/rejected": -2.6604361534118652, + "logps/chosen": -239.18319702148438, + "logps/rejected": -252.88662719726562, + "loss": 0.6804, + "positive_losses": 0.0, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.15879032015800476, + "rewards/margins": 0.025484349578619003, + "rewards/margins_max": 0.1668674200773239, + "rewards/margins_min": -0.09540258347988129, + "rewards/margins_std": 0.11624189466238022, + "rewards/rejected": 0.13330596685409546, + "step": 380 + }, + { + "dpo_losses": 0.6571207046508789, + "epoch": 0.1, + "grad_norm": 2.1382667192572367, + "learning_rate": 4.9999488562447675e-06, + "logits/chosen": -2.8185324668884277, + "logits/rejected": -2.790360927581787, + "logps/chosen": -274.35955810546875, + "logps/rejected": -251.66928100585938, + "loss": 0.7061, + "positive_losses": 0.5538875460624695, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18151499330997467, + "rewards/margins": 0.07630706578493118, + "rewards/margins_max": 0.19101569056510925, + "rewards/margins_min": -0.030439352616667747, + "rewards/margins_std": 0.09843975305557251, + "rewards/rejected": 0.10520792007446289, + "step": 390 + }, + { + "dpo_losses": 0.6789859533309937, + "epoch": 0.1, + "grad_norm": 7.956872563004055, + "learning_rate": 4.999698361256577e-06, + "logits/chosen": -2.7469451427459717, + "logits/rejected": -2.71948504447937, + "logps/chosen": -238.6767578125, + "logps/rejected": -236.68798828125, + "loss": 0.6944, + "positive_losses": 0.21751323342323303, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.13562782108783722, + "rewards/margins": 0.033287692815065384, + "rewards/margins_max": 0.17778214812278748, + "rewards/margins_min": -0.09985264390707016, + "rewards/margins_std": 0.1263827383518219, + "rewards/rejected": 0.10234012454748154, + "step": 400 + }, + { + "epoch": 0.1, + "eval_dpo_losses": 0.663064181804657, + "eval_logits/chosen": -2.7503912448883057, + "eval_logits/rejected": -2.711548328399658, + "eval_logps/chosen": -268.7827453613281, + "eval_logps/rejected": -249.2730255126953, + "eval_loss": 0.6911394596099854, + "eval_positive_losses": 0.1509626805782318, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": 0.15810656547546387, + "eval_rewards/margins": 0.06504742801189423, + "eval_rewards/margins_max": 0.24901820719242096, + "eval_rewards/margins_min": -0.11125880479812622, + "eval_rewards/margins_std": 0.11945176124572754, + "eval_rewards/rejected": 0.09305915236473083, + "eval_runtime": 428.2463, + "eval_samples_per_second": 4.67, + "eval_steps_per_second": 0.292, + "step": 400 + }, + { + "dpo_losses": 0.664069414138794, + "epoch": 0.11, + "grad_norm": 1.9598413572530216, + "learning_rate": 4.999239142174581e-06, + "logits/chosen": -2.775360345840454, + "logits/rejected": -2.691457509994507, + "logps/chosen": -267.9887390136719, + "logps/rejected": -250.80062866210938, + "loss": 0.7028, + "positive_losses": 0.5063844919204712, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.14695055782794952, + "rewards/margins": 0.062417663633823395, + "rewards/margins_max": 0.18425968289375305, + "rewards/margins_min": -0.06093385070562363, + "rewards/margins_std": 0.11020394414663315, + "rewards/rejected": 0.08453289419412613, + "step": 410 + }, + { + "dpo_losses": 0.653849720954895, + "epoch": 0.11, + "grad_norm": 4.945979117109157, + "learning_rate": 4.99857123734344e-06, + "logits/chosen": -2.789283275604248, + "logits/rejected": -2.7467398643493652, + "logps/chosen": -277.5484313964844, + "logps/rejected": -231.8037567138672, + "loss": 0.6648, + "positive_losses": 0.02366619184613228, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.1615566909313202, + "rewards/margins": 0.08374477177858353, + "rewards/margins_max": 0.2054201066493988, + "rewards/margins_min": -0.03128126636147499, + "rewards/margins_std": 0.10419468581676483, + "rewards/rejected": 0.07781191915273666, + "step": 420 + }, + { + "dpo_losses": 0.6573296785354614, + "epoch": 0.11, + "grad_norm": 13.690638352217581, + "learning_rate": 4.997694702533016e-06, + "logits/chosen": -2.7743239402770996, + "logits/rejected": -2.7178587913513184, + "logps/chosen": -336.22369384765625, + "logps/rejected": -252.02197265625, + "loss": 0.6747, + "positive_losses": 0.1470256745815277, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.16634371876716614, + "rewards/margins": 0.07746388018131256, + "rewards/margins_max": 0.22486260533332825, + "rewards/margins_min": -0.044031620025634766, + "rewards/margins_std": 0.11824808269739151, + "rewards/rejected": 0.08887985348701477, + "step": 430 + }, + { + "dpo_losses": 0.6622677445411682, + "epoch": 0.12, + "grad_norm": 1.8875260610233917, + "learning_rate": 4.996609610933713e-06, + "logits/chosen": -2.7872633934020996, + "logits/rejected": -2.776425838470459, + "logps/chosen": -258.63134765625, + "logps/rejected": -246.9169921875, + "loss": 0.6676, + "positive_losses": 0.15967722237110138, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.1545584499835968, + "rewards/margins": 0.06730343401432037, + "rewards/margins_max": 0.21810702979564667, + "rewards/margins_min": -0.07433603703975677, + "rewards/margins_std": 0.13059645891189575, + "rewards/rejected": 0.08725499361753464, + "step": 440 + }, + { + "dpo_losses": 0.660641074180603, + "epoch": 0.12, + "grad_norm": 10.268091007199386, + "learning_rate": 4.995316053150366e-06, + "logits/chosen": -2.822967052459717, + "logits/rejected": -2.783705234527588, + "logps/chosen": -284.12713623046875, + "logps/rejected": -260.09515380859375, + "loss": 0.6735, + "positive_losses": 0.07706870883703232, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.17116017639636993, + "rewards/margins": 0.06977041810750961, + "rewards/margins_max": 0.20091423392295837, + "rewards/margins_min": -0.05889366939663887, + "rewards/margins_std": 0.11523435264825821, + "rewards/rejected": 0.10138972848653793, + "step": 450 + }, + { + "dpo_losses": 0.6438701748847961, + "epoch": 0.12, + "grad_norm": 5.215093736693054, + "learning_rate": 4.9938141371946815e-06, + "logits/chosen": -2.7379660606384277, + "logits/rejected": -2.7176480293273926, + "logps/chosen": -261.96063232421875, + "logps/rejected": -243.8579864501953, + "loss": 0.6708, + "positive_losses": 0.4775467813014984, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.1769762635231018, + "rewards/margins": 0.10664983838796616, + "rewards/margins_max": 0.2506755292415619, + "rewards/margins_min": -0.03088602051138878, + "rewards/margins_std": 0.12461646646261215, + "rewards/rejected": 0.07032643258571625, + "step": 460 + }, + { + "dpo_losses": 0.6677502393722534, + "epoch": 0.12, + "grad_norm": 1.4687140440090638, + "learning_rate": 4.992103988476206e-06, + "logits/chosen": -2.6899399757385254, + "logits/rejected": -2.72920560836792, + "logps/chosen": -234.1622772216797, + "logps/rejected": -235.1865234375, + "loss": 0.6852, + "positive_losses": 0.19793835282325745, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.15648001432418823, + "rewards/margins": 0.05565309524536133, + "rewards/margins_max": 0.17241524159908295, + "rewards/margins_min": -0.06818656623363495, + "rewards/margins_std": 0.1089044064283371, + "rewards/rejected": 0.1008269339799881, + "step": 470 + }, + { + "dpo_losses": 0.6689162254333496, + "epoch": 0.13, + "grad_norm": 6.083829188931725, + "learning_rate": 4.990185749791866e-06, + "logits/chosen": -2.7936508655548096, + "logits/rejected": -2.7398219108581543, + "logps/chosen": -275.41754150390625, + "logps/rejected": -241.33541870117188, + "loss": 0.6862, + "positive_losses": 0.22376975417137146, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.17008516192436218, + "rewards/margins": 0.05477757006883621, + "rewards/margins_max": 0.20395350456237793, + "rewards/margins_min": -0.09113912284374237, + "rewards/margins_std": 0.13186687231063843, + "rewards/rejected": 0.11530760675668716, + "step": 480 + }, + { + "dpo_losses": 0.6603975892066956, + "epoch": 0.13, + "grad_norm": 17.080156629500834, + "learning_rate": 4.9880595813140395e-06, + "logits/chosen": -2.729578733444214, + "logits/rejected": -2.7126169204711914, + "logps/chosen": -285.56719970703125, + "logps/rejected": -253.8076171875, + "loss": 0.6882, + "positive_losses": 0.0, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.1881304681301117, + "rewards/margins": 0.07103969156742096, + "rewards/margins_max": 0.21888594329357147, + "rewards/margins_min": -0.046334654092788696, + "rewards/margins_std": 0.11919162422418594, + "rewards/rejected": 0.11709077656269073, + "step": 490 + }, + { + "dpo_losses": 0.6637958288192749, + "epoch": 0.13, + "grad_norm": 1.8083399023941662, + "learning_rate": 4.985725660577184e-06, + "logits/chosen": -2.7672886848449707, + "logits/rejected": -2.735917568206787, + "logps/chosen": -288.93804931640625, + "logps/rejected": -259.6017150878906, + "loss": 0.6923, + "positive_losses": 0.08706741034984589, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.19365164637565613, + "rewards/margins": 0.06393040716648102, + "rewards/margins_max": 0.19910098612308502, + "rewards/margins_min": -0.0605672225356102, + "rewards/margins_std": 0.11656157672405243, + "rewards/rejected": 0.1297212392091751, + "step": 500 + }, + { + "epoch": 0.13, + "eval_dpo_losses": 0.6647040247917175, + "eval_logits/chosen": -2.724285125732422, + "eval_logits/rejected": -2.6843347549438477, + "eval_logps/chosen": -265.1090087890625, + "eval_logps/rejected": -245.26019287109375, + "eval_loss": 0.6787527799606323, + "eval_positive_losses": 0.05962928384542465, + "eval_rewards/accuracies": 0.6949999928474426, + "eval_rewards/chosen": 0.19484417140483856, + "eval_rewards/margins": 0.061656612902879715, + "eval_rewards/margins_max": 0.25129368901252747, + "eval_rewards/margins_min": -0.10771217942237854, + "eval_rewards/margins_std": 0.11904539167881012, + "eval_rewards/rejected": 0.13318756222724915, + "eval_runtime": 428.3102, + "eval_samples_per_second": 4.67, + "eval_steps_per_second": 0.292, + "step": 500 + }, + { + "dpo_losses": 0.6538182497024536, + "epoch": 0.13, + "grad_norm": 1.8732684086967137, + "learning_rate": 4.983184182463009e-06, + "logits/chosen": -2.6488800048828125, + "logits/rejected": -2.6324260234832764, + "logps/chosen": -260.6108093261719, + "logps/rejected": -240.75369262695312, + "loss": 0.6678, + "positive_losses": 0.11122193187475204, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19986779987812042, + "rewards/margins": 0.08869300782680511, + "rewards/margins_max": 0.28801050782203674, + "rewards/margins_min": -0.06173459812998772, + "rewards/margins_std": 0.15817862749099731, + "rewards/rejected": 0.11117477715015411, + "step": 510 + }, + { + "dpo_losses": 0.6551252603530884, + "epoch": 0.14, + "grad_norm": 1.7478967016062716, + "learning_rate": 4.980435359184203e-06, + "logits/chosen": -2.7087385654449463, + "logits/rejected": -2.661221981048584, + "logps/chosen": -257.74530029296875, + "logps/rejected": -240.1739959716797, + "loss": 0.6886, + "positive_losses": 0.26389384269714355, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16229435801506042, + "rewards/margins": 0.0823858231306076, + "rewards/margins_max": 0.22145190834999084, + "rewards/margins_min": -0.06387770175933838, + "rewards/margins_std": 0.1273488700389862, + "rewards/rejected": 0.07990851253271103, + "step": 520 + }, + { + "dpo_losses": 0.6596701741218567, + "epoch": 0.14, + "grad_norm": 2.042964081913883, + "learning_rate": 4.9774794202667236e-06, + "logits/chosen": -2.783825397491455, + "logits/rejected": -2.7646143436431885, + "logps/chosen": -273.3974914550781, + "logps/rejected": -257.3733215332031, + "loss": 0.6823, + "positive_losses": 0.06744994968175888, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.1812540590763092, + "rewards/margins": 0.071528360247612, + "rewards/margins_max": 0.199395090341568, + "rewards/margins_min": -0.04514295980334282, + "rewards/margins_std": 0.10833747684955597, + "rewards/rejected": 0.10972567647695541, + "step": 530 + }, + { + "dpo_losses": 0.665927529335022, + "epoch": 0.14, + "grad_norm": 1.9756807916397077, + "learning_rate": 4.974316612530615e-06, + "logits/chosen": -2.728533983230591, + "logits/rejected": -2.7084076404571533, + "logps/chosen": -248.6405029296875, + "logps/rejected": -231.85354614257812, + "loss": 0.6775, + "positive_losses": 0.1546613723039627, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.190275639295578, + "rewards/margins": 0.06172444671392441, + "rewards/margins_max": 0.22966797649860382, + "rewards/margins_min": -0.09864543378353119, + "rewards/margins_std": 0.1446523368358612, + "rewards/rejected": 0.12855121493339539, + "step": 540 + }, + { + "dpo_losses": 0.6500524878501892, + "epoch": 0.14, + "grad_norm": 1.71028968008198, + "learning_rate": 4.970947200069416e-06, + "logits/chosen": -2.826908588409424, + "logits/rejected": -2.7796719074249268, + "logps/chosen": -292.29608154296875, + "logps/rejected": -250.68896484375, + "loss": 0.6916, + "positive_losses": 0.1571556031703949, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.1823032796382904, + "rewards/margins": 0.09319052845239639, + "rewards/margins_max": 0.24011695384979248, + "rewards/margins_min": -0.050476692616939545, + "rewards/margins_std": 0.12862922251224518, + "rewards/rejected": 0.08911273628473282, + "step": 550 + }, + { + "dpo_losses": 0.6723580360412598, + "epoch": 0.15, + "grad_norm": 1.8415686424485658, + "learning_rate": 4.967371464228096e-06, + "logits/chosen": -2.7671542167663574, + "logits/rejected": -2.786092758178711, + "logps/chosen": -279.84710693359375, + "logps/rejected": -265.36932373046875, + "loss": 0.6746, + "positive_losses": 0.01470336876809597, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.19616767764091492, + "rewards/margins": 0.04653048887848854, + "rewards/margins_max": 0.19184985756874084, + "rewards/margins_min": -0.09195338934659958, + "rewards/margins_std": 0.12599508464336395, + "rewards/rejected": 0.14963720738887787, + "step": 560 + }, + { + "dpo_losses": 0.660744309425354, + "epoch": 0.15, + "grad_norm": 1.7197400102842324, + "learning_rate": 4.963589703579569e-06, + "logits/chosen": -2.7402005195617676, + "logits/rejected": -2.7011075019836426, + "logps/chosen": -236.1312713623047, + "logps/rejected": -220.7137451171875, + "loss": 0.6725, + "positive_losses": 0.0806148499250412, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.18730634450912476, + "rewards/margins": 0.07002018392086029, + "rewards/margins_max": 0.2193438708782196, + "rewards/margins_min": -0.06707224994897842, + "rewards/margins_std": 0.12361118942499161, + "rewards/rejected": 0.11728618294000626, + "step": 570 + }, + { + "dpo_losses": 0.6621208786964417, + "epoch": 0.15, + "grad_norm": 16.76326761443846, + "learning_rate": 4.9596022338997615e-06, + "logits/chosen": -2.7995400428771973, + "logits/rejected": -2.763375997543335, + "logps/chosen": -280.8608703613281, + "logps/rejected": -242.50106811523438, + "loss": 0.6797, + "positive_losses": 0.2067081481218338, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.20165982842445374, + "rewards/margins": 0.0678969994187355, + "rewards/margins_max": 0.19827218353748322, + "rewards/margins_min": -0.04111642390489578, + "rewards/margins_std": 0.10833221673965454, + "rewards/rejected": 0.13376283645629883, + "step": 580 + }, + { + "dpo_losses": 0.658281147480011, + "epoch": 0.15, + "grad_norm": 3.9625818575036393, + "learning_rate": 4.955409388141243e-06, + "logits/chosen": -2.760684013366699, + "logits/rejected": -2.710430383682251, + "logps/chosen": -261.4241638183594, + "logps/rejected": -240.5662078857422, + "loss": 0.6887, + "positive_losses": 0.19717493653297424, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19780950248241425, + "rewards/margins": 0.07692292332649231, + "rewards/margins_max": 0.2365892380475998, + "rewards/margins_min": -0.06682348251342773, + "rewards/margins_std": 0.13651010394096375, + "rewards/rejected": 0.12088658660650253, + "step": 590 + }, + { + "dpo_losses": 0.6623961329460144, + "epoch": 0.16, + "grad_norm": 1.5443859689987585, + "learning_rate": 4.951011516405429e-06, + "logits/chosen": -2.774395227432251, + "logits/rejected": -2.745779514312744, + "logps/chosen": -305.0125427246094, + "logps/rejected": -267.03594970703125, + "loss": 0.663, + "positive_losses": 0.0, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.21630334854125977, + "rewards/margins": 0.06726609915494919, + "rewards/margins_max": 0.20849759876728058, + "rewards/margins_min": -0.08070281893014908, + "rewards/margins_std": 0.12883268296718597, + "rewards/rejected": 0.14903724193572998, + "step": 600 + }, + { + "epoch": 0.16, + "eval_dpo_losses": 0.6606666445732117, + "eval_logits/chosen": -2.7035694122314453, + "eval_logits/rejected": -2.665982484817505, + "eval_logps/chosen": -265.1740417480469, + "eval_logps/rejected": -246.3229522705078, + "eval_loss": 0.6892355680465698, + "eval_positive_losses": 0.14829835295677185, + "eval_rewards/accuracies": 0.6769999861717224, + "eval_rewards/chosen": 0.19419392943382263, + "eval_rewards/margins": 0.07163416594266891, + "eval_rewards/margins_max": 0.3007630705833435, + "eval_rewards/margins_min": -0.12855397164821625, + "eval_rewards/margins_std": 0.14202702045440674, + "eval_rewards/rejected": 0.12255976349115372, + "eval_runtime": 428.4408, + "eval_samples_per_second": 4.668, + "eval_steps_per_second": 0.292, + "step": 600 + }, + { + "dpo_losses": 0.6442610621452332, + "epoch": 0.16, + "grad_norm": 9.046805004316873, + "learning_rate": 4.946408985913344e-06, + "logits/chosen": -2.7550129890441895, + "logits/rejected": -2.678056478500366, + "logps/chosen": -345.64093017578125, + "logps/rejected": -259.5520324707031, + "loss": 0.6606, + "positive_losses": 0.055927276611328125, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.23318998515605927, + "rewards/margins": 0.10763374716043472, + "rewards/margins_max": 0.2763409912586212, + "rewards/margins_min": -0.0661562830209732, + "rewards/margins_std": 0.15407855808734894, + "rewards/rejected": 0.12555620074272156, + "step": 610 + }, + { + "dpo_losses": 0.6608101725578308, + "epoch": 0.16, + "grad_norm": 2.8816620306621594, + "learning_rate": 4.941602180974958e-06, + "logits/chosen": -2.742089033126831, + "logits/rejected": -2.7148919105529785, + "logps/chosen": -278.1106872558594, + "logps/rejected": -254.6637725830078, + "loss": 0.6557, + "positive_losses": 0.0004772186221089214, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.18525430560112, + "rewards/margins": 0.07316222786903381, + "rewards/margins_max": 0.2310808151960373, + "rewards/margins_min": -0.09148738533258438, + "rewards/margins_std": 0.14664871990680695, + "rewards/rejected": 0.11209206283092499, + "step": 620 + }, + { + "dpo_losses": 0.649380624294281, + "epoch": 0.16, + "grad_norm": 1.9957136475335484, + "learning_rate": 4.936591502957101e-06, + "logits/chosen": -2.6965575218200684, + "logits/rejected": -2.675640821456909, + "logps/chosen": -249.79653930664062, + "logps/rejected": -261.4026794433594, + "loss": 0.7133, + "positive_losses": 0.27226218581199646, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.1899053305387497, + "rewards/margins": 0.09622377902269363, + "rewards/margins_max": 0.27748948335647583, + "rewards/margins_min": -0.03962727636098862, + "rewards/margins_std": 0.14250265061855316, + "rewards/rejected": 0.09368153661489487, + "step": 630 + }, + { + "dpo_losses": 0.658961296081543, + "epoch": 0.17, + "grad_norm": 11.497970401956069, + "learning_rate": 4.931377370249946e-06, + "logits/chosen": -2.7044734954833984, + "logits/rejected": -2.6543304920196533, + "logps/chosen": -293.74298095703125, + "logps/rejected": -247.9723358154297, + "loss": 0.6902, + "positive_losses": 0.3072100281715393, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.20657439529895782, + "rewards/margins": 0.07905445247888565, + "rewards/margins_max": 0.2459186315536499, + "rewards/margins_min": -0.1058909073472023, + "rewards/margins_std": 0.1593790501356125, + "rewards/rejected": 0.12751996517181396, + "step": 640 + }, + { + "dpo_losses": 0.6644202470779419, + "epoch": 0.17, + "grad_norm": 9.998754805287007, + "learning_rate": 4.925960218232073e-06, + "logits/chosen": -2.684481143951416, + "logits/rejected": -2.6854796409606934, + "logps/chosen": -240.8351287841797, + "logps/rejected": -239.85079956054688, + "loss": 0.6814, + "positive_losses": 0.13215656578540802, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.1786859631538391, + "rewards/margins": 0.06295709311962128, + "rewards/margins_max": 0.20083513855934143, + "rewards/margins_min": -0.07749038934707642, + "rewards/margins_std": 0.12378038465976715, + "rewards/rejected": 0.11572885513305664, + "step": 650 + }, + { + "dpo_losses": 0.6548426747322083, + "epoch": 0.17, + "grad_norm": 9.859053858923993, + "learning_rate": 4.920340499234116e-06, + "logits/chosen": -2.75348162651062, + "logits/rejected": -2.735724449157715, + "logps/chosen": -242.1502227783203, + "logps/rejected": -245.6394500732422, + "loss": 0.7021, + "positive_losses": 0.3593042492866516, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.21502117812633514, + "rewards/margins": 0.08600465953350067, + "rewards/margins_max": 0.2713359296321869, + "rewards/margins_min": -0.10064806044101715, + "rewards/margins_std": 0.16272717714309692, + "rewards/rejected": 0.12901651859283447, + "step": 660 + }, + { + "dpo_losses": 0.6737793684005737, + "epoch": 0.18, + "grad_norm": 1.7701974587295215, + "learning_rate": 4.914518682500995e-06, + "logits/chosen": -2.7391796112060547, + "logits/rejected": -2.6809182167053223, + "logps/chosen": -255.51742553710938, + "logps/rejected": -226.91268920898438, + "loss": 0.7042, + "positive_losses": 0.3297177255153656, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.17366547882556915, + "rewards/margins": 0.04646056145429611, + "rewards/margins_max": 0.21436409652233124, + "rewards/margins_min": -0.11843661963939667, + "rewards/margins_std": 0.14836609363555908, + "rewards/rejected": 0.12720489501953125, + "step": 670 + }, + { + "dpo_losses": 0.6623993515968323, + "epoch": 0.18, + "grad_norm": 1.7781013634691223, + "learning_rate": 4.9084952541527315e-06, + "logits/chosen": -2.5873992443084717, + "logits/rejected": -2.566702365875244, + "logps/chosen": -258.7884521484375, + "logps/rejected": -231.9339141845703, + "loss": 0.6795, + "positive_losses": 0.30301570892333984, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.20401909947395325, + "rewards/margins": 0.06752722710371017, + "rewards/margins_max": 0.22269508242607117, + "rewards/margins_min": -0.07635542750358582, + "rewards/margins_std": 0.1323748379945755, + "rewards/rejected": 0.13649186491966248, + "step": 680 + }, + { + "dpo_losses": 0.662765622138977, + "epoch": 0.18, + "grad_norm": 1.7073095169049701, + "learning_rate": 4.902270717143858e-06, + "logits/chosen": -2.6526148319244385, + "logits/rejected": -2.629845142364502, + "logps/chosen": -268.54486083984375, + "logps/rejected": -229.29171752929688, + "loss": 0.654, + "positive_losses": 0.007523536682128906, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.18195727467536926, + "rewards/margins": 0.06656062602996826, + "rewards/margins_max": 0.20236501097679138, + "rewards/margins_min": -0.05715782567858696, + "rewards/margins_std": 0.11579607427120209, + "rewards/rejected": 0.1153966411948204, + "step": 690 + }, + { + "dpo_losses": 0.6593031287193298, + "epoch": 0.18, + "grad_norm": 10.299610197228036, + "learning_rate": 4.895845591221427e-06, + "logits/chosen": -2.670139789581299, + "logits/rejected": -2.638843059539795, + "logps/chosen": -255.5277862548828, + "logps/rejected": -218.6800994873047, + "loss": 0.6784, + "positive_losses": 0.43383750319480896, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.17841890454292297, + "rewards/margins": 0.07547406852245331, + "rewards/margins_max": 0.23104026913642883, + "rewards/margins_min": -0.07409827411174774, + "rewards/margins_std": 0.1367238461971283, + "rewards/rejected": 0.10294482856988907, + "step": 700 + }, + { + "epoch": 0.18, + "eval_dpo_losses": 0.654996395111084, + "eval_logits/chosen": -2.6624057292938232, + "eval_logits/rejected": -2.6229429244995117, + "eval_logps/chosen": -265.67559814453125, + "eval_logps/rejected": -248.0891571044922, + "eval_loss": 0.6934565901756287, + "eval_positive_losses": 0.21421319246292114, + "eval_rewards/accuracies": 0.6970000267028809, + "eval_rewards/chosen": 0.18917834758758545, + "eval_rewards/margins": 0.08428053557872772, + "eval_rewards/margins_max": 0.32754144072532654, + "eval_rewards/margins_min": -0.1273568570613861, + "eval_rewards/margins_std": 0.15159085392951965, + "eval_rewards/rejected": 0.10489779710769653, + "eval_runtime": 428.06, + "eval_samples_per_second": 4.672, + "eval_steps_per_second": 0.292, + "step": 700 + }, + { + "dpo_losses": 0.6701689958572388, + "epoch": 0.19, + "grad_norm": 8.136188022777755, + "learning_rate": 4.8892204128816e-06, + "logits/chosen": -2.6619200706481934, + "logits/rejected": -2.6518778800964355, + "logps/chosen": -194.81417846679688, + "logps/rejected": -204.11599731445312, + "loss": 0.6688, + "positive_losses": 0.038634538650512695, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.16319485008716583, + "rewards/margins": 0.04956919327378273, + "rewards/margins_max": 0.175912007689476, + "rewards/margins_min": -0.05440413951873779, + "rewards/margins_std": 0.1039264053106308, + "rewards/rejected": 0.1136256605386734, + "step": 710 + }, + { + "dpo_losses": 0.652790904045105, + "epoch": 0.19, + "grad_norm": 2.0698475441559903, + "learning_rate": 4.882395735324864e-06, + "logits/chosen": -2.6227266788482666, + "logits/rejected": -2.596557140350342, + "logps/chosen": -305.59320068359375, + "logps/rejected": -292.477783203125, + "loss": 0.6722, + "positive_losses": 0.32846182584762573, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.20921404659748077, + "rewards/margins": 0.09199421107769012, + "rewards/margins_max": 0.3147227168083191, + "rewards/margins_min": -0.09670272469520569, + "rewards/margins_std": 0.18117788434028625, + "rewards/rejected": 0.11721982061862946, + "step": 720 + }, + { + "dpo_losses": 0.6393214464187622, + "epoch": 0.19, + "grad_norm": 2.064984414732266, + "learning_rate": 4.87537212840983e-06, + "logits/chosen": -2.7411181926727295, + "logits/rejected": -2.6920838356018066, + "logps/chosen": -307.7519836425781, + "logps/rejected": -273.3503112792969, + "loss": 0.6824, + "positive_losses": 0.33143624663352966, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.2093723714351654, + "rewards/margins": 0.11846580356359482, + "rewards/margins_max": 0.2905295789241791, + "rewards/margins_min": -0.041656799614429474, + "rewards/margins_std": 0.15608441829681396, + "rewards/rejected": 0.09090657532215118, + "step": 730 + }, + { + "dpo_losses": 0.6581794619560242, + "epoch": 0.19, + "grad_norm": 6.430319624056747, + "learning_rate": 4.8681501786056545e-06, + "logits/chosen": -2.698139190673828, + "logits/rejected": -2.7497072219848633, + "logps/chosen": -251.01333618164062, + "logps/rejected": -292.71453857421875, + "loss": 0.6999, + "positive_losses": 0.4462181031703949, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.1730455905199051, + "rewards/margins": 0.07829009741544724, + "rewards/margins_max": 0.2488713562488556, + "rewards/margins_min": -0.09430457651615143, + "rewards/margins_std": 0.15166929364204407, + "rewards/rejected": 0.09475548565387726, + "step": 740 + }, + { + "dpo_losses": 0.656989336013794, + "epoch": 0.2, + "grad_norm": 2.0014309532421306, + "learning_rate": 4.860730488943068e-06, + "logits/chosen": -2.7123594284057617, + "logits/rejected": -2.727226972579956, + "logps/chosen": -281.8397521972656, + "logps/rejected": -279.3072204589844, + "loss": 0.6873, + "positive_losses": 0.23088416457176208, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.20445775985717773, + "rewards/margins": 0.07950909435749054, + "rewards/margins_max": 0.2337539941072464, + "rewards/margins_min": -0.06165488809347153, + "rewards/margins_std": 0.13100259006023407, + "rewards/rejected": 0.1249486654996872, + "step": 750 + }, + { + "dpo_losses": 0.639510989189148, + "epoch": 0.2, + "grad_norm": 1.8298763786835541, + "learning_rate": 4.853113678964022e-06, + "logits/chosen": -2.767143487930298, + "logits/rejected": -2.704211711883545, + "logps/chosen": -256.5793151855469, + "logps/rejected": -230.35009765625, + "loss": 0.6468, + "positive_losses": 0.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2196684181690216, + "rewards/margins": 0.11621483415365219, + "rewards/margins_max": 0.26115891337394714, + "rewards/margins_min": -0.009762251749634743, + "rewards/margins_std": 0.12192968279123306, + "rewards/rejected": 0.10345359891653061, + "step": 760 + }, + { + "dpo_losses": 0.6363869905471802, + "epoch": 0.2, + "grad_norm": 2.0467806495877507, + "learning_rate": 4.845300384669958e-06, + "logits/chosen": -2.7408857345581055, + "logits/rejected": -2.6877858638763428, + "logps/chosen": -298.6504821777344, + "logps/rejected": -261.07952880859375, + "loss": 0.6629, + "positive_losses": 0.1998986303806305, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.214046448469162, + "rewards/margins": 0.13136693835258484, + "rewards/margins_max": 0.37038570642471313, + "rewards/margins_min": -0.10493201017379761, + "rewards/margins_std": 0.21196743845939636, + "rewards/rejected": 0.08267951756715775, + "step": 770 + }, + { + "dpo_losses": 0.6489611864089966, + "epoch": 0.2, + "grad_norm": 5.9553260238661245, + "learning_rate": 4.837291258468701e-06, + "logits/chosen": -2.7537689208984375, + "logits/rejected": -2.7151620388031006, + "logps/chosen": -289.8448791503906, + "logps/rejected": -253.50607299804688, + "loss": 0.6994, + "positive_losses": 0.34298810362815857, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19952091574668884, + "rewards/margins": 0.09759555757045746, + "rewards/margins_max": 0.2846040427684784, + "rewards/margins_min": -0.08340279757976532, + "rewards/margins_std": 0.16337545216083527, + "rewards/rejected": 0.10192535072565079, + "step": 780 + }, + { + "dpo_losses": 0.6534120440483093, + "epoch": 0.21, + "grad_norm": 2.031892410809195, + "learning_rate": 4.829086969119984e-06, + "logits/chosen": -2.7431013584136963, + "logits/rejected": -2.7174015045166016, + "logps/chosen": -252.091552734375, + "logps/rejected": -253.53713989257812, + "loss": 0.7048, + "positive_losses": 0.26349717378616333, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.1834312379360199, + "rewards/margins": 0.08911575376987457, + "rewards/margins_max": 0.273346483707428, + "rewards/margins_min": -0.08002379536628723, + "rewards/margins_std": 0.15820558369159698, + "rewards/rejected": 0.09431548416614532, + "step": 790 + }, + { + "dpo_losses": 0.6413323879241943, + "epoch": 0.21, + "grad_norm": 9.333042094818406, + "learning_rate": 4.820688201679605e-06, + "logits/chosen": -2.733985185623169, + "logits/rejected": -2.703796863555908, + "logps/chosen": -329.62115478515625, + "logps/rejected": -263.99871826171875, + "loss": 0.661, + "positive_losses": 0.1969766616821289, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.22037820518016815, + "rewards/margins": 0.11573261022567749, + "rewards/margins_max": 0.3472324013710022, + "rewards/margins_min": -0.047318510711193085, + "rewards/margins_std": 0.1790885031223297, + "rewards/rejected": 0.10464560985565186, + "step": 800 + }, + { + "epoch": 0.21, + "eval_dpo_losses": 0.6538042426109314, + "eval_logits/chosen": -2.7244741916656494, + "eval_logits/rejected": -2.6850404739379883, + "eval_logps/chosen": -264.6508483886719, + "eval_logps/rejected": -247.35475158691406, + "eval_loss": 0.688547670841217, + "eval_positive_losses": 0.17699968814849854, + "eval_rewards/accuracies": 0.7020000219345093, + "eval_rewards/chosen": 0.19942589104175568, + "eval_rewards/margins": 0.08718385547399521, + "eval_rewards/margins_max": 0.33879923820495605, + "eval_rewards/margins_min": -0.12915974855422974, + "eval_rewards/margins_std": 0.15493494272232056, + "eval_rewards/rejected": 0.11224202811717987, + "eval_runtime": 428.1578, + "eval_samples_per_second": 4.671, + "eval_steps_per_second": 0.292, + "step": 800 + }, + { + "dpo_losses": 0.6463054418563843, + "epoch": 0.21, + "grad_norm": 30.82180664153932, + "learning_rate": 4.8120956574422315e-06, + "logits/chosen": -2.770087957382202, + "logits/rejected": -2.721579074859619, + "logps/chosen": -267.48638916015625, + "logps/rejected": -268.20855712890625, + "loss": 0.6844, + "positive_losses": 0.3319166302680969, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.20016130805015564, + "rewards/margins": 0.10353595018386841, + "rewards/margins_max": 0.31145358085632324, + "rewards/margins_min": -0.08192861080169678, + "rewards/margins_std": 0.17669261991977692, + "rewards/rejected": 0.09662538021802902, + "step": 810 + }, + { + "dpo_losses": 0.6368136405944824, + "epoch": 0.21, + "grad_norm": 2.0363461462731247, + "learning_rate": 4.803310053882831e-06, + "logits/chosen": -2.727980852127075, + "logits/rejected": -2.6585936546325684, + "logps/chosen": -241.8242645263672, + "logps/rejected": -211.6393585205078, + "loss": 0.7019, + "positive_losses": 0.810762882232666, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.19861382246017456, + "rewards/margins": 0.12479345500469208, + "rewards/margins_max": 0.3012546896934509, + "rewards/margins_min": -0.02588585577905178, + "rewards/margins_std": 0.15163125097751617, + "rewards/rejected": 0.07382034510374069, + "step": 820 + }, + { + "dpo_losses": 0.6578240394592285, + "epoch": 0.22, + "grad_norm": 1.949723111570459, + "learning_rate": 4.794332124596775e-06, + "logits/chosen": -2.742356538772583, + "logits/rejected": -2.704735279083252, + "logps/chosen": -309.84222412109375, + "logps/rejected": -289.576904296875, + "loss": 0.7202, + "positive_losses": 0.6529260873794556, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.19200512766838074, + "rewards/margins": 0.08338505029678345, + "rewards/margins_max": 0.2779453992843628, + "rewards/margins_min": -0.11795749515295029, + "rewards/margins_std": 0.1819847971200943, + "rewards/rejected": 0.10862010717391968, + "step": 830 + }, + { + "dpo_losses": 0.6642175912857056, + "epoch": 0.22, + "grad_norm": 9.685621904012176, + "learning_rate": 4.785162619238575e-06, + "logits/chosen": -2.7567312717437744, + "logits/rejected": -2.720520257949829, + "logps/chosen": -287.269775390625, + "logps/rejected": -249.7572479248047, + "loss": 0.6882, + "positive_losses": 0.4141426086425781, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.19045835733413696, + "rewards/margins": 0.06596094369888306, + "rewards/margins_max": 0.2540523409843445, + "rewards/margins_min": -0.08973310887813568, + "rewards/margins_std": 0.15320825576782227, + "rewards/rejected": 0.1244974136352539, + "step": 840 + }, + { + "dpo_losses": 0.6543776392936707, + "epoch": 0.22, + "grad_norm": 15.955758440669381, + "learning_rate": 4.775802303459288e-06, + "logits/chosen": -2.7536678314208984, + "logits/rejected": -2.708517551422119, + "logps/chosen": -278.27459716796875, + "logps/rejected": -261.2574157714844, + "loss": 0.6632, + "positive_losses": 0.0038405179511755705, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.1954740583896637, + "rewards/margins": 0.08532064408063889, + "rewards/margins_max": 0.23452997207641602, + "rewards/margins_min": -0.0665128082036972, + "rewards/margins_std": 0.13811759650707245, + "rewards/rejected": 0.11015341430902481, + "step": 850 + }, + { + "dpo_losses": 0.6685608625411987, + "epoch": 0.23, + "grad_norm": 11.169216111697574, + "learning_rate": 4.766251958842589e-06, + "logits/chosen": -2.730212926864624, + "logits/rejected": -2.743756055831909, + "logps/chosen": -207.6484832763672, + "logps/rejected": -218.2662811279297, + "loss": 0.6885, + "positive_losses": 0.24169044196605682, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.1786372810602188, + "rewards/margins": 0.054984550923109055, + "rewards/margins_max": 0.2055453062057495, + "rewards/margins_min": -0.07690045982599258, + "rewards/margins_std": 0.1269712895154953, + "rewards/rejected": 0.12365271896123886, + "step": 860 + }, + { + "dpo_losses": 0.6498798131942749, + "epoch": 0.23, + "grad_norm": 2.056410382633826, + "learning_rate": 4.7565123828395066e-06, + "logits/chosen": -2.666647434234619, + "logits/rejected": -2.6648640632629395, + "logps/chosen": -301.1907958984375, + "logps/rejected": -234.89401245117188, + "loss": 0.6975, + "positive_losses": 0.7158435583114624, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22358696162700653, + "rewards/margins": 0.12362835556268692, + "rewards/margins_max": 0.463988721370697, + "rewards/margins_min": -0.09853404015302658, + "rewards/margins_std": 0.26094773411750793, + "rewards/rejected": 0.0999586284160614, + "step": 870 + }, + { + "dpo_losses": 0.6778437495231628, + "epoch": 0.23, + "grad_norm": 2.009156248724103, + "learning_rate": 4.746584388701831e-06, + "logits/chosen": -2.7310903072357178, + "logits/rejected": -2.728564500808716, + "logps/chosen": -251.1389617919922, + "logps/rejected": -289.5425720214844, + "loss": 0.6921, + "positive_losses": 0.04077606275677681, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.18281050026416779, + "rewards/margins": 0.03961994871497154, + "rewards/margins_max": 0.21845373511314392, + "rewards/margins_min": -0.16457554697990417, + "rewards/margins_std": 0.1732816994190216, + "rewards/rejected": 0.14319053292274475, + "step": 880 + }, + { + "dpo_losses": 0.6645732522010803, + "epoch": 0.23, + "grad_norm": 7.164405406869741, + "learning_rate": 4.736468805414218e-06, + "logits/chosen": -2.723555564880371, + "logits/rejected": -2.708336353302002, + "logps/chosen": -291.2392578125, + "logps/rejected": -265.4895935058594, + "loss": 0.6918, + "positive_losses": 0.4872266352176666, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.19292163848876953, + "rewards/margins": 0.06546325981616974, + "rewards/margins_max": 0.25236308574676514, + "rewards/margins_min": -0.0974319577217102, + "rewards/margins_std": 0.15806356072425842, + "rewards/rejected": 0.1274583637714386, + "step": 890 + }, + { + "dpo_losses": 0.6708654165267944, + "epoch": 0.24, + "grad_norm": 1.846600801668145, + "learning_rate": 4.7261664776249595e-06, + "logits/chosen": -2.7252776622772217, + "logits/rejected": -2.707298755645752, + "logps/chosen": -245.03970336914062, + "logps/rejected": -263.7660827636719, + "loss": 0.6736, + "positive_losses": 0.04109077528119087, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.19691479206085205, + "rewards/margins": 0.05294477939605713, + "rewards/margins_max": 0.24138686060905457, + "rewards/margins_min": -0.1393672525882721, + "rewards/margins_std": 0.16866567730903625, + "rewards/rejected": 0.14396999776363373, + "step": 900 + }, + { + "epoch": 0.24, + "eval_dpo_losses": 0.6556591987609863, + "eval_logits/chosen": -2.7201335430145264, + "eval_logits/rejected": -2.681368112564087, + "eval_logps/chosen": -264.33880615234375, + "eval_logps/rejected": -246.65931701660156, + "eval_loss": 0.6827124357223511, + "eval_positive_losses": 0.15759699046611786, + "eval_rewards/accuracies": 0.6940000057220459, + "eval_rewards/chosen": 0.20254585146903992, + "eval_rewards/margins": 0.08334962278604507, + "eval_rewards/margins_max": 0.33453845977783203, + "eval_rewards/margins_min": -0.13353128731250763, + "eval_rewards/margins_std": 0.15607841312885284, + "eval_rewards/rejected": 0.11919621378183365, + "eval_runtime": 428.1043, + "eval_samples_per_second": 4.672, + "eval_steps_per_second": 0.292, + "step": 900 + }, + { + "dpo_losses": 0.6529034376144409, + "epoch": 0.24, + "grad_norm": 11.924980908904399, + "learning_rate": 4.715678265575463e-06, + "logits/chosen": -2.732459783554077, + "logits/rejected": -2.6982932090759277, + "logps/chosen": -246.77554321289062, + "logps/rejected": -207.98641967773438, + "loss": 0.6815, + "positive_losses": 0.10796146094799042, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.20487447082996368, + "rewards/margins": 0.08838620036840439, + "rewards/margins_max": 0.2575072646141052, + "rewards/margins_min": -0.06950771808624268, + "rewards/margins_std": 0.1464182436466217, + "rewards/rejected": 0.1164882630109787, + "step": 910 + }, + { + "dpo_losses": 0.6596372723579407, + "epoch": 0.24, + "grad_norm": 8.42476750174722, + "learning_rate": 4.705005045028415e-06, + "logits/chosen": -2.6776070594787598, + "logits/rejected": -2.6785919666290283, + "logps/chosen": -262.73541259765625, + "logps/rejected": -258.1957092285156, + "loss": 0.6866, + "positive_losses": 0.18081608414649963, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.19900774955749512, + "rewards/margins": 0.07932322472333908, + "rewards/margins_max": 0.27794817090034485, + "rewards/margins_min": -0.09649928659200668, + "rewards/margins_std": 0.16794349253177643, + "rewards/rejected": 0.11968453228473663, + "step": 920 + }, + { + "dpo_losses": 0.6416321992874146, + "epoch": 0.24, + "grad_norm": 9.065324435269773, + "learning_rate": 4.694147707194659e-06, + "logits/chosen": -2.7121102809906006, + "logits/rejected": -2.673405885696411, + "logps/chosen": -307.6800231933594, + "logps/rejected": -261.2682189941406, + "loss": 0.6748, + "positive_losses": 0.24066261947155, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.21907511353492737, + "rewards/margins": 0.11731608957052231, + "rewards/margins_max": 0.34327879548072815, + "rewards/margins_min": -0.11537656933069229, + "rewards/margins_std": 0.20834875106811523, + "rewards/rejected": 0.10175903886556625, + "step": 930 + }, + { + "dpo_losses": 0.6601050496101379, + "epoch": 0.25, + "grad_norm": 19.494077125341175, + "learning_rate": 4.683107158658782e-06, + "logits/chosen": -2.758807897567749, + "logits/rejected": -2.750800609588623, + "logps/chosen": -257.49993896484375, + "logps/rejected": -261.9332580566406, + "loss": 0.7437, + "positive_losses": 0.3766586184501648, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.19478636980056763, + "rewards/margins": 0.07299786806106567, + "rewards/margins_max": 0.23471426963806152, + "rewards/margins_min": -0.09077923744916916, + "rewards/margins_std": 0.14624808728694916, + "rewards/rejected": 0.12178850173950195, + "step": 940 + }, + { + "dpo_losses": 0.6490548849105835, + "epoch": 0.25, + "grad_norm": 2.033985819119182, + "learning_rate": 4.671884321303407e-06, + "logits/chosen": -2.7479376792907715, + "logits/rejected": -2.76665997505188, + "logps/chosen": -247.3947296142578, + "logps/rejected": -275.3429260253906, + "loss": 0.6816, + "positive_losses": 0.367349237203598, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.19129522144794464, + "rewards/margins": 0.09990672767162323, + "rewards/margins_max": 0.3187563121318817, + "rewards/margins_min": -0.10026586055755615, + "rewards/margins_std": 0.1873435080051422, + "rewards/rejected": 0.09138850122690201, + "step": 950 + }, + { + "dpo_losses": 0.6453216075897217, + "epoch": 0.25, + "grad_norm": 1.9967649641449399, + "learning_rate": 4.660480132232224e-06, + "logits/chosen": -2.734900951385498, + "logits/rejected": -2.645411252975464, + "logps/chosen": -311.1207580566406, + "logps/rejected": -248.730224609375, + "loss": 0.6846, + "positive_losses": 0.4737134873867035, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1866598129272461, + "rewards/margins": 0.10710735619068146, + "rewards/margins_max": 0.30022764205932617, + "rewards/margins_min": -0.10734431445598602, + "rewards/margins_std": 0.1846868395805359, + "rewards/rejected": 0.07955245673656464, + "step": 960 + }, + { + "dpo_losses": 0.6419434547424316, + "epoch": 0.25, + "grad_norm": 13.359467516127022, + "learning_rate": 4.6488955436917414e-06, + "logits/chosen": -2.7888317108154297, + "logits/rejected": -2.78301739692688, + "logps/chosen": -292.08740234375, + "logps/rejected": -263.8045349121094, + "loss": 0.6857, + "positive_losses": 0.4418838620185852, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18865352869033813, + "rewards/margins": 0.11351042985916138, + "rewards/margins_max": 0.2997412085533142, + "rewards/margins_min": -0.07673824578523636, + "rewards/margins_std": 0.16734425723552704, + "rewards/rejected": 0.07514312863349915, + "step": 970 + }, + { + "dpo_losses": 0.6530637145042419, + "epoch": 0.26, + "grad_norm": 6.4673183165262085, + "learning_rate": 4.6371315229917644e-06, + "logits/chosen": -2.809450626373291, + "logits/rejected": -2.80094313621521, + "logps/chosen": -261.11688232421875, + "logps/rejected": -223.17453002929688, + "loss": 0.6668, + "positive_losses": 0.17321071028709412, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.18977181613445282, + "rewards/margins": 0.08781514316797256, + "rewards/margins_max": 0.2462121695280075, + "rewards/margins_min": -0.07525520026683807, + "rewards/margins_std": 0.1442008763551712, + "rewards/rejected": 0.10195668041706085, + "step": 980 + }, + { + "dpo_losses": 0.6565328240394592, + "epoch": 0.26, + "grad_norm": 8.967947362413515, + "learning_rate": 4.625189052424638e-06, + "logits/chosen": -2.6529622077941895, + "logits/rejected": -2.6158525943756104, + "logps/chosen": -274.95526123046875, + "logps/rejected": -258.0525817871094, + "loss": 0.836, + "positive_losses": 3.846707582473755, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.16130439937114716, + "rewards/margins": 0.08392287790775299, + "rewards/margins_max": 0.26575109362602234, + "rewards/margins_min": -0.1301548182964325, + "rewards/margins_std": 0.18032459914684296, + "rewards/rejected": 0.07738152891397476, + "step": 990 + }, + { + "dpo_losses": 0.6670268774032593, + "epoch": 0.26, + "grad_norm": 11.870740909736584, + "learning_rate": 4.613069129183218e-06, + "logits/chosen": -2.7265543937683105, + "logits/rejected": -2.713296890258789, + "logps/chosen": -253.14404296875, + "logps/rejected": -242.2244110107422, + "loss": 0.6998, + "positive_losses": 0.3402255177497864, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.184846431016922, + "rewards/margins": 0.061308927834033966, + "rewards/margins_max": 0.2561453878879547, + "rewards/margins_min": -0.1173776239156723, + "rewards/margins_std": 0.16720959544181824, + "rewards/rejected": 0.12353750318288803, + "step": 1000 + }, + { + "epoch": 0.26, + "eval_dpo_losses": 0.651651918888092, + "eval_logits/chosen": -2.7190160751342773, + "eval_logits/rejected": -2.683014392852783, + "eval_logps/chosen": -264.2192077636719, + "eval_logps/rejected": -247.42453002929688, + "eval_loss": 0.6805807948112488, + "eval_positive_losses": 0.21309128403663635, + "eval_rewards/accuracies": 0.7070000171661377, + "eval_rewards/chosen": 0.20374199748039246, + "eval_rewards/margins": 0.09219793230295181, + "eval_rewards/margins_max": 0.34986335039138794, + "eval_rewards/margins_min": -0.133478045463562, + "eval_rewards/margins_std": 0.16154462099075317, + "eval_rewards/rejected": 0.11154407262802124, + "eval_runtime": 428.3508, + "eval_samples_per_second": 4.669, + "eval_steps_per_second": 0.292, + "step": 1000 + }, + { + "dpo_losses": 0.6370615363121033, + "epoch": 0.26, + "grad_norm": 11.008320869094142, + "learning_rate": 4.600772765277607e-06, + "logits/chosen": -2.7868447303771973, + "logits/rejected": -2.746675729751587, + "logps/chosen": -266.67388916015625, + "logps/rejected": -269.6067810058594, + "loss": 0.7051, + "positive_losses": 0.28719252347946167, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.19845828413963318, + "rewards/margins": 0.202920600771904, + "rewards/margins_max": 0.6908767819404602, + "rewards/margins_min": -0.03486362472176552, + "rewards/margins_std": 0.3422602117061615, + "rewards/rejected": -0.0044622840359807014, + "step": 1010 + }, + { + "dpo_losses": 0.669637143611908, + "epoch": 0.27, + "grad_norm": 10.704096528671498, + "learning_rate": 4.588300987450652e-06, + "logits/chosen": -2.775867462158203, + "logits/rejected": -2.7486350536346436, + "logps/chosen": -265.61761474609375, + "logps/rejected": -266.26904296875, + "loss": 0.7002, + "positive_losses": 0.5554599761962891, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.18071797490119934, + "rewards/margins": 0.05442746728658676, + "rewards/margins_max": 0.21861937642097473, + "rewards/margins_min": -0.11456086486577988, + "rewards/margins_std": 0.1543532758951187, + "rewards/rejected": 0.12629050016403198, + "step": 1020 + }, + { + "dpo_losses": 0.6489515900611877, + "epoch": 0.27, + "grad_norm": 2.235231995457484, + "learning_rate": 4.5756548370922136e-06, + "logits/chosen": -2.7182843685150146, + "logits/rejected": -2.717958927154541, + "logps/chosen": -278.40966796875, + "logps/rejected": -283.611328125, + "loss": 0.6727, + "positive_losses": 0.07070960849523544, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.21360652148723602, + "rewards/margins": 0.10068871825933456, + "rewards/margins_max": 0.30906373262405396, + "rewards/margins_min": -0.11298646777868271, + "rewards/margins_std": 0.1882849931716919, + "rewards/rejected": 0.11291780322790146, + "step": 1030 + }, + { + "dpo_losses": 0.640581488609314, + "epoch": 0.27, + "grad_norm": 1.838801840149293, + "learning_rate": 4.562835370152206e-06, + "logits/chosen": -2.682776927947998, + "logits/rejected": -2.652805805206299, + "logps/chosen": -254.6275634765625, + "logps/rejected": -233.42239379882812, + "loss": 0.6677, + "positive_losses": 0.03793792799115181, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.22730672359466553, + "rewards/margins": 0.11742307990789413, + "rewards/margins_max": 0.32565218210220337, + "rewards/margins_min": -0.05968831852078438, + "rewards/margins_std": 0.17563822865486145, + "rewards/rejected": 0.1098836287856102, + "step": 1040 + }, + { + "dpo_losses": 0.6529589891433716, + "epoch": 0.27, + "grad_norm": 19.3033959123777, + "learning_rate": 4.54984365705243e-06, + "logits/chosen": -2.706040859222412, + "logits/rejected": -2.6679813861846924, + "logps/chosen": -295.9115905761719, + "logps/rejected": -298.58062744140625, + "loss": 0.6748, + "positive_losses": 0.3477066159248352, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.2416692078113556, + "rewards/margins": 0.0935961976647377, + "rewards/margins_max": 0.3274371325969696, + "rewards/margins_min": -0.13003945350646973, + "rewards/margins_std": 0.201436847448349, + "rewards/rejected": 0.14807303249835968, + "step": 1050 + }, + { + "dpo_losses": 0.627490222454071, + "epoch": 0.28, + "grad_norm": 1.928168930962888, + "learning_rate": 4.536680782597191e-06, + "logits/chosen": -2.7081384658813477, + "logits/rejected": -2.6851656436920166, + "logps/chosen": -305.4207458496094, + "logps/rejected": -286.4194641113281, + "loss": 0.6566, + "positive_losses": 0.12177524715662003, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.22412219643592834, + "rewards/margins": 0.1465277522802353, + "rewards/margins_max": 0.371377557516098, + "rewards/margins_min": -0.0201462060213089, + "rewards/margins_std": 0.17978012561798096, + "rewards/rejected": 0.07759441435337067, + "step": 1060 + }, + { + "dpo_losses": 0.6365917921066284, + "epoch": 0.28, + "grad_norm": 8.350123567866095, + "learning_rate": 4.523347845882718e-06, + "logits/chosen": -2.75634503364563, + "logits/rejected": -2.7587296962738037, + "logps/chosen": -230.2615966796875, + "logps/rejected": -232.2123565673828, + "loss": 0.7026, + "positive_losses": 0.3845987319946289, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.18936094641685486, + "rewards/margins": 0.12512774765491486, + "rewards/margins_max": 0.33139970898628235, + "rewards/margins_min": -0.08519905805587769, + "rewards/margins_std": 0.1838373988866806, + "rewards/rejected": 0.06423317641019821, + "step": 1070 + }, + { + "dpo_losses": 0.655931830406189, + "epoch": 0.28, + "grad_norm": 1.8322517571395962, + "learning_rate": 4.50984596020539e-06, + "logits/chosen": -2.702406167984009, + "logits/rejected": -2.5898425579071045, + "logps/chosen": -267.80548095703125, + "logps/rejected": -243.29324340820312, + "loss": 0.7019, + "positive_losses": 0.3977195620536804, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.21579650044441223, + "rewards/margins": 0.08627926558256149, + "rewards/margins_max": 0.27977439761161804, + "rewards/margins_min": -0.13393890857696533, + "rewards/margins_std": 0.1862020194530487, + "rewards/rejected": 0.12951722741127014, + "step": 1080 + }, + { + "dpo_losses": 0.6420449614524841, + "epoch": 0.29, + "grad_norm": 2.10960782496911, + "learning_rate": 4.4961762529687745e-06, + "logits/chosen": -2.7220356464385986, + "logits/rejected": -2.690786600112915, + "logps/chosen": -246.0760955810547, + "logps/rejected": -221.6387176513672, + "loss": 0.6464, + "positive_losses": 0.06222038343548775, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.21960726380348206, + "rewards/margins": 0.11451101303100586, + "rewards/margins_max": 0.30883947014808655, + "rewards/margins_min": -0.07992779463529587, + "rewards/margins_std": 0.1761944591999054, + "rewards/rejected": 0.105096235871315, + "step": 1090 + }, + { + "dpo_losses": 0.6517956852912903, + "epoch": 0.29, + "grad_norm": 6.732583641895375, + "learning_rate": 4.482339865589492e-06, + "logits/chosen": -2.6478145122528076, + "logits/rejected": -2.6480424404144287, + "logps/chosen": -278.41357421875, + "logps/rejected": -245.4984130859375, + "loss": 0.6943, + "positive_losses": 0.2920181155204773, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.2183549404144287, + "rewards/margins": 0.09904515743255615, + "rewards/margins_max": 0.34376025199890137, + "rewards/margins_min": -0.08190792798995972, + "rewards/margins_std": 0.19108134508132935, + "rewards/rejected": 0.11930978298187256, + "step": 1100 + }, + { + "epoch": 0.29, + "eval_dpo_losses": 0.6502726078033447, + "eval_logits/chosen": -2.6978962421417236, + "eval_logits/rejected": -2.6632769107818604, + "eval_logps/chosen": -263.5789489746094, + "eval_logps/rejected": -247.13441467285156, + "eval_loss": 0.6807675957679749, + "eval_positive_losses": 0.2124890238046646, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": 0.21014489233493805, + "eval_rewards/margins": 0.09569980204105377, + "eval_rewards/margins_max": 0.3629496395587921, + "eval_rewards/margins_min": -0.1371210664510727, + "eval_rewards/margins_std": 0.16737282276153564, + "eval_rewards/rejected": 0.11444510519504547, + "eval_runtime": 429.0088, + "eval_samples_per_second": 4.662, + "eval_steps_per_second": 0.291, + "step": 1100 + }, + { + "dpo_losses": 0.6413752436637878, + "epoch": 0.29, + "grad_norm": 2.1748893183465925, + "learning_rate": 4.468337953401909e-06, + "logits/chosen": -2.738788604736328, + "logits/rejected": -2.723175287246704, + "logps/chosen": -265.66070556640625, + "logps/rejected": -256.4477844238281, + "loss": 0.6548, + "positive_losses": 0.08787040412425995, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21674147248268127, + "rewards/margins": 0.1143731027841568, + "rewards/margins_max": 0.3031662106513977, + "rewards/margins_min": -0.05679730698466301, + "rewards/margins_std": 0.16097629070281982, + "rewards/rejected": 0.10236841440200806, + "step": 1110 + }, + { + "dpo_losses": 0.6477451920509338, + "epoch": 0.29, + "grad_norm": 14.056073835531809, + "learning_rate": 4.45417168556166e-06, + "logits/chosen": -2.738856792449951, + "logits/rejected": -2.695164918899536, + "logps/chosen": -268.19818115234375, + "logps/rejected": -223.90090942382812, + "loss": 0.6943, + "positive_losses": 0.43545445799827576, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.2281276285648346, + "rewards/margins": 0.10189314186573029, + "rewards/margins_max": 0.29055875539779663, + "rewards/margins_min": -0.06951011717319489, + "rewards/margins_std": 0.16105321049690247, + "rewards/rejected": 0.1262345016002655, + "step": 1120 + }, + { + "dpo_losses": 0.6370912194252014, + "epoch": 0.3, + "grad_norm": 2.132578742663081, + "learning_rate": 4.439842244948036e-06, + "logits/chosen": -2.7594432830810547, + "logits/rejected": -2.716683864593506, + "logps/chosen": -284.3851318359375, + "logps/rejected": -248.4437255859375, + "loss": 0.6533, + "positive_losses": 0.10260801017284393, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.21433600783348083, + "rewards/margins": 0.1223423033952713, + "rewards/margins_max": 0.29776710271835327, + "rewards/margins_min": -0.0162980817258358, + "rewards/margins_std": 0.14208626747131348, + "rewards/rejected": 0.09199371933937073, + "step": 1130 + }, + { + "dpo_losses": 0.6386234760284424, + "epoch": 0.3, + "grad_norm": 16.81325418702227, + "learning_rate": 4.425350828065204e-06, + "logits/chosen": -2.7315094470977783, + "logits/rejected": -2.699362277984619, + "logps/chosen": -231.3442840576172, + "logps/rejected": -217.38253784179688, + "loss": 0.6649, + "positive_losses": 0.36649513244628906, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20520725846290588, + "rewards/margins": 0.12019307911396027, + "rewards/margins_max": 0.32323652505874634, + "rewards/margins_min": -0.058698803186416626, + "rewards/margins_std": 0.16871722042560577, + "rewards/rejected": 0.08501417934894562, + "step": 1140 + }, + { + "dpo_losses": 0.6301491856575012, + "epoch": 0.3, + "grad_norm": 2.1136794640555174, + "learning_rate": 4.410698644942303e-06, + "logits/chosen": -2.7129852771759033, + "logits/rejected": -2.7118496894836426, + "logps/chosen": -285.33026123046875, + "logps/rejected": -243.83712768554688, + "loss": 0.6547, + "positive_losses": 0.145775705575943, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.23448574542999268, + "rewards/margins": 0.14068496227264404, + "rewards/margins_max": 0.3842105567455292, + "rewards/margins_min": -0.058680903166532516, + "rewards/margins_std": 0.19865915179252625, + "rewards/rejected": 0.09380079060792923, + "step": 1150 + }, + { + "dpo_losses": 0.644921600818634, + "epoch": 0.3, + "grad_norm": 2.066305637614414, + "learning_rate": 4.395886919032406e-06, + "logits/chosen": -2.648340940475464, + "logits/rejected": -2.6257126331329346, + "logps/chosen": -215.16940307617188, + "logps/rejected": -204.62840270996094, + "loss": 0.6784, + "positive_losses": 0.661210298538208, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.18664805591106415, + "rewards/margins": 0.10691721737384796, + "rewards/margins_max": 0.3014773428440094, + "rewards/margins_min": -0.06276218593120575, + "rewards/margins_std": 0.1590488702058792, + "rewards/rejected": 0.07973084598779678, + "step": 1160 + }, + { + "dpo_losses": 0.6426645517349243, + "epoch": 0.31, + "grad_norm": 2.084542368699148, + "learning_rate": 4.380916887110366e-06, + "logits/chosen": -2.7288904190063477, + "logits/rejected": -2.689663887023926, + "logps/chosen": -243.3949737548828, + "logps/rejected": -254.04248046875, + "loss": 0.6843, + "positive_losses": 0.3210752606391907, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19764408469200134, + "rewards/margins": 0.11319931596517563, + "rewards/margins_max": 0.3080732226371765, + "rewards/margins_min": -0.07896386086940765, + "rewards/margins_std": 0.17322476208209991, + "rewards/rejected": 0.08444477617740631, + "step": 1170 + }, + { + "dpo_losses": 0.6467557549476624, + "epoch": 0.31, + "grad_norm": 1.906405208084926, + "learning_rate": 4.365789799169539e-06, + "logits/chosen": -2.699143171310425, + "logits/rejected": -2.661818027496338, + "logps/chosen": -244.04592895507812, + "logps/rejected": -226.03610229492188, + "loss": 0.6639, + "positive_losses": 0.2675541937351227, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.20552949607372284, + "rewards/margins": 0.10467071831226349, + "rewards/margins_max": 0.30532926321029663, + "rewards/margins_min": -0.07091771066188812, + "rewards/margins_std": 0.16842308640480042, + "rewards/rejected": 0.10085882246494293, + "step": 1180 + }, + { + "dpo_losses": 0.6682693958282471, + "epoch": 0.31, + "grad_norm": 15.821492047076443, + "learning_rate": 4.350506918317416e-06, + "logits/chosen": -2.6909573078155518, + "logits/rejected": -2.681933641433716, + "logps/chosen": -250.02426147460938, + "logps/rejected": -231.6263885498047, + "loss": 0.7676, + "positive_losses": 1.3114748001098633, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.20274274051189423, + "rewards/margins": 0.059476565569639206, + "rewards/margins_max": 0.23862798511981964, + "rewards/margins_min": -0.12899455428123474, + "rewards/margins_std": 0.16008736193180084, + "rewards/rejected": 0.14326617121696472, + "step": 1190 + }, + { + "dpo_losses": 0.6668694615364075, + "epoch": 0.31, + "grad_norm": 1.8757632291628046, + "learning_rate": 4.335069520670149e-06, + "logits/chosen": -2.693537473678589, + "logits/rejected": -2.671903133392334, + "logps/chosen": -259.87274169921875, + "logps/rejected": -235.5923614501953, + "loss": 0.6761, + "positive_losses": 0.11368007957935333, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.22806064784526825, + "rewards/margins": 0.06635666638612747, + "rewards/margins_max": 0.28212258219718933, + "rewards/margins_min": -0.13651351630687714, + "rewards/margins_std": 0.18843720853328705, + "rewards/rejected": 0.16170397400856018, + "step": 1200 + }, + { + "epoch": 0.31, + "eval_dpo_losses": 0.6510934233665466, + "eval_logits/chosen": -2.6915640830993652, + "eval_logits/rejected": -2.6573286056518555, + "eval_logps/chosen": -263.0201110839844, + "eval_logps/rejected": -246.42547607421875, + "eval_loss": 0.6793044209480286, + "eval_positive_losses": 0.18975259363651276, + "eval_rewards/accuracies": 0.7110000252723694, + "eval_rewards/chosen": 0.21573299169540405, + "eval_rewards/margins": 0.09419818967580795, + "eval_rewards/margins_max": 0.3704459071159363, + "eval_rewards/margins_min": -0.13656333088874817, + "eval_rewards/margins_std": 0.16917268931865692, + "eval_rewards/rejected": 0.1215347945690155, + "eval_runtime": 428.1461, + "eval_samples_per_second": 4.671, + "eval_steps_per_second": 0.292, + "step": 1200 + }, + { + "dpo_losses": 0.6602927446365356, + "epoch": 0.32, + "grad_norm": 1.772652069799707, + "learning_rate": 4.319478895246e-06, + "logits/chosen": -2.7384231090545654, + "logits/rejected": -2.7090110778808594, + "logps/chosen": -262.98516845703125, + "logps/rejected": -229.734130859375, + "loss": 0.6552, + "positive_losses": 0.04911189153790474, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.23368725180625916, + "rewards/margins": 0.0766754299402237, + "rewards/margins_max": 0.3011917471885681, + "rewards/margins_min": -0.13092352449893951, + "rewards/margins_std": 0.19443733990192413, + "rewards/rejected": 0.15701182186603546, + "step": 1210 + }, + { + "dpo_losses": 0.6473007202148438, + "epoch": 0.32, + "grad_norm": 1.963837565042628, + "learning_rate": 4.303736343857704e-06, + "logits/chosen": -2.7466750144958496, + "logits/rejected": -2.720334529876709, + "logps/chosen": -275.8536071777344, + "logps/rejected": -256.60882568359375, + "loss": 0.7014, + "positive_losses": 0.2650478482246399, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21415309607982635, + "rewards/margins": 0.10407302528619766, + "rewards/margins_max": 0.32783380150794983, + "rewards/margins_min": -0.06663999706506729, + "rewards/margins_std": 0.17963366210460663, + "rewards/rejected": 0.1100800633430481, + "step": 1220 + }, + { + "dpo_losses": 0.6583267450332642, + "epoch": 0.32, + "grad_norm": 1.7006431484971423, + "learning_rate": 4.287843181003772e-06, + "logits/chosen": -2.664750337600708, + "logits/rejected": -2.6802029609680176, + "logps/chosen": -206.2310028076172, + "logps/rejected": -220.775390625, + "loss": 0.6694, + "positive_losses": 0.01693267747759819, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.19193141162395477, + "rewards/margins": 0.07765965163707733, + "rewards/margins_max": 0.2502481937408447, + "rewards/margins_min": -0.0685039535164833, + "rewards/margins_std": 0.14312848448753357, + "rewards/rejected": 0.11427175998687744, + "step": 1230 + }, + { + "dpo_losses": 0.6376180052757263, + "epoch": 0.32, + "grad_norm": 1.6513813363342713, + "learning_rate": 4.27180073375873e-06, + "logits/chosen": -2.627671003341675, + "logits/rejected": -2.6573538780212402, + "logps/chosen": -227.34194946289062, + "logps/rejected": -215.28567504882812, + "loss": 0.6641, + "positive_losses": 0.25857123732566833, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.21966569125652313, + "rewards/margins": 0.12344787269830704, + "rewards/margins_max": 0.318206787109375, + "rewards/margins_min": -0.02818525768816471, + "rewards/margins_std": 0.15635992586612701, + "rewards/rejected": 0.0962178111076355, + "step": 1240 + }, + { + "dpo_losses": 0.6496556401252747, + "epoch": 0.33, + "grad_norm": 2.4268975138989184, + "learning_rate": 4.255610341662304e-06, + "logits/chosen": -2.6240971088409424, + "logits/rejected": -2.648489475250244, + "logps/chosen": -251.15036010742188, + "logps/rejected": -266.73590087890625, + "loss": 0.7239, + "positive_losses": 1.152416467666626, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.19764620065689087, + "rewards/margins": 0.10158193111419678, + "rewards/margins_max": 0.31982916593551636, + "rewards/margins_min": -0.09628921747207642, + "rewards/margins_std": 0.18527303636074066, + "rewards/rejected": 0.09606426954269409, + "step": 1250 + }, + { + "dpo_losses": 0.6284223794937134, + "epoch": 0.33, + "grad_norm": 12.252363950000884, + "learning_rate": 4.2392733566075764e-06, + "logits/chosen": -2.7110676765441895, + "logits/rejected": -2.7120535373687744, + "logps/chosen": -227.1322021484375, + "logps/rejected": -213.12319946289062, + "loss": 0.7199, + "positive_losses": 0.46507692337036133, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20356687903404236, + "rewards/margins": 0.1450451761484146, + "rewards/margins_max": 0.3519541621208191, + "rewards/margins_min": -0.06976257264614105, + "rewards/margins_std": 0.1865970492362976, + "rewards/rejected": 0.05852172523736954, + "step": 1260 + }, + { + "dpo_losses": 0.6484035849571228, + "epoch": 0.33, + "grad_norm": 1.7312466700552855, + "learning_rate": 4.2227911427280975e-06, + "logits/chosen": -2.763777494430542, + "logits/rejected": -2.7636969089508057, + "logps/chosen": -290.2239074707031, + "logps/rejected": -288.4922180175781, + "loss": 0.6953, + "positive_losses": 0.015831470489501953, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.23205384612083435, + "rewards/margins": 0.1020529493689537, + "rewards/margins_max": 0.3355284333229065, + "rewards/margins_min": -0.08288715034723282, + "rewards/margins_std": 0.18504774570465088, + "rewards/rejected": 0.13000090420246124, + "step": 1270 + }, + { + "dpo_losses": 0.6495410203933716, + "epoch": 0.33, + "grad_norm": 10.634064521483323, + "learning_rate": 4.206165076283983e-06, + "logits/chosen": -2.647918701171875, + "logits/rejected": -2.6600279808044434, + "logps/chosen": -258.177978515625, + "logps/rejected": -247.9029541015625, + "loss": 0.6737, + "positive_losses": 0.13902759552001953, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21103759109973907, + "rewards/margins": 0.09863315522670746, + "rewards/margins_max": 0.28625327348709106, + "rewards/margins_min": -0.08542687445878983, + "rewards/margins_std": 0.16910138726234436, + "rewards/rejected": 0.11240440607070923, + "step": 1280 + }, + { + "dpo_losses": 0.6440567374229431, + "epoch": 0.34, + "grad_norm": 14.369690553098103, + "learning_rate": 4.189396545546995e-06, + "logits/chosen": -2.648498296737671, + "logits/rejected": -2.643078327178955, + "logps/chosen": -247.05038452148438, + "logps/rejected": -245.91970825195312, + "loss": 0.7193, + "positive_losses": 0.5839151740074158, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.2182537019252777, + "rewards/margins": 0.12008634954690933, + "rewards/margins_max": 0.3548484444618225, + "rewards/margins_min": -0.09168001264333725, + "rewards/margins_std": 0.20228728652000427, + "rewards/rejected": 0.09816733002662659, + "step": 1290 + }, + { + "dpo_losses": 0.6662293672561646, + "epoch": 0.34, + "grad_norm": 1.8886050949204087, + "learning_rate": 4.172486950684627e-06, + "logits/chosen": -2.685309648513794, + "logits/rejected": -2.6278042793273926, + "logps/chosen": -181.18350219726562, + "logps/rejected": -209.67837524414062, + "loss": 0.6976, + "positive_losses": 0.6295714378356934, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.1815871298313141, + "rewards/margins": 0.06246403604745865, + "rewards/margins_max": 0.23650658130645752, + "rewards/margins_min": -0.12367131561040878, + "rewards/margins_std": 0.15985225141048431, + "rewards/rejected": 0.11912310123443604, + "step": 1300 + }, + { + "epoch": 0.34, + "eval_dpo_losses": 0.6535477042198181, + "eval_logits/chosen": -2.664144277572632, + "eval_logits/rejected": -2.628232717514038, + "eval_logps/chosen": -262.81219482421875, + "eval_logps/rejected": -245.60546875, + "eval_loss": 0.6730425953865051, + "eval_positive_losses": 0.11943159997463226, + "eval_rewards/accuracies": 0.7080000042915344, + "eval_rewards/chosen": 0.2178124189376831, + "eval_rewards/margins": 0.08807788044214249, + "eval_rewards/margins_max": 0.3433879613876343, + "eval_rewards/margins_min": -0.13216106593608856, + "eval_rewards/margins_std": 0.1594102680683136, + "eval_rewards/rejected": 0.12973454594612122, + "eval_runtime": 428.1365, + "eval_samples_per_second": 4.671, + "eval_steps_per_second": 0.292, + "step": 1300 + }, + { + "dpo_losses": 0.6625665426254272, + "epoch": 0.34, + "grad_norm": 14.739595008369198, + "learning_rate": 4.155437703643182e-06, + "logits/chosen": -2.6630187034606934, + "logits/rejected": -2.617025852203369, + "logps/chosen": -251.9992218017578, + "logps/rejected": -239.4009246826172, + "loss": 0.6807, + "positive_losses": 0.28922000527381897, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.20277170836925507, + "rewards/margins": 0.07075655460357666, + "rewards/margins_max": 0.2553870975971222, + "rewards/margins_min": -0.10119247436523438, + "rewards/margins_std": 0.16121159493923187, + "rewards/rejected": 0.1320151537656784, + "step": 1310 + }, + { + "dpo_losses": 0.6581068634986877, + "epoch": 0.35, + "grad_norm": 15.239631505000478, + "learning_rate": 4.138250228029882e-06, + "logits/chosen": -2.713186502456665, + "logits/rejected": -2.6854805946350098, + "logps/chosen": -265.56597900390625, + "logps/rejected": -242.75222778320312, + "loss": 0.6827, + "positive_losses": 0.1938123255968094, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.22281205654144287, + "rewards/margins": 0.08136356621980667, + "rewards/margins_max": 0.2923230230808258, + "rewards/margins_min": -0.1347009688615799, + "rewards/margins_std": 0.1873144507408142, + "rewards/rejected": 0.1414484828710556, + "step": 1320 + }, + { + "dpo_losses": 0.6390641927719116, + "epoch": 0.35, + "grad_norm": 1.8551365605185532, + "learning_rate": 4.120925958993994e-06, + "logits/chosen": -2.6785247325897217, + "logits/rejected": -2.677243232727051, + "logps/chosen": -270.0191955566406, + "logps/rejected": -263.7103576660156, + "loss": 0.6983, + "positive_losses": 0.18178720772266388, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.23928770422935486, + "rewards/margins": 0.1227588877081871, + "rewards/margins_max": 0.32545411586761475, + "rewards/margins_min": -0.08336476981639862, + "rewards/margins_std": 0.18180139362812042, + "rewards/rejected": 0.11652884632349014, + "step": 1330 + }, + { + "dpo_losses": 0.6413576602935791, + "epoch": 0.35, + "grad_norm": 3.235333729057778, + "learning_rate": 4.103466343106999e-06, + "logits/chosen": -2.545454502105713, + "logits/rejected": -2.519524335861206, + "logps/chosen": -324.9889221191406, + "logps/rejected": -270.2016906738281, + "loss": 0.6622, + "positive_losses": 0.10343074798583984, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.242496058344841, + "rewards/margins": 0.1184769719839096, + "rewards/margins_max": 0.3343360722064972, + "rewards/margins_min": -0.08624882996082306, + "rewards/margins_std": 0.1914907842874527, + "rewards/rejected": 0.1240190863609314, + "step": 1340 + }, + { + "dpo_losses": 0.6509544849395752, + "epoch": 0.35, + "grad_norm": 8.931457611084193, + "learning_rate": 4.085872838241797e-06, + "logits/chosen": -2.7083029747009277, + "logits/rejected": -2.661639451980591, + "logps/chosen": -294.5940856933594, + "logps/rejected": -282.7342224121094, + "loss": 0.6798, + "positive_losses": 0.41289058327674866, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.2159956395626068, + "rewards/margins": 0.09552866220474243, + "rewards/margins_max": 0.32265013456344604, + "rewards/margins_min": -0.10738413035869598, + "rewards/margins_std": 0.18789581954479218, + "rewards/rejected": 0.12046699225902557, + "step": 1350 + }, + { + "dpo_losses": 0.658233642578125, + "epoch": 0.36, + "grad_norm": 2.2726473664195037, + "learning_rate": 4.06814691345098e-06, + "logits/chosen": -2.6470699310302734, + "logits/rejected": -2.6591906547546387, + "logps/chosen": -254.8450469970703, + "logps/rejected": -259.53704833984375, + "loss": 0.6959, + "positive_losses": 0.18143853545188904, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.22105729579925537, + "rewards/margins": 0.07999229431152344, + "rewards/margins_max": 0.28359168767929077, + "rewards/margins_min": -0.0992380753159523, + "rewards/margins_std": 0.16937807202339172, + "rewards/rejected": 0.14106498658657074, + "step": 1360 + }, + { + "dpo_losses": 0.645255982875824, + "epoch": 0.36, + "grad_norm": 1.8297135713643442, + "learning_rate": 4.050290048844171e-06, + "logits/chosen": -2.7332587242126465, + "logits/rejected": -2.6860995292663574, + "logps/chosen": -276.13128662109375, + "logps/rejected": -242.00167846679688, + "loss": 0.6569, + "positive_losses": 0.025147819891572, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.2587778866291046, + "rewards/margins": 0.10525497049093246, + "rewards/margins_max": 0.2734186351299286, + "rewards/margins_min": -0.04697667807340622, + "rewards/margins_std": 0.14240820705890656, + "rewards/rejected": 0.15352290868759155, + "step": 1370 + }, + { + "dpo_losses": 0.6353882551193237, + "epoch": 0.36, + "grad_norm": 5.55038006142768, + "learning_rate": 4.032303735464422e-06, + "logits/chosen": -2.6713452339172363, + "logits/rejected": -2.6468327045440674, + "logps/chosen": -265.26678466796875, + "logps/rejected": -232.63088989257812, + "loss": 0.6595, + "positive_losses": 0.20324555039405823, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2500496804714203, + "rewards/margins": 0.12701056897640228, + "rewards/margins_max": 0.3361486792564392, + "rewards/margins_min": -0.03808742016553879, + "rewards/margins_std": 0.16679345071315765, + "rewards/rejected": 0.12303910404443741, + "step": 1380 + }, + { + "dpo_losses": 0.642040491104126, + "epoch": 0.36, + "grad_norm": 2.0037947655315964, + "learning_rate": 4.014189475163727e-06, + "logits/chosen": -2.737389326095581, + "logits/rejected": -2.705068588256836, + "logps/chosen": -235.0084991455078, + "logps/rejected": -231.0500946044922, + "loss": 0.6591, + "positive_losses": 0.02664165571331978, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.21402215957641602, + "rewards/margins": 0.11353866010904312, + "rewards/margins_max": 0.2948105037212372, + "rewards/margins_min": -0.08596575260162354, + "rewards/margins_std": 0.17296113073825836, + "rewards/rejected": 0.1004834994673729, + "step": 1390 + }, + { + "dpo_losses": 0.6551258563995361, + "epoch": 0.37, + "grad_norm": 6.662737769487926, + "learning_rate": 3.995948780477605e-06, + "logits/chosen": -2.690420150756836, + "logits/rejected": -2.6233978271484375, + "logps/chosen": -256.7580261230469, + "logps/rejected": -224.26107788085938, + "loss": 0.7536, + "positive_losses": 1.4171825647354126, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.19240829348564148, + "rewards/margins": 0.08824966102838516, + "rewards/margins_max": 0.2761348783969879, + "rewards/margins_min": -0.11071120202541351, + "rewards/margins_std": 0.1715453863143921, + "rewards/rejected": 0.10415863990783691, + "step": 1400 + }, + { + "epoch": 0.37, + "eval_dpo_losses": 0.6471304297447205, + "eval_logits/chosen": -2.65718150138855, + "eval_logits/rejected": -2.621112108230591, + "eval_logps/chosen": -263.3833312988281, + "eval_logps/rejected": -247.75094604492188, + "eval_loss": 0.7005280256271362, + "eval_positive_losses": 0.31428229808807373, + "eval_rewards/accuracies": 0.703000009059906, + "eval_rewards/chosen": 0.2121007740497589, + "eval_rewards/margins": 0.10382122546434402, + "eval_rewards/margins_max": 0.3986285924911499, + "eval_rewards/margins_min": -0.1530025601387024, + "eval_rewards/margins_std": 0.1837586909532547, + "eval_rewards/rejected": 0.10827956348657608, + "eval_runtime": 427.9161, + "eval_samples_per_second": 4.674, + "eval_steps_per_second": 0.292, + "step": 1400 + }, + { + "dpo_losses": 0.6532198190689087, + "epoch": 0.37, + "grad_norm": 1.9163321603977244, + "learning_rate": 3.977583174498816e-06, + "logits/chosen": -2.616386890411377, + "logits/rejected": -2.594870090484619, + "logps/chosen": -212.79452514648438, + "logps/rejected": -201.47752380371094, + "loss": 0.6752, + "positive_losses": 0.1494629830121994, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.19361071288585663, + "rewards/margins": 0.08965893089771271, + "rewards/margins_max": 0.2625807225704193, + "rewards/margins_min": -0.09533650428056717, + "rewards/margins_std": 0.16135409474372864, + "rewards/rejected": 0.10395178943872452, + "step": 1410 + }, + { + "dpo_losses": 0.6550511121749878, + "epoch": 0.37, + "grad_norm": 9.910783277980881, + "learning_rate": 3.959094190750172e-06, + "logits/chosen": -2.6816649436950684, + "logits/rejected": -2.6005473136901855, + "logps/chosen": -220.52005004882812, + "logps/rejected": -170.9810791015625, + "loss": 0.6839, + "positive_losses": 0.1656380593776703, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.20555934309959412, + "rewards/margins": 0.0838155522942543, + "rewards/margins_max": 0.25206801295280457, + "rewards/margins_min": -0.0647808387875557, + "rewards/margins_std": 0.14257046580314636, + "rewards/rejected": 0.1217438131570816, + "step": 1420 + }, + { + "dpo_losses": 0.65045166015625, + "epoch": 0.37, + "grad_norm": 2.470307932975548, + "learning_rate": 3.9404833730564975e-06, + "logits/chosen": -2.6928744316101074, + "logits/rejected": -2.687532663345337, + "logps/chosen": -249.73452758789062, + "logps/rejected": -229.1648406982422, + "loss": 0.6868, + "positive_losses": 0.3897302746772766, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18598373234272003, + "rewards/margins": 0.09566141664981842, + "rewards/margins_max": 0.2690790593624115, + "rewards/margins_min": -0.0811094120144844, + "rewards/margins_std": 0.15660539269447327, + "rewards/rejected": 0.09032230079174042, + "step": 1430 + }, + { + "dpo_losses": 0.6608055830001831, + "epoch": 0.38, + "grad_norm": 6.588009618051499, + "learning_rate": 3.921752275415712e-06, + "logits/chosen": -2.652783155441284, + "logits/rejected": -2.6326842308044434, + "logps/chosen": -299.107421875, + "logps/rejected": -297.75653076171875, + "loss": 0.6803, + "positive_losses": 0.592887818813324, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.1909155696630478, + "rewards/margins": 0.07359861582517624, + "rewards/margins_max": 0.25505146384239197, + "rewards/margins_min": -0.11408748477697372, + "rewards/margins_std": 0.1656920462846756, + "rewards/rejected": 0.11731694638729095, + "step": 1440 + }, + { + "dpo_losses": 0.6433164477348328, + "epoch": 0.38, + "grad_norm": 8.355582886960034, + "learning_rate": 3.902902461869079e-06, + "logits/chosen": -2.6364712715148926, + "logits/rejected": -2.578061103820801, + "logps/chosen": -302.90423583984375, + "logps/rejected": -246.289306640625, + "loss": 0.681, + "positive_losses": 0.42677387595176697, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.2319517880678177, + "rewards/margins": 0.11193177849054337, + "rewards/margins_max": 0.332444965839386, + "rewards/margins_min": -0.07925325632095337, + "rewards/margins_std": 0.18473513424396515, + "rewards/rejected": 0.12001999467611313, + "step": 1450 + }, + { + "dpo_losses": 0.6398014426231384, + "epoch": 0.38, + "grad_norm": 1.871566171277863, + "learning_rate": 3.883935506370605e-06, + "logits/chosen": -2.661167621612549, + "logits/rejected": -2.5897514820098877, + "logps/chosen": -277.9273376464844, + "logps/rejected": -250.48477172851562, + "loss": 0.651, + "positive_losses": 0.045377541333436966, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.22294461727142334, + "rewards/margins": 0.1170610636472702, + "rewards/margins_max": 0.2914854884147644, + "rewards/margins_min": -0.059104692190885544, + "rewards/margins_std": 0.1563470959663391, + "rewards/rejected": 0.10588352382183075, + "step": 1460 + }, + { + "dpo_losses": 0.6480587720870972, + "epoch": 0.38, + "grad_norm": 10.342228250269091, + "learning_rate": 3.864852992655617e-06, + "logits/chosen": -2.708836317062378, + "logits/rejected": -2.6821327209472656, + "logps/chosen": -267.07965087890625, + "logps/rejected": -222.532958984375, + "loss": 0.6606, + "positive_losses": 0.34216421842575073, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.20954665541648865, + "rewards/margins": 0.10420210659503937, + "rewards/margins_max": 0.3442058563232422, + "rewards/margins_min": -0.10880078375339508, + "rewards/margins_std": 0.20500075817108154, + "rewards/rejected": 0.10534457117319107, + "step": 1470 + }, + { + "dpo_losses": 0.6472574472427368, + "epoch": 0.39, + "grad_norm": 1.967146871691099, + "learning_rate": 3.845656514108516e-06, + "logits/chosen": -2.6916420459747314, + "logits/rejected": -2.6357762813568115, + "logps/chosen": -236.6306610107422, + "logps/rejected": -241.1412353515625, + "loss": 0.6832, + "positive_losses": 0.531735897064209, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.18670131266117096, + "rewards/margins": 0.10241687297821045, + "rewards/margins_max": 0.28860098123550415, + "rewards/margins_min": -0.07669075578451157, + "rewards/margins_std": 0.16355423629283905, + "rewards/rejected": 0.08428442478179932, + "step": 1480 + }, + { + "dpo_losses": 0.6345597505569458, + "epoch": 0.39, + "grad_norm": 9.877404415176946, + "learning_rate": 3.826347673629738e-06, + "logits/chosen": -2.63944411277771, + "logits/rejected": -2.6056394577026367, + "logps/chosen": -269.664794921875, + "logps/rejected": -252.0602569580078, + "loss": 0.7134, + "positive_losses": 0.5844457745552063, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.2085011899471283, + "rewards/margins": 0.13039520382881165, + "rewards/margins_max": 0.3248598575592041, + "rewards/margins_min": -0.060199182480573654, + "rewards/margins_std": 0.17145588994026184, + "rewards/rejected": 0.07810600847005844, + "step": 1490 + }, + { + "dpo_losses": 0.6581373810768127, + "epoch": 0.39, + "grad_norm": 1.655496173827722, + "learning_rate": 3.8069280835019062e-06, + "logits/chosen": -2.6964640617370605, + "logits/rejected": -2.643976926803589, + "logps/chosen": -222.6151885986328, + "logps/rejected": -192.56661987304688, + "loss": 0.6711, + "positive_losses": 0.15156669914722443, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.1984512060880661, + "rewards/margins": 0.0814533680677414, + "rewards/margins_max": 0.32450541853904724, + "rewards/margins_min": -0.09028832614421844, + "rewards/margins_std": 0.18675634264945984, + "rewards/rejected": 0.1169978603720665, + "step": 1500 + }, + { + "epoch": 0.39, + "eval_dpo_losses": 0.64888596534729, + "eval_logits/chosen": -2.6356375217437744, + "eval_logits/rejected": -2.5982918739318848, + "eval_logps/chosen": -262.6917419433594, + "eval_logps/rejected": -246.61277770996094, + "eval_loss": 0.6917663216590881, + "eval_positive_losses": 0.22132979333400726, + "eval_rewards/accuracies": 0.7039999961853027, + "eval_rewards/chosen": 0.21901659667491913, + "eval_rewards/margins": 0.09935507923364639, + "eval_rewards/margins_max": 0.3825626075267792, + "eval_rewards/margins_min": -0.14513574540615082, + "eval_rewards/margins_std": 0.1760016828775406, + "eval_rewards/rejected": 0.11966153234243393, + "eval_runtime": 428.4872, + "eval_samples_per_second": 4.668, + "eval_steps_per_second": 0.292, + "step": 1500 + }, + { + "dpo_losses": 0.6641441583633423, + "epoch": 0.4, + "grad_norm": 2.2737252489182502, + "learning_rate": 3.7873993652552077e-06, + "logits/chosen": -2.673140287399292, + "logits/rejected": -2.6399130821228027, + "logps/chosen": -282.4440002441406, + "logps/rejected": -251.7005157470703, + "loss": 0.6808, + "positive_losses": 0.2769942283630371, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.208525612950325, + "rewards/margins": 0.06747975200414658, + "rewards/margins_max": 0.2461291253566742, + "rewards/margins_min": -0.11353076994419098, + "rewards/margins_std": 0.16327622532844543, + "rewards/rejected": 0.14104586839675903, + "step": 1510 + }, + { + "dpo_losses": 0.6597987413406372, + "epoch": 0.4, + "grad_norm": 2.0541448420460746, + "learning_rate": 3.7677631495319953e-06, + "logits/chosen": -2.6400909423828125, + "logits/rejected": -2.607898235321045, + "logps/chosen": -241.0442657470703, + "logps/rejected": -222.66433715820312, + "loss": 0.6816, + "positive_losses": 0.32232433557510376, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.19642217457294464, + "rewards/margins": 0.07600647211074829, + "rewards/margins_max": 0.25835585594177246, + "rewards/margins_min": -0.11334244161844254, + "rewards/margins_std": 0.1659018099308014, + "rewards/rejected": 0.12041568756103516, + "step": 1520 + }, + { + "dpo_losses": 0.6496556997299194, + "epoch": 0.4, + "grad_norm": 1.6912819197772817, + "learning_rate": 3.748021075950633e-06, + "logits/chosen": -2.6623635292053223, + "logits/rejected": -2.6502346992492676, + "logps/chosen": -257.75347900390625, + "logps/rejected": -247.4079132080078, + "loss": 0.6791, + "positive_losses": 0.21904030442237854, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.21671731770038605, + "rewards/margins": 0.09616968035697937, + "rewards/margins_max": 0.26715341210365295, + "rewards/margins_min": -0.07714874297380447, + "rewards/margins_std": 0.1532919853925705, + "rewards/rejected": 0.12054765224456787, + "step": 1530 + }, + { + "dpo_losses": 0.6511906385421753, + "epoch": 0.4, + "grad_norm": 16.54138726575801, + "learning_rate": 3.7281747929685824e-06, + "logits/chosen": -2.678647041320801, + "logits/rejected": -2.6122212409973145, + "logps/chosen": -326.34210205078125, + "logps/rejected": -292.89617919921875, + "loss": 0.6907, + "positive_losses": 0.1314670592546463, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.24261975288391113, + "rewards/margins": 0.09828249365091324, + "rewards/margins_max": 0.30110496282577515, + "rewards/margins_min": -0.09005574136972427, + "rewards/margins_std": 0.1740257441997528, + "rewards/rejected": 0.1443372666835785, + "step": 1540 + }, + { + "dpo_losses": 0.6388503909111023, + "epoch": 0.41, + "grad_norm": 7.9614962006478205, + "learning_rate": 3.7082259577447604e-06, + "logits/chosen": -2.6226963996887207, + "logits/rejected": -2.6585097312927246, + "logps/chosen": -260.5569152832031, + "logps/rejected": -260.112548828125, + "loss": 0.6693, + "positive_losses": 0.0032606124877929688, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.24550947546958923, + "rewards/margins": 0.12715184688568115, + "rewards/margins_max": 0.3580287992954254, + "rewards/margins_min": -0.05972598120570183, + "rewards/margins_std": 0.18930241465568542, + "rewards/rejected": 0.11835767328739166, + "step": 1550 + }, + { + "dpo_losses": 0.645304799079895, + "epoch": 0.41, + "grad_norm": 2.1291018894840246, + "learning_rate": 3.6881762360011688e-06, + "logits/chosen": -2.679312229156494, + "logits/rejected": -2.6295382976531982, + "logps/chosen": -253.4844207763672, + "logps/rejected": -228.0343475341797, + "loss": 0.694, + "positive_losses": 0.2775608003139496, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.25411921739578247, + "rewards/margins": 0.11943242698907852, + "rewards/margins_max": 0.4344254434108734, + "rewards/margins_min": -0.09446341544389725, + "rewards/margins_std": 0.23939839005470276, + "rewards/rejected": 0.13468676805496216, + "step": 1560 + }, + { + "dpo_losses": 0.6481470465660095, + "epoch": 0.41, + "grad_norm": 19.61157856144872, + "learning_rate": 3.668027301883802e-06, + "logits/chosen": -2.7096943855285645, + "logits/rejected": -2.7046380043029785, + "logps/chosen": -293.19866943359375, + "logps/rejected": -256.16204833984375, + "loss": 0.6653, + "positive_losses": 0.23131971061229706, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21611352264881134, + "rewards/margins": 0.10061807930469513, + "rewards/margins_max": 0.28255659341812134, + "rewards/margins_min": -0.09664733707904816, + "rewards/margins_std": 0.1659117192029953, + "rewards/rejected": 0.1154954582452774, + "step": 1570 + }, + { + "dpo_losses": 0.6671686172485352, + "epoch": 0.41, + "grad_norm": 8.99614463457168, + "learning_rate": 3.64778083782286e-06, + "logits/chosen": -2.6565983295440674, + "logits/rejected": -2.6668851375579834, + "logps/chosen": -275.6929626464844, + "logps/rejected": -244.76864624023438, + "loss": 0.6833, + "positive_losses": 0.2777779698371887, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.19542255997657776, + "rewards/margins": 0.06095566228032112, + "rewards/margins_max": 0.23151791095733643, + "rewards/margins_min": -0.08547428995370865, + "rewards/margins_std": 0.14268240332603455, + "rewards/rejected": 0.13446690142154694, + "step": 1580 + }, + { + "dpo_losses": 0.6683533191680908, + "epoch": 0.42, + "grad_norm": 14.015471492679415, + "learning_rate": 3.627438534392268e-06, + "logits/chosen": -2.6828346252441406, + "logits/rejected": -2.7137646675109863, + "logps/chosen": -266.16461181640625, + "logps/rejected": -254.8044891357422, + "loss": 0.6805, + "positive_losses": 0.3196195662021637, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.18736909329891205, + "rewards/margins": 0.06001616641879082, + "rewards/margins_max": 0.27491524815559387, + "rewards/margins_min": -0.14767390489578247, + "rewards/margins_std": 0.1888178288936615, + "rewards/rejected": 0.12735293805599213, + "step": 1590 + }, + { + "dpo_losses": 0.6401321291923523, + "epoch": 0.42, + "grad_norm": 1.5944175818008404, + "learning_rate": 3.607002090168506e-06, + "logits/chosen": -2.6533164978027344, + "logits/rejected": -2.6130764484405518, + "logps/chosen": -241.86148071289062, + "logps/rejected": -215.41580200195312, + "loss": 0.7428, + "positive_losses": 0.21368694305419922, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.23043449223041534, + "rewards/margins": 0.11721567064523697, + "rewards/margins_max": 0.3013015687465668, + "rewards/margins_min": -0.0614323690533638, + "rewards/margins_std": 0.16052599251270294, + "rewards/rejected": 0.11321882903575897, + "step": 1600 + }, + { + "epoch": 0.42, + "eval_dpo_losses": 0.6501025557518005, + "eval_logits/chosen": -2.632763624191284, + "eval_logits/rejected": -2.597917079925537, + "eval_logps/chosen": -262.6611022949219, + "eval_logps/rejected": -246.29571533203125, + "eval_loss": 0.6866611242294312, + "eval_positive_losses": 0.16520653665065765, + "eval_rewards/accuracies": 0.7009999752044678, + "eval_rewards/chosen": 0.21932312846183777, + "eval_rewards/margins": 0.09649096429347992, + "eval_rewards/margins_max": 0.37302011251449585, + "eval_rewards/margins_min": -0.14475053548812866, + "eval_rewards/margins_std": 0.17299990355968475, + "eval_rewards/rejected": 0.12283217161893845, + "eval_runtime": 428.1614, + "eval_samples_per_second": 4.671, + "eval_steps_per_second": 0.292, + "step": 1600 + }, + { + "dpo_losses": 0.6536516547203064, + "epoch": 0.42, + "grad_norm": 1.878714442624115, + "learning_rate": 3.586473211588787e-06, + "logits/chosen": -2.6485228538513184, + "logits/rejected": -2.6389007568359375, + "logps/chosen": -265.40814208984375, + "logps/rejected": -233.7576904296875, + "loss": 0.6639, + "positive_losses": 0.14214439690113068, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.207864448428154, + "rewards/margins": 0.09171368181705475, + "rewards/margins_max": 0.3362428545951843, + "rewards/margins_min": -0.09460137039422989, + "rewards/margins_std": 0.19438976049423218, + "rewards/rejected": 0.11615077406167984, + "step": 1610 + }, + { + "dpo_losses": 0.6428729891777039, + "epoch": 0.42, + "grad_norm": 2.2033140493675387, + "learning_rate": 3.5658536128085623e-06, + "logits/chosen": -2.584670305252075, + "logits/rejected": -2.5773091316223145, + "logps/chosen": -260.7598876953125, + "logps/rejected": -268.07696533203125, + "loss": 0.6476, + "positive_losses": 0.1323743760585785, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.2739534378051758, + "rewards/margins": 0.11782930046319962, + "rewards/margins_max": 0.36470386385917664, + "rewards/margins_min": -0.07272644340991974, + "rewards/margins_std": 0.1928330510854721, + "rewards/rejected": 0.15612414479255676, + "step": 1620 + }, + { + "dpo_losses": 0.6407750844955444, + "epoch": 0.43, + "grad_norm": 6.196434583225216, + "learning_rate": 3.545145015558399e-06, + "logits/chosen": -2.6906514167785645, + "logits/rejected": -2.6470489501953125, + "logps/chosen": -269.22802734375, + "logps/rejected": -256.2124938964844, + "loss": 0.6687, + "positive_losses": 0.28842735290527344, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.2171546220779419, + "rewards/margins": 0.11523237079381943, + "rewards/margins_max": 0.2809963822364807, + "rewards/margins_min": -0.055764008313417435, + "rewards/margins_std": 0.15428276360034943, + "rewards/rejected": 0.10192225128412247, + "step": 1630 + }, + { + "dpo_losses": 0.6404193043708801, + "epoch": 0.43, + "grad_norm": 2.1957397045270692, + "learning_rate": 3.5243491490002056e-06, + "logits/chosen": -2.7068564891815186, + "logits/rejected": -2.6719727516174316, + "logps/chosen": -272.58416748046875, + "logps/rejected": -227.26211547851562, + "loss": 0.6776, + "positive_losses": 0.1865355521440506, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.2290363609790802, + "rewards/margins": 0.1175479143857956, + "rewards/margins_max": 0.3120426833629608, + "rewards/margins_min": -0.06566186249256134, + "rewards/margins_std": 0.1665848195552826, + "rewards/rejected": 0.1114884465932846, + "step": 1640 + }, + { + "dpo_losses": 0.6327579617500305, + "epoch": 0.43, + "grad_norm": 2.3248961927968943, + "learning_rate": 3.503467749582857e-06, + "logits/chosen": -2.653707981109619, + "logits/rejected": -2.5981452465057373, + "logps/chosen": -308.4657897949219, + "logps/rejected": -257.87567138671875, + "loss": 0.6781, + "positive_losses": 0.5005988478660583, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.22237035632133484, + "rewards/margins": 0.13833247125148773, + "rewards/margins_max": 0.3445996940135956, + "rewards/margins_min": -0.08490337431430817, + "rewards/margins_std": 0.19539986550807953, + "rewards/rejected": 0.0840378999710083, + "step": 1650 + }, + { + "dpo_losses": 0.6430984139442444, + "epoch": 0.43, + "grad_norm": 5.0279293119427955, + "learning_rate": 3.4825025608971947e-06, + "logits/chosen": -2.6000447273254395, + "logits/rejected": -2.57517671585083, + "logps/chosen": -235.9827117919922, + "logps/rejected": -258.79815673828125, + "loss": 0.6541, + "positive_losses": 0.03592414781451225, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.2132348269224167, + "rewards/margins": 0.1130198985338211, + "rewards/margins_max": 0.3298777937889099, + "rewards/margins_min": -0.07961928844451904, + "rewards/margins_std": 0.17789717018604279, + "rewards/rejected": 0.10021491348743439, + "step": 1660 + }, + { + "dpo_losses": 0.6461843848228455, + "epoch": 0.44, + "grad_norm": 1.7008980006114085, + "learning_rate": 3.4614553335304407e-06, + "logits/chosen": -2.6518378257751465, + "logits/rejected": -2.6103954315185547, + "logps/chosen": -238.67007446289062, + "logps/rejected": -220.99642944335938, + "loss": 0.6762, + "positive_losses": 0.42069491744041443, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23298203945159912, + "rewards/margins": 0.10479624569416046, + "rewards/margins_max": 0.28432926535606384, + "rewards/margins_min": -0.09235044568777084, + "rewards/margins_std": 0.17162147164344788, + "rewards/rejected": 0.12818579375743866, + "step": 1670 + }, + { + "dpo_losses": 0.6569772362709045, + "epoch": 0.44, + "grad_norm": 14.248715526847992, + "learning_rate": 3.4403278249200222e-06, + "logits/chosen": -2.6170597076416016, + "logits/rejected": -2.594456672668457, + "logps/chosen": -244.982421875, + "logps/rejected": -260.5910339355469, + "loss": 0.6837, + "positive_losses": 0.6187906265258789, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.20160908997058868, + "rewards/margins": 0.08317351341247559, + "rewards/margins_max": 0.2904582917690277, + "rewards/margins_min": -0.11839810758829117, + "rewards/margins_std": 0.1846894472837448, + "rewards/rejected": 0.1184355840086937, + "step": 1680 + }, + { + "dpo_losses": 0.6429563164710999, + "epoch": 0.44, + "grad_norm": 7.834677506500148, + "learning_rate": 3.4191217992068293e-06, + "logits/chosen": -2.5974044799804688, + "logits/rejected": -2.604295015335083, + "logps/chosen": -232.93344116210938, + "logps/rejected": -248.7979736328125, + "loss": 0.6826, + "positive_losses": 0.513172447681427, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.20246489346027374, + "rewards/margins": 0.11489985883235931, + "rewards/margins_max": 0.3566764295101166, + "rewards/margins_min": -0.07884009182453156, + "rewards/margins_std": 0.19784900546073914, + "rewards/rejected": 0.08756502717733383, + "step": 1690 + }, + { + "dpo_losses": 0.6462850570678711, + "epoch": 0.44, + "grad_norm": 13.119844132404689, + "learning_rate": 3.3978390270879056e-06, + "logits/chosen": -2.7017605304718018, + "logits/rejected": -2.7095248699188232, + "logps/chosen": -226.1155242919922, + "logps/rejected": -251.32919311523438, + "loss": 0.6593, + "positive_losses": 0.1556541472673416, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.20098766684532166, + "rewards/margins": 0.1058509349822998, + "rewards/margins_max": 0.2990228533744812, + "rewards/margins_min": -0.09451510012149811, + "rewards/margins_std": 0.17923924326896667, + "rewards/rejected": 0.09513673931360245, + "step": 1700 + }, + { + "epoch": 0.44, + "eval_dpo_losses": 0.6466652154922485, + "eval_logits/chosen": -2.6614034175872803, + "eval_logits/rejected": -2.6261913776397705, + "eval_logps/chosen": -262.3858947753906, + "eval_logps/rejected": -246.8526153564453, + "eval_loss": 0.6785080432891846, + "eval_positive_losses": 0.22281299531459808, + "eval_rewards/accuracies": 0.7110000252723694, + "eval_rewards/chosen": 0.22207526862621307, + "eval_rewards/margins": 0.10481205582618713, + "eval_rewards/margins_max": 0.39778730273246765, + "eval_rewards/margins_min": -0.1483326256275177, + "eval_rewards/margins_std": 0.18249405920505524, + "eval_rewards/rejected": 0.11726321280002594, + "eval_runtime": 427.9753, + "eval_samples_per_second": 4.673, + "eval_steps_per_second": 0.292, + "step": 1700 + }, + { + "dpo_losses": 0.6631157994270325, + "epoch": 0.45, + "grad_norm": 13.091006480923983, + "learning_rate": 3.3764812856685995e-06, + "logits/chosen": -2.752492904663086, + "logits/rejected": -2.7280375957489014, + "logps/chosen": -243.3994903564453, + "logps/rejected": -230.54507446289062, + "loss": 0.6786, + "positive_losses": 0.2162889540195465, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.22619624435901642, + "rewards/margins": 0.07061522454023361, + "rewards/margins_max": 0.2632225751876831, + "rewards/margins_min": -0.11669723689556122, + "rewards/margins_std": 0.16627001762390137, + "rewards/rejected": 0.1555810272693634, + "step": 1710 + }, + { + "dpo_losses": 0.6593276858329773, + "epoch": 0.45, + "grad_norm": 2.1784248660099768, + "learning_rate": 3.3550503583141726e-06, + "logits/chosen": -2.6774322986602783, + "logits/rejected": -2.670872211456299, + "logps/chosen": -237.01089477539062, + "logps/rejected": -249.7379913330078, + "loss": 0.6847, + "positive_losses": 0.5006786584854126, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.2009124457836151, + "rewards/margins": 0.07849615067243576, + "rewards/margins_max": 0.31795603036880493, + "rewards/margins_min": -0.1156822070479393, + "rewards/margins_std": 0.19167271256446838, + "rewards/rejected": 0.12241628021001816, + "step": 1720 + }, + { + "dpo_losses": 0.6505664587020874, + "epoch": 0.45, + "grad_norm": 13.791308134245675, + "learning_rate": 3.3335480345008907e-06, + "logits/chosen": -2.6758742332458496, + "logits/rejected": -2.640214443206787, + "logps/chosen": -251.5399169921875, + "logps/rejected": -258.8959045410156, + "loss": 0.7027, + "positive_losses": 0.018910503014922142, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.1981344223022461, + "rewards/margins": 0.0945214182138443, + "rewards/margins_max": 0.2809906601905823, + "rewards/margins_min": -0.058115411549806595, + "rewards/margins_std": 0.15180954337120056, + "rewards/rejected": 0.10361298173666, + "step": 1730 + }, + { + "dpo_losses": 0.6440737247467041, + "epoch": 0.46, + "grad_norm": 17.506469402688413, + "learning_rate": 3.3119761096666055e-06, + "logits/chosen": -2.607551097869873, + "logits/rejected": -2.5610995292663574, + "logps/chosen": -246.9075469970703, + "logps/rejected": -207.48495483398438, + "loss": 0.6828, + "positive_losses": 0.12918797135353088, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.22817876935005188, + "rewards/margins": 0.11010940372943878, + "rewards/margins_max": 0.3282391130924225, + "rewards/margins_min": -0.06957204639911652, + "rewards/margins_std": 0.1807454377412796, + "rewards/rejected": 0.1180693507194519, + "step": 1740 + }, + { + "dpo_losses": 0.6257420182228088, + "epoch": 0.46, + "grad_norm": 5.253192475910705, + "learning_rate": 3.290336385060832e-06, + "logits/chosen": -2.7200748920440674, + "logits/rejected": -2.7011024951934814, + "logps/chosen": -293.35955810546875, + "logps/rejected": -290.0611877441406, + "loss": 0.6538, + "positive_losses": 0.06569008529186249, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.23800742626190186, + "rewards/margins": 0.15084929764270782, + "rewards/margins_max": 0.3628466725349426, + "rewards/margins_min": -0.0599190779030323, + "rewards/margins_std": 0.18702241778373718, + "rewards/rejected": 0.08715813606977463, + "step": 1750 + }, + { + "dpo_losses": 0.6414699554443359, + "epoch": 0.46, + "grad_norm": 2.0931786284207314, + "learning_rate": 3.268630667594348e-06, + "logits/chosen": -2.6284737586975098, + "logits/rejected": -2.5958545207977295, + "logps/chosen": -246.00051879882812, + "logps/rejected": -238.9696807861328, + "loss": 0.6537, + "positive_losses": 0.22344326972961426, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.23227672278881073, + "rewards/margins": 0.11926314979791641, + "rewards/margins_max": 0.36400899291038513, + "rewards/margins_min": -0.09165000170469284, + "rewards/margins_std": 0.2008214294910431, + "rewards/rejected": 0.11301358044147491, + "step": 1760 + }, + { + "dpo_losses": 0.6561053991317749, + "epoch": 0.46, + "grad_norm": 12.468139939453867, + "learning_rate": 3.2468607696883147e-06, + "logits/chosen": -2.6870360374450684, + "logits/rejected": -2.665567398071289, + "logps/chosen": -256.4646301269531, + "logps/rejected": -250.0025177001953, + "loss": 0.698, + "positive_losses": 0.25985726714134216, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.218532532453537, + "rewards/margins": 0.08526696264743805, + "rewards/margins_max": 0.265904039144516, + "rewards/margins_min": -0.08091627061367035, + "rewards/margins_std": 0.15936391055583954, + "rewards/rejected": 0.13326558470726013, + "step": 1770 + }, + { + "dpo_losses": 0.6368280649185181, + "epoch": 0.47, + "grad_norm": 11.775675540020119, + "learning_rate": 3.225028509122944e-06, + "logits/chosen": -2.6872940063476562, + "logits/rejected": -2.5880813598632812, + "logps/chosen": -300.2762756347656, + "logps/rejected": -226.71914672851562, + "loss": 0.6672, + "positive_losses": 0.22082766890525818, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.24134385585784912, + "rewards/margins": 0.12595918774604797, + "rewards/margins_max": 0.31853508949279785, + "rewards/margins_min": -0.07879041135311127, + "rewards/margins_std": 0.17602954804897308, + "rewards/rejected": 0.11538468301296234, + "step": 1780 + }, + { + "dpo_losses": 0.6583558320999146, + "epoch": 0.47, + "grad_norm": 2.0654924820537137, + "learning_rate": 3.2031357088857083e-06, + "logits/chosen": -2.714325189590454, + "logits/rejected": -2.6391549110412598, + "logps/chosen": -302.1825866699219, + "logps/rejected": -247.6293487548828, + "loss": 0.6977, + "positive_losses": 0.5240899920463562, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.2267806977033615, + "rewards/margins": 0.08057653903961182, + "rewards/margins_max": 0.28991439938545227, + "rewards/margins_min": -0.10471781343221664, + "rewards/margins_std": 0.17987985908985138, + "rewards/rejected": 0.1462041437625885, + "step": 1790 + }, + { + "dpo_losses": 0.6564275622367859, + "epoch": 0.47, + "grad_norm": 2.227088625134092, + "learning_rate": 3.181184197019127e-06, + "logits/chosen": -2.657984972000122, + "logits/rejected": -2.5912022590637207, + "logps/chosen": -291.07904052734375, + "logps/rejected": -244.7218780517578, + "loss": 0.6856, + "positive_losses": 0.2376052886247635, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.20932650566101074, + "rewards/margins": 0.08089903742074966, + "rewards/margins_max": 0.23547455668449402, + "rewards/margins_min": -0.06425313651561737, + "rewards/margins_std": 0.13408055901527405, + "rewards/rejected": 0.12842747569084167, + "step": 1800 + }, + { + "epoch": 0.47, + "eval_dpo_losses": 0.6504243612289429, + "eval_logits/chosen": -2.6325595378875732, + "eval_logits/rejected": -2.5972495079040527, + "eval_logps/chosen": -261.41424560546875, + "eval_logps/rejected": -245.0161590576172, + "eval_loss": 0.6702442169189453, + "eval_positive_losses": 0.13426439464092255, + "eval_rewards/accuracies": 0.6980000138282776, + "eval_rewards/chosen": 0.2317916303873062, + "eval_rewards/margins": 0.09616386890411377, + "eval_rewards/margins_max": 0.37600037455558777, + "eval_rewards/margins_min": -0.14539237320423126, + "eval_rewards/margins_std": 0.17481529712677002, + "eval_rewards/rejected": 0.13562773168087006, + "eval_runtime": 428.5251, + "eval_samples_per_second": 4.667, + "eval_steps_per_second": 0.292, + "step": 1800 + }, + { + "dpo_losses": 0.648065447807312, + "epoch": 0.47, + "grad_norm": 1.9905249745676243, + "learning_rate": 3.159175806468126e-06, + "logits/chosen": -2.6667561531066895, + "logits/rejected": -2.604173183441162, + "logps/chosen": -298.1353454589844, + "logps/rejected": -280.3143615722656, + "loss": 0.6757, + "positive_losses": 0.18467631936073303, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.22995993494987488, + "rewards/margins": 0.10116241872310638, + "rewards/margins_max": 0.3034105896949768, + "rewards/margins_min": -0.09748566150665283, + "rewards/margins_std": 0.17770439386367798, + "rewards/rejected": 0.1287975013256073, + "step": 1810 + }, + { + "dpo_losses": 0.6494191884994507, + "epoch": 0.48, + "grad_norm": 8.822187145087618, + "learning_rate": 3.1371123749269804e-06, + "logits/chosen": -2.6063125133514404, + "logits/rejected": -2.578479290008545, + "logps/chosen": -256.0450439453125, + "logps/rejected": -266.94189453125, + "loss": 0.6798, + "positive_losses": 0.39024466276168823, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.24025802314281464, + "rewards/margins": 0.10246167331933975, + "rewards/margins_max": 0.33869725465774536, + "rewards/margins_min": -0.07729412615299225, + "rewards/margins_std": 0.1838615983724594, + "rewards/rejected": 0.1377963423728943, + "step": 1820 + }, + { + "dpo_losses": 0.6494563817977905, + "epoch": 0.48, + "grad_norm": 1.8413912021959202, + "learning_rate": 3.114995744685877e-06, + "logits/chosen": -2.646260976791382, + "logits/rejected": -2.6059529781341553, + "logps/chosen": -284.9770812988281, + "logps/rejected": -316.1197204589844, + "loss": 0.6638, + "positive_losses": 0.3104667663574219, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.24419507384300232, + "rewards/margins": 0.10210222005844116, + "rewards/margins_max": 0.34120216965675354, + "rewards/margins_min": -0.10375956445932388, + "rewards/margins_std": 0.20285239815711975, + "rewards/rejected": 0.14209285378456116, + "step": 1830 + }, + { + "dpo_losses": 0.647398829460144, + "epoch": 0.48, + "grad_norm": 12.69858491478049, + "learning_rate": 3.0928277624770743e-06, + "logits/chosen": -2.650028705596924, + "logits/rejected": -2.5910632610321045, + "logps/chosen": -240.1370391845703, + "logps/rejected": -220.29757690429688, + "loss": 0.6698, + "positive_losses": 0.24458742141723633, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.21732616424560547, + "rewards/margins": 0.10281439870595932, + "rewards/margins_max": 0.3056146502494812, + "rewards/margins_min": -0.07628180831670761, + "rewards/margins_std": 0.17483511567115784, + "rewards/rejected": 0.11451175063848495, + "step": 1840 + }, + { + "dpo_losses": 0.6395186185836792, + "epoch": 0.48, + "grad_norm": 2.0071692716628116, + "learning_rate": 3.070610279320708e-06, + "logits/chosen": -2.639828681945801, + "logits/rejected": -2.5725197792053223, + "logps/chosen": -242.0826873779297, + "logps/rejected": -234.2345733642578, + "loss": 0.6526, + "positive_losses": 0.12014941871166229, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.20422932505607605, + "rewards/margins": 0.11995555460453033, + "rewards/margins_max": 0.28611624240875244, + "rewards/margins_min": -0.06965469568967819, + "rewards/margins_std": 0.1571904718875885, + "rewards/rejected": 0.08427377045154572, + "step": 1850 + }, + { + "dpo_losses": 0.6299269795417786, + "epoch": 0.49, + "grad_norm": 11.192479224306089, + "learning_rate": 3.0483451503702264e-06, + "logits/chosen": -2.6060972213745117, + "logits/rejected": -2.5461373329162598, + "logps/chosen": -283.3192443847656, + "logps/rejected": -275.7360534667969, + "loss": 0.6694, + "positive_losses": 0.40090227127075195, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.23160043358802795, + "rewards/margins": 0.15093091130256653, + "rewards/margins_max": 0.452861487865448, + "rewards/margins_min": -0.10485140979290009, + "rewards/margins_std": 0.247911736369133, + "rewards/rejected": 0.08066950738430023, + "step": 1860 + }, + { + "dpo_losses": 0.6297956705093384, + "epoch": 0.49, + "grad_norm": 1.9491342897024369, + "learning_rate": 3.0260342347574916e-06, + "logits/chosen": -2.612184524536133, + "logits/rejected": -2.5798373222351074, + "logps/chosen": -266.5758972167969, + "logps/rejected": -257.8975830078125, + "loss": 0.638, + "positive_losses": 0.0, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.22928202152252197, + "rewards/margins": 0.1414060890674591, + "rewards/margins_max": 0.3423546850681305, + "rewards/margins_min": -0.05438286066055298, + "rewards/margins_std": 0.1759631633758545, + "rewards/rejected": 0.08787593990564346, + "step": 1870 + }, + { + "dpo_losses": 0.633985698223114, + "epoch": 0.49, + "grad_norm": 25.29399641975962, + "learning_rate": 3.0036793954375358e-06, + "logits/chosen": -2.6048264503479004, + "logits/rejected": -2.570802688598633, + "logps/chosen": -242.4512176513672, + "logps/rejected": -228.9194793701172, + "loss": 0.6871, + "positive_losses": 0.2523138225078583, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.2291102111339569, + "rewards/margins": 0.13221105933189392, + "rewards/margins_max": 0.33114004135131836, + "rewards/margins_min": -0.06668253242969513, + "rewards/margins_std": 0.18370743095874786, + "rewards/rejected": 0.09689915925264359, + "step": 1880 + }, + { + "dpo_losses": 0.6468071341514587, + "epoch": 0.49, + "grad_norm": 2.190082986905398, + "learning_rate": 2.981282499033009e-06, + "logits/chosen": -2.689648151397705, + "logits/rejected": -2.6369049549102783, + "logps/chosen": -289.0721740722656, + "logps/rejected": -275.77740478515625, + "loss": 0.6958, + "positive_losses": 0.37820395827293396, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.2109154462814331, + "rewards/margins": 0.10500963032245636, + "rewards/margins_max": 0.3085101246833801, + "rewards/margins_min": -0.08139508962631226, + "rewards/margins_std": 0.17975106835365295, + "rewards/rejected": 0.10590583086013794, + "step": 1890 + }, + { + "dpo_losses": 0.6324842572212219, + "epoch": 0.5, + "grad_norm": 1.733292043939962, + "learning_rate": 2.9588454156783163e-06, + "logits/chosen": -2.688908338546753, + "logits/rejected": -2.6539764404296875, + "logps/chosen": -274.11370849609375, + "logps/rejected": -247.70663452148438, + "loss": 0.6552, + "positive_losses": 0.31979283690452576, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22242672741413116, + "rewards/margins": 0.1341570019721985, + "rewards/margins_max": 0.3386608958244324, + "rewards/margins_min": -0.04844938963651657, + "rewards/margins_std": 0.1713506430387497, + "rewards/rejected": 0.08826972544193268, + "step": 1900 + }, + { + "epoch": 0.5, + "eval_dpo_losses": 0.6483587622642517, + "eval_logits/chosen": -2.6118340492248535, + "eval_logits/rejected": -2.5760679244995117, + "eval_logps/chosen": -261.8096008300781, + "eval_logps/rejected": -245.90634155273438, + "eval_loss": 0.6742714643478394, + "eval_positive_losses": 0.18548505008220673, + "eval_rewards/accuracies": 0.6990000009536743, + "eval_rewards/chosen": 0.22783830761909485, + "eval_rewards/margins": 0.10111244767904282, + "eval_rewards/margins_max": 0.39197680354118347, + "eval_rewards/margins_min": -0.14937348663806915, + "eval_rewards/margins_std": 0.1816234290599823, + "eval_rewards/rejected": 0.12672588229179382, + "eval_runtime": 428.2063, + "eval_samples_per_second": 4.671, + "eval_steps_per_second": 0.292, + "step": 1900 + }, + { + "dpo_losses": 0.6267801523208618, + "epoch": 0.5, + "grad_norm": 15.330194456057852, + "learning_rate": 2.9363700188634597e-06, + "logits/chosen": -2.6190261840820312, + "logits/rejected": -2.625927209854126, + "logps/chosen": -255.4626007080078, + "logps/rejected": -255.8193359375, + "loss": 0.6525, + "positive_losses": 0.085240438580513, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.26029080152511597, + "rewards/margins": 0.14714768528938293, + "rewards/margins_max": 0.3583988547325134, + "rewards/margins_min": -0.02962956205010414, + "rewards/margins_std": 0.16915565729141235, + "rewards/rejected": 0.11314307153224945, + "step": 1910 + }, + { + "dpo_losses": 0.6608460545539856, + "epoch": 0.5, + "grad_norm": 2.001834623969936, + "learning_rate": 2.9138581852776053e-06, + "logits/chosen": -2.610356569290161, + "logits/rejected": -2.593571662902832, + "logps/chosen": -240.2068634033203, + "logps/rejected": -213.23709106445312, + "loss": 0.6832, + "positive_losses": 0.3589244782924652, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.20095577836036682, + "rewards/margins": 0.07470414787530899, + "rewards/margins_max": 0.26924365758895874, + "rewards/margins_min": -0.11080431938171387, + "rewards/margins_std": 0.16561010479927063, + "rewards/rejected": 0.12625160813331604, + "step": 1920 + }, + { + "dpo_losses": 0.6245434880256653, + "epoch": 0.51, + "grad_norm": 11.289943061462033, + "learning_rate": 2.8913117946523805e-06, + "logits/chosen": -2.691692590713501, + "logits/rejected": -2.6207187175750732, + "logps/chosen": -280.5381164550781, + "logps/rejected": -236.1483154296875, + "loss": 0.6727, + "positive_losses": 0.04957924038171768, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24948105216026306, + "rewards/margins": 0.15194857120513916, + "rewards/margins_max": 0.365040123462677, + "rewards/margins_min": -0.04834876209497452, + "rewards/margins_std": 0.18084149062633514, + "rewards/rejected": 0.0975324809551239, + "step": 1930 + }, + { + "dpo_losses": 0.6420565843582153, + "epoch": 0.51, + "grad_norm": 22.284367111904366, + "learning_rate": 2.8687327296049126e-06, + "logits/chosen": -2.6661148071289062, + "logits/rejected": -2.6287002563476562, + "logps/chosen": -273.7840881347656, + "logps/rejected": -255.37850952148438, + "loss": 0.6836, + "positive_losses": 0.5944596529006958, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.20418283343315125, + "rewards/margins": 0.11511027812957764, + "rewards/margins_max": 0.30449509620666504, + "rewards/margins_min": -0.08485864102840424, + "rewards/margins_std": 0.1740976870059967, + "rewards/rejected": 0.0890725702047348, + "step": 1940 + }, + { + "dpo_losses": 0.6480592489242554, + "epoch": 0.51, + "grad_norm": 1.9088918795813237, + "learning_rate": 2.8461228754806376e-06, + "logits/chosen": -2.661724090576172, + "logits/rejected": -2.6638152599334717, + "logps/chosen": -252.10205078125, + "logps/rejected": -240.0064697265625, + "loss": 0.6628, + "positive_losses": 0.27016982436180115, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.22642064094543457, + "rewards/margins": 0.10169925540685654, + "rewards/margins_max": 0.32218581438064575, + "rewards/margins_min": -0.07501425594091415, + "rewards/margins_std": 0.17813637852668762, + "rewards/rejected": 0.12472137063741684, + "step": 1950 + }, + { + "dpo_losses": 0.6339899301528931, + "epoch": 0.51, + "grad_norm": 2.0925300801985247, + "learning_rate": 2.823484120195865e-06, + "logits/chosen": -2.636782169342041, + "logits/rejected": -2.610898494720459, + "logps/chosen": -274.0904846191406, + "logps/rejected": -233.5660858154297, + "loss": 0.6656, + "positive_losses": 0.0016654968494549394, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.2515600323677063, + "rewards/margins": 0.13343092799186707, + "rewards/margins_max": 0.36472180485725403, + "rewards/margins_min": -0.06343535333871841, + "rewards/margins_std": 0.1956871747970581, + "rewards/rejected": 0.11812911927700043, + "step": 1960 + }, + { + "dpo_losses": 0.6549708843231201, + "epoch": 0.52, + "grad_norm": 1.8122554482530602, + "learning_rate": 2.8008183540801486e-06, + "logits/chosen": -2.6718621253967285, + "logits/rejected": -2.6593704223632812, + "logps/chosen": -234.1223602294922, + "logps/rejected": -219.40792846679688, + "loss": 0.6895, + "positive_losses": 0.42588481307029724, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.2129441797733307, + "rewards/margins": 0.08714379370212555, + "rewards/margins_max": 0.2743634283542633, + "rewards/margins_min": -0.10051698982715607, + "rewards/margins_std": 0.16739001870155334, + "rewards/rejected": 0.12580038607120514, + "step": 1970 + }, + { + "dpo_losses": 0.6716464161872864, + "epoch": 0.52, + "grad_norm": 11.150848876388851, + "learning_rate": 2.7781274697184353e-06, + "logits/chosen": -2.6749327182769775, + "logits/rejected": -2.625612258911133, + "logps/chosen": -221.8278045654297, + "logps/rejected": -226.98898315429688, + "loss": 0.6794, + "positive_losses": 0.11149444431066513, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.20755262672901154, + "rewards/margins": 0.050573475658893585, + "rewards/margins_max": 0.23641857504844666, + "rewards/margins_min": -0.10350849479436874, + "rewards/margins_std": 0.15354886651039124, + "rewards/rejected": 0.15697914361953735, + "step": 1980 + }, + { + "dpo_losses": 0.6682985424995422, + "epoch": 0.52, + "grad_norm": 1.8371998444460573, + "learning_rate": 2.7554133617930397e-06, + "logits/chosen": -2.6378262042999268, + "logits/rejected": -2.545085906982422, + "logps/chosen": -248.79019165039062, + "logps/rejected": -216.0754852294922, + "loss": 0.6744, + "positive_losses": 0.17018738389015198, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.2032647579908371, + "rewards/margins": 0.05912317708134651, + "rewards/margins_max": 0.236159086227417, + "rewards/margins_min": -0.11596567928791046, + "rewards/margins_std": 0.1587970107793808, + "rewards/rejected": 0.1441415697336197, + "step": 1990 + }, + { + "dpo_losses": 0.6558287143707275, + "epoch": 0.52, + "grad_norm": 5.998555645346825, + "learning_rate": 2.7326779269254363e-06, + "logits/chosen": -2.6352341175079346, + "logits/rejected": -2.6149191856384277, + "logps/chosen": -226.8389434814453, + "logps/rejected": -214.03012084960938, + "loss": 0.6577, + "positive_losses": 0.10618214309215546, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.24029576778411865, + "rewards/margins": 0.08520184457302094, + "rewards/margins_max": 0.26697662472724915, + "rewards/margins_min": -0.11164041608572006, + "rewards/margins_std": 0.17001576721668243, + "rewards/rejected": 0.1550939381122589, + "step": 2000 + }, + { + "epoch": 0.52, + "eval_dpo_losses": 0.6460915207862854, + "eval_logits/chosen": -2.624117136001587, + "eval_logits/rejected": -2.586920738220215, + "eval_logps/chosen": -261.489013671875, + "eval_logps/rejected": -246.10638427734375, + "eval_loss": 0.6748321652412415, + "eval_positive_losses": 0.20357099175453186, + "eval_rewards/accuracies": 0.7089999914169312, + "eval_rewards/chosen": 0.23104406893253326, + "eval_rewards/margins": 0.10631891340017319, + "eval_rewards/margins_max": 0.4015742838382721, + "eval_rewards/margins_min": -0.152598038315773, + "eval_rewards/margins_std": 0.18525823950767517, + "eval_rewards/rejected": 0.12472515553236008, + "eval_runtime": 428.2412, + "eval_samples_per_second": 4.67, + "eval_steps_per_second": 0.292, + "step": 2000 + }, + { + "dpo_losses": 0.6374010443687439, + "epoch": 0.53, + "grad_norm": 1.973224943165519, + "learning_rate": 2.7099230635178954e-06, + "logits/chosen": -2.5919179916381836, + "logits/rejected": -2.601430892944336, + "logps/chosen": -241.382568359375, + "logps/rejected": -238.89810180664062, + "loss": 0.6445, + "positive_losses": 0.02119159698486328, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.23006686568260193, + "rewards/margins": 0.12360439449548721, + "rewards/margins_max": 0.3254317045211792, + "rewards/margins_min": -0.03285626322031021, + "rewards/margins_std": 0.16100876033306122, + "rewards/rejected": 0.1064625009894371, + "step": 2010 + }, + { + "dpo_losses": 0.6473142504692078, + "epoch": 0.53, + "grad_norm": 15.41456339467445, + "learning_rate": 2.6871506715949608e-06, + "logits/chosen": -2.728001832962036, + "logits/rejected": -2.6899797916412354, + "logps/chosen": -294.582763671875, + "logps/rejected": -275.78729248046875, + "loss": 0.6803, + "positive_losses": 0.3745996356010437, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.2238301932811737, + "rewards/margins": 0.10699672996997833, + "rewards/margins_max": 0.36772987246513367, + "rewards/margins_min": -0.10426433384418488, + "rewards/margins_std": 0.2051413357257843, + "rewards/rejected": 0.11683347076177597, + "step": 2020 + }, + { + "dpo_losses": 0.6469787359237671, + "epoch": 0.53, + "grad_norm": 2.038889559723446, + "learning_rate": 2.6643626526448063e-06, + "logits/chosen": -2.690023422241211, + "logits/rejected": -2.6673452854156494, + "logps/chosen": -250.1510772705078, + "logps/rejected": -255.42233276367188, + "loss": 0.6965, + "positive_losses": 0.2789936661720276, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.2083875685930252, + "rewards/margins": 0.10868855565786362, + "rewards/margins_max": 0.35883814096450806, + "rewards/margins_min": -0.13523946702480316, + "rewards/margins_std": 0.2176806479692459, + "rewards/rejected": 0.099699005484581, + "step": 2030 + }, + { + "dpo_losses": 0.627991259098053, + "epoch": 0.53, + "grad_norm": 2.0095529100790013, + "learning_rate": 2.6415609094604562e-06, + "logits/chosen": -2.543858051300049, + "logits/rejected": -2.564131021499634, + "logps/chosen": -284.96490478515625, + "logps/rejected": -213.75936889648438, + "loss": 0.6419, + "positive_losses": 0.08691177517175674, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.235652893781662, + "rewards/margins": 0.1479492485523224, + "rewards/margins_max": 0.40268969535827637, + "rewards/margins_min": -0.03910567983984947, + "rewards/margins_std": 0.19945472478866577, + "rewards/rejected": 0.08770367503166199, + "step": 2040 + }, + { + "dpo_losses": 0.6487405896186829, + "epoch": 0.54, + "grad_norm": 27.932688868454377, + "learning_rate": 2.618747345980904e-06, + "logits/chosen": -2.665897846221924, + "logits/rejected": -2.6270029544830322, + "logps/chosen": -238.196044921875, + "logps/rejected": -246.7635498046875, + "loss": 0.7043, + "positive_losses": 0.5654325485229492, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.210309699177742, + "rewards/margins": 0.1032426729798317, + "rewards/margins_max": 0.32105982303619385, + "rewards/margins_min": -0.10647524893283844, + "rewards/margins_std": 0.19207589328289032, + "rewards/rejected": 0.10706702619791031, + "step": 2050 + }, + { + "dpo_losses": 0.6493858695030212, + "epoch": 0.54, + "grad_norm": 2.2409215669673306, + "learning_rate": 2.595923867132136e-06, + "logits/chosen": -2.702725887298584, + "logits/rejected": -2.6534149646759033, + "logps/chosen": -281.5009460449219, + "logps/rejected": -256.12164306640625, + "loss": 0.6666, + "positive_losses": 0.2502501606941223, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.26053109765052795, + "rewards/margins": 0.10209240764379501, + "rewards/margins_max": 0.3246293067932129, + "rewards/margins_min": -0.10611675679683685, + "rewards/margins_std": 0.19126805663108826, + "rewards/rejected": 0.15843868255615234, + "step": 2060 + }, + { + "dpo_losses": 0.6561940312385559, + "epoch": 0.54, + "grad_norm": 1.9395106557743642, + "learning_rate": 2.5730923786680672e-06, + "logits/chosen": -2.672586441040039, + "logits/rejected": -2.6671547889709473, + "logps/chosen": -239.53213500976562, + "logps/rejected": -263.83819580078125, + "loss": 0.6554, + "positive_losses": 0.08901993930339813, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.22132229804992676, + "rewards/margins": 0.0845700055360794, + "rewards/margins_max": 0.28750890493392944, + "rewards/margins_min": -0.09480021893978119, + "rewards/margins_std": 0.17295077443122864, + "rewards/rejected": 0.13675229251384735, + "step": 2070 + }, + { + "dpo_losses": 0.648390531539917, + "epoch": 0.54, + "grad_norm": 2.1118077695813278, + "learning_rate": 2.5502547870114137e-06, + "logits/chosen": -2.692939519882202, + "logits/rejected": -2.6532864570617676, + "logps/chosen": -245.56301879882812, + "logps/rejected": -235.213623046875, + "loss": 0.6669, + "positive_losses": 0.24616627395153046, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.2260979413986206, + "rewards/margins": 0.10583852231502533, + "rewards/margins_max": 0.3507748544216156, + "rewards/margins_min": -0.14606128633022308, + "rewards/margins_std": 0.22330446541309357, + "rewards/rejected": 0.12025941908359528, + "step": 2080 + }, + { + "dpo_losses": 0.6537029147148132, + "epoch": 0.55, + "grad_norm": 2.312745261718066, + "learning_rate": 2.527412999094507e-06, + "logits/chosen": -2.614720106124878, + "logits/rejected": -2.61167573928833, + "logps/chosen": -245.9591064453125, + "logps/rejected": -246.91757202148438, + "loss": 0.6898, + "positive_losses": 0.22916193306446075, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.19518707692623138, + "rewards/margins": 0.08987731486558914, + "rewards/margins_max": 0.2740749418735504, + "rewards/margins_min": -0.08669424802064896, + "rewards/margins_std": 0.15994782745838165, + "rewards/rejected": 0.10530976206064224, + "step": 2090 + }, + { + "dpo_losses": 0.6503337621688843, + "epoch": 0.55, + "grad_norm": 1.6356648623281171, + "learning_rate": 2.504568922200064e-06, + "logits/chosen": -2.6984810829162598, + "logits/rejected": -2.667903423309326, + "logps/chosen": -225.71798706054688, + "logps/rejected": -187.17330932617188, + "loss": 0.6695, + "positive_losses": 0.14211463928222656, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.2089039832353592, + "rewards/margins": 0.09576747566461563, + "rewards/margins_max": 0.26251569390296936, + "rewards/margins_min": -0.05504865571856499, + "rewards/margins_std": 0.1408441960811615, + "rewards/rejected": 0.11313650757074356, + "step": 2100 + }, + { + "epoch": 0.55, + "eval_dpo_losses": 0.6443433165550232, + "eval_logits/chosen": -2.6403861045837402, + "eval_logits/rejected": -2.6032521724700928, + "eval_logps/chosen": -262.2980041503906, + "eval_logps/rejected": -247.34201049804688, + "eval_loss": 0.6841303706169128, + "eval_positive_losses": 0.28419262170791626, + "eval_rewards/accuracies": 0.7099999785423279, + "eval_rewards/chosen": 0.22295419871807098, + "eval_rewards/margins": 0.11058513075113297, + "eval_rewards/margins_max": 0.42021989822387695, + "eval_rewards/margins_min": -0.153671532869339, + "eval_rewards/margins_std": 0.19153155386447906, + "eval_rewards/rejected": 0.11236906796693802, + "eval_runtime": 428.0985, + "eval_samples_per_second": 4.672, + "eval_steps_per_second": 0.292, + "step": 2100 + }, + { + "dpo_losses": 0.6568988561630249, + "epoch": 0.55, + "grad_norm": 1.7868341286072185, + "learning_rate": 2.4817244638019333e-06, + "logits/chosen": -2.661259174346924, + "logits/rejected": -2.641444683074951, + "logps/chosen": -235.8902587890625, + "logps/rejected": -235.330322265625, + "loss": 0.7136, + "positive_losses": 0.02505035325884819, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.2192874401807785, + "rewards/margins": 0.08493863046169281, + "rewards/margins_max": 0.3254045844078064, + "rewards/margins_min": -0.10991616547107697, + "rewards/margins_std": 0.1919499933719635, + "rewards/rejected": 0.1343488246202469, + "step": 2110 + }, + { + "dpo_losses": 0.6377665996551514, + "epoch": 0.55, + "grad_norm": 21.985245351908034, + "learning_rate": 2.4588815314058155e-06, + "logits/chosen": -2.6296534538269043, + "logits/rejected": -2.590580463409424, + "logps/chosen": -226.78634643554688, + "logps/rejected": -234.3419952392578, + "loss": 0.6845, + "positive_losses": 0.016133880242705345, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.22431540489196777, + "rewards/margins": 0.12120058387517929, + "rewards/margins_max": 0.29898110032081604, + "rewards/margins_min": -0.0451454222202301, + "rewards/margins_std": 0.15597811341285706, + "rewards/rejected": 0.10311480611562729, + "step": 2120 + }, + { + "dpo_losses": 0.6583911776542664, + "epoch": 0.56, + "grad_norm": 11.158834718963995, + "learning_rate": 2.4360420323899922e-06, + "logits/chosen": -2.6720199584960938, + "logits/rejected": -2.664806842803955, + "logps/chosen": -242.418701171875, + "logps/rejected": -251.580078125, + "loss": 0.6923, + "positive_losses": 0.3454399108886719, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.20674967765808105, + "rewards/margins": 0.07989230006933212, + "rewards/margins_max": 0.28146398067474365, + "rewards/margins_min": -0.11776062101125717, + "rewards/margins_std": 0.17787934839725494, + "rewards/rejected": 0.12685738503932953, + "step": 2130 + }, + { + "dpo_losses": 0.641891598701477, + "epoch": 0.56, + "grad_norm": 22.608503028278708, + "learning_rate": 2.4132078738460585e-06, + "logits/chosen": -2.6954100131988525, + "logits/rejected": -2.679792881011963, + "logps/chosen": -255.9194793701172, + "logps/rejected": -250.09262084960938, + "loss": 0.67, + "positive_losses": 0.2556828558444977, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.22262707352638245, + "rewards/margins": 0.1144232302904129, + "rewards/margins_max": 0.31138306856155396, + "rewards/margins_min": -0.07543648779392242, + "rewards/margins_std": 0.1741967797279358, + "rewards/rejected": 0.10820382833480835, + "step": 2140 + }, + { + "dpo_losses": 0.6457602381706238, + "epoch": 0.56, + "grad_norm": 8.879370244695442, + "learning_rate": 2.3903809624196826e-06, + "logits/chosen": -2.6508166790008545, + "logits/rejected": -2.6161789894104004, + "logps/chosen": -260.8797912597656, + "logps/rejected": -224.77035522460938, + "loss": 0.6811, + "positive_losses": 0.41312235593795776, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.2238878309726715, + "rewards/margins": 0.10860620439052582, + "rewards/margins_max": 0.30983787775039673, + "rewards/margins_min": -0.11345354467630386, + "rewards/margins_std": 0.18995985388755798, + "rewards/rejected": 0.11528158187866211, + "step": 2150 + }, + { + "dpo_losses": 0.6489660143852234, + "epoch": 0.57, + "grad_norm": 23.894288265582425, + "learning_rate": 2.3675632041513978e-06, + "logits/chosen": -2.5794270038604736, + "logits/rejected": -2.571488618850708, + "logps/chosen": -247.68661499023438, + "logps/rejected": -241.1427001953125, + "loss": 0.6863, + "positive_losses": 0.3009759783744812, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.19268381595611572, + "rewards/margins": 0.0996984988451004, + "rewards/margins_max": 0.30270710587501526, + "rewards/margins_min": -0.09910142421722412, + "rewards/margins_std": 0.1765524446964264, + "rewards/rejected": 0.09298529475927353, + "step": 2160 + }, + { + "dpo_losses": 0.6239650249481201, + "epoch": 0.57, + "grad_norm": 14.773507733836224, + "learning_rate": 2.3447565043174533e-06, + "logits/chosen": -2.550816059112549, + "logits/rejected": -2.582885265350342, + "logps/chosen": -252.0146026611328, + "logps/rejected": -252.2535858154297, + "loss": 0.672, + "positive_losses": 0.3954521119594574, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.2451392114162445, + "rewards/margins": 0.15671542286872864, + "rewards/margins_max": 0.39923062920570374, + "rewards/margins_min": -0.06341136246919632, + "rewards/margins_std": 0.2062772810459137, + "rewards/rejected": 0.08842380344867706, + "step": 2170 + }, + { + "dpo_losses": 0.6374545693397522, + "epoch": 0.57, + "grad_norm": 2.091495837749851, + "learning_rate": 2.321962767270724e-06, + "logits/chosen": -2.7370948791503906, + "logits/rejected": -2.689626693725586, + "logps/chosen": -287.5489501953125, + "logps/rejected": -249.8207244873047, + "loss": 0.6783, + "positive_losses": 0.3480583131313324, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.2280876636505127, + "rewards/margins": 0.12556587159633636, + "rewards/margins_max": 0.34966373443603516, + "rewards/margins_min": -0.07942704856395721, + "rewards/margins_std": 0.18983253836631775, + "rewards/rejected": 0.10252177715301514, + "step": 2180 + }, + { + "dpo_losses": 0.6545939445495605, + "epoch": 0.57, + "grad_norm": 9.123808482513535, + "learning_rate": 2.299183896281692e-06, + "logits/chosen": -2.6965737342834473, + "logits/rejected": -2.6977882385253906, + "logps/chosen": -251.5290069580078, + "logps/rejected": -271.16448974609375, + "loss": 0.6629, + "positive_losses": 0.02122955396771431, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.20595040917396545, + "rewards/margins": 0.08762118965387344, + "rewards/margins_max": 0.2856506109237671, + "rewards/margins_min": -0.11087393760681152, + "rewards/margins_std": 0.1784714311361313, + "rewards/rejected": 0.11832920461893082, + "step": 2190 + }, + { + "dpo_losses": 0.6504623889923096, + "epoch": 0.58, + "grad_norm": 2.1236901453664587, + "learning_rate": 2.2764217933795297e-06, + "logits/chosen": -2.6987946033477783, + "logits/rejected": -2.6358554363250732, + "logps/chosen": -271.6541442871094, + "logps/rejected": -237.68222045898438, + "loss": 0.6633, + "positive_losses": 0.3033943176269531, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.22934004664421082, + "rewards/margins": 0.097927525639534, + "rewards/margins_max": 0.30080220103263855, + "rewards/margins_min": -0.12408483028411865, + "rewards/margins_std": 0.18676115572452545, + "rewards/rejected": 0.13141249120235443, + "step": 2200 + }, + { + "epoch": 0.58, + "eval_dpo_losses": 0.6435040235519409, + "eval_logits/chosen": -2.6383254528045654, + "eval_logits/rejected": -2.6014163494110107, + "eval_logps/chosen": -261.85888671875, + "eval_logps/rejected": -247.10403442382812, + "eval_loss": 0.6799082159996033, + "eval_positive_losses": 0.2579602301120758, + "eval_rewards/accuracies": 0.7139999866485596, + "eval_rewards/chosen": 0.2273455262184143, + "eval_rewards/margins": 0.1125965267419815, + "eval_rewards/margins_max": 0.42541539669036865, + "eval_rewards/margins_min": -0.15488092601299286, + "eval_rewards/margins_std": 0.19319504499435425, + "eval_rewards/rejected": 0.114749014377594, + "eval_runtime": 428.0816, + "eval_samples_per_second": 4.672, + "eval_steps_per_second": 0.292, + "step": 2200 + }, + { + "dpo_losses": 0.6446993350982666, + "epoch": 0.58, + "grad_norm": 14.01389386257104, + "learning_rate": 2.2536783591932786e-06, + "logits/chosen": -2.656202793121338, + "logits/rejected": -2.613140106201172, + "logps/chosen": -239.73202514648438, + "logps/rejected": -248.09237670898438, + "loss": 0.6649, + "positive_losses": 0.35535115003585815, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.21086589992046356, + "rewards/margins": 0.10729577392339706, + "rewards/margins_max": 0.28793585300445557, + "rewards/margins_min": -0.07084666192531586, + "rewards/margins_std": 0.16439056396484375, + "rewards/rejected": 0.1035701185464859, + "step": 2210 + }, + { + "dpo_losses": 0.6435825228691101, + "epoch": 0.58, + "grad_norm": 18.876870574218103, + "learning_rate": 2.230955492793149e-06, + "logits/chosen": -2.7256557941436768, + "logits/rejected": -2.7092697620391846, + "logps/chosen": -301.4788513183594, + "logps/rejected": -264.95849609375, + "loss": 0.6931, + "positive_losses": 0.31521472334861755, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.21002745628356934, + "rewards/margins": 0.11338339000940323, + "rewards/margins_max": 0.3429552912712097, + "rewards/margins_min": -0.11895330995321274, + "rewards/margins_std": 0.20564672350883484, + "rewards/rejected": 0.0966440886259079, + "step": 2220 + }, + { + "dpo_losses": 0.6391919851303101, + "epoch": 0.58, + "grad_norm": 20.86574326362134, + "learning_rate": 2.208255091531947e-06, + "logits/chosen": -2.610348701477051, + "logits/rejected": -2.6132984161376953, + "logps/chosen": -228.4042205810547, + "logps/rejected": -228.87814331054688, + "loss": 0.6615, + "positive_losses": 0.11838321387767792, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23576466739177704, + "rewards/margins": 0.12023010104894638, + "rewards/margins_max": 0.32014840841293335, + "rewards/margins_min": -0.06382576376199722, + "rewards/margins_std": 0.17094869911670685, + "rewards/rejected": 0.11553458124399185, + "step": 2230 + }, + { + "dpo_losses": 0.6561200618743896, + "epoch": 0.59, + "grad_norm": 2.1399648720540925, + "learning_rate": 2.1855790508866435e-06, + "logits/chosen": -2.5838656425476074, + "logits/rejected": -2.5281291007995605, + "logps/chosen": -262.9925231933594, + "logps/rejected": -244.24853515625, + "loss": 0.7209, + "positive_losses": 0.18396730720996857, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.25871050357818604, + "rewards/margins": 0.10102052986621857, + "rewards/margins_max": 0.3833063244819641, + "rewards/margins_min": -0.18261994421482086, + "rewards/margins_std": 0.25534966588020325, + "rewards/rejected": 0.15768997371196747, + "step": 2240 + }, + { + "dpo_losses": 0.6441120505332947, + "epoch": 0.59, + "grad_norm": 6.884931569365359, + "learning_rate": 2.162929264300107e-06, + "logits/chosen": -2.6860594749450684, + "logits/rejected": -2.6936264038085938, + "logps/chosen": -262.0154113769531, + "logps/rejected": -240.4822998046875, + "loss": 0.7164, + "positive_losses": 0.605613112449646, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.21775564551353455, + "rewards/margins": 0.11152400076389313, + "rewards/margins_max": 0.3520006239414215, + "rewards/margins_min": -0.0792192816734314, + "rewards/margins_std": 0.19621047377586365, + "rewards/rejected": 0.10623165220022202, + "step": 2250 + }, + { + "dpo_losses": 0.6494182348251343, + "epoch": 0.59, + "grad_norm": 11.585645778736357, + "learning_rate": 2.1403076230230006e-06, + "logits/chosen": -2.573509693145752, + "logits/rejected": -2.5338587760925293, + "logps/chosen": -246.6294403076172, + "logps/rejected": -238.9720001220703, + "loss": 0.6875, + "positive_losses": 0.24757306277751923, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21872088313102722, + "rewards/margins": 0.09721089154481888, + "rewards/margins_max": 0.2833651900291443, + "rewards/margins_min": -0.1053292527794838, + "rewards/margins_std": 0.1713966429233551, + "rewards/rejected": 0.12150999158620834, + "step": 2260 + }, + { + "dpo_losses": 0.6332554817199707, + "epoch": 0.59, + "grad_norm": 2.0882530361771363, + "learning_rate": 2.11771601595586e-06, + "logits/chosen": -2.669936418533325, + "logits/rejected": -2.6094093322753906, + "logps/chosen": -242.7032928466797, + "logps/rejected": -234.1455535888672, + "loss": 0.6378, + "positive_losses": 0.030338669195771217, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.26567015051841736, + "rewards/margins": 0.13651952147483826, + "rewards/margins_max": 0.3859143853187561, + "rewards/margins_min": -0.07704669237136841, + "rewards/margins_std": 0.20730972290039062, + "rewards/rejected": 0.1291506141424179, + "step": 2270 + }, + { + "dpo_losses": 0.6492759585380554, + "epoch": 0.6, + "grad_norm": 10.500555841174924, + "learning_rate": 2.0951563294913737e-06, + "logits/chosen": -2.61448335647583, + "logits/rejected": -2.598428249359131, + "logps/chosen": -261.6297912597656, + "logps/rejected": -271.35736083984375, + "loss": 0.6854, + "positive_losses": 0.5357402563095093, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.23328149318695068, + "rewards/margins": 0.10041890293359756, + "rewards/margins_max": 0.30268600583076477, + "rewards/margins_min": -0.10914675891399384, + "rewards/margins_std": 0.185011625289917, + "rewards/rejected": 0.13286259770393372, + "step": 2280 + }, + { + "dpo_losses": 0.6415624618530273, + "epoch": 0.6, + "grad_norm": 1.641243663580953, + "learning_rate": 2.0726304473568693e-06, + "logits/chosen": -2.65415620803833, + "logits/rejected": -2.597093105316162, + "logps/chosen": -235.9225616455078, + "logps/rejected": -211.4453582763672, + "loss": 0.6927, + "positive_losses": 0.8288175463676453, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.23576009273529053, + "rewards/margins": 0.11631790548563004, + "rewards/margins_max": 0.34672680497169495, + "rewards/margins_min": -0.07067125290632248, + "rewards/margins_std": 0.18884898722171783, + "rewards/rejected": 0.1194421797990799, + "step": 2290 + }, + { + "dpo_losses": 0.65235835313797, + "epoch": 0.6, + "grad_norm": 1.9386519422396105, + "learning_rate": 2.050140250457023e-06, + "logits/chosen": -2.6757822036743164, + "logits/rejected": -2.6740939617156982, + "logps/chosen": -264.4683532714844, + "logps/rejected": -268.44500732421875, + "loss": 0.7136, + "positive_losses": 0.49965667724609375, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.22599156200885773, + "rewards/margins": 0.0977611392736435, + "rewards/margins_max": 0.3241087794303894, + "rewards/margins_min": -0.10137069225311279, + "rewards/margins_std": 0.18976759910583496, + "rewards/rejected": 0.12823040783405304, + "step": 2300 + }, + { + "epoch": 0.6, + "eval_dpo_losses": 0.6443361639976501, + "eval_logits/chosen": -2.6471359729766846, + "eval_logits/rejected": -2.611751079559326, + "eval_logps/chosen": -261.69073486328125, + "eval_logps/rejected": -246.74459838867188, + "eval_loss": 0.6781137585639954, + "eval_positive_losses": 0.23762869834899902, + "eval_rewards/accuracies": 0.7110000252723694, + "eval_rewards/chosen": 0.2290269285440445, + "eval_rewards/margins": 0.11068341135978699, + "eval_rewards/margins_max": 0.4196871817111969, + "eval_rewards/margins_min": -0.15320247411727905, + "eval_rewards/margins_std": 0.19144602119922638, + "eval_rewards/rejected": 0.1183435395359993, + "eval_runtime": 428.0913, + "eval_samples_per_second": 4.672, + "eval_steps_per_second": 0.292, + "step": 2300 + }, + { + "dpo_losses": 0.642924964427948, + "epoch": 0.6, + "grad_norm": 1.8356161932105675, + "learning_rate": 2.0276876167168042e-06, + "logits/chosen": -2.6548080444335938, + "logits/rejected": -2.614123821258545, + "logps/chosen": -290.8428649902344, + "logps/rejected": -231.98458862304688, + "loss": 0.6622, + "positive_losses": 0.28353041410446167, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.22888973355293274, + "rewards/margins": 0.11493362486362457, + "rewards/margins_max": 0.37111008167266846, + "rewards/margins_min": -0.08703051507472992, + "rewards/margins_std": 0.20240816473960876, + "rewards/rejected": 0.11395610868930817, + "step": 2310 + }, + { + "dpo_losses": 0.6534973978996277, + "epoch": 0.61, + "grad_norm": 4.820293111159344, + "learning_rate": 2.0052744209246682e-06, + "logits/chosen": -2.6986007690429688, + "logits/rejected": -2.6542580127716064, + "logps/chosen": -239.6178741455078, + "logps/rejected": -219.450439453125, + "loss": 0.6904, + "positive_losses": 0.5139321088790894, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.23109444975852966, + "rewards/margins": 0.09262574464082718, + "rewards/margins_max": 0.323060542345047, + "rewards/margins_min": -0.1246475949883461, + "rewards/margins_std": 0.1977689117193222, + "rewards/rejected": 0.13846872746944427, + "step": 2320 + }, + { + "dpo_losses": 0.6327471137046814, + "epoch": 0.61, + "grad_norm": 7.510093509178269, + "learning_rate": 1.9829025345760127e-06, + "logits/chosen": -2.6793408393859863, + "logits/rejected": -2.6734347343444824, + "logps/chosen": -279.198974609375, + "logps/rejected": -287.8305969238281, + "loss": 0.6621, + "positive_losses": 0.3259936273097992, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.2557789087295532, + "rewards/margins": 0.13750512897968292, + "rewards/margins_max": 0.40703850984573364, + "rewards/margins_min": -0.06727553904056549, + "rewards/margins_std": 0.21355748176574707, + "rewards/rejected": 0.1182737797498703, + "step": 2330 + }, + { + "dpo_losses": 0.6526913046836853, + "epoch": 0.61, + "grad_norm": 4.227720768249378, + "learning_rate": 1.9605738257169115e-06, + "logits/chosen": -2.624246120452881, + "logits/rejected": -2.602616786956787, + "logps/chosen": -271.4979248046875, + "logps/rejected": -267.3983459472656, + "loss": 0.6765, + "positive_losses": 0.3113154470920563, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21783605217933655, + "rewards/margins": 0.0947721004486084, + "rewards/margins_max": 0.33382928371429443, + "rewards/margins_min": -0.13909652829170227, + "rewards/margins_std": 0.21136465668678284, + "rewards/rejected": 0.12306392192840576, + "step": 2340 + }, + { + "dpo_losses": 0.6590754389762878, + "epoch": 0.62, + "grad_norm": 4.172955662352638, + "learning_rate": 1.9382901587881275e-06, + "logits/chosen": -2.697080135345459, + "logits/rejected": -2.6767849922180176, + "logps/chosen": -266.29718017578125, + "logps/rejected": -235.399169921875, + "loss": 0.6694, + "positive_losses": 0.1714244782924652, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.22708387672901154, + "rewards/margins": 0.08011853694915771, + "rewards/margins_max": 0.2996049225330353, + "rewards/margins_min": -0.13483865559101105, + "rewards/margins_std": 0.19062037765979767, + "rewards/rejected": 0.14696532487869263, + "step": 2350 + }, + { + "dpo_losses": 0.6451055407524109, + "epoch": 0.62, + "grad_norm": 7.03921180084103, + "learning_rate": 1.916053394469437e-06, + "logits/chosen": -2.6840641498565674, + "logits/rejected": -2.635782480239868, + "logps/chosen": -263.34197998046875, + "logps/rejected": -206.8403778076172, + "loss": 0.6896, + "positive_losses": 0.5735152959823608, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.21855726838111877, + "rewards/margins": 0.12016957998275757, + "rewards/margins_max": 0.3809802234172821, + "rewards/margins_min": -0.12460510432720184, + "rewards/margins_std": 0.22663649916648865, + "rewards/rejected": 0.09838766604661942, + "step": 2360 + }, + { + "dpo_losses": 0.6624797582626343, + "epoch": 0.62, + "grad_norm": 2.3493769935869393, + "learning_rate": 1.8938653895242604e-06, + "logits/chosen": -2.708848476409912, + "logits/rejected": -2.689735174179077, + "logps/chosen": -226.92910766601562, + "logps/rejected": -238.28622436523438, + "loss": 0.6737, + "positive_losses": 0.17334279417991638, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.21128730475902557, + "rewards/margins": 0.07327686995267868, + "rewards/margins_max": 0.24936611950397491, + "rewards/margins_min": -0.11990121752023697, + "rewards/margins_std": 0.16652920842170715, + "rewards/rejected": 0.1380104124546051, + "step": 2370 + }, + { + "dpo_losses": 0.6371510028839111, + "epoch": 0.62, + "grad_norm": 1.9093900988991668, + "learning_rate": 1.8717279966446267e-06, + "logits/chosen": -2.6955037117004395, + "logits/rejected": -2.635916233062744, + "logps/chosen": -276.50286865234375, + "logps/rejected": -231.66708374023438, + "loss": 0.6679, + "positive_losses": 0.250906378030777, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.23346546292304993, + "rewards/margins": 0.12501740455627441, + "rewards/margins_max": 0.32189956307411194, + "rewards/margins_min": -0.06842408329248428, + "rewards/margins_std": 0.17628346383571625, + "rewards/rejected": 0.10844806581735611, + "step": 2380 + }, + { + "dpo_losses": 0.6528540849685669, + "epoch": 0.63, + "grad_norm": 12.358461985279623, + "learning_rate": 1.8496430642964698e-06, + "logits/chosen": -2.687997341156006, + "logits/rejected": -2.675405979156494, + "logps/chosen": -267.6132507324219, + "logps/rejected": -262.12823486328125, + "loss": 0.6581, + "positive_losses": 0.11948716640472412, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.2187456339597702, + "rewards/margins": 0.09295535832643509, + "rewards/margins_max": 0.318665087223053, + "rewards/margins_min": -0.13442297279834747, + "rewards/margins_std": 0.19662639498710632, + "rewards/rejected": 0.1257902830839157, + "step": 2390 + }, + { + "dpo_losses": 0.6597181558609009, + "epoch": 0.63, + "grad_norm": 1.8895929845909043, + "learning_rate": 1.827612436565286e-06, + "logits/chosen": -2.6727089881896973, + "logits/rejected": -2.613783359527588, + "logps/chosen": -218.83389282226562, + "logps/rejected": -250.94454956054688, + "loss": 0.6631, + "positive_losses": 0.18260526657104492, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.22674863040447235, + "rewards/margins": 0.07946915179491043, + "rewards/margins_max": 0.2924432158470154, + "rewards/margins_min": -0.11948784440755844, + "rewards/margins_std": 0.1873387098312378, + "rewards/rejected": 0.1472795009613037, + "step": 2400 + }, + { + "epoch": 0.63, + "eval_dpo_losses": 0.6449704170227051, + "eval_logits/chosen": -2.6430318355560303, + "eval_logits/rejected": -2.6072418689727783, + "eval_logps/chosen": -261.74786376953125, + "eval_logps/rejected": -246.63014221191406, + "eval_loss": 0.676893413066864, + "eval_positive_losses": 0.22887668013572693, + "eval_rewards/accuracies": 0.7080000042915344, + "eval_rewards/chosen": 0.22845546901226044, + "eval_rewards/margins": 0.10896759480237961, + "eval_rewards/margins_max": 0.41343066096305847, + "eval_rewards/margins_min": -0.1509072631597519, + "eval_rewards/margins_std": 0.18821609020233154, + "eval_rewards/rejected": 0.11948786675930023, + "eval_runtime": 428.2634, + "eval_samples_per_second": 4.67, + "eval_steps_per_second": 0.292, + "step": 2400 + }, + { + "dpo_losses": 0.6545339822769165, + "epoch": 0.63, + "grad_norm": 7.772719030867673, + "learning_rate": 1.8056379530021492e-06, + "logits/chosen": -2.615975856781006, + "logits/rejected": -2.6049745082855225, + "logps/chosen": -228.251953125, + "logps/rejected": -227.32797241210938, + "loss": 0.6481, + "positive_losses": 0.014975356869399548, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.21992447972297668, + "rewards/margins": 0.08761879056692123, + "rewards/margins_max": 0.2946820557117462, + "rewards/margins_min": -0.10195348411798477, + "rewards/margins_std": 0.17538979649543762, + "rewards/rejected": 0.13230566680431366, + "step": 2410 + }, + { + "dpo_losses": 0.6384466886520386, + "epoch": 0.63, + "grad_norm": 8.770477746806417, + "learning_rate": 1.7837214484701154e-06, + "logits/chosen": -2.7008354663848877, + "logits/rejected": -2.6350302696228027, + "logps/chosen": -311.5066223144531, + "logps/rejected": -258.36822509765625, + "loss": 0.668, + "positive_losses": 0.29362478852272034, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.27750059962272644, + "rewards/margins": 0.12787646055221558, + "rewards/margins_max": 0.3754945397377014, + "rewards/margins_min": -0.09464956074953079, + "rewards/margins_std": 0.20763537287712097, + "rewards/rejected": 0.14962413907051086, + "step": 2420 + }, + { + "dpo_losses": 0.6517640948295593, + "epoch": 0.64, + "grad_norm": 2.1106932865174315, + "learning_rate": 1.7618647529910043e-06, + "logits/chosen": -2.703248977661133, + "logits/rejected": -2.6583917140960693, + "logps/chosen": -287.2205810546875, + "logps/rejected": -274.2737731933594, + "loss": 0.6766, + "positive_losses": 0.004874801728874445, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.2177753895521164, + "rewards/margins": 0.09591395407915115, + "rewards/margins_max": 0.3515348732471466, + "rewards/margins_min": -0.12417320907115936, + "rewards/margins_std": 0.2077832669019699, + "rewards/rejected": 0.12186142057180405, + "step": 2430 + }, + { + "dpo_losses": 0.6442539691925049, + "epoch": 0.64, + "grad_norm": 9.177313522817762, + "learning_rate": 1.7400696915925996e-06, + "logits/chosen": -2.61651873588562, + "logits/rejected": -2.55776309967041, + "logps/chosen": -250.7611083984375, + "logps/rejected": -226.8053741455078, + "loss": 0.6548, + "positive_losses": 0.12011022865772247, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.23287931084632874, + "rewards/margins": 0.11137101799249649, + "rewards/margins_max": 0.33335360884666443, + "rewards/margins_min": -0.07477789372205734, + "rewards/margins_std": 0.18021732568740845, + "rewards/rejected": 0.12150830030441284, + "step": 2440 + }, + { + "dpo_losses": 0.6601482033729553, + "epoch": 0.64, + "grad_norm": 1.873232326137569, + "learning_rate": 1.718338084156254e-06, + "logits/chosen": -2.7020750045776367, + "logits/rejected": -2.7036221027374268, + "logps/chosen": -257.56304931640625, + "logps/rejected": -280.86236572265625, + "loss": 0.6887, + "positive_losses": 0.27468910813331604, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.21189913153648376, + "rewards/margins": 0.0761132761836052, + "rewards/margins_max": 0.24091093242168427, + "rewards/margins_min": -0.09531867504119873, + "rewards/margins_std": 0.14899347722530365, + "rewards/rejected": 0.13578586280345917, + "step": 2450 + }, + { + "dpo_losses": 0.6478050947189331, + "epoch": 0.64, + "grad_norm": 15.953977197794108, + "learning_rate": 1.6966717452649372e-06, + "logits/chosen": -2.688554286956787, + "logits/rejected": -2.678035020828247, + "logps/chosen": -252.1296844482422, + "logps/rejected": -239.04464721679688, + "loss": 0.683, + "positive_losses": 0.2908807396888733, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.21076655387878418, + "rewards/margins": 0.10334186255931854, + "rewards/margins_max": 0.32458725571632385, + "rewards/margins_min": -0.0979321151971817, + "rewards/margins_std": 0.19284126162528992, + "rewards/rejected": 0.10742469877004623, + "step": 2460 + }, + { + "dpo_losses": 0.6468202471733093, + "epoch": 0.65, + "grad_norm": 1.9430355856596895, + "learning_rate": 1.6750724840517103e-06, + "logits/chosen": -2.649327039718628, + "logits/rejected": -2.6505379676818848, + "logps/chosen": -279.24298095703125, + "logps/rejected": -266.89642333984375, + "loss": 0.7082, + "positive_losses": 0.6239467859268188, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.21040575206279755, + "rewards/margins": 0.10797951370477676, + "rewards/margins_max": 0.30911627411842346, + "rewards/margins_min": -0.08701352775096893, + "rewards/margins_std": 0.18224899470806122, + "rewards/rejected": 0.10242621600627899, + "step": 2470 + }, + { + "dpo_losses": 0.6477130651473999, + "epoch": 0.65, + "grad_norm": 15.76191478734966, + "learning_rate": 1.6535421040486686e-06, + "logits/chosen": -2.664402723312378, + "logits/rejected": -2.6564764976501465, + "logps/chosen": -260.77862548828125, + "logps/rejected": -248.49453735351562, + "loss": 0.677, + "positive_losses": 0.32372361421585083, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.22065123915672302, + "rewards/margins": 0.1069595217704773, + "rewards/margins_max": 0.33060309290885925, + "rewards/margins_min": -0.11348260939121246, + "rewards/margins_std": 0.19639793038368225, + "rewards/rejected": 0.11369173228740692, + "step": 2480 + }, + { + "dpo_losses": 0.6175050139427185, + "epoch": 0.65, + "grad_norm": 17.532775703719967, + "learning_rate": 1.6320824030363458e-06, + "logits/chosen": -2.599806785583496, + "logits/rejected": -2.523268938064575, + "logps/chosen": -273.0903015136719, + "logps/rejected": -238.984375, + "loss": 0.6431, + "positive_losses": 0.190023735165596, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.2450869083404541, + "rewards/margins": 0.1699949949979782, + "rewards/margins_max": 0.40681809186935425, + "rewards/margins_min": -0.07576446980237961, + "rewards/margins_std": 0.21555539965629578, + "rewards/rejected": 0.07509191334247589, + "step": 2490 + }, + { + "dpo_losses": 0.6297341585159302, + "epoch": 0.65, + "grad_norm": 8.013988662471801, + "learning_rate": 1.6106951728936028e-06, + "logits/chosen": -2.6417384147644043, + "logits/rejected": -2.6619882583618164, + "logps/chosen": -231.43295288085938, + "logps/rejected": -252.7168426513672, + "loss": 0.6884, + "positive_losses": 0.8568565249443054, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.22378802299499512, + "rewards/margins": 0.14048174023628235, + "rewards/margins_max": 0.3421075940132141, + "rewards/margins_min": -0.038745298981666565, + "rewards/margins_std": 0.17421843111515045, + "rewards/rejected": 0.08330627530813217, + "step": 2500 + }, + { + "epoch": 0.65, + "eval_dpo_losses": 0.6403517127037048, + "eval_logits/chosen": -2.6412670612335205, + "eval_logits/rejected": -2.606358528137207, + "eval_logps/chosen": -262.11669921875, + "eval_logps/rejected": -248.11026000976562, + "eval_loss": 0.6854027509689331, + "eval_positive_losses": 0.3215247392654419, + "eval_rewards/accuracies": 0.7120000123977661, + "eval_rewards/chosen": 0.22476711869239807, + "eval_rewards/margins": 0.12008056044578552, + "eval_rewards/margins_max": 0.44079744815826416, + "eval_rewards/margins_min": -0.15831314027309418, + "eval_rewards/margins_std": 0.2000243216753006, + "eval_rewards/rejected": 0.10468658804893494, + "eval_runtime": 427.7686, + "eval_samples_per_second": 4.675, + "eval_steps_per_second": 0.292, + "step": 2500 + }, + { + "dpo_losses": 0.6387171745300293, + "epoch": 0.66, + "grad_norm": 1.9671122687318612, + "learning_rate": 1.5893821994479996e-06, + "logits/chosen": -2.6180126667022705, + "logits/rejected": -2.570876121520996, + "logps/chosen": -238.55923461914062, + "logps/rejected": -235.71737670898438, + "loss": 0.6459, + "positive_losses": 0.010045480914413929, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.22037789225578308, + "rewards/margins": 0.1271388828754425, + "rewards/margins_max": 0.3906629681587219, + "rewards/margins_min": -0.11404719203710556, + "rewards/margins_std": 0.22322943806648254, + "rewards/rejected": 0.09323902428150177, + "step": 2510 + }, + { + "dpo_losses": 0.6344737410545349, + "epoch": 0.66, + "grad_norm": 12.848256009868312, + "learning_rate": 1.5681452623266868e-06, + "logits/chosen": -2.694092035293579, + "logits/rejected": -2.657163143157959, + "logps/chosen": -237.0204620361328, + "logps/rejected": -257.2022399902344, + "loss": 0.6847, + "positive_losses": 0.2897499203681946, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.21192964911460876, + "rewards/margins": 0.13573183119297028, + "rewards/margins_max": 0.39697641134262085, + "rewards/margins_min": -0.08935762196779251, + "rewards/margins_std": 0.223657488822937, + "rewards/rejected": 0.07619784027338028, + "step": 2520 + }, + { + "dpo_losses": 0.6213490962982178, + "epoch": 0.66, + "grad_norm": 7.56286014928845, + "learning_rate": 1.5469861348078014e-06, + "logits/chosen": -2.6197333335876465, + "logits/rejected": -2.6297221183776855, + "logps/chosen": -255.9955291748047, + "logps/rejected": -268.439697265625, + "loss": 0.6403, + "positive_losses": 0.1759118139743805, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.25826987624168396, + "rewards/margins": 0.16589568555355072, + "rewards/margins_max": 0.4661618173122406, + "rewards/margins_min": -0.05714429169893265, + "rewards/margins_std": 0.2331564724445343, + "rewards/rejected": 0.09237419068813324, + "step": 2530 + }, + { + "dpo_losses": 0.651238739490509, + "epoch": 0.66, + "grad_norm": 17.91774946385872, + "learning_rate": 1.5259065836724035e-06, + "logits/chosen": -2.650686740875244, + "logits/rejected": -2.6421239376068115, + "logps/chosen": -231.24282836914062, + "logps/rejected": -279.4007263183594, + "loss": 0.6847, + "positive_losses": 0.3626817762851715, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18830692768096924, + "rewards/margins": 0.09621229022741318, + "rewards/margins_max": 0.31938856840133667, + "rewards/margins_min": -0.10441014915704727, + "rewards/margins_std": 0.19326581060886383, + "rewards/rejected": 0.09209464490413666, + "step": 2540 + }, + { + "dpo_losses": 0.640707790851593, + "epoch": 0.67, + "grad_norm": 9.327269446213379, + "learning_rate": 1.5049083690569456e-06, + "logits/chosen": -2.65415620803833, + "logits/rejected": -2.6038031578063965, + "logps/chosen": -226.2071533203125, + "logps/rejected": -233.241455078125, + "loss": 0.6716, + "positive_losses": 0.41562312841415405, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.2270263433456421, + "rewards/margins": 0.1171361580491066, + "rewards/margins_max": 0.3306456208229065, + "rewards/margins_min": -0.0836900919675827, + "rewards/margins_std": 0.18275295197963715, + "rewards/rejected": 0.10989020019769669, + "step": 2550 + }, + { + "dpo_losses": 0.6367359161376953, + "epoch": 0.67, + "grad_norm": 2.0294225835552444, + "learning_rate": 1.4839932443063057e-06, + "logits/chosen": -2.576378345489502, + "logits/rejected": -2.5502843856811523, + "logps/chosen": -214.53225708007812, + "logps/rejected": -193.76736450195312, + "loss": 0.6614, + "positive_losses": 0.5105085372924805, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2333839386701584, + "rewards/margins": 0.12664873898029327, + "rewards/margins_max": 0.3154265582561493, + "rewards/margins_min": -0.07867036014795303, + "rewards/margins_std": 0.17050711810588837, + "rewards/rejected": 0.1067352145910263, + "step": 2560 + }, + { + "dpo_losses": 0.6389847993850708, + "epoch": 0.67, + "grad_norm": 12.980117265027586, + "learning_rate": 1.4631629558273803e-06, + "logits/chosen": -2.6802918910980225, + "logits/rejected": -2.6632590293884277, + "logps/chosen": -277.330810546875, + "logps/rejected": -259.49267578125, + "loss": 0.6689, + "positive_losses": 0.12875232100486755, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.22042131423950195, + "rewards/margins": 0.1271112561225891, + "rewards/margins_max": 0.34655025601387024, + "rewards/margins_min": -0.1025533527135849, + "rewards/margins_std": 0.2004118412733078, + "rewards/rejected": 0.09331005066633224, + "step": 2570 + }, + { + "dpo_losses": 0.6375996470451355, + "epoch": 0.68, + "grad_norm": 1.9154082880451648, + "learning_rate": 1.4424192429432657e-06, + "logits/chosen": -2.6565616130828857, + "logits/rejected": -2.60559344291687, + "logps/chosen": -248.62783813476562, + "logps/rejected": -255.2563018798828, + "loss": 0.6649, + "positive_losses": 0.3865188658237457, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.233999565243721, + "rewards/margins": 0.12307735532522202, + "rewards/margins_max": 0.3148532807826996, + "rewards/margins_min": -0.07773791253566742, + "rewards/margins_std": 0.17624323070049286, + "rewards/rejected": 0.11092217266559601, + "step": 2580 + }, + { + "dpo_losses": 0.6365996599197388, + "epoch": 0.68, + "grad_norm": 4.128842962352004, + "learning_rate": 1.421763837748016e-06, + "logits/chosen": -2.681898593902588, + "logits/rejected": -2.651426315307617, + "logps/chosen": -265.7993469238281, + "logps/rejected": -227.25900268554688, + "loss": 0.7046, + "positive_losses": 0.6643952131271362, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.22455474734306335, + "rewards/margins": 0.12930192053318024, + "rewards/margins_max": 0.36015385389328003, + "rewards/margins_min": -0.1021166443824768, + "rewards/margins_std": 0.21218717098236084, + "rewards/rejected": 0.09525284171104431, + "step": 2590 + }, + { + "dpo_losses": 0.6480204463005066, + "epoch": 0.68, + "grad_norm": 10.935780068452658, + "learning_rate": 1.401198464962021e-06, + "logits/chosen": -2.651763439178467, + "logits/rejected": -2.638746500015259, + "logps/chosen": -242.8926544189453, + "logps/rejected": -274.14361572265625, + "loss": 0.6701, + "positive_losses": 0.24560967087745667, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.20024743676185608, + "rewards/margins": 0.10189042240381241, + "rewards/margins_max": 0.31531795859336853, + "rewards/margins_min": -0.08022803068161011, + "rewards/margins_std": 0.17858561873435974, + "rewards/rejected": 0.09835700690746307, + "step": 2600 + }, + { + "epoch": 0.68, + "eval_dpo_losses": 0.6431987881660461, + "eval_logits/chosen": -2.645789861679077, + "eval_logits/rejected": -2.611599922180176, + "eval_logps/chosen": -261.6951904296875, + "eval_logps/rejected": -247.0383758544922, + "eval_loss": 0.6817047595977783, + "eval_positive_losses": 0.2660869061946869, + "eval_rewards/accuracies": 0.7239999771118164, + "eval_rewards/chosen": 0.22898218035697937, + "eval_rewards/margins": 0.113576740026474, + "eval_rewards/margins_max": 0.4344462454319, + "eval_rewards/margins_min": -0.15536274015903473, + "eval_rewards/margins_std": 0.19602753221988678, + "eval_rewards/rejected": 0.11540544778108597, + "eval_runtime": 427.7774, + "eval_samples_per_second": 4.675, + "eval_steps_per_second": 0.292, + "step": 2600 + }, + { + "dpo_losses": 0.637254536151886, + "epoch": 0.68, + "grad_norm": 12.47443858624476, + "learning_rate": 1.3807248417879896e-06, + "logits/chosen": -2.6484155654907227, + "logits/rejected": -2.5987777709960938, + "logps/chosen": -268.6302795410156, + "logps/rejected": -212.509765625, + "loss": 0.6782, + "positive_losses": 0.20167112350463867, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.2548355460166931, + "rewards/margins": 0.12552940845489502, + "rewards/margins_max": 0.3718295991420746, + "rewards/margins_min": -0.07150840759277344, + "rewards/margins_std": 0.2004489004611969, + "rewards/rejected": 0.1293061524629593, + "step": 2610 + }, + { + "dpo_losses": 0.6588480472564697, + "epoch": 0.69, + "grad_norm": 2.203309288163017, + "learning_rate": 1.3603446777675665e-06, + "logits/chosen": -2.648198366165161, + "logits/rejected": -2.640773296356201, + "logps/chosen": -247.3656005859375, + "logps/rejected": -243.3533172607422, + "loss": 0.6802, + "positive_losses": 0.26356926560401917, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.20906881988048553, + "rewards/margins": 0.0802852138876915, + "rewards/margins_max": 0.29386550188064575, + "rewards/margins_min": -0.11695988476276398, + "rewards/margins_std": 0.1891327053308487, + "rewards/rejected": 0.12878362834453583, + "step": 2620 + }, + { + "dpo_losses": 0.6410268545150757, + "epoch": 0.69, + "grad_norm": 35.622983427026256, + "learning_rate": 1.3400596746385817e-06, + "logits/chosen": -2.602980375289917, + "logits/rejected": -2.643425941467285, + "logps/chosen": -261.08648681640625, + "logps/rejected": -260.27093505859375, + "loss": 0.7158, + "positive_losses": 1.012854814529419, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.21067564189434052, + "rewards/margins": 0.12065211683511734, + "rewards/margins_max": 0.33560889959335327, + "rewards/margins_min": -0.10677622258663177, + "rewards/margins_std": 0.19985225796699524, + "rewards/rejected": 0.09002353996038437, + "step": 2630 + }, + { + "dpo_losses": 0.6518687009811401, + "epoch": 0.69, + "grad_norm": 6.458894012068232, + "learning_rate": 1.3198715261929587e-06, + "logits/chosen": -2.7320096492767334, + "logits/rejected": -2.717092514038086, + "logps/chosen": -294.92437744140625, + "logps/rejected": -248.4272003173828, + "loss": 0.7169, + "positive_losses": 0.7706745266914368, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.21491801738739014, + "rewards/margins": 0.09523527324199677, + "rewards/margins_max": 0.32567816972732544, + "rewards/margins_min": -0.10061510652303696, + "rewards/margins_std": 0.19867166876792908, + "rewards/rejected": 0.11968272924423218, + "step": 2640 + }, + { + "dpo_losses": 0.6246224641799927, + "epoch": 0.69, + "grad_norm": 1.8372310545408177, + "learning_rate": 1.2997819181352823e-06, + "logits/chosen": -2.6638991832733154, + "logits/rejected": -2.620840549468994, + "logps/chosen": -238.86465454101562, + "logps/rejected": -241.9066619873047, + "loss": 0.648, + "positive_losses": 0.20916476845741272, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.23800602555274963, + "rewards/margins": 0.15638919174671173, + "rewards/margins_max": 0.3884904980659485, + "rewards/margins_min": -0.024152684956789017, + "rewards/margins_std": 0.1865231841802597, + "rewards/rejected": 0.0816168338060379, + "step": 2650 + }, + { + "dpo_losses": 0.6479545831680298, + "epoch": 0.7, + "grad_norm": 10.419345440065516, + "learning_rate": 1.2797925279420454e-06, + "logits/chosen": -2.727102756500244, + "logits/rejected": -2.6972246170043945, + "logps/chosen": -291.5232238769531, + "logps/rejected": -232.34115600585938, + "loss": 0.6486, + "positive_losses": 0.0034275054931640625, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.24116802215576172, + "rewards/margins": 0.10435731709003448, + "rewards/margins_max": 0.30152642726898193, + "rewards/margins_min": -0.08927350491285324, + "rewards/margins_std": 0.18111565709114075, + "rewards/rejected": 0.13681069016456604, + "step": 2660 + }, + { + "dpo_losses": 0.6520140171051025, + "epoch": 0.7, + "grad_norm": 10.71310623216262, + "learning_rate": 1.2599050247215764e-06, + "logits/chosen": -2.759894847869873, + "logits/rejected": -2.679072856903076, + "logps/chosen": -255.1886749267578, + "logps/rejected": -222.32943725585938, + "loss": 0.6957, + "positive_losses": 0.6127627491950989, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.20489636063575745, + "rewards/margins": 0.09360338747501373, + "rewards/margins_max": 0.29037782549858093, + "rewards/margins_min": -0.08301670104265213, + "rewards/margins_std": 0.16996921598911285, + "rewards/rejected": 0.11129295825958252, + "step": 2670 + }, + { + "dpo_losses": 0.6375089883804321, + "epoch": 0.7, + "grad_norm": 10.434048201797872, + "learning_rate": 1.2401210690746705e-06, + "logits/chosen": -2.698019504547119, + "logits/rejected": -2.700742483139038, + "logps/chosen": -299.0583801269531, + "logps/rejected": -336.4329833984375, + "loss": 0.7219, + "positive_losses": 1.1617672443389893, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.21842971444129944, + "rewards/margins": 0.13291862607002258, + "rewards/margins_max": 0.41158396005630493, + "rewards/margins_min": -0.15379118919372559, + "rewards/margins_std": 0.2509996294975281, + "rewards/rejected": 0.08551109582185745, + "step": 2680 + }, + { + "dpo_losses": 0.6348380446434021, + "epoch": 0.7, + "grad_norm": 2.2058169104648453, + "learning_rate": 1.2204423129559306e-06, + "logits/chosen": -2.5682687759399414, + "logits/rejected": -2.5628485679626465, + "logps/chosen": -197.96792602539062, + "logps/rejected": -222.6014862060547, + "loss": 0.6793, + "positive_losses": 0.20842599868774414, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.21794691681861877, + "rewards/margins": 0.13202880322933197, + "rewards/margins_max": 0.34874072670936584, + "rewards/margins_min": -0.07386180013418198, + "rewards/margins_std": 0.1926882565021515, + "rewards/rejected": 0.08591810613870621, + "step": 2690 + }, + { + "dpo_losses": 0.6509995460510254, + "epoch": 0.71, + "grad_norm": 16.022086158582685, + "learning_rate": 1.20087039953583e-06, + "logits/chosen": -2.643242359161377, + "logits/rejected": -2.5987229347229004, + "logps/chosen": -239.15554809570312, + "logps/rejected": -214.74429321289062, + "loss": 0.668, + "positive_losses": 0.1304554045200348, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.22552835941314697, + "rewards/margins": 0.09569480270147324, + "rewards/margins_max": 0.2902173399925232, + "rewards/margins_min": -0.11860315501689911, + "rewards/margins_std": 0.18064549565315247, + "rewards/rejected": 0.12983354926109314, + "step": 2700 + }, + { + "epoch": 0.71, + "eval_dpo_losses": 0.6440854072570801, + "eval_logits/chosen": -2.6533379554748535, + "eval_logits/rejected": -2.619645595550537, + "eval_logps/chosen": -261.2966003417969, + "eval_logps/rejected": -246.400390625, + "eval_loss": 0.6771283149719238, + "eval_positive_losses": 0.2208622395992279, + "eval_rewards/accuracies": 0.718999981880188, + "eval_rewards/chosen": 0.23296818137168884, + "eval_rewards/margins": 0.11118274182081223, + "eval_rewards/margins_max": 0.4213098883628845, + "eval_rewards/margins_min": -0.15251097083091736, + "eval_rewards/margins_std": 0.19105467200279236, + "eval_rewards/rejected": 0.12178544700145721, + "eval_runtime": 428.2103, + "eval_samples_per_second": 4.671, + "eval_steps_per_second": 0.292, + "step": 2700 + }, + { + "dpo_losses": 0.6561421155929565, + "epoch": 0.71, + "grad_norm": 12.403169878981542, + "learning_rate": 1.181406963063507e-06, + "logits/chosen": -2.743208169937134, + "logits/rejected": -2.709289789199829, + "logps/chosen": -266.05523681640625, + "logps/rejected": -277.7951354980469, + "loss": 0.7108, + "positive_losses": 0.5917842984199524, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.22070738673210144, + "rewards/margins": 0.08396445214748383, + "rewards/margins_max": 0.2688206732273102, + "rewards/margins_min": -0.09836752712726593, + "rewards/margins_std": 0.1652737259864807, + "rewards/rejected": 0.13674293458461761, + "step": 2710 + }, + { + "dpo_losses": 0.646786630153656, + "epoch": 0.71, + "grad_norm": 2.6027516806031588, + "learning_rate": 1.1620536287303052e-06, + "logits/chosen": -2.6333985328674316, + "logits/rejected": -2.5747392177581787, + "logps/chosen": -228.45947265625, + "logps/rejected": -238.262939453125, + "loss": 0.6666, + "positive_losses": 0.027136946097016335, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.23899738490581512, + "rewards/margins": 0.10501845180988312, + "rewards/margins_max": 0.29407796263694763, + "rewards/margins_min": -0.09125413000583649, + "rewards/margins_std": 0.17288681864738464, + "rewards/rejected": 0.133978933095932, + "step": 2720 + }, + { + "dpo_losses": 0.646081268787384, + "epoch": 0.71, + "grad_norm": 16.276106770703127, + "learning_rate": 1.1428120125340717e-06, + "logits/chosen": -2.6631503105163574, + "logits/rejected": -2.6598093509674072, + "logps/chosen": -254.3302764892578, + "logps/rejected": -246.5015869140625, + "loss": 0.6626, + "positive_losses": 0.015617894940078259, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.27498504519462585, + "rewards/margins": 0.11424392461776733, + "rewards/margins_max": 0.40816107392311096, + "rewards/margins_min": -0.10722239315509796, + "rewards/margins_std": 0.22900240123271942, + "rewards/rejected": 0.16074110567569733, + "step": 2730 + }, + { + "dpo_losses": 0.6563581228256226, + "epoch": 0.72, + "grad_norm": 9.278159503609219, + "learning_rate": 1.123683721144223e-06, + "logits/chosen": -2.663508892059326, + "logits/rejected": -2.683802843093872, + "logps/chosen": -224.25607299804688, + "logps/rejected": -240.0520782470703, + "loss": 0.7136, + "positive_losses": 0.1441497802734375, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.22323274612426758, + "rewards/margins": 0.08508383482694626, + "rewards/margins_max": 0.3114696741104126, + "rewards/margins_min": -0.08833112567663193, + "rewards/margins_std": 0.18358775973320007, + "rewards/rejected": 0.13814887404441833, + "step": 2740 + }, + { + "dpo_losses": 0.6344123482704163, + "epoch": 0.72, + "grad_norm": 12.207456867580083, + "learning_rate": 1.1046703517675848e-06, + "logits/chosen": -2.725710868835449, + "logits/rejected": -2.6906607151031494, + "logps/chosen": -258.3102111816406, + "logps/rejected": -234.1375274658203, + "loss": 0.6738, + "positive_losses": 0.17371778190135956, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.25737282633781433, + "rewards/margins": 0.13390055298805237, + "rewards/margins_max": 0.34902340173721313, + "rewards/margins_min": -0.047223832458257675, + "rewards/margins_std": 0.1819126307964325, + "rewards/rejected": 0.12347228825092316, + "step": 2750 + }, + { + "dpo_losses": 0.6266162395477295, + "epoch": 0.72, + "grad_norm": 9.972333019022438, + "learning_rate": 1.085773492015028e-06, + "logits/chosen": -2.673086643218994, + "logits/rejected": -2.61374568939209, + "logps/chosen": -305.066162109375, + "logps/rejected": -278.8713684082031, + "loss": 0.6407, + "positive_losses": 0.03099961206316948, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.2706281542778015, + "rewards/margins": 0.15120838582515717, + "rewards/margins_max": 0.39720866084098816, + "rewards/margins_min": -0.05999114364385605, + "rewards/margins_std": 0.20378157496452332, + "rewards/rejected": 0.11941979080438614, + "step": 2760 + }, + { + "dpo_losses": 0.6503585577011108, + "epoch": 0.72, + "grad_norm": 3.3210533123475434, + "learning_rate": 1.0669947197689034e-06, + "logits/chosen": -2.767747402191162, + "logits/rejected": -2.721989631652832, + "logps/chosen": -284.27728271484375, + "logps/rejected": -252.5501708984375, + "loss": 0.6769, + "positive_losses": 0.05525505542755127, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.2337963879108429, + "rewards/margins": 0.09842909872531891, + "rewards/margins_max": 0.3275780975818634, + "rewards/margins_min": -0.10053189098834991, + "rewards/margins_std": 0.1886182427406311, + "rewards/rejected": 0.13536730408668518, + "step": 2770 + }, + { + "dpo_losses": 0.6543646454811096, + "epoch": 0.73, + "grad_norm": 1.7555621174530065, + "learning_rate": 1.048335603051291e-06, + "logits/chosen": -2.6742210388183594, + "logits/rejected": -2.6266090869903564, + "logps/chosen": -299.0511169433594, + "logps/rejected": -228.11959838867188, + "loss": 0.6762, + "positive_losses": 0.36124539375305176, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.21554644405841827, + "rewards/margins": 0.08905548602342606, + "rewards/margins_max": 0.3050028383731842, + "rewards/margins_min": -0.1116354912519455, + "rewards/margins_std": 0.1855059266090393, + "rewards/rejected": 0.126490980386734, + "step": 2780 + }, + { + "dpo_losses": 0.6355771422386169, + "epoch": 0.73, + "grad_norm": 1.5128031151064185, + "learning_rate": 1.0297976998930665e-06, + "logits/chosen": -2.7675232887268066, + "logits/rejected": -2.733736753463745, + "logps/chosen": -301.5973205566406, + "logps/rejected": -249.68814086914062, + "loss": 0.6533, + "positive_losses": 0.05606355518102646, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.2681359052658081, + "rewards/margins": 0.1323312371969223, + "rewards/margins_max": 0.371121346950531, + "rewards/margins_min": -0.0732019692659378, + "rewards/margins_std": 0.19328762590885162, + "rewards/rejected": 0.1358046680688858, + "step": 2790 + }, + { + "dpo_losses": 0.6555207371711731, + "epoch": 0.73, + "grad_norm": 7.699491498926473, + "learning_rate": 1.0113825582038078e-06, + "logits/chosen": -2.7008633613586426, + "logits/rejected": -2.678898572921753, + "logps/chosen": -244.8804931640625, + "logps/rejected": -244.4199676513672, + "loss": 0.6851, + "positive_losses": 0.41886672377586365, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.20459289848804474, + "rewards/margins": 0.08580772578716278, + "rewards/margins_max": 0.28134217858314514, + "rewards/margins_min": -0.10736802965402603, + "rewards/margins_std": 0.17606523633003235, + "rewards/rejected": 0.11878518760204315, + "step": 2800 + }, + { + "epoch": 0.73, + "eval_dpo_losses": 0.6430361270904541, + "eval_logits/chosen": -2.661334991455078, + "eval_logits/rejected": -2.6278324127197266, + "eval_logps/chosen": -261.293701171875, + "eval_logps/rejected": -246.66207885742188, + "eval_loss": 0.677651584148407, + "eval_positive_losses": 0.22990721464157104, + "eval_rewards/accuracies": 0.7089999914169312, + "eval_rewards/chosen": 0.23299722373485565, + "eval_rewards/margins": 0.11382872611284256, + "eval_rewards/margins_max": 0.42740339040756226, + "eval_rewards/margins_min": -0.15496976673603058, + "eval_rewards/margins_std": 0.1945992261171341, + "eval_rewards/rejected": 0.1191684901714325, + "eval_runtime": 428.0328, + "eval_samples_per_second": 4.673, + "eval_steps_per_second": 0.292, + "step": 2800 + }, + { + "dpo_losses": 0.6165627837181091, + "epoch": 0.74, + "grad_norm": 2.109208390049169, + "learning_rate": 9.930917156425477e-07, + "logits/chosen": -2.6779825687408447, + "logits/rejected": -2.6695377826690674, + "logps/chosen": -245.42239379882812, + "logps/rejected": -220.6289520263672, + "loss": 0.6648, + "positive_losses": 0.18235059082508087, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.26049965620040894, + "rewards/margins": 0.17864897847175598, + "rewards/margins_max": 0.4070645272731781, + "rewards/margins_min": -0.03810154274106026, + "rewards/margins_std": 0.1952413022518158, + "rewards/rejected": 0.08185064792633057, + "step": 2810 + }, + { + "dpo_losses": 0.6527791023254395, + "epoch": 0.74, + "grad_norm": 2.362274409427705, + "learning_rate": 9.749266994893756e-07, + "logits/chosen": -2.6672139167785645, + "logits/rejected": -2.664942979812622, + "logps/chosen": -241.8899383544922, + "logps/rejected": -248.9877471923828, + "loss": 0.6564, + "positive_losses": 0.20571669936180115, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.2259671688079834, + "rewards/margins": 0.09094025939702988, + "rewards/margins_max": 0.27354854345321655, + "rewards/margins_min": -0.09114174544811249, + "rewards/margins_std": 0.16183331608772278, + "rewards/rejected": 0.13502691686153412, + "step": 2820 + }, + { + "dpo_losses": 0.6393716335296631, + "epoch": 0.74, + "grad_norm": 2.1587866637225996, + "learning_rate": 9.56889026517913e-07, + "logits/chosen": -2.6505894660949707, + "logits/rejected": -2.636881113052368, + "logps/chosen": -276.57061767578125, + "logps/rejected": -363.47723388671875, + "loss": 0.6445, + "positive_losses": 0.06818590313196182, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.23265020549297333, + "rewards/margins": 0.1187715083360672, + "rewards/margins_max": 0.31650876998901367, + "rewards/margins_min": -0.05719621106982231, + "rewards/margins_std": 0.16603873670101166, + "rewards/rejected": 0.11387868970632553, + "step": 2830 + }, + { + "dpo_losses": 0.6333634257316589, + "epoch": 0.74, + "grad_norm": 9.361244384340141, + "learning_rate": 9.389802028686617e-07, + "logits/chosen": -2.6403565406799316, + "logits/rejected": -2.6060609817504883, + "logps/chosen": -260.1311340332031, + "logps/rejected": -231.0727081298828, + "loss": 0.655, + "positive_losses": 0.023256683722138405, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.23338253796100616, + "rewards/margins": 0.13441750407218933, + "rewards/margins_max": 0.34804508090019226, + "rewards/margins_min": -0.060067594051361084, + "rewards/margins_std": 0.18399295210838318, + "rewards/rejected": 0.09896502643823624, + "step": 2840 + }, + { + "dpo_losses": 0.6408575177192688, + "epoch": 0.75, + "grad_norm": 2.6338957869957857, + "learning_rate": 9.212017239232427e-07, + "logits/chosen": -2.713360548019409, + "logits/rejected": -2.6990275382995605, + "logps/chosen": -261.1534729003906, + "logps/rejected": -248.7087860107422, + "loss": 0.6963, + "positive_losses": 0.2504243850708008, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.22237715125083923, + "rewards/margins": 0.12060409784317017, + "rewards/margins_max": 0.37134578824043274, + "rewards/margins_min": -0.14265871047973633, + "rewards/margins_std": 0.2249237596988678, + "rewards/rejected": 0.10177306830883026, + "step": 2850 + }, + { + "dpo_losses": 0.641190230846405, + "epoch": 0.75, + "grad_norm": 1.977161723741076, + "learning_rate": 9.03555074179533e-07, + "logits/chosen": -2.6907799243927, + "logits/rejected": -2.655926465988159, + "logps/chosen": -227.423583984375, + "logps/rejected": -216.56454467773438, + "loss": 0.6444, + "positive_losses": 0.05986515432596207, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.2274264544248581, + "rewards/margins": 0.11544140428304672, + "rewards/margins_max": 0.3044131100177765, + "rewards/margins_min": -0.0820235162973404, + "rewards/margins_std": 0.17438822984695435, + "rewards/rejected": 0.11198506504297256, + "step": 2860 + }, + { + "dpo_losses": 0.6329208612442017, + "epoch": 0.75, + "grad_norm": 13.625303492534464, + "learning_rate": 8.860417271277067e-07, + "logits/chosen": -2.6437621116638184, + "logits/rejected": -2.6126272678375244, + "logps/chosen": -286.8416442871094, + "logps/rejected": -271.61279296875, + "loss": 0.6515, + "positive_losses": 0.0965370163321495, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.24304378032684326, + "rewards/margins": 0.1386830061674118, + "rewards/margins_max": 0.36370256543159485, + "rewards/margins_min": -0.09242524206638336, + "rewards/margins_std": 0.20286257565021515, + "rewards/rejected": 0.10436077415943146, + "step": 2870 + }, + { + "dpo_losses": 0.6439909338951111, + "epoch": 0.75, + "grad_norm": 6.085725009327989, + "learning_rate": 8.686631451272029e-07, + "logits/chosen": -2.6746413707733154, + "logits/rejected": -2.5927882194519043, + "logps/chosen": -277.53564453125, + "logps/rejected": -248.0998992919922, + "loss": 0.6892, + "positive_losses": 0.506631076335907, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.22289161384105682, + "rewards/margins": 0.1165764331817627, + "rewards/margins_max": 0.33958134055137634, + "rewards/margins_min": -0.12362408638000488, + "rewards/margins_std": 0.2088121473789215, + "rewards/rejected": 0.10631519556045532, + "step": 2880 + }, + { + "dpo_losses": 0.6548301577568054, + "epoch": 0.76, + "grad_norm": 1.9717958896927266, + "learning_rate": 8.514207792846168e-07, + "logits/chosen": -2.6367344856262207, + "logits/rejected": -2.5813980102539062, + "logps/chosen": -269.0919494628906, + "logps/rejected": -237.8687286376953, + "loss": 0.6582, + "positive_losses": 0.07601909339427948, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.24029597640037537, + "rewards/margins": 0.09323279559612274, + "rewards/margins_max": 0.33396726846694946, + "rewards/margins_min": -0.15189293026924133, + "rewards/margins_std": 0.21695995330810547, + "rewards/rejected": 0.147063210606575, + "step": 2890 + }, + { + "dpo_losses": 0.6318849325180054, + "epoch": 0.76, + "grad_norm": 14.571141376389916, + "learning_rate": 8.343160693325356e-07, + "logits/chosen": -2.6871140003204346, + "logits/rejected": -2.6749329566955566, + "logps/chosen": -278.9430847167969, + "logps/rejected": -235.34912109375, + "loss": 0.678, + "positive_losses": 0.3711848258972168, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.24064703285694122, + "rewards/margins": 0.1461915671825409, + "rewards/margins_max": 0.44787925481796265, + "rewards/margins_min": -0.07638730853796005, + "rewards/margins_std": 0.2327384501695633, + "rewards/rejected": 0.09445545822381973, + "step": 2900 + }, + { + "epoch": 0.76, + "eval_dpo_losses": 0.6402330994606018, + "eval_logits/chosen": -2.6602489948272705, + "eval_logits/rejected": -2.626877546310425, + "eval_logps/chosen": -261.8085021972656, + "eval_logps/rejected": -247.86146545410156, + "eval_loss": 0.685627818107605, + "eval_positive_losses": 0.29970258474349976, + "eval_rewards/accuracies": 0.7110000252723694, + "eval_rewards/chosen": 0.22784921526908875, + "eval_rewards/margins": 0.12067471444606781, + "eval_rewards/margins_max": 0.4461641013622284, + "eval_rewards/margins_min": -0.1602570116519928, + "eval_rewards/margins_std": 0.20283648371696472, + "eval_rewards/rejected": 0.10717451572418213, + "eval_runtime": 427.9539, + "eval_samples_per_second": 4.673, + "eval_steps_per_second": 0.292, + "step": 2900 + }, + { + "dpo_losses": 0.6342518329620361, + "epoch": 0.76, + "grad_norm": 9.158026304874936, + "learning_rate": 8.173504435093174e-07, + "logits/chosen": -2.6617798805236816, + "logits/rejected": -2.6154887676239014, + "logps/chosen": -235.50564575195312, + "logps/rejected": -237.1990966796875, + "loss": 0.6705, + "positive_losses": 0.3194850981235504, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.22792664170265198, + "rewards/margins": 0.1314457356929779, + "rewards/margins_max": 0.3273685872554779, + "rewards/margins_min": -0.05593591183423996, + "rewards/margins_std": 0.17454983294010162, + "rewards/rejected": 0.09648089855909348, + "step": 2910 + }, + { + "dpo_losses": 0.636628270149231, + "epoch": 0.76, + "grad_norm": 22.19845680165628, + "learning_rate": 8.00525318439836e-07, + "logits/chosen": -2.675938367843628, + "logits/rejected": -2.6456830501556396, + "logps/chosen": -234.3569793701172, + "logps/rejected": -204.05677795410156, + "loss": 0.6814, + "positive_losses": 0.8898605108261108, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.23664262890815735, + "rewards/margins": 0.1313386708498001, + "rewards/margins_max": 0.37490588426589966, + "rewards/margins_min": -0.09426429122686386, + "rewards/margins_std": 0.20440106093883514, + "rewards/rejected": 0.10530395805835724, + "step": 2920 + }, + { + "dpo_losses": 0.6341038942337036, + "epoch": 0.77, + "grad_norm": 18.19028684887985, + "learning_rate": 7.838420990171927e-07, + "logits/chosen": -2.587294101715088, + "logits/rejected": -2.614553451538086, + "logps/chosen": -259.9834289550781, + "logps/rejected": -228.9385986328125, + "loss": 0.7121, + "positive_losses": 0.6251548528671265, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.24708354473114014, + "rewards/margins": 0.13857489824295044, + "rewards/margins_max": 0.3844526410102844, + "rewards/margins_min": -0.06038924306631088, + "rewards/margins_std": 0.19883593916893005, + "rewards/rejected": 0.10850866883993149, + "step": 2930 + }, + { + "dpo_losses": 0.6501592397689819, + "epoch": 0.77, + "grad_norm": 1.743798917679003, + "learning_rate": 7.673021782854084e-07, + "logits/chosen": -2.6355204582214355, + "logits/rejected": -2.6447384357452393, + "logps/chosen": -238.6925506591797, + "logps/rejected": -237.84347534179688, + "loss": 0.701, + "positive_losses": 0.6593742370605469, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.23624686896800995, + "rewards/margins": 0.10006473958492279, + "rewards/margins_max": 0.33041948080062866, + "rewards/margins_min": -0.1267656683921814, + "rewards/margins_std": 0.20058086514472961, + "rewards/rejected": 0.13618211448192596, + "step": 2940 + }, + { + "dpo_losses": 0.637436032295227, + "epoch": 0.77, + "grad_norm": 13.226887824557881, + "learning_rate": 7.509069373231039e-07, + "logits/chosen": -2.7241337299346924, + "logits/rejected": -2.6747238636016846, + "logps/chosen": -249.4187469482422, + "logps/rejected": -271.2737731933594, + "loss": 0.6926, + "positive_losses": 1.0058292150497437, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22392499446868896, + "rewards/margins": 0.13012099266052246, + "rewards/margins_max": 0.3944118320941925, + "rewards/margins_min": -0.08947178721427917, + "rewards/margins_std": 0.2141483724117279, + "rewards/rejected": 0.09380398690700531, + "step": 2950 + }, + { + "dpo_losses": 0.647119402885437, + "epoch": 0.77, + "grad_norm": 1.8484562438375651, + "learning_rate": 7.346577451281822e-07, + "logits/chosen": -2.6706955432891846, + "logits/rejected": -2.662426471710205, + "logps/chosen": -258.4949951171875, + "logps/rejected": -227.0917510986328, + "loss": 0.6722, + "positive_losses": 0.6746174097061157, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.23804013431072235, + "rewards/margins": 0.10887646675109863, + "rewards/margins_max": 0.3489915430545807, + "rewards/margins_min": -0.12331470102071762, + "rewards/margins_std": 0.20819251239299774, + "rewards/rejected": 0.12916366755962372, + "step": 2960 + }, + { + "dpo_losses": 0.6399433612823486, + "epoch": 0.78, + "grad_norm": 7.929735846912345, + "learning_rate": 7.185559585035138e-07, + "logits/chosen": -2.661951780319214, + "logits/rejected": -2.7017343044281006, + "logps/chosen": -248.2020721435547, + "logps/rejected": -241.7464599609375, + "loss": 0.6815, + "positive_losses": 0.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24719658493995667, + "rewards/margins": 0.12132751941680908, + "rewards/margins_max": 0.3393659293651581, + "rewards/margins_min": -0.05899649113416672, + "rewards/margins_std": 0.1788053661584854, + "rewards/rejected": 0.12586906552314758, + "step": 2970 + }, + { + "dpo_losses": 0.6399902105331421, + "epoch": 0.78, + "grad_norm": 2.018031494961003, + "learning_rate": 7.026029219436504e-07, + "logits/chosen": -2.6252198219299316, + "logits/rejected": -2.644059419631958, + "logps/chosen": -279.79241943359375, + "logps/rejected": -288.64154052734375, + "loss": 0.6595, + "positive_losses": 0.3226403295993805, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.23678548634052277, + "rewards/margins": 0.1288241147994995, + "rewards/margins_max": 0.40611523389816284, + "rewards/margins_min": -0.08896999061107635, + "rewards/margins_std": 0.21852633357048035, + "rewards/rejected": 0.10796137899160385, + "step": 2980 + }, + { + "dpo_losses": 0.6523637771606445, + "epoch": 0.78, + "grad_norm": 2.1797487530595916, + "learning_rate": 6.867999675225523e-07, + "logits/chosen": -2.6289401054382324, + "logits/rejected": -2.572408437728882, + "logps/chosen": -280.0498352050781, + "logps/rejected": -271.45501708984375, + "loss": 0.6729, + "positive_losses": 0.18320608139038086, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.23342570662498474, + "rewards/margins": 0.10582546144723892, + "rewards/margins_max": 0.3280499279499054, + "rewards/margins_min": -0.17500343918800354, + "rewards/margins_std": 0.22897744178771973, + "rewards/rejected": 0.12760025262832642, + "step": 2990 + }, + { + "dpo_losses": 0.6533046960830688, + "epoch": 0.79, + "grad_norm": 2.3628875656726143, + "learning_rate": 6.711484147823663e-07, + "logits/chosen": -2.6833419799804688, + "logits/rejected": -2.662086009979248, + "logps/chosen": -222.61648559570312, + "logps/rejected": -240.6640167236328, + "loss": 0.6605, + "positive_losses": 0.03348231315612793, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.21097604930400848, + "rewards/margins": 0.09355119615793228, + "rewards/margins_max": 0.32012492418289185, + "rewards/margins_min": -0.07909585535526276, + "rewards/margins_std": 0.18243499100208282, + "rewards/rejected": 0.1174248605966568, + "step": 3000 + }, + { + "epoch": 0.79, + "eval_dpo_losses": 0.6411991119384766, + "eval_logits/chosen": -2.6605002880096436, + "eval_logits/rejected": -2.627521514892578, + "eval_logps/chosen": -261.4324035644531, + "eval_logps/rejected": -247.2367401123047, + "eval_loss": 0.6807260513305664, + "eval_positive_losses": 0.24150457978248596, + "eval_rewards/accuracies": 0.7160000205039978, + "eval_rewards/chosen": 0.23161005973815918, + "eval_rewards/margins": 0.11818789690732956, + "eval_rewards/margins_max": 0.4379708766937256, + "eval_rewards/margins_min": -0.1546909064054489, + "eval_rewards/margins_std": 0.19860580563545227, + "eval_rewards/rejected": 0.11342217773199081, + "eval_runtime": 427.8032, + "eval_samples_per_second": 4.675, + "eval_steps_per_second": 0.292, + "step": 3000 + }, + { + "dpo_losses": 0.6328443288803101, + "epoch": 0.79, + "grad_norm": 2.0178014552248493, + "learning_rate": 6.556495706232413e-07, + "logits/chosen": -2.7266080379486084, + "logits/rejected": -2.7250070571899414, + "logps/chosen": -270.19879150390625, + "logps/rejected": -247.5888671875, + "loss": 0.6644, + "positive_losses": 0.2839382290840149, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.24732156097888947, + "rewards/margins": 0.1365831345319748, + "rewards/margins_max": 0.36095213890075684, + "rewards/margins_min": -0.06557613611221313, + "rewards/margins_std": 0.19293871521949768, + "rewards/rejected": 0.11073843389749527, + "step": 3010 + }, + { + "dpo_losses": 0.6344121098518372, + "epoch": 0.79, + "grad_norm": 19.014779215475556, + "learning_rate": 6.403047291942057e-07, + "logits/chosen": -2.691561698913574, + "logits/rejected": -2.63598370552063, + "logps/chosen": -241.51620483398438, + "logps/rejected": -209.51547241210938, + "loss": 0.6889, + "positive_losses": 0.2836257815361023, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.2331659495830536, + "rewards/margins": 0.13293863832950592, + "rewards/margins_max": 0.3479847311973572, + "rewards/margins_min": -0.08762186020612717, + "rewards/margins_std": 0.19680854678153992, + "rewards/rejected": 0.10022733360528946, + "step": 3020 + }, + { + "dpo_losses": 0.6390833854675293, + "epoch": 0.79, + "grad_norm": 1.9188208153793838, + "learning_rate": 6.251151717851023e-07, + "logits/chosen": -2.6485981941223145, + "logits/rejected": -2.6350791454315186, + "logps/chosen": -253.7571563720703, + "logps/rejected": -280.71746826171875, + "loss": 0.687, + "positive_losses": 0.36080265045166016, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.22402584552764893, + "rewards/margins": 0.12664227187633514, + "rewards/margins_max": 0.39223814010620117, + "rewards/margins_min": -0.07834620773792267, + "rewards/margins_std": 0.20787307620048523, + "rewards/rejected": 0.09738355875015259, + "step": 3030 + }, + { + "dpo_losses": 0.6260613203048706, + "epoch": 0.8, + "grad_norm": 2.513409699444311, + "learning_rate": 6.100821667196041e-07, + "logits/chosen": -2.678619384765625, + "logits/rejected": -2.6174044609069824, + "logps/chosen": -287.0162658691406, + "logps/rejected": -278.2513427734375, + "loss": 0.6457, + "positive_losses": 0.19703082740306854, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.2569059133529663, + "rewards/margins": 0.15052184462547302, + "rewards/margins_max": 0.3674320578575134, + "rewards/margins_min": -0.055698297917842865, + "rewards/margins_std": 0.1886374056339264, + "rewards/rejected": 0.10638407617807388, + "step": 3040 + }, + { + "dpo_losses": 0.6396051645278931, + "epoch": 0.8, + "grad_norm": 66.39852528600504, + "learning_rate": 5.952069692493062e-07, + "logits/chosen": -2.661994218826294, + "logits/rejected": -2.6428751945495605, + "logps/chosen": -248.02285766601562, + "logps/rejected": -239.62319946289062, + "loss": 0.7019, + "positive_losses": 0.19167347252368927, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.22153182327747345, + "rewards/margins": 0.1235644593834877, + "rewards/margins_max": 0.3312918543815613, + "rewards/margins_min": -0.09882686287164688, + "rewards/margins_std": 0.19070424139499664, + "rewards/rejected": 0.09796737134456635, + "step": 3050 + }, + { + "dpo_losses": 0.6472259163856506, + "epoch": 0.8, + "grad_norm": 13.873807288791474, + "learning_rate": 5.80490821448918e-07, + "logits/chosen": -2.6815364360809326, + "logits/rejected": -2.6650428771972656, + "logps/chosen": -283.110107421875, + "logps/rejected": -268.1537170410156, + "loss": 0.6617, + "positive_losses": 0.19559669494628906, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.24070200324058533, + "rewards/margins": 0.1051400676369667, + "rewards/margins_max": 0.3200732171535492, + "rewards/margins_min": -0.11534781754016876, + "rewards/margins_std": 0.1950106918811798, + "rewards/rejected": 0.13556192815303802, + "step": 3060 + }, + { + "dpo_losses": 0.6577891111373901, + "epoch": 0.8, + "grad_norm": 13.527340930009814, + "learning_rate": 5.659349521125459e-07, + "logits/chosen": -2.61413836479187, + "logits/rejected": -2.600532054901123, + "logps/chosen": -224.72509765625, + "logps/rejected": -247.92626953125, + "loss": 0.677, + "positive_losses": 0.5455880165100098, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.20303694903850555, + "rewards/margins": 0.08140838146209717, + "rewards/margins_max": 0.28243789076805115, + "rewards/margins_min": -0.09992311894893646, + "rewards/margins_std": 0.1716955602169037, + "rewards/rejected": 0.12162858247756958, + "step": 3070 + }, + { + "dpo_losses": 0.6692142486572266, + "epoch": 0.81, + "grad_norm": 2.043028733957474, + "learning_rate": 5.5154057665109e-07, + "logits/chosen": -2.6322388648986816, + "logits/rejected": -2.638415813446045, + "logps/chosen": -229.92355346679688, + "logps/rejected": -269.1875305175781, + "loss": 0.6743, + "positive_losses": 0.5757365226745605, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.2035880982875824, + "rewards/margins": 0.06119891256093979, + "rewards/margins_max": 0.285746693611145, + "rewards/margins_min": -0.15262167155742645, + "rewards/margins_std": 0.19626206159591675, + "rewards/rejected": 0.1423891931772232, + "step": 3080 + }, + { + "dpo_losses": 0.6530777215957642, + "epoch": 0.81, + "grad_norm": 6.519199772036796, + "learning_rate": 5.373088969907586e-07, + "logits/chosen": -2.6646833419799805, + "logits/rejected": -2.612794876098633, + "logps/chosen": -255.7550811767578, + "logps/rejected": -229.76272583007812, + "loss": 0.7051, + "positive_losses": 0.8610352277755737, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.20456835627555847, + "rewards/margins": 0.09238609671592712, + "rewards/margins_max": 0.30011457204818726, + "rewards/margins_min": -0.0869378000497818, + "rewards/margins_std": 0.17641900479793549, + "rewards/rejected": 0.11218225955963135, + "step": 3090 + }, + { + "dpo_losses": 0.6696368455886841, + "epoch": 0.81, + "grad_norm": 12.489859167790884, + "learning_rate": 5.23241101472709e-07, + "logits/chosen": -2.681021213531494, + "logits/rejected": -2.673013687133789, + "logps/chosen": -262.3744201660156, + "logps/rejected": -271.0479736328125, + "loss": 0.6874, + "positive_losses": 0.26185646653175354, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.23370583355426788, + "rewards/margins": 0.0600501112639904, + "rewards/margins_max": 0.2808021008968353, + "rewards/margins_min": -0.1530933827161789, + "rewards/margins_std": 0.19106140732765198, + "rewards/rejected": 0.17365573346614838, + "step": 3100 + }, + { + "epoch": 0.81, + "eval_dpo_losses": 0.6425239443778992, + "eval_logits/chosen": -2.6493923664093018, + "eval_logits/rejected": -2.6151318550109863, + "eval_logps/chosen": -261.0995178222656, + "eval_logps/rejected": -246.58517456054688, + "eval_loss": 0.6752615571022034, + "eval_positive_losses": 0.2061479240655899, + "eval_rewards/accuracies": 0.718999981880188, + "eval_rewards/chosen": 0.23493869602680206, + "eval_rewards/margins": 0.11500106751918793, + "eval_rewards/margins_max": 0.42997851967811584, + "eval_rewards/margins_min": -0.15204735100269318, + "eval_rewards/margins_std": 0.19510281085968018, + "eval_rewards/rejected": 0.11993761360645294, + "eval_runtime": 428.1959, + "eval_samples_per_second": 4.671, + "eval_steps_per_second": 0.292, + "step": 3100 + }, + { + "dpo_losses": 0.6559592485427856, + "epoch": 0.81, + "grad_norm": 2.2951095958795493, + "learning_rate": 5.09338364753818e-07, + "logits/chosen": -2.70573353767395, + "logits/rejected": -2.6352028846740723, + "logps/chosen": -266.19921875, + "logps/rejected": -251.9840545654297, + "loss": 0.6587, + "positive_losses": 0.18008080124855042, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.22073253989219666, + "rewards/margins": 0.08792804181575775, + "rewards/margins_max": 0.3271465301513672, + "rewards/margins_min": -0.1252015233039856, + "rewards/margins_std": 0.1997259557247162, + "rewards/rejected": 0.1328045129776001, + "step": 3110 + }, + { + "dpo_losses": 0.6467105746269226, + "epoch": 0.82, + "grad_norm": 1.9154281415492576, + "learning_rate": 4.956018477086005e-07, + "logits/chosen": -2.6191787719726562, + "logits/rejected": -2.5866799354553223, + "logps/chosen": -271.4346923828125, + "logps/rejected": -247.22811889648438, + "loss": 0.6679, + "positive_losses": 0.4603656232357025, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.23231330513954163, + "rewards/margins": 0.10724592208862305, + "rewards/margins_max": 0.3365776538848877, + "rewards/margins_min": -0.12125638872385025, + "rewards/margins_std": 0.20351378619670868, + "rewards/rejected": 0.1250673532485962, + "step": 3120 + }, + { + "dpo_losses": 0.6430977582931519, + "epoch": 0.82, + "grad_norm": 1.906469051094875, + "learning_rate": 4.820326973322764e-07, + "logits/chosen": -2.7109386920928955, + "logits/rejected": -2.670637845993042, + "logps/chosen": -241.487060546875, + "logps/rejected": -210.2299041748047, + "loss": 0.6443, + "positive_losses": 0.1338365525007248, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.22416770458221436, + "rewards/margins": 0.11624391376972198, + "rewards/margins_max": 0.360629141330719, + "rewards/margins_min": -0.10840293020009995, + "rewards/margins_std": 0.211382657289505, + "rewards/rejected": 0.10792376846075058, + "step": 3130 + }, + { + "dpo_losses": 0.6356518864631653, + "epoch": 0.82, + "grad_norm": 1.9870086974369754, + "learning_rate": 4.686320466449981e-07, + "logits/chosen": -2.613452434539795, + "logits/rejected": -2.6398723125457764, + "logps/chosen": -216.00192260742188, + "logps/rejected": -240.72891235351562, + "loss": 0.6845, + "positive_losses": 0.26034507155418396, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.23193475604057312, + "rewards/margins": 0.12595371901988983, + "rewards/margins_max": 0.2877231240272522, + "rewards/margins_min": -0.03262107074260712, + "rewards/margins_std": 0.14468979835510254, + "rewards/rejected": 0.10598105192184448, + "step": 3140 + }, + { + "dpo_losses": 0.6281145811080933, + "epoch": 0.82, + "grad_norm": 9.798790168042391, + "learning_rate": 4.554010145972418e-07, + "logits/chosen": -2.5790274143218994, + "logits/rejected": -2.563317060470581, + "logps/chosen": -278.03118896484375, + "logps/rejected": -295.5206298828125, + "loss": 0.6563, + "positive_losses": 0.2543970048427582, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.25272035598754883, + "rewards/margins": 0.15435662865638733, + "rewards/margins_max": 0.41291069984436035, + "rewards/margins_min": -0.06888072192668915, + "rewards/margins_std": 0.21114492416381836, + "rewards/rejected": 0.0983637273311615, + "step": 3150 + }, + { + "dpo_losses": 0.6204236149787903, + "epoch": 0.83, + "grad_norm": 6.473582945600099, + "learning_rate": 4.4234070597637455e-07, + "logits/chosen": -2.71343731880188, + "logits/rejected": -2.677605152130127, + "logps/chosen": -279.60089111328125, + "logps/rejected": -246.6046905517578, + "loss": 0.6658, + "positive_losses": 0.0, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.259508341550827, + "rewards/margins": 0.163357213139534, + "rewards/margins_max": 0.3819490373134613, + "rewards/margins_min": -0.04566248506307602, + "rewards/margins_std": 0.1943180114030838, + "rewards/rejected": 0.09615114331245422, + "step": 3160 + }, + { + "dpo_losses": 0.6419295072555542, + "epoch": 0.83, + "grad_norm": 11.911500749177504, + "learning_rate": 4.2945221131440783e-07, + "logits/chosen": -2.6954774856567383, + "logits/rejected": -2.675586223602295, + "logps/chosen": -273.2368469238281, + "logps/rejected": -255.10995483398438, + "loss": 0.6614, + "positive_losses": 0.35317736864089966, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.2160724401473999, + "rewards/margins": 0.11563559621572495, + "rewards/margins_max": 0.3120267689228058, + "rewards/margins_min": -0.08002261817455292, + "rewards/margins_std": 0.17242729663848877, + "rewards/rejected": 0.10043685138225555, + "step": 3170 + }, + { + "dpo_losses": 0.6373859643936157, + "epoch": 0.83, + "grad_norm": 15.502172502424358, + "learning_rate": 4.167366067969381e-07, + "logits/chosen": -2.623873233795166, + "logits/rejected": -2.6007158756256104, + "logps/chosen": -269.0568542480469, + "logps/rejected": -220.5035858154297, + "loss": 0.6918, + "positive_losses": 0.3878262937068939, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.25423291325569153, + "rewards/margins": 0.12449681758880615, + "rewards/margins_max": 0.3036009967327118, + "rewards/margins_min": -0.08757523447275162, + "rewards/margins_std": 0.17467781901359558, + "rewards/rejected": 0.12973609566688538, + "step": 3180 + }, + { + "dpo_losses": 0.6364492774009705, + "epoch": 0.83, + "grad_norm": 1.986876525494331, + "learning_rate": 4.041949541732826e-07, + "logits/chosen": -2.6248703002929688, + "logits/rejected": -2.5871098041534424, + "logps/chosen": -259.7639465332031, + "logps/rejected": -237.5189666748047, + "loss": 0.6605, + "positive_losses": 0.2175983488559723, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.22980129718780518, + "rewards/margins": 0.12821714580059052, + "rewards/margins_max": 0.35198521614074707, + "rewards/margins_min": -0.09533867239952087, + "rewards/margins_std": 0.20564059913158417, + "rewards/rejected": 0.10158412158489227, + "step": 3190 + }, + { + "dpo_losses": 0.6411651372909546, + "epoch": 0.84, + "grad_norm": 1.6812550881461255, + "learning_rate": 3.9182830066782614e-07, + "logits/chosen": -2.6221518516540527, + "logits/rejected": -2.59946870803833, + "logps/chosen": -249.2165069580078, + "logps/rejected": -240.29556274414062, + "loss": 0.6516, + "positive_losses": 0.1823883056640625, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.2199111431837082, + "rewards/margins": 0.11735031753778458, + "rewards/margins_max": 0.3530258536338806, + "rewards/margins_min": -0.0921543687582016, + "rewards/margins_std": 0.1945345401763916, + "rewards/rejected": 0.10256080329418182, + "step": 3200 + }, + { + "epoch": 0.84, + "eval_dpo_losses": 0.6384575366973877, + "eval_logits/chosen": -2.649825096130371, + "eval_logits/rejected": -2.6158199310302734, + "eval_logps/chosen": -261.75390625, + "eval_logps/rejected": -248.2175750732422, + "eval_loss": 0.6827730536460876, + "eval_positive_losses": 0.3005758225917816, + "eval_rewards/accuracies": 0.7160000205039978, + "eval_rewards/chosen": 0.2283952236175537, + "eval_rewards/margins": 0.12478169053792953, + "eval_rewards/margins_max": 0.4527498781681061, + "eval_rewards/margins_min": -0.15858082473278046, + "eval_rewards/margins_std": 0.20521041750907898, + "eval_rewards/rejected": 0.10361352562904358, + "eval_runtime": 427.9924, + "eval_samples_per_second": 4.673, + "eval_steps_per_second": 0.292, + "step": 3200 + }, + { + "dpo_losses": 0.6365126371383667, + "epoch": 0.84, + "grad_norm": 2.1383141065978455, + "learning_rate": 3.796376788925771e-07, + "logits/chosen": -2.673523426055908, + "logits/rejected": -2.652254104614258, + "logps/chosen": -230.735595703125, + "logps/rejected": -223.2124481201172, + "loss": 0.6735, + "positive_losses": 0.21147899329662323, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.22603364288806915, + "rewards/margins": 0.12993502616882324, + "rewards/margins_max": 0.3829730153083801, + "rewards/margins_min": -0.08001724630594254, + "rewards/margins_std": 0.20853832364082336, + "rewards/rejected": 0.09609860926866531, + "step": 3210 + }, + { + "dpo_losses": 0.6381786465644836, + "epoch": 0.84, + "grad_norm": 2.3358172637716947, + "learning_rate": 3.676241067609465e-07, + "logits/chosen": -2.612231731414795, + "logits/rejected": -2.598639965057373, + "logps/chosen": -268.4934387207031, + "logps/rejected": -264.42913818359375, + "loss": 0.6846, + "positive_losses": 0.41847342252731323, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.22133584320545197, + "rewards/margins": 0.1236250028014183, + "rewards/margins_max": 0.3461146950721741, + "rewards/margins_min": -0.05468686297535896, + "rewards/margins_std": 0.17791306972503662, + "rewards/rejected": 0.09771083295345306, + "step": 3220 + }, + { + "dpo_losses": 0.66154545545578, + "epoch": 0.85, + "grad_norm": 2.3606999195058305, + "learning_rate": 3.5578858740274976e-07, + "logits/chosen": -2.627288341522217, + "logits/rejected": -2.6127145290374756, + "logps/chosen": -294.7670593261719, + "logps/rejected": -310.2293701171875, + "loss": 0.7285, + "positive_losses": 0.8174301981925964, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.21198442578315735, + "rewards/margins": 0.0844467282295227, + "rewards/margins_max": 0.4018055498600006, + "rewards/margins_min": -0.1648840606212616, + "rewards/margins_std": 0.2497369796037674, + "rewards/rejected": 0.12753772735595703, + "step": 3230 + }, + { + "dpo_losses": 0.6271528601646423, + "epoch": 0.85, + "grad_norm": 13.007862940968268, + "learning_rate": 3.44132109080447e-07, + "logits/chosen": -2.6953787803649902, + "logits/rejected": -2.646010637283325, + "logps/chosen": -231.9201202392578, + "logps/rejected": -249.96530151367188, + "loss": 0.6825, + "positive_losses": 0.1917347013950348, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.21758997440338135, + "rewards/margins": 0.15806236863136292, + "rewards/margins_max": 0.4027267098426819, + "rewards/margins_min": -0.05626615881919861, + "rewards/margins_std": 0.21179640293121338, + "rewards/rejected": 0.05952761694788933, + "step": 3240 + }, + { + "dpo_losses": 0.6535229086875916, + "epoch": 0.85, + "grad_norm": 11.488135810665579, + "learning_rate": 3.3265564510662344e-07, + "logits/chosen": -2.7149996757507324, + "logits/rejected": -2.6980862617492676, + "logps/chosen": -271.8984375, + "logps/rejected": -253.8206787109375, + "loss": 0.6967, + "positive_losses": 0.38341885805130005, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.21455137431621552, + "rewards/margins": 0.0899895653128624, + "rewards/margins_max": 0.2641471028327942, + "rewards/margins_min": -0.08759258687496185, + "rewards/margins_std": 0.15608647465705872, + "rewards/rejected": 0.12456182390451431, + "step": 3250 + }, + { + "dpo_losses": 0.6507695317268372, + "epoch": 0.85, + "grad_norm": 7.504993556538967, + "learning_rate": 3.213601537627195e-07, + "logits/chosen": -2.7060675621032715, + "logits/rejected": -2.717250347137451, + "logps/chosen": -244.5074005126953, + "logps/rejected": -284.1248474121094, + "loss": 0.6728, + "positive_losses": 0.2510760426521301, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.19894081354141235, + "rewards/margins": 0.096902996301651, + "rewards/margins_max": 0.2957373857498169, + "rewards/margins_min": -0.10130006074905396, + "rewards/margins_std": 0.17616146802902222, + "rewards/rejected": 0.10203780978918076, + "step": 3260 + }, + { + "dpo_losses": 0.6447475552558899, + "epoch": 0.86, + "grad_norm": 11.609264594024141, + "learning_rate": 3.1024657821901063e-07, + "logits/chosen": -2.6828439235687256, + "logits/rejected": -2.631220579147339, + "logps/chosen": -236.44894409179688, + "logps/rejected": -255.2611541748047, + "loss": 0.6838, + "positive_losses": 0.306144654750824, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.21078269183635712, + "rewards/margins": 0.11331923305988312, + "rewards/margins_max": 0.3753519654273987, + "rewards/margins_min": -0.13783003389835358, + "rewards/margins_std": 0.2330664098262787, + "rewards/rejected": 0.097463458776474, + "step": 3270 + }, + { + "dpo_losses": 0.6534501314163208, + "epoch": 0.86, + "grad_norm": 11.688866846461075, + "learning_rate": 2.9931584645585654e-07, + "logits/chosen": -2.7720062732696533, + "logits/rejected": -2.679375171661377, + "logps/chosen": -235.0417938232422, + "logps/rejected": -235.50137329101562, + "loss": 0.672, + "positive_losses": 0.38392525911331177, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.2245892584323883, + "rewards/margins": 0.09525544196367264, + "rewards/margins_max": 0.34492596983909607, + "rewards/margins_min": -0.1276320219039917, + "rewards/margins_std": 0.20781132578849792, + "rewards/rejected": 0.12933377921581268, + "step": 3280 + }, + { + "dpo_losses": 0.616895318031311, + "epoch": 0.86, + "grad_norm": 2.188074962803304, + "learning_rate": 2.885688711862136e-07, + "logits/chosen": -2.6105411052703857, + "logits/rejected": -2.6143670082092285, + "logps/chosen": -268.9541931152344, + "logps/rejected": -265.6374816894531, + "loss": 0.6597, + "positive_losses": 0.35630494356155396, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.25052422285079956, + "rewards/margins": 0.17335107922554016, + "rewards/margins_max": 0.40427374839782715, + "rewards/margins_min": -0.08141206204891205, + "rewards/margins_std": 0.2202124148607254, + "rewards/rejected": 0.07717315107584, + "step": 3290 + }, + { + "dpo_losses": 0.6404051780700684, + "epoch": 0.86, + "grad_norm": 8.593080140796566, + "learning_rate": 2.7800654977942486e-07, + "logits/chosen": -2.6817626953125, + "logits/rejected": -2.6735479831695557, + "logps/chosen": -310.8262634277344, + "logps/rejected": -296.20501708984375, + "loss": 0.6627, + "positive_losses": 0.14009293913841248, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.24962861835956573, + "rewards/margins": 0.12139974534511566, + "rewards/margins_max": 0.3643108010292053, + "rewards/margins_min": -0.13667277991771698, + "rewards/margins_std": 0.21952347457408905, + "rewards/rejected": 0.12822887301445007, + "step": 3300 + }, + { + "epoch": 0.86, + "eval_dpo_losses": 0.6403202414512634, + "eval_logits/chosen": -2.652597188949585, + "eval_logits/rejected": -2.6184253692626953, + "eval_logps/chosen": -261.3397521972656, + "eval_logps/rejected": -247.35203552246094, + "eval_loss": 0.6772990822792053, + "eval_positive_losses": 0.24060045182704926, + "eval_rewards/accuracies": 0.718999981880188, + "eval_rewards/chosen": 0.23253653943538666, + "eval_rewards/margins": 0.12026768177747726, + "eval_rewards/margins_max": 0.4419324994087219, + "eval_rewards/margins_min": -0.15448962152004242, + "eval_rewards/margins_std": 0.20027266442775726, + "eval_rewards/rejected": 0.1122688427567482, + "eval_runtime": 428.0713, + "eval_samples_per_second": 4.672, + "eval_steps_per_second": 0.292, + "step": 3300 + }, + { + "dpo_losses": 0.6264361143112183, + "epoch": 0.87, + "grad_norm": 39.82948273970552, + "learning_rate": 2.6762976418628797e-07, + "logits/chosen": -2.705967664718628, + "logits/rejected": -2.658480405807495, + "logps/chosen": -251.9584503173828, + "logps/rejected": -254.8125, + "loss": 0.6839, + "positive_losses": 0.18140240013599396, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.22535867989063263, + "rewards/margins": 0.15404847264289856, + "rewards/margins_max": 0.436443567276001, + "rewards/margins_min": -0.07473914325237274, + "rewards/margins_std": 0.2287091761827469, + "rewards/rejected": 0.07131022214889526, + "step": 3310 + }, + { + "dpo_losses": 0.6395207047462463, + "epoch": 0.87, + "grad_norm": 2.9691071947943692, + "learning_rate": 2.5743938086541354e-07, + "logits/chosen": -2.612696409225464, + "logits/rejected": -2.59602689743042, + "logps/chosen": -278.0087890625, + "logps/rejected": -266.9355773925781, + "loss": 0.6748, + "positive_losses": 0.2753303647041321, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.26316890120506287, + "rewards/margins": 0.13156357407569885, + "rewards/margins_max": 0.4129224419593811, + "rewards/margins_min": -0.09041708707809448, + "rewards/margins_std": 0.23072955012321472, + "rewards/rejected": 0.1316053569316864, + "step": 3320 + }, + { + "dpo_losses": 0.6477687358856201, + "epoch": 0.87, + "grad_norm": 2.3269522356509023, + "learning_rate": 2.4743625071087574e-07, + "logits/chosen": -2.737248182296753, + "logits/rejected": -2.750629425048828, + "logps/chosen": -280.17864990234375, + "logps/rejected": -276.9759216308594, + "loss": 0.6466, + "positive_losses": 0.0, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.24916020035743713, + "rewards/margins": 0.10327957570552826, + "rewards/margins_max": 0.3029175102710724, + "rewards/margins_min": -0.09398408979177475, + "rewards/margins_std": 0.17593896389007568, + "rewards/rejected": 0.14588062465190887, + "step": 3330 + }, + { + "dpo_losses": 0.6357873678207397, + "epoch": 0.87, + "grad_norm": 1.9947758633325405, + "learning_rate": 2.3762120898116498e-07, + "logits/chosen": -2.656794548034668, + "logits/rejected": -2.672175168991089, + "logps/chosen": -243.54544067382812, + "logps/rejected": -295.7755126953125, + "loss": 0.65, + "positive_losses": 0.1419471800327301, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22034494578838348, + "rewards/margins": 0.13167402148246765, + "rewards/margins_max": 0.3637886047363281, + "rewards/margins_min": -0.0878521203994751, + "rewards/margins_std": 0.201420858502388, + "rewards/rejected": 0.08867089450359344, + "step": 3340 + }, + { + "dpo_losses": 0.6441482305526733, + "epoch": 0.88, + "grad_norm": 7.694467256086055, + "learning_rate": 2.2799507522944048e-07, + "logits/chosen": -2.7229669094085693, + "logits/rejected": -2.647151231765747, + "logps/chosen": -252.82571411132812, + "logps/rejected": -221.85879516601562, + "loss": 0.6769, + "positive_losses": 0.35150521993637085, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.21200843155384064, + "rewards/margins": 0.11605236679315567, + "rewards/margins_max": 0.3657509982585907, + "rewards/margins_min": -0.10902712494134903, + "rewards/margins_std": 0.21472206711769104, + "rewards/rejected": 0.09595610946416855, + "step": 3350 + }, + { + "dpo_losses": 0.6233208775520325, + "epoch": 0.88, + "grad_norm": 1.767260106599493, + "learning_rate": 2.1855865323510056e-07, + "logits/chosen": -2.6158571243286133, + "logits/rejected": -2.5889029502868652, + "logps/chosen": -270.18914794921875, + "logps/rejected": -277.93341064453125, + "loss": 0.6692, + "positive_losses": 0.1520233154296875, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.23553188145160675, + "rewards/margins": 0.16098877787590027, + "rewards/margins_max": 0.4127779006958008, + "rewards/margins_min": -0.05117069557309151, + "rewards/margins_std": 0.2057846337556839, + "rewards/rejected": 0.07454311102628708, + "step": 3360 + }, + { + "dpo_losses": 0.6466307044029236, + "epoch": 0.88, + "grad_norm": 2.0852991620424257, + "learning_rate": 2.0931273093666575e-07, + "logits/chosen": -2.713484287261963, + "logits/rejected": -2.7137503623962402, + "logps/chosen": -237.7594451904297, + "logps/rejected": -243.6479949951172, + "loss": 0.6619, + "positive_losses": 0.06381092220544815, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.22457580268383026, + "rewards/margins": 0.10744913667440414, + "rewards/margins_max": 0.3464711308479309, + "rewards/margins_min": -0.09933777153491974, + "rewards/margins_std": 0.20307926833629608, + "rewards/rejected": 0.1171267032623291, + "step": 3370 + }, + { + "dpo_losses": 0.6486043334007263, + "epoch": 0.88, + "grad_norm": 1.5986555560433542, + "learning_rate": 2.002580803659873e-07, + "logits/chosen": -2.7386975288391113, + "logits/rejected": -2.6531612873077393, + "logps/chosen": -231.8615264892578, + "logps/rejected": -220.4953155517578, + "loss": 0.7043, + "positive_losses": 0.03696479648351669, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.20933715999126434, + "rewards/margins": 0.10051582008600235, + "rewards/margins_max": 0.30396851897239685, + "rewards/margins_min": -0.06479805707931519, + "rewards/margins_std": 0.16805866360664368, + "rewards/rejected": 0.1088213324546814, + "step": 3380 + }, + { + "dpo_losses": 0.6446608304977417, + "epoch": 0.89, + "grad_norm": 1.9495041510241085, + "learning_rate": 1.913954575837826e-07, + "logits/chosen": -2.636171340942383, + "logits/rejected": -2.6109492778778076, + "logps/chosen": -255.878662109375, + "logps/rejected": -243.5431671142578, + "loss": 0.6607, + "positive_losses": 0.33525413274765015, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.21522343158721924, + "rewards/margins": 0.11724289506673813, + "rewards/margins_max": 0.403298944234848, + "rewards/margins_min": -0.14402872323989868, + "rewards/margins_std": 0.24462909996509552, + "rewards/rejected": 0.09798052161931992, + "step": 3390 + }, + { + "dpo_losses": 0.6406092047691345, + "epoch": 0.89, + "grad_norm": 1.846940206289947, + "learning_rate": 1.827256026165028e-07, + "logits/chosen": -2.641024112701416, + "logits/rejected": -2.635166883468628, + "logps/chosen": -215.92178344726562, + "logps/rejected": -237.233642578125, + "loss": 0.6517, + "positive_losses": 0.1393692046403885, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.21265530586242676, + "rewards/margins": 0.11898468434810638, + "rewards/margins_max": 0.3562791049480438, + "rewards/margins_min": -0.06635533273220062, + "rewards/margins_std": 0.1890067160129547, + "rewards/rejected": 0.09367059916257858, + "step": 3400 + }, + { + "epoch": 0.89, + "eval_dpo_losses": 0.6386284828186035, + "eval_logits/chosen": -2.6550769805908203, + "eval_logits/rejected": -2.621270179748535, + "eval_logps/chosen": -261.5968017578125, + "eval_logps/rejected": -248.01812744140625, + "eval_loss": 0.6813686490058899, + "eval_positive_losses": 0.2865428030490875, + "eval_rewards/accuracies": 0.718999981880188, + "eval_rewards/chosen": 0.2299659103155136, + "eval_rewards/margins": 0.12435787916183472, + "eval_rewards/margins_max": 0.45185142755508423, + "eval_rewards/margins_min": -0.1569340080022812, + "eval_rewards/margins_std": 0.20452216267585754, + "eval_rewards/rejected": 0.10560804605484009, + "eval_runtime": 427.9107, + "eval_samples_per_second": 4.674, + "eval_steps_per_second": 0.292, + "step": 3400 + }, + { + "dpo_losses": 0.6420435905456543, + "epoch": 0.89, + "grad_norm": 1.6239890108112547, + "learning_rate": 1.7424923939454274e-07, + "logits/chosen": -2.5693392753601074, + "logits/rejected": -2.581394672393799, + "logps/chosen": -228.1303253173828, + "logps/rejected": -247.8132781982422, + "loss": 0.6825, + "positive_losses": 0.7139572501182556, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18297092616558075, + "rewards/margins": 0.11609435081481934, + "rewards/margins_max": 0.33712631464004517, + "rewards/margins_min": -0.06108611077070236, + "rewards/margins_std": 0.17885836958885193, + "rewards/rejected": 0.0668765977025032, + "step": 3410 + }, + { + "dpo_losses": 0.6098856329917908, + "epoch": 0.9, + "grad_norm": 9.593562320029054, + "learning_rate": 1.6596707569179304e-07, + "logits/chosen": -2.623298168182373, + "logits/rejected": -2.6018214225769043, + "logps/chosen": -285.25372314453125, + "logps/rejected": -255.9405517578125, + "loss": 0.6298, + "positive_losses": 0.10523166507482529, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.2724429666996002, + "rewards/margins": 0.1895667314529419, + "rewards/margins_max": 0.4633703827857971, + "rewards/margins_min": -0.06719541549682617, + "rewards/margins_std": 0.23646345734596252, + "rewards/rejected": 0.08287624269723892, + "step": 3420 + }, + { + "dpo_losses": 0.6304863095283508, + "epoch": 0.9, + "grad_norm": 2.0355417541779164, + "learning_rate": 1.578798030665385e-07, + "logits/chosen": -2.647200345993042, + "logits/rejected": -2.6434273719787598, + "logps/chosen": -258.7545471191406, + "logps/rejected": -249.9468536376953, + "loss": 0.6541, + "positive_losses": 0.08031348884105682, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.2140689641237259, + "rewards/margins": 0.139048233628273, + "rewards/margins_max": 0.3309935927391052, + "rewards/margins_min": -0.042954690754413605, + "rewards/margins_std": 0.1671784669160843, + "rewards/rejected": 0.07502072304487228, + "step": 3430 + }, + { + "dpo_losses": 0.641990065574646, + "epoch": 0.9, + "grad_norm": 1.828017663657493, + "learning_rate": 1.499880968037165e-07, + "logits/chosen": -2.6463229656219482, + "logits/rejected": -2.642021417617798, + "logps/chosen": -285.63006591796875, + "logps/rejected": -275.5422058105469, + "loss": 0.6635, + "positive_losses": 0.28176501393318176, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.24048736691474915, + "rewards/margins": 0.11896850913763046, + "rewards/margins_max": 0.3504177927970886, + "rewards/margins_min": -0.10457517206668854, + "rewards/margins_std": 0.2031029909849167, + "rewards/rejected": 0.12151883542537689, + "step": 3440 + }, + { + "dpo_losses": 0.6230959892272949, + "epoch": 0.9, + "grad_norm": 11.307529769679473, + "learning_rate": 1.4229261585852805e-07, + "logits/chosen": -2.69089674949646, + "logits/rejected": -2.675455093383789, + "logps/chosen": -284.59722900390625, + "logps/rejected": -256.7802734375, + "loss": 0.6745, + "positive_losses": 0.520799994468689, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.2580224871635437, + "rewards/margins": 0.15853682160377502, + "rewards/margins_max": 0.36704161763191223, + "rewards/margins_min": -0.04584568738937378, + "rewards/margins_std": 0.18185365200042725, + "rewards/rejected": 0.09948565810918808, + "step": 3450 + }, + { + "dpo_losses": 0.6113892197608948, + "epoch": 0.91, + "grad_norm": 2.303752127453406, + "learning_rate": 1.3479400280141886e-07, + "logits/chosen": -2.702341318130493, + "logits/rejected": -2.6269218921661377, + "logps/chosen": -282.23284912109375, + "logps/rejected": -220.28604125976562, + "loss": 0.645, + "positive_losses": 0.3007164001464844, + "rewards/accuracies": 0.8374999761581421, + "rewards/chosen": 0.26262277364730835, + "rewards/margins": 0.1838146150112152, + "rewards/margins_max": 0.42686158418655396, + "rewards/margins_min": -0.011207438074052334, + "rewards/margins_std": 0.19332796335220337, + "rewards/rejected": 0.07880813628435135, + "step": 3460 + }, + { + "dpo_losses": 0.6374378204345703, + "epoch": 0.91, + "grad_norm": 2.208238875570211, + "learning_rate": 1.2749288376442044e-07, + "logits/chosen": -2.6841673851013184, + "logits/rejected": -2.6038734912872314, + "logps/chosen": -267.94732666015625, + "logps/rejected": -256.0314025878906, + "loss": 0.6904, + "positive_losses": 0.3077290952205658, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.20196840167045593, + "rewards/margins": 0.12561708688735962, + "rewards/margins_max": 0.3410743772983551, + "rewards/margins_min": -0.06772379577159882, + "rewards/margins_std": 0.18634767830371857, + "rewards/rejected": 0.0763513371348381, + "step": 3470 + }, + { + "dpo_losses": 0.6477632522583008, + "epoch": 0.91, + "grad_norm": 2.0981624283802955, + "learning_rate": 1.203898683888713e-07, + "logits/chosen": -2.7335000038146973, + "logits/rejected": -2.6686019897460938, + "logps/chosen": -265.36053466796875, + "logps/rejected": -269.66693115234375, + "loss": 0.6875, + "positive_losses": 0.46651220321655273, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.22941572964191437, + "rewards/margins": 0.10551927983760834, + "rewards/margins_max": 0.316548228263855, + "rewards/margins_min": -0.10447671264410019, + "rewards/margins_std": 0.18978312611579895, + "rewards/rejected": 0.12389643490314484, + "step": 3480 + }, + { + "dpo_losses": 0.6597553491592407, + "epoch": 0.91, + "grad_norm": 9.665018489245684, + "learning_rate": 1.1348554977451132e-07, + "logits/chosen": -2.646718978881836, + "logits/rejected": -2.673459529876709, + "logps/chosen": -228.7762908935547, + "logps/rejected": -215.7211151123047, + "loss": 0.6829, + "positive_losses": 0.17726345360279083, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.21256911754608154, + "rewards/margins": 0.07763149589300156, + "rewards/margins_max": 0.27114394307136536, + "rewards/margins_min": -0.10825137794017792, + "rewards/margins_std": 0.16942939162254333, + "rewards/rejected": 0.1349376142024994, + "step": 3490 + }, + { + "dpo_losses": 0.6395228505134583, + "epoch": 0.92, + "grad_norm": 46.476112711094665, + "learning_rate": 1.0678050442995802e-07, + "logits/chosen": -2.725433111190796, + "logits/rejected": -2.701233386993408, + "logps/chosen": -287.6202087402344, + "logps/rejected": -235.7355194091797, + "loss": 0.7267, + "positive_losses": 0.15096637606620789, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.22031128406524658, + "rewards/margins": 0.12081418186426163, + "rewards/margins_max": 0.35709747672080994, + "rewards/margins_min": -0.06519372016191483, + "rewards/margins_std": 0.1876780092716217, + "rewards/rejected": 0.09949707239866257, + "step": 3500 + }, + { + "epoch": 0.92, + "eval_dpo_losses": 0.6385395526885986, + "eval_logits/chosen": -2.655956983566284, + "eval_logits/rejected": -2.6222283840179443, + "eval_logps/chosen": -261.5743713378906, + "eval_logps/rejected": -248.02084350585938, + "eval_loss": 0.6810497045516968, + "eval_positive_losses": 0.2879987061023712, + "eval_rewards/accuracies": 0.7200000286102295, + "eval_rewards/chosen": 0.23019051551818848, + "eval_rewards/margins": 0.12460958957672119, + "eval_rewards/margins_max": 0.45358777046203613, + "eval_rewards/margins_min": -0.15690042078495026, + "eval_rewards/margins_std": 0.2050294280052185, + "eval_rewards/rejected": 0.10558092594146729, + "eval_runtime": 428.2463, + "eval_samples_per_second": 4.67, + "eval_steps_per_second": 0.292, + "step": 3500 + }, + { + "dpo_losses": 0.6398534178733826, + "epoch": 0.92, + "grad_norm": 2.27603190148628, + "learning_rate": 1.0027529222456755e-07, + "logits/chosen": -2.6275250911712646, + "logits/rejected": -2.6174960136413574, + "logps/chosen": -267.97479248046875, + "logps/rejected": -248.0113067626953, + "loss": 0.7017, + "positive_losses": 0.6987945437431335, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.2240476906299591, + "rewards/margins": 0.12655410170555115, + "rewards/margins_max": 0.37817585468292236, + "rewards/margins_min": -0.0808626115322113, + "rewards/margins_std": 0.21181420981884003, + "rewards/rejected": 0.09749359637498856, + "step": 3510 + }, + { + "dpo_losses": 0.6397808194160461, + "epoch": 0.92, + "grad_norm": 11.618890284798946, + "learning_rate": 9.397045634168766e-08, + "logits/chosen": -2.6689958572387695, + "logits/rejected": -2.6879935264587402, + "logps/chosen": -251.0983428955078, + "logps/rejected": -250.15884399414062, + "loss": 0.7027, + "positive_losses": 0.4100571572780609, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.18390432000160217, + "rewards/margins": 0.1212887167930603, + "rewards/margins_max": 0.36527150869369507, + "rewards/margins_min": -0.08835957199335098, + "rewards/margins_std": 0.1976158767938614, + "rewards/rejected": 0.06261558085680008, + "step": 3520 + }, + { + "dpo_losses": 0.6355238556861877, + "epoch": 0.92, + "grad_norm": 8.93058712249867, + "learning_rate": 8.78665232332998e-08, + "logits/chosen": -2.743079423904419, + "logits/rejected": -2.7208328247070312, + "logps/chosen": -273.70379638671875, + "logps/rejected": -265.4845886230469, + "loss": 0.6391, + "positive_losses": 0.03184204176068306, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.25056618452072144, + "rewards/margins": 0.1285247504711151, + "rewards/margins_max": 0.36041417717933655, + "rewards/margins_min": -0.034891076385974884, + "rewards/margins_std": 0.17605885863304138, + "rewards/rejected": 0.12204144150018692, + "step": 3530 + }, + { + "dpo_losses": 0.6398634910583496, + "epoch": 0.93, + "grad_norm": 14.409295875135173, + "learning_rate": 8.196400257606208e-08, + "logits/chosen": -2.6243643760681152, + "logits/rejected": -2.5534205436706543, + "logps/chosen": -280.9654846191406, + "logps/rejected": -299.6298522949219, + "loss": 0.7192, + "positive_losses": 0.7538820505142212, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.2547672390937805, + "rewards/margins": 0.13398560881614685, + "rewards/margins_max": 0.46419715881347656, + "rewards/margins_min": -0.1198708787560463, + "rewards/margins_std": 0.2631421685218811, + "rewards/rejected": 0.12078163772821426, + "step": 3540 + }, + { + "dpo_losses": 0.6318740248680115, + "epoch": 0.93, + "grad_norm": 2.107180888315368, + "learning_rate": 7.626338722875076e-08, + "logits/chosen": -2.6094422340393066, + "logits/rejected": -2.6372439861297607, + "logps/chosen": -238.2021026611328, + "logps/rejected": -258.7787170410156, + "loss": 0.66, + "positive_losses": 0.06889379024505615, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.2709522843360901, + "rewards/margins": 0.14883530139923096, + "rewards/margins_max": 0.41571635007858276, + "rewards/margins_min": -0.08988544344902039, + "rewards/margins_std": 0.2293560951948166, + "rewards/rejected": 0.12211696058511734, + "step": 3550 + }, + { + "dpo_losses": 0.6310317516326904, + "epoch": 0.93, + "grad_norm": 10.46521774597658, + "learning_rate": 7.076515319110688e-08, + "logits/chosen": -2.6502716541290283, + "logits/rejected": -2.582925319671631, + "logps/chosen": -290.3277282714844, + "logps/rejected": -261.40679931640625, + "loss": 0.6553, + "positive_losses": 0.20256996154785156, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.25210773944854736, + "rewards/margins": 0.1431921422481537, + "rewards/margins_max": 0.4087950587272644, + "rewards/margins_min": -0.09110721200704575, + "rewards/margins_std": 0.22172784805297852, + "rewards/rejected": 0.10891561210155487, + "step": 3560 + }, + { + "dpo_losses": 0.6263138651847839, + "epoch": 0.93, + "grad_norm": 9.912305455265015, + "learning_rate": 6.54697595640899e-08, + "logits/chosen": -2.6590230464935303, + "logits/rejected": -2.629793167114258, + "logps/chosen": -300.0892639160156, + "logps/rejected": -249.05899047851562, + "loss": 0.6875, + "positive_losses": 0.4945901930332184, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.221832275390625, + "rewards/margins": 0.15287736058235168, + "rewards/margins_max": 0.4320314824581146, + "rewards/margins_min": -0.10347136110067368, + "rewards/margins_std": 0.23440003395080566, + "rewards/rejected": 0.06895491480827332, + "step": 3570 + }, + { + "dpo_losses": 0.628657341003418, + "epoch": 0.94, + "grad_norm": 7.1994219547929195, + "learning_rate": 6.037764851154426e-08, + "logits/chosen": -2.7059946060180664, + "logits/rejected": -2.644545078277588, + "logps/chosen": -265.8240966796875, + "logps/rejected": -246.8155059814453, + "loss": 0.6898, + "positive_losses": 0.4449668824672699, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.22660474479198456, + "rewards/margins": 0.14894968271255493, + "rewards/margins_max": 0.3824352025985718, + "rewards/margins_min": -0.10199352353811264, + "rewards/margins_std": 0.21889081597328186, + "rewards/rejected": 0.07765506953001022, + "step": 3580 + }, + { + "dpo_losses": 0.6285854578018188, + "epoch": 0.94, + "grad_norm": 6.918410697963323, + "learning_rate": 5.548924522327748e-08, + "logits/chosen": -2.676370143890381, + "logits/rejected": -2.651085376739502, + "logps/chosen": -249.1566162109375, + "logps/rejected": -219.9067840576172, + "loss": 0.6905, + "positive_losses": 0.5990933179855347, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.23606470227241516, + "rewards/margins": 0.15003365278244019, + "rewards/margins_max": 0.4204083979129791, + "rewards/margins_min": -0.09411970525979996, + "rewards/margins_std": 0.22876787185668945, + "rewards/rejected": 0.08603102713823318, + "step": 3590 + }, + { + "dpo_losses": 0.6493958234786987, + "epoch": 0.94, + "grad_norm": 2.531047577475979, + "learning_rate": 5.0804957879556915e-08, + "logits/chosen": -2.6386544704437256, + "logits/rejected": -2.5956645011901855, + "logps/chosen": -237.57080078125, + "logps/rejected": -249.49801635742188, + "loss": 0.6563, + "positive_losses": 0.004039764404296875, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.21744480729103088, + "rewards/margins": 0.09995146840810776, + "rewards/margins_max": 0.296904981136322, + "rewards/margins_min": -0.12265890836715698, + "rewards/margins_std": 0.18629953265190125, + "rewards/rejected": 0.11749333143234253, + "step": 3600 + }, + { + "epoch": 0.94, + "eval_dpo_losses": 0.6394082307815552, + "eval_logits/chosen": -2.6555135250091553, + "eval_logits/rejected": -2.621596097946167, + "eval_logps/chosen": -261.41357421875, + "eval_logps/rejected": -247.6491241455078, + "eval_loss": 0.679019033908844, + "eval_positive_losses": 0.2627328932285309, + "eval_rewards/accuracies": 0.722000002861023, + "eval_rewards/chosen": 0.23179861903190613, + "eval_rewards/margins": 0.12250068038702011, + "eval_rewards/margins_max": 0.4486891031265259, + "eval_rewards/margins_min": -0.1550326645374298, + "eval_rewards/margins_std": 0.20270268619060516, + "eval_rewards/rejected": 0.10929791629314423, + "eval_runtime": 428.0374, + "eval_samples_per_second": 4.672, + "eval_steps_per_second": 0.292, + "step": 3600 + }, + { + "dpo_losses": 0.6413955688476562, + "epoch": 0.94, + "grad_norm": 9.51063303929373, + "learning_rate": 4.632517761702815e-08, + "logits/chosen": -2.661477565765381, + "logits/rejected": -2.654937982559204, + "logps/chosen": -305.40875244140625, + "logps/rejected": -247.637451171875, + "loss": 0.6674, + "positive_losses": 0.05891609191894531, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.2322145253419876, + "rewards/margins": 0.11494660377502441, + "rewards/margins_max": 0.28751423954963684, + "rewards/margins_min": -0.06896142661571503, + "rewards/margins_std": 0.1589422971010208, + "rewards/rejected": 0.11726789176464081, + "step": 3610 + }, + { + "dpo_losses": 0.6433061361312866, + "epoch": 0.95, + "grad_norm": 2.5528407523443044, + "learning_rate": 4.205027849605359e-08, + "logits/chosen": -2.7029664516448975, + "logits/rejected": -2.6620147228240967, + "logps/chosen": -262.38421630859375, + "logps/rejected": -237.93930053710938, + "loss": 0.6516, + "positive_losses": 0.02631073072552681, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.2335212230682373, + "rewards/margins": 0.10934171825647354, + "rewards/margins_max": 0.30177029967308044, + "rewards/margins_min": -0.052206508815288544, + "rewards/margins_std": 0.15611205995082855, + "rewards/rejected": 0.12417948246002197, + "step": 3620 + }, + { + "dpo_losses": 0.6407767534255981, + "epoch": 0.95, + "grad_norm": 9.639408968460176, + "learning_rate": 3.798061746947995e-08, + "logits/chosen": -2.6105198860168457, + "logits/rejected": -2.558924436569214, + "logps/chosen": -228.0260772705078, + "logps/rejected": -273.5794677734375, + "loss": 0.6908, + "positive_losses": 0.8591312170028687, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.1914621740579605, + "rewards/margins": 0.11739379167556763, + "rewards/margins_max": 0.3272903561592102, + "rewards/margins_min": -0.0655929371714592, + "rewards/margins_std": 0.17901813983917236, + "rewards/rejected": 0.07406838238239288, + "step": 3630 + }, + { + "dpo_losses": 0.643804669380188, + "epoch": 0.95, + "grad_norm": 3.736702293568718, + "learning_rate": 3.411653435283158e-08, + "logits/chosen": -2.647799253463745, + "logits/rejected": -2.6162681579589844, + "logps/chosen": -234.08847045898438, + "logps/rejected": -210.5509796142578, + "loss": 0.6757, + "positive_losses": 0.05722751468420029, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.2295902520418167, + "rewards/margins": 0.11434964835643768, + "rewards/margins_max": 0.35972321033477783, + "rewards/margins_min": -0.10925455391407013, + "rewards/margins_std": 0.20940427482128143, + "rewards/rejected": 0.11524059623479843, + "step": 3640 + }, + { + "dpo_losses": 0.6531789898872375, + "epoch": 0.96, + "grad_norm": 1.915823488391317, + "learning_rate": 3.04583517959367e-08, + "logits/chosen": -2.6603918075561523, + "logits/rejected": -2.6616523265838623, + "logps/chosen": -273.0516052246094, + "logps/rejected": -283.12347412109375, + "loss": 0.6599, + "positive_losses": 0.3513060510158539, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.22649244964122772, + "rewards/margins": 0.09476588666439056, + "rewards/margins_max": 0.3478698134422302, + "rewards/margins_min": -0.12968704104423523, + "rewards/margins_std": 0.2150001972913742, + "rewards/rejected": 0.13172657787799835, + "step": 3650 + }, + { + "dpo_losses": 0.6662554740905762, + "epoch": 0.96, + "grad_norm": 2.2316362361908206, + "learning_rate": 2.7006375255985984e-08, + "logits/chosen": -2.8110129833221436, + "logits/rejected": -2.7716379165649414, + "logps/chosen": -262.69708251953125, + "logps/rejected": -261.35894775390625, + "loss": 0.7064, + "positive_losses": 0.4577966630458832, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.19929836690425873, + "rewards/margins": 0.06393896043300629, + "rewards/margins_max": 0.24750575423240662, + "rewards/margins_min": -0.1344698667526245, + "rewards/margins_std": 0.16754016280174255, + "rewards/rejected": 0.13535940647125244, + "step": 3660 + }, + { + "dpo_losses": 0.6305605173110962, + "epoch": 0.96, + "grad_norm": 1.8443417650134954, + "learning_rate": 2.3760892972027328e-08, + "logits/chosen": -2.6637253761291504, + "logits/rejected": -2.6332812309265137, + "logps/chosen": -275.3893127441406, + "logps/rejected": -284.32098388671875, + "loss": 0.6579, + "positive_losses": 0.18694505095481873, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.24530284106731415, + "rewards/margins": 0.14238665997982025, + "rewards/margins_max": 0.38903477787971497, + "rewards/margins_min": -0.068825364112854, + "rewards/margins_std": 0.20899328589439392, + "rewards/rejected": 0.1029161810874939, + "step": 3670 + }, + { + "dpo_losses": 0.6498867273330688, + "epoch": 0.96, + "grad_norm": 5.458971796720751, + "learning_rate": 2.072217594089765e-08, + "logits/chosen": -2.6662886142730713, + "logits/rejected": -2.630774974822998, + "logps/chosen": -265.15081787109375, + "logps/rejected": -256.91204833984375, + "loss": 0.6753, + "positive_losses": 0.18378598988056183, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.20927441120147705, + "rewards/margins": 0.09998832643032074, + "rewards/margins_max": 0.3337637484073639, + "rewards/margins_min": -0.13355764746665955, + "rewards/margins_std": 0.20748178660869598, + "rewards/rejected": 0.1092861071228981, + "step": 3680 + }, + { + "dpo_losses": 0.622007429599762, + "epoch": 0.97, + "grad_norm": 9.395134628492558, + "learning_rate": 1.789047789459375e-08, + "logits/chosen": -2.6741926670074463, + "logits/rejected": -2.6397705078125, + "logps/chosen": -325.18414306640625, + "logps/rejected": -261.67279052734375, + "loss": 0.6462, + "positive_losses": 0.0, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.27494171261787415, + "rewards/margins": 0.15975967049598694, + "rewards/margins_max": 0.35449787974357605, + "rewards/margins_min": -0.06289161741733551, + "rewards/margins_std": 0.18452905118465424, + "rewards/rejected": 0.11518202722072601, + "step": 3690 + }, + { + "dpo_losses": 0.6495968103408813, + "epoch": 0.97, + "grad_norm": 7.4822717311515134, + "learning_rate": 1.5266035279088708e-08, + "logits/chosen": -2.7605767250061035, + "logits/rejected": -2.7201406955718994, + "logps/chosen": -267.16839599609375, + "logps/rejected": -241.4385528564453, + "loss": 0.7039, + "positive_losses": 0.7736231088638306, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.23643679916858673, + "rewards/margins": 0.0986911952495575, + "rewards/margins_max": 0.29774361848831177, + "rewards/margins_min": -0.10798849165439606, + "rewards/margins_std": 0.17969079315662384, + "rewards/rejected": 0.13774561882019043, + "step": 3700 + }, + { + "epoch": 0.97, + "eval_dpo_losses": 0.6395586729049683, + "eval_logits/chosen": -2.6559033393859863, + "eval_logits/rejected": -2.622006893157959, + "eval_logps/chosen": -261.3918151855469, + "eval_logps/rejected": -247.59274291992188, + "eval_loss": 0.6790311336517334, + "eval_positive_losses": 0.26341503858566284, + "eval_rewards/accuracies": 0.7229999899864197, + "eval_rewards/chosen": 0.23201590776443481, + "eval_rewards/margins": 0.12215426564216614, + "eval_rewards/margins_max": 0.44827672839164734, + "eval_rewards/margins_min": -0.15502171218395233, + "eval_rewards/margins_std": 0.20247788727283478, + "eval_rewards/rejected": 0.10986167192459106, + "eval_runtime": 428.2413, + "eval_samples_per_second": 4.67, + "eval_steps_per_second": 0.292, + "step": 3700 + }, + { + "dpo_losses": 0.6481695175170898, + "epoch": 0.97, + "grad_norm": 7.4982549544584405, + "learning_rate": 1.2849067234584623e-08, + "logits/chosen": -2.629378080368042, + "logits/rejected": -2.583587169647217, + "logps/chosen": -255.37991333007812, + "logps/rejected": -237.39132690429688, + "loss": 0.6597, + "positive_losses": 0.1582089364528656, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.23376032710075378, + "rewards/margins": 0.10453498363494873, + "rewards/margins_max": 0.3104201853275299, + "rewards/margins_min": -0.1114891991019249, + "rewards/margins_std": 0.19313430786132812, + "rewards/rejected": 0.12922534346580505, + "step": 3710 + }, + { + "dpo_losses": 0.6363897323608398, + "epoch": 0.97, + "grad_norm": 9.799345687509973, + "learning_rate": 1.0639775577218625e-08, + "logits/chosen": -2.66035532951355, + "logits/rejected": -2.659759998321533, + "logps/chosen": -239.5210418701172, + "logps/rejected": -238.7107391357422, + "loss": 0.6614, + "positive_losses": 0.48925361037254333, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.23430407047271729, + "rewards/margins": 0.1289234459400177, + "rewards/margins_max": 0.3533153533935547, + "rewards/margins_min": -0.09572507441043854, + "rewards/margins_std": 0.20401068031787872, + "rewards/rejected": 0.10538060963153839, + "step": 3720 + }, + { + "dpo_losses": 0.641473114490509, + "epoch": 0.98, + "grad_norm": 12.708607221476539, + "learning_rate": 8.638344782207486e-09, + "logits/chosen": -2.6768765449523926, + "logits/rejected": -2.6652908325195312, + "logps/chosen": -299.50384521484375, + "logps/rejected": -268.8082580566406, + "loss": 0.7508, + "positive_losses": 0.9163432121276855, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.24856901168823242, + "rewards/margins": 0.12247447669506073, + "rewards/margins_max": 0.41335535049438477, + "rewards/margins_min": -0.11577402055263519, + "rewards/margins_std": 0.23423083126544952, + "rewards/rejected": 0.1260945051908493, + "step": 3730 + }, + { + "dpo_losses": 0.6213124394416809, + "epoch": 0.98, + "grad_norm": 11.03160963471843, + "learning_rate": 6.84494196844715e-09, + "logits/chosen": -2.666469097137451, + "logits/rejected": -2.62202525138855, + "logps/chosen": -282.1092529296875, + "logps/rejected": -265.4429931640625, + "loss": 0.6529, + "positive_losses": 0.3889961242675781, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.25011515617370605, + "rewards/margins": 0.1748693734407425, + "rewards/margins_max": 0.45166462659835815, + "rewards/margins_min": -0.05564745515584946, + "rewards/margins_std": 0.22870846092700958, + "rewards/rejected": 0.07524577528238297, + "step": 3740 + }, + { + "dpo_losses": 0.6486204862594604, + "epoch": 0.98, + "grad_norm": 2.142027903370626, + "learning_rate": 5.259716884556121e-09, + "logits/chosen": -2.6220340728759766, + "logits/rejected": -2.6580042839050293, + "logps/chosen": -269.54071044921875, + "logps/rejected": -257.8643798828125, + "loss": 0.6949, + "positive_losses": 1.0363675355911255, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.23406870663166046, + "rewards/margins": 0.10313974320888519, + "rewards/margins_max": 0.3234516978263855, + "rewards/margins_min": -0.11554668843746185, + "rewards/margins_std": 0.19535276293754578, + "rewards/rejected": 0.13092896342277527, + "step": 3750 + }, + { + "dpo_losses": 0.6417810320854187, + "epoch": 0.98, + "grad_norm": 1.9153348469624756, + "learning_rate": 3.882801896372967e-09, + "logits/chosen": -2.6521694660186768, + "logits/rejected": -2.6701323986053467, + "logps/chosen": -233.39639282226562, + "logps/rejected": -241.3929443359375, + "loss": 0.6517, + "positive_losses": 0.23487205803394318, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.2109147310256958, + "rewards/margins": 0.11781202256679535, + "rewards/margins_max": 0.3403708338737488, + "rewards/margins_min": -0.07822667807340622, + "rewards/margins_std": 0.18718595802783966, + "rewards/rejected": 0.09310269355773926, + "step": 3760 + }, + { + "dpo_losses": 0.6552250385284424, + "epoch": 0.99, + "grad_norm": 15.349581416384067, + "learning_rate": 2.7143119759026614e-09, + "logits/chosen": -2.6792030334472656, + "logits/rejected": -2.6174237728118896, + "logps/chosen": -284.4708251953125, + "logps/rejected": -291.76434326171875, + "loss": 0.6763, + "positive_losses": 0.43425750732421875, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.2588556110858917, + "rewards/margins": 0.09476854652166367, + "rewards/margins_max": 0.3151422142982483, + "rewards/margins_min": -0.13262644410133362, + "rewards/margins_std": 0.2025650441646576, + "rewards/rejected": 0.16408707201480865, + "step": 3770 + }, + { + "dpo_losses": 0.6605818867683411, + "epoch": 0.99, + "grad_norm": 2.0486200479718413, + "learning_rate": 1.754344691717591e-09, + "logits/chosen": -2.678449869155884, + "logits/rejected": -2.6325161457061768, + "logps/chosen": -249.8470458984375, + "logps/rejected": -256.65533447265625, + "loss": 0.6661, + "positive_losses": 0.13493213057518005, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.203067347407341, + "rewards/margins": 0.08785489946603775, + "rewards/margins_max": 0.3431921601295471, + "rewards/margins_min": -0.2343141734600067, + "rewards/margins_std": 0.2637536823749542, + "rewards/rejected": 0.11521244049072266, + "step": 3780 + }, + { + "dpo_losses": 0.6521404385566711, + "epoch": 0.99, + "grad_norm": 18.55025604976165, + "learning_rate": 1.0029802008096335e-09, + "logits/chosen": -2.6939713954925537, + "logits/rejected": -2.681117534637451, + "logps/chosen": -215.4471893310547, + "logps/rejected": -188.15237426757812, + "loss": 0.7364, + "positive_losses": 1.0411407947540283, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.19064785540103912, + "rewards/margins": 0.09285817295312881, + "rewards/margins_max": 0.2621470093727112, + "rewards/margins_min": -0.07667995244264603, + "rewards/margins_std": 0.14927421510219574, + "rewards/rejected": 0.0977896898984909, + "step": 3790 + }, + { + "dpo_losses": 0.635759711265564, + "epoch": 0.99, + "grad_norm": 10.457828666598141, + "learning_rate": 4.602812418974534e-10, + "logits/chosen": -2.60920786857605, + "logits/rejected": -2.6047613620758057, + "logps/chosen": -262.1203918457031, + "logps/rejected": -199.7135772705078, + "loss": 0.6622, + "positive_losses": 0.0, + "rewards/accuracies": 0.7875000238418579, + "rewards/chosen": 0.2570125162601471, + "rewards/margins": 0.13325463235378265, + "rewards/margins_max": 0.39995330572128296, + "rewards/margins_min": -0.04778273031115532, + "rewards/margins_std": 0.20286540687084198, + "rewards/rejected": 0.12375785410404205, + "step": 3800 + }, + { + "epoch": 0.99, + "eval_dpo_losses": 0.6395210027694702, + "eval_logits/chosen": -2.6544384956359863, + "eval_logits/rejected": -2.620440721511841, + "eval_logps/chosen": -261.3937683105469, + "eval_logps/rejected": -247.60301208496094, + "eval_loss": 0.6789272427558899, + "eval_positive_losses": 0.26124632358551025, + "eval_rewards/accuracies": 0.722000002861023, + "eval_rewards/chosen": 0.23199646174907684, + "eval_rewards/margins": 0.12223710119724274, + "eval_rewards/margins_max": 0.4482419788837433, + "eval_rewards/margins_min": -0.15490815043449402, + "eval_rewards/margins_std": 0.20252487063407898, + "eval_rewards/rejected": 0.1097593605518341, + "eval_runtime": 428.0449, + "eval_samples_per_second": 4.672, + "eval_steps_per_second": 0.292, + "step": 3800 + }, + { + "dpo_losses": 0.6477853655815125, + "epoch": 1.0, + "grad_norm": 1.7682638113441669, + "learning_rate": 1.2629313018819312e-10, + "logits/chosen": -2.6831555366516113, + "logits/rejected": -2.651272773742676, + "logps/chosen": -259.72967529296875, + "logps/rejected": -233.97537231445312, + "loss": 0.6669, + "positive_losses": 0.11028347164392471, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.22155094146728516, + "rewards/margins": 0.10187198221683502, + "rewards/margins_max": 0.29284974932670593, + "rewards/margins_min": -0.08088265359401703, + "rewards/margins_std": 0.17286035418510437, + "rewards/rejected": 0.11967895179986954, + "step": 3810 + }, + { + "dpo_losses": 0.6358076333999634, + "epoch": 1.0, + "grad_norm": 7.1167203747809475, + "learning_rate": 1.0437535929996855e-12, + "logits/chosen": -2.622563600540161, + "logits/rejected": -2.609252452850342, + "logps/chosen": -307.4974670410156, + "logps/rejected": -206.5818328857422, + "loss": 0.6681, + "positive_losses": 0.5052299499511719, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.23733225464820862, + "rewards/margins": 0.13700851798057556, + "rewards/margins_max": 0.3782404065132141, + "rewards/margins_min": -0.08631386607885361, + "rewards/margins_std": 0.20959043502807617, + "rewards/rejected": 0.10032373666763306, + "step": 3820 + }, + { + "epoch": 1.0, + "step": 3821, + "total_flos": 0.0, + "train_loss": 0.6795008113577053, + "train_runtime": 46141.1523, + "train_samples_per_second": 1.325, + "train_steps_per_second": 0.083 + } + ], + "logging_steps": 10, + "max_steps": 3821, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}