diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8488 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 3821, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "dpo_losses": 0.6931471824645996, + "epoch": 0.0, + "grad_norm": 1.666406547592859, + "learning_rate": 1.3054830287206266e-09, + "logits/chosen": -2.7590973377227783, + "logits/rejected": -2.847461462020874, + "logps/chosen": -183.89276123046875, + "logps/rejected": -240.56399536132812, + "loss": 0.6931, + "positive_losses": 0.0, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/margins_max": 0.0, + "rewards/margins_min": 0.0, + "rewards/margins_std": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "dpo_losses": 0.6931865215301514, + "epoch": 0.0, + "grad_norm": 30.19681129042229, + "learning_rate": 1.3054830287206264e-08, + "logits/chosen": -2.865060329437256, + "logits/rejected": -2.7412307262420654, + "logps/chosen": -287.0556640625, + "logps/rejected": -190.19590759277344, + "loss": 0.6958, + "positive_losses": 0.018089719116687775, + "rewards/accuracies": 0.0833333358168602, + "rewards/chosen": 5.116145621286705e-05, + "rewards/margins": -7.855256990296766e-05, + "rewards/margins_max": 0.0002732997527346015, + "rewards/margins_min": -0.00045305039384402335, + "rewards/margins_std": 0.00030465322197414935, + "rewards/rejected": 0.0001297140261158347, + "step": 10 + }, + { + "dpo_losses": 0.6932038068771362, + "epoch": 0.01, + "grad_norm": 26.185775226613973, + "learning_rate": 2.610966057441253e-08, + "logits/chosen": -2.9035611152648926, + "logits/rejected": -2.83616042137146, + "logps/chosen": -350.1943359375, + "logps/rejected": -269.9788818359375, + "loss": 0.7027, + "positive_losses": 0.07787647098302841, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.00026953319320455194, + "rewards/margins": -0.00011073592031607404, + "rewards/margins_max": 0.003399646608158946, + "rewards/margins_min": -0.0036606634967029095, + "rewards/margins_std": 0.0031352010555565357, + "rewards/rejected": 0.00038026898982934654, + "step": 20 + }, + { + "dpo_losses": 0.6929337978363037, + "epoch": 0.01, + "grad_norm": 32.03306397302637, + "learning_rate": 3.91644908616188e-08, + "logits/chosen": -2.8334875106811523, + "logits/rejected": -2.848536729812622, + "logps/chosen": -251.3134307861328, + "logps/rejected": -251.9360809326172, + "loss": 0.6993, + "positive_losses": 0.05228748172521591, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0004572083707898855, + "rewards/margins": 0.0004286346083972603, + "rewards/margins_max": 0.003322898643091321, + "rewards/margins_min": -0.00229655927978456, + "rewards/margins_std": 0.0024444316513836384, + "rewards/rejected": 2.8573758754646406e-05, + "step": 30 + }, + { + "dpo_losses": 0.6932997107505798, + "epoch": 0.01, + "grad_norm": 28.33388144767012, + "learning_rate": 5.221932114882506e-08, + "logits/chosen": -2.803170919418335, + "logits/rejected": -2.8049044609069824, + "logps/chosen": -225.2213592529297, + "logps/rejected": -243.0467071533203, + "loss": 0.7001, + "positive_losses": 0.05524027347564697, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.00021809982717968524, + "rewards/margins": -0.0003027426719199866, + "rewards/margins_max": 0.002072124509140849, + "rewards/margins_min": -0.004094444215297699, + "rewards/margins_std": 0.0028748027980327606, + "rewards/rejected": 0.0005208424990996718, + "step": 40 + }, + { + "dpo_losses": 0.69282066822052, + "epoch": 0.01, + "grad_norm": 8.755876584092501, + "learning_rate": 6.527415143603133e-08, + "logits/chosen": -2.954601764678955, + "logits/rejected": -2.915971279144287, + "logps/chosen": -341.208984375, + "logps/rejected": -307.0840148925781, + "loss": 0.6972, + "positive_losses": 0.0233170036226511, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0012528609950095415, + "rewards/margins": 0.0006549887475557625, + "rewards/margins_max": 0.0030162143521010876, + "rewards/margins_min": -0.0022026845254004, + "rewards/margins_std": 0.002286398783326149, + "rewards/rejected": 0.0005978723056614399, + "step": 50 + }, + { + "dpo_losses": 0.6929444074630737, + "epoch": 0.02, + "grad_norm": 16.87014271095211, + "learning_rate": 7.83289817232376e-08, + "logits/chosen": -2.739804744720459, + "logits/rejected": -2.6878609657287598, + "logps/chosen": -247.22518920898438, + "logps/rejected": -251.5140838623047, + "loss": 0.6955, + "positive_losses": 0.025938797742128372, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0018961990717798471, + "rewards/margins": 0.00040858075954020023, + "rewards/margins_max": 0.004767539910972118, + "rewards/margins_min": -0.002739040181040764, + "rewards/margins_std": 0.003281188430264592, + "rewards/rejected": 0.0014876185450702906, + "step": 60 + }, + { + "dpo_losses": 0.693077802658081, + "epoch": 0.02, + "grad_norm": 1.6246733703996752, + "learning_rate": 9.138381201044386e-08, + "logits/chosen": -2.8502087593078613, + "logits/rejected": -2.812725305557251, + "logps/chosen": -260.4454650878906, + "logps/rejected": -244.32363891601562, + "loss": 0.6952, + "positive_losses": 0.01544876117259264, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.00289685046300292, + "rewards/margins": 0.000141006734338589, + "rewards/margins_max": 0.0029947375878691673, + "rewards/margins_min": -0.00309961661696434, + "rewards/margins_std": 0.002640590537339449, + "rewards/rejected": 0.0027558435685932636, + "step": 70 + }, + { + "dpo_losses": 0.692978024482727, + "epoch": 0.02, + "grad_norm": 2.4895937824961045, + "learning_rate": 1.0443864229765012e-07, + "logits/chosen": -2.748595952987671, + "logits/rejected": -2.7783217430114746, + "logps/chosen": -296.84954833984375, + "logps/rejected": -234.31381225585938, + "loss": 0.6952, + "positive_losses": 0.01120681781321764, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.003352649509906769, + "rewards/margins": 0.00034075783332809806, + "rewards/margins_max": 0.003386855823919177, + "rewards/margins_min": -0.003298679366707802, + "rewards/margins_std": 0.002919359365478158, + "rewards/rejected": 0.003011892084032297, + "step": 80 + }, + { + "dpo_losses": 0.6929032206535339, + "epoch": 0.02, + "grad_norm": 16.15562665020882, + "learning_rate": 1.174934725848564e-07, + "logits/chosen": -2.96268892288208, + "logits/rejected": -2.9666476249694824, + "logps/chosen": -355.95611572265625, + "logps/rejected": -323.64129638671875, + "loss": 0.6945, + "positive_losses": 0.0033386230934411287, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0049633425660431385, + "rewards/margins": 0.0004909709095954895, + "rewards/margins_max": 0.004024089779704809, + "rewards/margins_min": -0.0036105443723499775, + "rewards/margins_std": 0.003409436671063304, + "rewards/rejected": 0.004472372122108936, + "step": 90 + }, + { + "dpo_losses": 0.6929186582565308, + "epoch": 0.03, + "grad_norm": 19.22387984128968, + "learning_rate": 1.3054830287206266e-07, + "logits/chosen": -2.705064535140991, + "logits/rejected": -2.6813759803771973, + "logps/chosen": -297.81158447265625, + "logps/rejected": -226.8807373046875, + "loss": 0.6943, + "positive_losses": 0.011927795596420765, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004185602534562349, + "rewards/margins": 0.00045963405864313245, + "rewards/margins_max": 0.003522497834637761, + "rewards/margins_min": -0.0030625914223492146, + "rewards/margins_std": 0.0029240392614156008, + "rewards/rejected": 0.0037259687669575214, + "step": 100 + }, + { + "epoch": 0.03, + "eval_dpo_losses": 0.6929485201835632, + "eval_logits/chosen": -2.8206725120544434, + "eval_logits/rejected": -2.782633066177368, + "eval_logps/chosen": -283.98443603515625, + "eval_logps/rejected": -261.6789855957031, + "eval_loss": 0.693632960319519, + "eval_positive_losses": 0.0069915228523314, + "eval_rewards/accuracies": 0.5555555820465088, + "eval_rewards/chosen": 0.005104683805257082, + "eval_rewards/margins": 0.0004005789814982563, + "eval_rewards/margins_max": 0.005383267533034086, + "eval_rewards/margins_min": -0.00492233969271183, + "eval_rewards/margins_std": 0.003428671509027481, + "eval_rewards/rejected": 0.004704104270786047, + "eval_runtime": 388.5975, + "eval_samples_per_second": 5.147, + "eval_steps_per_second": 0.162, + "step": 100 + }, + { + "dpo_losses": 0.6934686899185181, + "epoch": 0.03, + "grad_norm": 11.788573237859406, + "learning_rate": 1.4360313315926893e-07, + "logits/chosen": -2.8273611068725586, + "logits/rejected": -2.814751386642456, + "logps/chosen": -266.32379150390625, + "logps/rejected": -263.38446044921875, + "loss": 0.6939, + "positive_losses": 0.01450958289206028, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.004909469746053219, + "rewards/margins": -0.0006404476007446647, + "rewards/margins_max": 0.002480804454535246, + "rewards/margins_min": -0.003917304333299398, + "rewards/margins_std": 0.0027779233641922474, + "rewards/rejected": 0.005549917463213205, + "step": 110 + }, + { + "dpo_losses": 0.693248450756073, + "epoch": 0.03, + "grad_norm": 10.899515515680617, + "learning_rate": 1.566579634464752e-07, + "logits/chosen": -2.8180179595947266, + "logits/rejected": -2.7959036827087402, + "logps/chosen": -248.500244140625, + "logps/rejected": -239.03329467773438, + "loss": 0.6937, + "positive_losses": 0.02230529859662056, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.005369266960769892, + "rewards/margins": -0.00020054270862601697, + "rewards/margins_max": 0.0017862394452095032, + "rewards/margins_min": -0.0030471840873360634, + "rewards/margins_std": 0.0022475814912468195, + "rewards/rejected": 0.005569809582084417, + "step": 120 + }, + { + "dpo_losses": 0.6927030086517334, + "epoch": 0.03, + "grad_norm": 13.482247442238627, + "learning_rate": 1.6971279373368143e-07, + "logits/chosen": -2.7805299758911133, + "logits/rejected": -2.7365939617156982, + "logps/chosen": -278.9297790527344, + "logps/rejected": -392.56842041015625, + "loss": 0.6933, + "positive_losses": 0.0033142089378088713, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.005917480681091547, + "rewards/margins": 0.0008933775825425982, + "rewards/margins_max": 0.006129108369350433, + "rewards/margins_min": -0.00324707361869514, + "rewards/margins_std": 0.004221538081765175, + "rewards/rejected": 0.005024102982133627, + "step": 130 + }, + { + "dpo_losses": 0.6928731203079224, + "epoch": 0.04, + "grad_norm": 8.023760795580056, + "learning_rate": 1.8276762402088773e-07, + "logits/chosen": -2.7624192237854004, + "logits/rejected": -2.7280631065368652, + "logps/chosen": -233.57473754882812, + "logps/rejected": -216.6000518798828, + "loss": 0.6953, + "positive_losses": 0.006344604305922985, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.00621133903041482, + "rewards/margins": 0.0005498835816979408, + "rewards/margins_max": 0.0035002590157091618, + "rewards/margins_min": -0.0017805719980970025, + "rewards/margins_std": 0.00228001456707716, + "rewards/rejected": 0.005661455448716879, + "step": 140 + }, + { + "dpo_losses": 0.6931334137916565, + "epoch": 0.04, + "grad_norm": 1.8466653566091873, + "learning_rate": 1.95822454308094e-07, + "logits/chosen": -2.7768664360046387, + "logits/rejected": -2.756864070892334, + "logps/chosen": -218.4131622314453, + "logps/rejected": -248.4625244140625, + "loss": 0.6934, + "positive_losses": 0.0017547607421875, + "rewards/accuracies": 0.44999998807907104, + "rewards/chosen": 0.006885058246552944, + "rewards/margins": 3.0700430215802044e-05, + "rewards/margins_max": 0.004112632479518652, + "rewards/margins_min": -0.0041068727150559425, + "rewards/margins_std": 0.003603233490139246, + "rewards/rejected": 0.0068543581292033195, + "step": 150 + }, + { + "dpo_losses": 0.6929692029953003, + "epoch": 0.04, + "grad_norm": 10.628778481407727, + "learning_rate": 2.0887728459530023e-07, + "logits/chosen": -2.8364949226379395, + "logits/rejected": -2.828240156173706, + "logps/chosen": -263.7819519042969, + "logps/rejected": -227.7448272705078, + "loss": 0.6931, + "positive_losses": 0.00244903564453125, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.00650602113455534, + "rewards/margins": 0.00035825843224301934, + "rewards/margins_max": 0.0036457558162510395, + "rewards/margins_min": -0.0027374648489058018, + "rewards/margins_std": 0.0028280240949243307, + "rewards/rejected": 0.006147762760519981, + "step": 160 + }, + { + "dpo_losses": 0.6932560205459595, + "epoch": 0.04, + "grad_norm": 1.8417289130737209, + "learning_rate": 2.2193211488250652e-07, + "logits/chosen": -2.7584662437438965, + "logits/rejected": -2.7368972301483154, + "logps/chosen": -244.76736450195312, + "logps/rejected": -219.04421997070312, + "loss": 0.6933, + "positive_losses": 0.00555496197193861, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.006044226232916117, + "rewards/margins": -0.00021317150094546378, + "rewards/margins_max": 0.0035289092920720577, + "rewards/margins_min": -0.004762549884617329, + "rewards/margins_std": 0.003684843424707651, + "rewards/rejected": 0.006257397588342428, + "step": 170 + }, + { + "dpo_losses": 0.692129373550415, + "epoch": 0.05, + "grad_norm": 8.525853053395064, + "learning_rate": 2.349869451697128e-07, + "logits/chosen": -2.7833104133605957, + "logits/rejected": -2.7295079231262207, + "logps/chosen": -312.50042724609375, + "logps/rejected": -244.234375, + "loss": 0.6931, + "positive_losses": 0.012158965691924095, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.00696224719285965, + "rewards/margins": 0.0020412951707839966, + "rewards/margins_max": 0.006969262845814228, + "rewards/margins_min": -0.0024727259296923876, + "rewards/margins_std": 0.004140241537243128, + "rewards/rejected": 0.00492095248773694, + "step": 180 + }, + { + "dpo_losses": 0.6932461857795715, + "epoch": 0.05, + "grad_norm": 6.851936912963262, + "learning_rate": 2.4804177545691903e-07, + "logits/chosen": -2.8912341594696045, + "logits/rejected": -2.83351993560791, + "logps/chosen": -282.2035217285156, + "logps/rejected": -211.73965454101562, + "loss": 0.6933, + "positive_losses": 0.0056968689896166325, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.007540510501712561, + "rewards/margins": -0.0001929441059473902, + "rewards/margins_max": 0.004584602080285549, + "rewards/margins_min": -0.0037689092569053173, + "rewards/margins_std": 0.0036659010220319033, + "rewards/rejected": 0.007733455393463373, + "step": 190 + }, + { + "dpo_losses": 0.6919293999671936, + "epoch": 0.05, + "grad_norm": 8.245672767261967, + "learning_rate": 2.610966057441253e-07, + "logits/chosen": -2.8154959678649902, + "logits/rejected": -2.7628376483917236, + "logps/chosen": -322.11981201171875, + "logps/rejected": -254.35159301757812, + "loss": 0.6937, + "positive_losses": 0.0044952393509447575, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.009347244165837765, + "rewards/margins": 0.002443180652335286, + "rewards/margins_max": 0.00754980742931366, + "rewards/margins_min": -0.0015672739828005433, + "rewards/margins_std": 0.004176629241555929, + "rewards/rejected": 0.006904063280671835, + "step": 200 + }, + { + "epoch": 0.05, + "eval_dpo_losses": 0.6924603581428528, + "eval_logits/chosen": -2.8224804401397705, + "eval_logits/rejected": -2.784637928009033, + "eval_logps/chosen": -283.6239929199219, + "eval_logps/rejected": -261.4164733886719, + "eval_loss": 0.693173348903656, + "eval_positive_losses": 0.005795312114059925, + "eval_rewards/accuracies": 0.591269850730896, + "eval_rewards/chosen": 0.008708693087100983, + "eval_rewards/margins": 0.001379763358272612, + "eval_rewards/margins_max": 0.008318053558468819, + "eval_rewards/margins_min": -0.005318824201822281, + "eval_rewards/margins_std": 0.004544954281300306, + "eval_rewards/rejected": 0.00732893031090498, + "eval_runtime": 387.8762, + "eval_samples_per_second": 5.156, + "eval_steps_per_second": 0.162, + "step": 200 + }, + { + "dpo_losses": 0.6922547221183777, + "epoch": 0.05, + "grad_norm": 19.29159055932387, + "learning_rate": 2.7415143603133156e-07, + "logits/chosen": -2.798863649368286, + "logits/rejected": -2.7512946128845215, + "logps/chosen": -340.0872802734375, + "logps/rejected": -274.6262512207031, + "loss": 0.6926, + "positive_losses": 0.0, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.010198279283940792, + "rewards/margins": 0.0018013219814747572, + "rewards/margins_max": 0.008373035117983818, + "rewards/margins_min": -0.002471528248861432, + "rewards/margins_std": 0.004959521349519491, + "rewards/rejected": 0.008396958000957966, + "step": 210 + }, + { + "dpo_losses": 0.692650318145752, + "epoch": 0.06, + "grad_norm": 1.7940467137152343, + "learning_rate": 2.8720626631853785e-07, + "logits/chosen": -2.7393550872802734, + "logits/rejected": -2.6828718185424805, + "logps/chosen": -265.1138000488281, + "logps/rejected": -295.94427490234375, + "loss": 0.6926, + "positive_losses": 0.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.009583190083503723, + "rewards/margins": 0.0009993657004088163, + "rewards/margins_max": 0.00544703658670187, + "rewards/margins_min": -0.004009666386991739, + "rewards/margins_std": 0.004203102085739374, + "rewards/rejected": 0.008583825081586838, + "step": 220 + }, + { + "dpo_losses": 0.6927578449249268, + "epoch": 0.06, + "grad_norm": 1.8830337077895423, + "learning_rate": 3.002610966057441e-07, + "logits/chosen": -2.8563461303710938, + "logits/rejected": -2.8284122943878174, + "logps/chosen": -306.5498352050781, + "logps/rejected": -253.4873046875, + "loss": 0.6928, + "positive_losses": 0.018727874383330345, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.01034473441541195, + "rewards/margins": 0.0007884896476753056, + "rewards/margins_max": 0.008461997844278812, + "rewards/margins_min": -0.006560837384313345, + "rewards/margins_std": 0.006800378207117319, + "rewards/rejected": 0.009556243196129799, + "step": 230 + }, + { + "dpo_losses": 0.6917439699172974, + "epoch": 0.06, + "grad_norm": 2.1350279539542942, + "learning_rate": 3.133159268929504e-07, + "logits/chosen": -2.729341506958008, + "logits/rejected": -2.6087653636932373, + "logps/chosen": -274.9852600097656, + "logps/rejected": -224.34011840820312, + "loss": 0.6919, + "positive_losses": 0.0032783509232103825, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.012574483640491962, + "rewards/margins": 0.002816675463691354, + "rewards/margins_max": 0.009632418863475323, + "rewards/margins_min": -0.0025681552942842245, + "rewards/margins_std": 0.005478983279317617, + "rewards/rejected": 0.009757807478308678, + "step": 240 + }, + { + "dpo_losses": 0.6922317743301392, + "epoch": 0.07, + "grad_norm": 1.6556520642278625, + "learning_rate": 3.263707571801567e-07, + "logits/chosen": -2.7859044075012207, + "logits/rejected": -2.8035387992858887, + "logps/chosen": -284.1858215332031, + "logps/rejected": -252.52474975585938, + "loss": 0.6925, + "positive_losses": 0.0, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01263434998691082, + "rewards/margins": 0.0018406830495223403, + "rewards/margins_max": 0.007658310234546661, + "rewards/margins_min": -0.0037867389619350433, + "rewards/margins_std": 0.0052383774891495705, + "rewards/rejected": 0.01079366635531187, + "step": 250 + }, + { + "dpo_losses": 0.6910029649734497, + "epoch": 0.07, + "grad_norm": 3.2893183737781766, + "learning_rate": 3.3942558746736286e-07, + "logits/chosen": -2.8234846591949463, + "logits/rejected": -2.789393901824951, + "logps/chosen": -349.8510437011719, + "logps/rejected": -324.61932373046875, + "loss": 0.6916, + "positive_losses": 0.0, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.012757892720401287, + "rewards/margins": 0.0043033878318965435, + "rewards/margins_max": 0.009838690981268883, + "rewards/margins_min": -0.002494791056960821, + "rewards/margins_std": 0.005606819875538349, + "rewards/rejected": 0.008454503491520882, + "step": 260 + }, + { + "dpo_losses": 0.6918529272079468, + "epoch": 0.07, + "grad_norm": 1.7905826690090754, + "learning_rate": 3.5248041775456916e-07, + "logits/chosen": -2.8090696334838867, + "logits/rejected": -2.7536022663116455, + "logps/chosen": -324.2267761230469, + "logps/rejected": -298.77349853515625, + "loss": 0.6916, + "positive_losses": 0.0, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.015475763007998466, + "rewards/margins": 0.002604722511023283, + "rewards/margins_max": 0.011933142319321632, + "rewards/margins_min": -0.004363273270428181, + "rewards/margins_std": 0.007273535244166851, + "rewards/rejected": 0.012871041893959045, + "step": 270 + }, + { + "dpo_losses": 0.6923967003822327, + "epoch": 0.07, + "grad_norm": 9.155954193440616, + "learning_rate": 3.6553524804177545e-07, + "logits/chosen": -2.903094530105591, + "logits/rejected": -2.8605706691741943, + "logps/chosen": -276.69207763671875, + "logps/rejected": -249.9399871826172, + "loss": 0.6923, + "positive_losses": 0.0, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.015179550275206566, + "rewards/margins": 0.001524943159893155, + "rewards/margins_max": 0.012009905651211739, + "rewards/margins_min": -0.010082701221108437, + "rewards/margins_std": 0.009722010232508183, + "rewards/rejected": 0.013654607348144054, + "step": 280 + }, + { + "dpo_losses": 0.6904204487800598, + "epoch": 0.08, + "grad_norm": 10.5627611116718, + "learning_rate": 3.785900783289817e-07, + "logits/chosen": -2.8307137489318848, + "logits/rejected": -2.7851150035858154, + "logps/chosen": -299.5377502441406, + "logps/rejected": -239.16635131835938, + "loss": 0.6916, + "positive_losses": 0.015967559069395065, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.015545010566711426, + "rewards/margins": 0.005487080663442612, + "rewards/margins_max": 0.015951601788401604, + "rewards/margins_min": -0.004018872976303101, + "rewards/margins_std": 0.00920058973133564, + "rewards/rejected": 0.01005792897194624, + "step": 290 + }, + { + "dpo_losses": 0.6916751861572266, + "epoch": 0.08, + "grad_norm": 1.8403639655253654, + "learning_rate": 3.91644908616188e-07, + "logits/chosen": -2.8961424827575684, + "logits/rejected": -2.8574767112731934, + "logps/chosen": -295.5889587402344, + "logps/rejected": -239.2697296142578, + "loss": 0.6932, + "positive_losses": 0.01744537428021431, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01415463350713253, + "rewards/margins": 0.0029651448130607605, + "rewards/margins_max": 0.013198497705161572, + "rewards/margins_min": -0.0035127296578139067, + "rewards/margins_std": 0.007742941379547119, + "rewards/rejected": 0.01118948683142662, + "step": 300 + }, + { + "epoch": 0.08, + "eval_dpo_losses": 0.6907697319984436, + "eval_logits/chosen": -2.8172719478607178, + "eval_logits/rejected": -2.779116153717041, + "eval_logps/chosen": -282.73956298828125, + "eval_logps/rejected": -260.8730163574219, + "eval_loss": 0.6918479204177856, + "eval_positive_losses": 0.011128379963338375, + "eval_rewards/accuracies": 0.670634925365448, + "eval_rewards/chosen": 0.017553498968482018, + "eval_rewards/margins": 0.004790027160197496, + "eval_rewards/margins_max": 0.021332116797566414, + "eval_rewards/margins_min": -0.010061729699373245, + "eval_rewards/margins_std": 0.010371250100433826, + "eval_rewards/rejected": 0.012763473205268383, + "eval_runtime": 401.6615, + "eval_samples_per_second": 4.979, + "eval_steps_per_second": 0.157, + "step": 300 + }, + { + "dpo_losses": 0.6913976669311523, + "epoch": 0.08, + "grad_norm": 2.2550175140445052, + "learning_rate": 4.046997389033943e-07, + "logits/chosen": -2.865398406982422, + "logits/rejected": -2.8224949836730957, + "logps/chosen": -286.3303527832031, + "logps/rejected": -305.35626220703125, + "loss": 0.6913, + "positive_losses": 0.0, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.0183410681784153, + "rewards/margins": 0.0035322513431310654, + "rewards/margins_max": 0.01414328534156084, + "rewards/margins_min": -0.0053473422303795815, + "rewards/margins_std": 0.00869253184646368, + "rewards/rejected": 0.014808815903961658, + "step": 310 + }, + { + "dpo_losses": 0.6903918981552124, + "epoch": 0.08, + "grad_norm": 9.72644405675876, + "learning_rate": 4.1775456919060046e-07, + "logits/chosen": -2.8780643939971924, + "logits/rejected": -2.8514111042022705, + "logps/chosen": -307.68963623046875, + "logps/rejected": -265.0461730957031, + "loss": 0.6915, + "positive_losses": 0.0016929625999182463, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.020931031554937363, + "rewards/margins": 0.0055774180218577385, + "rewards/margins_max": 0.02189483866095543, + "rewards/margins_min": -0.009504149667918682, + "rewards/margins_std": 0.014499841257929802, + "rewards/rejected": 0.015353617258369923, + "step": 320 + }, + { + "dpo_losses": 0.6896374225616455, + "epoch": 0.09, + "grad_norm": 2.2611511256087677, + "learning_rate": 4.3080939947780675e-07, + "logits/chosen": -2.8339321613311768, + "logits/rejected": -2.773601770401001, + "logps/chosen": -285.16424560546875, + "logps/rejected": -290.3405456542969, + "loss": 0.6906, + "positive_losses": 0.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.024154286831617355, + "rewards/margins": 0.007089032791554928, + "rewards/margins_max": 0.023823006078600883, + "rewards/margins_min": -0.0061348374001681805, + "rewards/margins_std": 0.013329845853149891, + "rewards/rejected": 0.017065253108739853, + "step": 330 + }, + { + "dpo_losses": 0.6893516778945923, + "epoch": 0.09, + "grad_norm": 5.815296443818852, + "learning_rate": 4.4386422976501305e-07, + "logits/chosen": -2.714614152908325, + "logits/rejected": -2.7669053077697754, + "logps/chosen": -321.5865173339844, + "logps/rejected": -296.74346923828125, + "loss": 0.6921, + "positive_losses": 0.0, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.02406420186161995, + "rewards/margins": 0.0076649547554552555, + "rewards/margins_max": 0.025969142094254494, + "rewards/margins_min": -0.004413464106619358, + "rewards/margins_std": 0.013922092504799366, + "rewards/rejected": 0.016399245709180832, + "step": 340 + }, + { + "dpo_losses": 0.6887660026550293, + "epoch": 0.09, + "grad_norm": 3.686826870790532, + "learning_rate": 4.569190600522193e-07, + "logits/chosen": -2.757418632507324, + "logits/rejected": -2.725862979888916, + "logps/chosen": -306.6487731933594, + "logps/rejected": -232.5929718017578, + "loss": 0.6917, + "positive_losses": 0.03159179538488388, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.024827757850289345, + "rewards/margins": 0.008878981694579124, + "rewards/margins_max": 0.031129935756325722, + "rewards/margins_min": -0.011993775144219398, + "rewards/margins_std": 0.019443338736891747, + "rewards/rejected": 0.01594877615571022, + "step": 350 + }, + { + "dpo_losses": 0.6898253560066223, + "epoch": 0.09, + "grad_norm": 2.1252427803924783, + "learning_rate": 4.699738903394256e-07, + "logits/chosen": -2.947758197784424, + "logits/rejected": -2.923774480819702, + "logps/chosen": -374.69000244140625, + "logps/rejected": -284.81695556640625, + "loss": 0.6914, + "positive_losses": 0.04075012356042862, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.024526912719011307, + "rewards/margins": 0.006719623692333698, + "rewards/margins_max": 0.024043092504143715, + "rewards/margins_min": -0.010639860294759274, + "rewards/margins_std": 0.015631282702088356, + "rewards/rejected": 0.017807289958000183, + "step": 360 + }, + { + "dpo_losses": 0.6916292309761047, + "epoch": 0.1, + "grad_norm": 1.8193199496915609, + "learning_rate": 4.830287206266319e-07, + "logits/chosen": -2.6774754524230957, + "logits/rejected": -2.7066006660461426, + "logps/chosen": -211.9786376953125, + "logps/rejected": -250.2374725341797, + "loss": 0.6923, + "positive_losses": 0.0, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.02256864868104458, + "rewards/margins": 0.0030855671502649784, + "rewards/margins_max": 0.016740715131163597, + "rewards/margins_min": -0.009803814813494682, + "rewards/margins_std": 0.01225105207413435, + "rewards/rejected": 0.019483083859086037, + "step": 370 + }, + { + "dpo_losses": 0.6892152428627014, + "epoch": 0.1, + "grad_norm": 1.8605234141835845, + "learning_rate": 4.960835509138381e-07, + "logits/chosen": -2.7890875339508057, + "logits/rejected": -2.6685492992401123, + "logps/chosen": -278.82476806640625, + "logps/rejected": -283.8394470214844, + "loss": 0.691, + "positive_losses": 0.0, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.02961309812963009, + "rewards/margins": 0.007972048595547676, + "rewards/margins_max": 0.029571373015642166, + "rewards/margins_min": -0.009612159803509712, + "rewards/margins_std": 0.018037427216768265, + "rewards/rejected": 0.021641049534082413, + "step": 380 + }, + { + "dpo_losses": 0.6870719194412231, + "epoch": 0.1, + "grad_norm": 2.048819630098869, + "learning_rate": 4.999948856244767e-07, + "logits/chosen": -2.91493558883667, + "logits/rejected": -2.873018741607666, + "logps/chosen": -285.544677734375, + "logps/rejected": -257.34625244140625, + "loss": 0.6894, + "positive_losses": 0.0, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.03688435256481171, + "rewards/margins": 0.012254970148205757, + "rewards/margins_max": 0.031174445524811745, + "rewards/margins_min": -0.004109539091587067, + "rewards/margins_std": 0.015911713242530823, + "rewards/rejected": 0.0246293805539608, + "step": 390 + }, + { + "dpo_losses": 0.6913290023803711, + "epoch": 0.1, + "grad_norm": 1.6765440869631212, + "learning_rate": 4.999698361256577e-07, + "logits/chosen": -2.7324068546295166, + "logits/rejected": -2.728743076324463, + "logps/chosen": -252.0926971435547, + "logps/rejected": -232.7787628173828, + "loss": 0.6923, + "positive_losses": 0.08732833713293076, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.030203068628907204, + "rewards/margins": 0.0038094078190624714, + "rewards/margins_max": 0.032358523458242416, + "rewards/margins_min": -0.020671313628554344, + "rewards/margins_std": 0.023817040026187897, + "rewards/rejected": 0.02639366313815117, + "step": 400 + }, + { + "epoch": 0.1, + "eval_dpo_losses": 0.6882805824279785, + "eval_logits/chosen": -2.817486047744751, + "eval_logits/rejected": -2.7794790267944336, + "eval_logps/chosen": -280.82073974609375, + "eval_logps/rejected": -259.46270751953125, + "eval_loss": 0.6901962161064148, + "eval_positive_losses": 0.01547150406986475, + "eval_rewards/accuracies": 0.6785714030265808, + "eval_rewards/chosen": 0.03674148768186569, + "eval_rewards/margins": 0.009874720126390457, + "eval_rewards/margins_max": 0.042554907500743866, + "eval_rewards/margins_min": -0.020002564415335655, + "eval_rewards/margins_std": 0.0206435713917017, + "eval_rewards/rejected": 0.026866771280765533, + "eval_runtime": 399.8496, + "eval_samples_per_second": 5.002, + "eval_steps_per_second": 0.158, + "step": 400 + }, + { + "dpo_losses": 0.6877659559249878, + "epoch": 0.11, + "grad_norm": 1.8218651563532247, + "learning_rate": 4.99923914217458e-07, + "logits/chosen": -2.7876436710357666, + "logits/rejected": -2.679112434387207, + "logps/chosen": -315.75225830078125, + "logps/rejected": -291.5887145996094, + "loss": 0.6933, + "positive_losses": 0.06274566799402237, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0355767160654068, + "rewards/margins": 0.01087341457605362, + "rewards/margins_max": 0.030309131368994713, + "rewards/margins_min": -0.007646770682185888, + "rewards/margins_std": 0.016793150454759598, + "rewards/rejected": 0.02470330148935318, + "step": 410 + }, + { + "dpo_losses": 0.6863200664520264, + "epoch": 0.11, + "grad_norm": 4.46313080590029, + "learning_rate": 4.99857123734344e-07, + "logits/chosen": -2.7639195919036865, + "logits/rejected": -2.7428085803985596, + "logps/chosen": -267.6919860839844, + "logps/rejected": -230.65151977539062, + "loss": 0.6869, + "positive_losses": 0.0, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.03915037587285042, + "rewards/margins": 0.013787394389510155, + "rewards/margins_max": 0.035141073167324066, + "rewards/margins_min": -0.005410642828792334, + "rewards/margins_std": 0.01859271712601185, + "rewards/rejected": 0.025362977758049965, + "step": 420 + }, + { + "dpo_losses": 0.6852012872695923, + "epoch": 0.11, + "grad_norm": 12.85730423598963, + "learning_rate": 4.997694702533016e-07, + "logits/chosen": -2.793182373046875, + "logits/rejected": -2.6905808448791504, + "logps/chosen": -296.16558837890625, + "logps/rejected": -218.5005645751953, + "loss": 0.6865, + "positive_losses": 0.0, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.047698576003313065, + "rewards/margins": 0.01618335209786892, + "rewards/margins_max": 0.04767851531505585, + "rewards/margins_min": -0.011801841668784618, + "rewards/margins_std": 0.025887608528137207, + "rewards/rejected": 0.0315152183175087, + "step": 430 + }, + { + "dpo_losses": 0.6853216886520386, + "epoch": 0.12, + "grad_norm": 1.8741895831606603, + "learning_rate": 4.996609610933712e-07, + "logits/chosen": -2.8489298820495605, + "logits/rejected": -2.844210624694824, + "logps/chosen": -280.28546142578125, + "logps/rejected": -257.695068359375, + "loss": 0.687, + "positive_losses": 0.0074554444290697575, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.04551452770829201, + "rewards/margins": 0.01586158759891987, + "rewards/margins_max": 0.0459897443652153, + "rewards/margins_min": -0.002934148535132408, + "rewards/margins_std": 0.022323202341794968, + "rewards/rejected": 0.029652941972017288, + "step": 440 + }, + { + "dpo_losses": 0.6888092756271362, + "epoch": 0.12, + "grad_norm": 2.0146739993228095, + "learning_rate": 4.995316053150366e-07, + "logits/chosen": -2.8890292644500732, + "logits/rejected": -2.8318934440612793, + "logps/chosen": -281.5171203613281, + "logps/rejected": -230.30618286132812, + "loss": 0.6877, + "positive_losses": 0.0, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.04681592434644699, + "rewards/margins": 0.008902020752429962, + "rewards/margins_max": 0.04170208051800728, + "rewards/margins_min": -0.020604267716407776, + "rewards/margins_std": 0.02728102169930935, + "rewards/rejected": 0.037913911044597626, + "step": 450 + }, + { + "dpo_losses": 0.6891074180603027, + "epoch": 0.12, + "grad_norm": 2.0343813894613922, + "learning_rate": 4.99381413719468e-07, + "logits/chosen": -2.7213335037231445, + "logits/rejected": -2.719630241394043, + "logps/chosen": -247.08349609375, + "logps/rejected": -261.12261962890625, + "loss": 0.687, + "positive_losses": 0.0772472396492958, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.04410042613744736, + "rewards/margins": 0.008285664021968842, + "rewards/margins_max": 0.04062087833881378, + "rewards/margins_min": -0.022265169769525528, + "rewards/margins_std": 0.027511686086654663, + "rewards/rejected": 0.03581475839018822, + "step": 460 + }, + { + "dpo_losses": 0.6866526007652283, + "epoch": 0.12, + "grad_norm": 1.3789397065423763, + "learning_rate": 4.992103988476205e-07, + "logits/chosen": -2.767130136489868, + "logits/rejected": -2.8326306343078613, + "logps/chosen": -282.37945556640625, + "logps/rejected": -272.17578125, + "loss": 0.6883, + "positive_losses": 0.0, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.05081998184323311, + "rewards/margins": 0.013341334648430347, + "rewards/margins_max": 0.0426035150885582, + "rewards/margins_min": -0.01956797018647194, + "rewards/margins_std": 0.028744056820869446, + "rewards/rejected": 0.037478648126125336, + "step": 470 + }, + { + "dpo_losses": 0.6860936880111694, + "epoch": 0.13, + "grad_norm": 1.4536725725069872, + "learning_rate": 4.990185749791864e-07, + "logits/chosen": -2.8672947883605957, + "logits/rejected": -2.805577516555786, + "logps/chosen": -270.2125549316406, + "logps/rejected": -237.41305541992188, + "loss": 0.6892, + "positive_losses": 0.008692169561982155, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.05292079970240593, + "rewards/margins": 0.014513748697936535, + "rewards/margins_max": 0.051043324172496796, + "rewards/margins_min": -0.026107680052518845, + "rewards/margins_std": 0.03386848792433739, + "rewards/rejected": 0.03840705007314682, + "step": 480 + }, + { + "dpo_losses": 0.6850025057792664, + "epoch": 0.13, + "grad_norm": 11.915723501018912, + "learning_rate": 4.988059581314039e-07, + "logits/chosen": -2.8255181312561035, + "logits/rejected": -2.832737445831299, + "logps/chosen": -317.77325439453125, + "logps/rejected": -313.11309814453125, + "loss": 0.6886, + "positive_losses": 0.0, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.05682548135519028, + "rewards/margins": 0.01671528071165085, + "rewards/margins_max": 0.06309463828802109, + "rewards/margins_min": -0.02339771017432213, + "rewards/margins_std": 0.03948745131492615, + "rewards/rejected": 0.040110208094120026, + "step": 490 + }, + { + "dpo_losses": 0.6825217604637146, + "epoch": 0.13, + "grad_norm": 1.7951523193932593, + "learning_rate": 4.985725660577184e-07, + "logits/chosen": -2.7949891090393066, + "logits/rejected": -2.7831168174743652, + "logps/chosen": -293.61553955078125, + "logps/rejected": -242.59927368164062, + "loss": 0.6931, + "positive_losses": 0.17864075303077698, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.059095822274684906, + "rewards/margins": 0.021753598004579544, + "rewards/margins_max": 0.06915644556283951, + "rewards/margins_min": -0.015057327225804329, + "rewards/margins_std": 0.03711111471056938, + "rewards/rejected": 0.03734221309423447, + "step": 500 + }, + { + "epoch": 0.13, + "eval_dpo_losses": 0.6845101714134216, + "eval_logits/chosen": -2.8117895126342773, + "eval_logits/rejected": -2.773566722869873, + "eval_logps/chosen": -278.5678405761719, + "eval_logps/rejected": -257.99334716796875, + "eval_loss": 0.6883335113525391, + "eval_positive_losses": 0.026538720354437828, + "eval_rewards/accuracies": 0.6865079402923584, + "eval_rewards/chosen": 0.0592704601585865, + "eval_rewards/margins": 0.01771017536520958, + "eval_rewards/margins_max": 0.07632049918174744, + "eval_rewards/margins_min": -0.034054145216941833, + "eval_rewards/margins_std": 0.03624986857175827, + "eval_rewards/rejected": 0.04156028851866722, + "eval_runtime": 388.2124, + "eval_samples_per_second": 5.152, + "eval_steps_per_second": 0.162, + "step": 500 + }, + { + "dpo_losses": 0.6836234331130981, + "epoch": 0.13, + "grad_norm": 1.7674795218094803, + "learning_rate": 4.983184182463008e-07, + "logits/chosen": -2.680546522140503, + "logits/rejected": -2.707136392593384, + "logps/chosen": -286.67022705078125, + "logps/rejected": -249.7257080078125, + "loss": 0.6859, + "positive_losses": 0.023099135607481003, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.057285137474536896, + "rewards/margins": 0.01968751661479473, + "rewards/margins_max": 0.07216949760913849, + "rewards/margins_min": -0.025082409381866455, + "rewards/margins_std": 0.04353880137205124, + "rewards/rejected": 0.037597618997097015, + "step": 510 + }, + { + "dpo_losses": 0.6839173436164856, + "epoch": 0.14, + "grad_norm": 1.7000138450507565, + "learning_rate": 4.980435359184203e-07, + "logits/chosen": -2.7557244300842285, + "logits/rejected": -2.666841745376587, + "logps/chosen": -268.85308837890625, + "logps/rejected": -226.5156707763672, + "loss": 0.6881, + "positive_losses": 0.038689423352479935, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.05459153652191162, + "rewards/margins": 0.018840208649635315, + "rewards/margins_max": 0.05965876579284668, + "rewards/margins_min": -0.014995383098721504, + "rewards/margins_std": 0.03378991782665253, + "rewards/rejected": 0.035751327872276306, + "step": 520 + }, + { + "dpo_losses": 0.6827250719070435, + "epoch": 0.14, + "grad_norm": 2.027310115704259, + "learning_rate": 4.977479420266723e-07, + "logits/chosen": -2.8694539070129395, + "logits/rejected": -2.8462345600128174, + "logps/chosen": -320.22021484375, + "logps/rejected": -287.8216552734375, + "loss": 0.6863, + "positive_losses": 0.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.06608148664236069, + "rewards/margins": 0.021329481154680252, + "rewards/margins_max": 0.06904635578393936, + "rewards/margins_min": -0.013772931881248951, + "rewards/margins_std": 0.038150329142808914, + "rewards/rejected": 0.044752009212970734, + "step": 530 + }, + { + "dpo_losses": 0.6873120069503784, + "epoch": 0.14, + "grad_norm": 1.9903594362387926, + "learning_rate": 4.974316612530614e-07, + "logits/chosen": -2.791822910308838, + "logits/rejected": -2.7837400436401367, + "logps/chosen": -282.8401794433594, + "logps/rejected": -246.26394653320312, + "loss": 0.6846, + "positive_losses": 0.026886368170380592, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.061132561415433884, + "rewards/margins": 0.012306076474487782, + "rewards/margins_max": 0.06637242436408997, + "rewards/margins_min": -0.038782063871622086, + "rewards/margins_std": 0.04558960720896721, + "rewards/rejected": 0.04882648214697838, + "step": 540 + }, + { + "dpo_losses": 0.6811521053314209, + "epoch": 0.14, + "grad_norm": 1.7338383849703332, + "learning_rate": 4.970947200069415e-07, + "logits/chosen": -2.896639347076416, + "logits/rejected": -2.8555636405944824, + "logps/chosen": -291.7834167480469, + "logps/rejected": -262.4410705566406, + "loss": 0.686, + "positive_losses": 0.01641998253762722, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.06366805732250214, + "rewards/margins": 0.024676566943526268, + "rewards/margins_max": 0.07537047564983368, + "rewards/margins_min": -0.02262812666594982, + "rewards/margins_std": 0.04408256709575653, + "rewards/rejected": 0.038991499692201614, + "step": 550 + }, + { + "dpo_losses": 0.6864355802536011, + "epoch": 0.15, + "grad_norm": 8.223314997365627, + "learning_rate": 4.967371464228095e-07, + "logits/chosen": -2.8287415504455566, + "logits/rejected": -2.8781867027282715, + "logps/chosen": -292.7107238769531, + "logps/rejected": -259.1818542480469, + "loss": 0.6862, + "positive_losses": 0.03968963772058487, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.07235224545001984, + "rewards/margins": 0.014133909717202187, + "rewards/margins_max": 0.06725303828716278, + "rewards/margins_min": -0.04378097131848335, + "rewards/margins_std": 0.04836827144026756, + "rewards/rejected": 0.0582183413207531, + "step": 560 + }, + { + "dpo_losses": 0.6787232756614685, + "epoch": 0.15, + "grad_norm": 1.704103031540377, + "learning_rate": 4.963589703579569e-07, + "logits/chosen": -2.806661367416382, + "logits/rejected": -2.766479969024658, + "logps/chosen": -265.540771484375, + "logps/rejected": -223.92953491210938, + "loss": 0.6854, + "positive_losses": 0.04793090745806694, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.07533114403486252, + "rewards/margins": 0.0297311432659626, + "rewards/margins_max": 0.08528684079647064, + "rewards/margins_min": -0.022052111104130745, + "rewards/margins_std": 0.04731011018157005, + "rewards/rejected": 0.045600004494190216, + "step": 570 + }, + { + "dpo_losses": 0.673004686832428, + "epoch": 0.15, + "grad_norm": 9.335852918680665, + "learning_rate": 4.959602233899761e-07, + "logits/chosen": -2.9390029907226562, + "logits/rejected": -2.87630033493042, + "logps/chosen": -360.3851623535156, + "logps/rejected": -260.69146728515625, + "loss": 0.6863, + "positive_losses": 0.02658386155962944, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.09925512969493866, + "rewards/margins": 0.04134751111268997, + "rewards/margins_max": 0.08122013509273529, + "rewards/margins_min": -0.0026921990793198347, + "rewards/margins_std": 0.03794340789318085, + "rewards/rejected": 0.05790762975811958, + "step": 580 + }, + { + "dpo_losses": 0.6868249177932739, + "epoch": 0.15, + "grad_norm": 1.5910712524849442, + "learning_rate": 4.955409388141243e-07, + "logits/chosen": -2.8337204456329346, + "logits/rejected": -2.7996432781219482, + "logps/chosen": -251.7097625732422, + "logps/rejected": -233.59188842773438, + "loss": 0.684, + "positive_losses": 0.04377937316894531, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.07259367406368256, + "rewards/margins": 0.013290634378790855, + "rewards/margins_max": 0.06030074879527092, + "rewards/margins_min": -0.041350338608026505, + "rewards/margins_std": 0.04475581645965576, + "rewards/rejected": 0.05930304527282715, + "step": 590 + }, + { + "dpo_losses": 0.6834608912467957, + "epoch": 0.16, + "grad_norm": 1.6366856100510174, + "learning_rate": 4.951011516405429e-07, + "logits/chosen": -2.904270648956299, + "logits/rejected": -2.8730576038360596, + "logps/chosen": -372.6429748535156, + "logps/rejected": -296.50006103515625, + "loss": 0.6831, + "positive_losses": 0.017871856689453125, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.08529319614171982, + "rewards/margins": 0.02043917588889599, + "rewards/margins_max": 0.08833781629800797, + "rewards/margins_min": -0.039192624390125275, + "rewards/margins_std": 0.057125210762023926, + "rewards/rejected": 0.06485401839017868, + "step": 600 + }, + { + "epoch": 0.16, + "eval_dpo_losses": 0.68129962682724, + "eval_logits/chosen": -2.8138515949249268, + "eval_logits/rejected": -2.7760884761810303, + "eval_logps/chosen": -276.13824462890625, + "eval_logps/rejected": -256.2458190917969, + "eval_loss": 0.6870078444480896, + "eval_positive_losses": 0.03946831822395325, + "eval_rewards/accuracies": 0.6964285969734192, + "eval_rewards/chosen": 0.08356665074825287, + "eval_rewards/margins": 0.024530887603759766, + "eval_rewards/margins_max": 0.10627257823944092, + "eval_rewards/margins_min": -0.047064412385225296, + "eval_rewards/margins_std": 0.05019204691052437, + "eval_rewards/rejected": 0.0590357705950737, + "eval_runtime": 388.6656, + "eval_samples_per_second": 5.146, + "eval_steps_per_second": 0.162, + "step": 600 + }, + { + "dpo_losses": 0.6777583360671997, + "epoch": 0.16, + "grad_norm": 8.536802615813908, + "learning_rate": 4.946408985913344e-07, + "logits/chosen": -2.8229687213897705, + "logits/rejected": -2.768874168395996, + "logps/chosen": -334.3592834472656, + "logps/rejected": -264.8375549316406, + "loss": 0.6821, + "positive_losses": 0.14571304619312286, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.09296802431344986, + "rewards/margins": 0.03191830962896347, + "rewards/margins_max": 0.0921122133731842, + "rewards/margins_min": -0.02777908369898796, + "rewards/margins_std": 0.056708864867687225, + "rewards/rejected": 0.06104971095919609, + "step": 610 + }, + { + "dpo_losses": 0.6855438947677612, + "epoch": 0.16, + "grad_norm": 7.472399980903684, + "learning_rate": 4.941602180974958e-07, + "logits/chosen": -2.8649790287017822, + "logits/rejected": -2.816356897354126, + "logps/chosen": -259.48626708984375, + "logps/rejected": -225.6002960205078, + "loss": 0.6815, + "positive_losses": 0.0016845703357830644, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.06873434782028198, + "rewards/margins": 0.016284704208374023, + "rewards/margins_max": 0.07119239866733551, + "rewards/margins_min": -0.036721087992191315, + "rewards/margins_std": 0.050042442977428436, + "rewards/rejected": 0.05244964361190796, + "step": 620 + }, + { + "dpo_losses": 0.6780611872673035, + "epoch": 0.16, + "grad_norm": 1.7025178999667026, + "learning_rate": 4.936591502957101e-07, + "logits/chosen": -2.820408582687378, + "logits/rejected": -2.810722589492798, + "logps/chosen": -285.3260498046875, + "logps/rejected": -270.40850830078125, + "loss": 0.6891, + "positive_losses": 0.0, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.09266801178455353, + "rewards/margins": 0.03133053332567215, + "rewards/margins_max": 0.09321032464504242, + "rewards/margins_min": -0.029340893030166626, + "rewards/margins_std": 0.053192298859357834, + "rewards/rejected": 0.06133747845888138, + "step": 630 + }, + { + "dpo_losses": 0.6769616007804871, + "epoch": 0.17, + "grad_norm": 9.36334439838426, + "learning_rate": 4.931377370249945e-07, + "logits/chosen": -2.8390297889709473, + "logits/rejected": -2.7957053184509277, + "logps/chosen": -335.83233642578125, + "logps/rejected": -253.6592559814453, + "loss": 0.6861, + "positive_losses": 0.010548400692641735, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.09559468179941177, + "rewards/margins": 0.03336268663406372, + "rewards/margins_max": 0.08360464125871658, + "rewards/margins_min": -0.01670246385037899, + "rewards/margins_std": 0.045782435685396194, + "rewards/rejected": 0.06223199516534805, + "step": 640 + }, + { + "dpo_losses": 0.6821959614753723, + "epoch": 0.17, + "grad_norm": 2.141563710905807, + "learning_rate": 4.925960218232072e-07, + "logits/chosen": -2.853116035461426, + "logits/rejected": -2.851471424102783, + "logps/chosen": -270.49798583984375, + "logps/rejected": -272.83270263671875, + "loss": 0.683, + "positive_losses": 0.07119579613208771, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.08185555040836334, + "rewards/margins": 0.022922957316040993, + "rewards/margins_max": 0.09071676433086395, + "rewards/margins_min": -0.03058004379272461, + "rewards/margins_std": 0.053156398236751556, + "rewards/rejected": 0.0589325949549675, + "step": 650 + }, + { + "dpo_losses": 0.6855509877204895, + "epoch": 0.17, + "grad_norm": 1.6887987578473747, + "learning_rate": 4.920340499234116e-07, + "logits/chosen": -2.834197998046875, + "logits/rejected": -2.8090615272521973, + "logps/chosen": -231.2488555908203, + "logps/rejected": -246.72470092773438, + "loss": 0.688, + "positive_losses": 0.10030250251293182, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.08967218548059464, + "rewards/margins": 0.016500327736139297, + "rewards/margins_max": 0.09120874851942062, + "rewards/margins_min": -0.06212595850229263, + "rewards/margins_std": 0.06792709976434708, + "rewards/rejected": 0.07317186146974564, + "step": 660 + }, + { + "dpo_losses": 0.6879802346229553, + "epoch": 0.18, + "grad_norm": 1.766555473439177, + "learning_rate": 4.914518682500995e-07, + "logits/chosen": -2.834306240081787, + "logits/rejected": -2.8038411140441895, + "logps/chosen": -239.792724609375, + "logps/rejected": -240.0399169921875, + "loss": 0.6927, + "positive_losses": 0.2406913787126541, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.07756872475147247, + "rewards/margins": 0.011351787485182285, + "rewards/margins_max": 0.0727124959230423, + "rewards/margins_min": -0.05722881481051445, + "rewards/margins_std": 0.05708152800798416, + "rewards/rejected": 0.06621693819761276, + "step": 670 + }, + { + "dpo_losses": 0.6810533404350281, + "epoch": 0.18, + "grad_norm": 1.791549885810375, + "learning_rate": 4.90849525415273e-07, + "logits/chosen": -2.761420488357544, + "logits/rejected": -2.7136807441711426, + "logps/chosen": -319.48455810546875, + "logps/rejected": -283.24859619140625, + "loss": 0.6814, + "positive_losses": 0.04459686204791069, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.09917163103818893, + "rewards/margins": 0.02512495219707489, + "rewards/margins_max": 0.07780589163303375, + "rewards/margins_min": -0.030969727784395218, + "rewards/margins_std": 0.048123858869075775, + "rewards/rejected": 0.07404667884111404, + "step": 680 + }, + { + "dpo_losses": 0.6744126081466675, + "epoch": 0.18, + "grad_norm": 1.7544814358215872, + "learning_rate": 4.902270717143858e-07, + "logits/chosen": -2.786694288253784, + "logits/rejected": -2.731166362762451, + "logps/chosen": -276.16217041015625, + "logps/rejected": -235.80343627929688, + "loss": 0.6849, + "positive_losses": 0.00790252722799778, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.08214370906352997, + "rewards/margins": 0.03874465078115463, + "rewards/margins_max": 0.09999613463878632, + "rewards/margins_min": -0.015251509845256805, + "rewards/margins_std": 0.04996136948466301, + "rewards/rejected": 0.043399058282375336, + "step": 690 + }, + { + "dpo_losses": 0.6797881126403809, + "epoch": 0.18, + "grad_norm": 8.765541878644441, + "learning_rate": 4.895845591221426e-07, + "logits/chosen": -2.7817792892456055, + "logits/rejected": -2.749866008758545, + "logps/chosen": -264.08209228515625, + "logps/rejected": -254.13796997070312, + "loss": 0.6843, + "positive_losses": 0.13192901015281677, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.08249294757843018, + "rewards/margins": 0.028021205216646194, + "rewards/margins_max": 0.1065501719713211, + "rewards/margins_min": -0.030667319893836975, + "rewards/margins_std": 0.060979198664426804, + "rewards/rejected": 0.05447175353765488, + "step": 700 + }, + { + "epoch": 0.18, + "eval_dpo_losses": 0.6787125468254089, + "eval_logits/chosen": -2.8047168254852295, + "eval_logits/rejected": -2.766716480255127, + "eval_logps/chosen": -275.483642578125, + "eval_logps/rejected": -256.1453857421875, + "eval_loss": 0.6863225102424622, + "eval_positive_losses": 0.05308011174201965, + "eval_rewards/accuracies": 0.7083333134651184, + "eval_rewards/chosen": 0.09011287987232208, + "eval_rewards/margins": 0.030073018744587898, + "eval_rewards/margins_max": 0.1277952939271927, + "eval_rewards/margins_min": -0.05525188893079758, + "eval_rewards/margins_std": 0.059921521693468094, + "eval_rewards/rejected": 0.060039862990379333, + "eval_runtime": 389.1185, + "eval_samples_per_second": 5.14, + "eval_steps_per_second": 0.162, + "step": 700 + }, + { + "dpo_losses": 0.6902292966842651, + "epoch": 0.19, + "grad_norm": 1.685836395824075, + "learning_rate": 4.8892204128816e-07, + "logits/chosen": -2.836456775665283, + "logits/rejected": -2.8118834495544434, + "logps/chosen": -216.6274871826172, + "logps/rejected": -182.49378967285156, + "loss": 0.6842, + "positive_losses": 0.022132491692900658, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0716724544763565, + "rewards/margins": 0.006178057286888361, + "rewards/margins_max": 0.04306349158287048, + "rewards/margins_min": -0.03043294884264469, + "rewards/margins_std": 0.03296063840389252, + "rewards/rejected": 0.06549438089132309, + "step": 710 + }, + { + "dpo_losses": 0.6763695478439331, + "epoch": 0.19, + "grad_norm": 2.1240272539477205, + "learning_rate": 4.882395735324863e-07, + "logits/chosen": -2.751163959503174, + "logits/rejected": -2.6967310905456543, + "logps/chosen": -358.1401062011719, + "logps/rejected": -294.829833984375, + "loss": 0.6781, + "positive_losses": 0.0, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.11145804822444916, + "rewards/margins": 0.035284630954265594, + "rewards/margins_max": 0.11927734315395355, + "rewards/margins_min": -0.04303320124745369, + "rewards/margins_std": 0.07272578030824661, + "rewards/rejected": 0.07617342472076416, + "step": 720 + }, + { + "dpo_losses": 0.6721448302268982, + "epoch": 0.19, + "grad_norm": 2.0424881145453653, + "learning_rate": 4.875372128409829e-07, + "logits/chosen": -2.9102444648742676, + "logits/rejected": -2.827521562576294, + "logps/chosen": -305.36785888671875, + "logps/rejected": -248.75381469726562, + "loss": 0.679, + "positive_losses": 0.03110790252685547, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.10629504919052124, + "rewards/margins": 0.043546758592128754, + "rewards/margins_max": 0.11803199350833893, + "rewards/margins_min": -0.011555962264537811, + "rewards/margins_std": 0.06165642663836479, + "rewards/rejected": 0.06274829059839249, + "step": 730 + }, + { + "dpo_losses": 0.6869141459465027, + "epoch": 0.19, + "grad_norm": 1.7550163579107814, + "learning_rate": 4.868150178605653e-07, + "logits/chosen": -2.800950050354004, + "logits/rejected": -2.857544422149658, + "logps/chosen": -259.14111328125, + "logps/rejected": -322.3183898925781, + "loss": 0.6882, + "positive_losses": 0.17758464813232422, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.09367353469133377, + "rewards/margins": 0.014143924228847027, + "rewards/margins_max": 0.10260222107172012, + "rewards/margins_min": -0.0663861557841301, + "rewards/margins_std": 0.07564298063516617, + "rewards/rejected": 0.07952960580587387, + "step": 740 + }, + { + "dpo_losses": 0.6848233938217163, + "epoch": 0.2, + "grad_norm": 2.134068604758163, + "learning_rate": 4.860730488943068e-07, + "logits/chosen": -2.7879323959350586, + "logits/rejected": -2.856818914413452, + "logps/chosen": -243.7280731201172, + "logps/rejected": -260.96710205078125, + "loss": 0.6914, + "positive_losses": 0.08630981296300888, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.09625248610973358, + "rewards/margins": 0.017914965748786926, + "rewards/margins_max": 0.09145281463861465, + "rewards/margins_min": -0.04039599746465683, + "rewards/margins_std": 0.058698005974292755, + "rewards/rejected": 0.07833750545978546, + "step": 750 + }, + { + "dpo_losses": 0.664521336555481, + "epoch": 0.2, + "grad_norm": 1.9969820856504785, + "learning_rate": 4.853113678964021e-07, + "logits/chosen": -2.883469820022583, + "logits/rejected": -2.8021795749664307, + "logps/chosen": -285.27667236328125, + "logps/rejected": -236.72909545898438, + "loss": 0.6755, + "positive_losses": 0.0, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.12383918464183807, + "rewards/margins": 0.058944981545209885, + "rewards/margins_max": 0.12066600471735, + "rewards/margins_min": 0.015928596258163452, + "rewards/margins_std": 0.04676266759634018, + "rewards/rejected": 0.06489420682191849, + "step": 760 + }, + { + "dpo_losses": 0.6735109686851501, + "epoch": 0.2, + "grad_norm": 1.87848136681826, + "learning_rate": 4.845300384669957e-07, + "logits/chosen": -2.84818172454834, + "logits/rejected": -2.7825305461883545, + "logps/chosen": -304.6402893066406, + "logps/rejected": -265.74615478515625, + "loss": 0.6755, + "positive_losses": 0.07650699466466904, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10168993473052979, + "rewards/margins": 0.041267722845077515, + "rewards/margins_max": 0.14881454408168793, + "rewards/margins_min": -0.037732165306806564, + "rewards/margins_std": 0.08322665095329285, + "rewards/rejected": 0.06042221933603287, + "step": 770 + }, + { + "dpo_losses": 0.6793437004089355, + "epoch": 0.2, + "grad_norm": 9.001723654254144, + "learning_rate": 4.8372912584687e-07, + "logits/chosen": -2.8889095783233643, + "logits/rejected": -2.825033664703369, + "logps/chosen": -301.7953796386719, + "logps/rejected": -273.88629150390625, + "loss": 0.6872, + "positive_losses": 0.2866264283657074, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.10100637376308441, + "rewards/margins": 0.02894478105008602, + "rewards/margins_max": 0.10271352529525757, + "rewards/margins_min": -0.028590435162186623, + "rewards/margins_std": 0.06048471853137016, + "rewards/rejected": 0.07206159085035324, + "step": 780 + }, + { + "dpo_losses": 0.6845918893814087, + "epoch": 0.21, + "grad_norm": 1.9355510630788444, + "learning_rate": 4.829086969119983e-07, + "logits/chosen": -2.9074501991271973, + "logits/rejected": -2.8751301765441895, + "logps/chosen": -280.39056396484375, + "logps/rejected": -259.8931579589844, + "loss": 0.6946, + "positive_losses": 0.09826965630054474, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.08458300679922104, + "rewards/margins": 0.0185464508831501, + "rewards/margins_max": 0.0906822457909584, + "rewards/margins_min": -0.058589279651641846, + "rewards/margins_std": 0.06743675470352173, + "rewards/rejected": 0.06603654474020004, + "step": 790 + }, + { + "dpo_losses": 0.6688677072525024, + "epoch": 0.21, + "grad_norm": 8.99267152598806, + "learning_rate": 4.820688201679605e-07, + "logits/chosen": -2.7524070739746094, + "logits/rejected": -2.7194697856903076, + "logps/chosen": -343.477294921875, + "logps/rejected": -247.39419555664062, + "loss": 0.678, + "positive_losses": 0.03696594387292862, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.09179838001728058, + "rewards/margins": 0.05082429200410843, + "rewards/margins_max": 0.15096034109592438, + "rewards/margins_min": -0.020338425412774086, + "rewards/margins_std": 0.07840122282505035, + "rewards/rejected": 0.04097408801317215, + "step": 800 + }, + { + "epoch": 0.21, + "eval_dpo_losses": 0.6756108999252319, + "eval_logits/chosen": -2.8027398586273193, + "eval_logits/rejected": -2.7648611068725586, + "eval_logps/chosen": -274.7101745605469, + "eval_logps/rejected": -256.04681396484375, + "eval_loss": 0.6881770491600037, + "eval_positive_losses": 0.09072524309158325, + "eval_rewards/accuracies": 0.7003968358039856, + "eval_rewards/chosen": 0.09784739464521408, + "eval_rewards/margins": 0.03682180121541023, + "eval_rewards/margins_max": 0.15395967662334442, + "eval_rewards/margins_min": -0.06626705825328827, + "eval_rewards/margins_std": 0.07224141061306, + "eval_rewards/rejected": 0.06102558597922325, + "eval_runtime": 388.4571, + "eval_samples_per_second": 5.149, + "eval_steps_per_second": 0.162, + "step": 800 + }, + { + "dpo_losses": 0.6745853424072266, + "epoch": 0.21, + "grad_norm": 28.433617094113924, + "learning_rate": 4.812095657442231e-07, + "logits/chosen": -2.875488519668579, + "logits/rejected": -2.8277158737182617, + "logps/chosen": -298.0208435058594, + "logps/rejected": -262.80096435546875, + "loss": 0.688, + "positive_losses": 0.01885681226849556, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1029839739203453, + "rewards/margins": 0.039221733808517456, + "rewards/margins_max": 0.14284637570381165, + "rewards/margins_min": -0.045712970197200775, + "rewards/margins_std": 0.0840546116232872, + "rewards/rejected": 0.06376224756240845, + "step": 810 + }, + { + "dpo_losses": 0.6649328470230103, + "epoch": 0.21, + "grad_norm": 15.648486742464959, + "learning_rate": 4.803310053882831e-07, + "logits/chosen": -2.736679792404175, + "logits/rejected": -2.6685967445373535, + "logps/chosen": -259.08319091796875, + "logps/rejected": -206.3892364501953, + "loss": 0.6822, + "positive_losses": 0.01941223070025444, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.10811863094568253, + "rewards/margins": 0.05877862125635147, + "rewards/margins_max": 0.1449100822210312, + "rewards/margins_min": -0.013782364316284657, + "rewards/margins_std": 0.06897840648889542, + "rewards/rejected": 0.04934001341462135, + "step": 820 + }, + { + "dpo_losses": 0.6795674562454224, + "epoch": 0.22, + "grad_norm": 1.9645455287668387, + "learning_rate": 4.794332124596775e-07, + "logits/chosen": -2.768758535385132, + "logits/rejected": -2.727670669555664, + "logps/chosen": -312.9381103515625, + "logps/rejected": -294.339599609375, + "loss": 0.7017, + "positive_losses": 0.09621696174144745, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.09689287841320038, + "rewards/margins": 0.029259273782372475, + "rewards/margins_max": 0.11800308525562286, + "rewards/margins_min": -0.03915540874004364, + "rewards/margins_std": 0.07343783229589462, + "rewards/rejected": 0.06763359904289246, + "step": 830 + }, + { + "dpo_losses": 0.6785067319869995, + "epoch": 0.22, + "grad_norm": 18.788130174373894, + "learning_rate": 4.785162619238574e-07, + "logits/chosen": -2.8277535438537598, + "logits/rejected": -2.8106484413146973, + "logps/chosen": -246.59152221679688, + "logps/rejected": -203.9308624267578, + "loss": 0.6877, + "positive_losses": 0.08641128242015839, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.08607254922389984, + "rewards/margins": 0.030666109174489975, + "rewards/margins_max": 0.10051500797271729, + "rewards/margins_min": -0.037806764245033264, + "rewards/margins_std": 0.06067012995481491, + "rewards/rejected": 0.055406440049409866, + "step": 840 + }, + { + "dpo_losses": 0.6822186708450317, + "epoch": 0.22, + "grad_norm": 2.1377190683995804, + "learning_rate": 4.775802303459287e-07, + "logits/chosen": -2.753934144973755, + "logits/rejected": -2.715641736984253, + "logps/chosen": -223.06021118164062, + "logps/rejected": -232.00149536132812, + "loss": 0.6773, + "positive_losses": 0.04858856275677681, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.09215366095304489, + "rewards/margins": 0.023227885365486145, + "rewards/margins_max": 0.07749857753515244, + "rewards/margins_min": -0.0410555824637413, + "rewards/margins_std": 0.055815864354372025, + "rewards/rejected": 0.06892578303813934, + "step": 850 + }, + { + "dpo_losses": 0.6828041076660156, + "epoch": 0.23, + "grad_norm": 1.9617373594360756, + "learning_rate": 4.766251958842589e-07, + "logits/chosen": -2.727834463119507, + "logits/rejected": -2.806042194366455, + "logps/chosen": -139.188720703125, + "logps/rejected": -196.4252166748047, + "loss": 0.6854, + "positive_losses": 0.0, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.09413941949605942, + "rewards/margins": 0.021806001663208008, + "rewards/margins_max": 0.08895751088857651, + "rewards/margins_min": -0.04246797785162926, + "rewards/margins_std": 0.05625462532043457, + "rewards/rejected": 0.07233341783285141, + "step": 860 + }, + { + "dpo_losses": 0.6727257966995239, + "epoch": 0.23, + "grad_norm": 1.9573024173567872, + "learning_rate": 4.756512382839506e-07, + "logits/chosen": -2.760953187942505, + "logits/rejected": -2.732851505279541, + "logps/chosen": -296.09368896484375, + "logps/rejected": -229.08847045898438, + "loss": 0.6927, + "positive_losses": 0.15652236342430115, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.1098223552107811, + "rewards/margins": 0.0430893711745739, + "rewards/margins_max": 0.1367332488298416, + "rewards/margins_min": -0.034009821712970734, + "rewards/margins_std": 0.07827076315879822, + "rewards/rejected": 0.0667329877614975, + "step": 870 + }, + { + "dpo_losses": 0.6918343305587769, + "epoch": 0.23, + "grad_norm": 1.8900833615079249, + "learning_rate": 4.746584388701831e-07, + "logits/chosen": -2.8361616134643555, + "logits/rejected": -2.8230698108673096, + "logps/chosen": -255.2892608642578, + "logps/rejected": -299.9884033203125, + "loss": 0.6891, + "positive_losses": 0.1362869292497635, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.10028652846813202, + "rewards/margins": 0.003979234956204891, + "rewards/margins_max": 0.07634096592664719, + "rewards/margins_min": -0.07755633443593979, + "rewards/margins_std": 0.06962737441062927, + "rewards/rejected": 0.09630729258060455, + "step": 880 + }, + { + "dpo_losses": 0.6761296987533569, + "epoch": 0.23, + "grad_norm": 15.70216978593494, + "learning_rate": 4.736468805414218e-07, + "logits/chosen": -2.7853493690490723, + "logits/rejected": -2.737541675567627, + "logps/chosen": -294.12774658203125, + "logps/rejected": -244.80654907226562, + "loss": 0.6859, + "positive_losses": 0.10331420600414276, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.10274813324213028, + "rewards/margins": 0.036404069513082504, + "rewards/margins_max": 0.12679249048233032, + "rewards/margins_min": -0.040073007345199585, + "rewards/margins_std": 0.07672830671072006, + "rewards/rejected": 0.06634406000375748, + "step": 890 + }, + { + "dpo_losses": 0.6793249845504761, + "epoch": 0.24, + "grad_norm": 1.811255981772413, + "learning_rate": 4.7261664776249595e-07, + "logits/chosen": -2.782780885696411, + "logits/rejected": -2.7844455242156982, + "logps/chosen": -217.6700439453125, + "logps/rejected": -262.0546569824219, + "loss": 0.6788, + "positive_losses": 0.0, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.12033214420080185, + "rewards/margins": 0.029326725751161575, + "rewards/margins_max": 0.10667027533054352, + "rewards/margins_min": -0.06298742443323135, + "rewards/margins_std": 0.07723227888345718, + "rewards/rejected": 0.09100539982318878, + "step": 900 + }, + { + "epoch": 0.24, + "eval_dpo_losses": 0.6740882396697998, + "eval_logits/chosen": -2.8025319576263428, + "eval_logits/rejected": -2.7642123699188232, + "eval_logps/chosen": -272.8671875, + "eval_logps/rejected": -254.53573608398438, + "eval_loss": 0.6861064434051514, + "eval_positive_losses": 0.0828118622303009, + "eval_rewards/accuracies": 0.7123016119003296, + "eval_rewards/chosen": 0.11627738922834396, + "eval_rewards/margins": 0.04014097899198532, + "eval_rewards/margins_max": 0.16791382431983948, + "eval_rewards/margins_min": -0.06933987140655518, + "eval_rewards/margins_std": 0.07767920196056366, + "eval_rewards/rejected": 0.07613641023635864, + "eval_runtime": 388.6477, + "eval_samples_per_second": 5.146, + "eval_steps_per_second": 0.162, + "step": 900 + }, + { + "dpo_losses": 0.6753177046775818, + "epoch": 0.24, + "grad_norm": 11.841192366899621, + "learning_rate": 4.7156782655754624e-07, + "logits/chosen": -2.8079895973205566, + "logits/rejected": -2.7867820262908936, + "logps/chosen": -257.70550537109375, + "logps/rejected": -235.9842529296875, + "loss": 0.6895, + "positive_losses": 0.18343773484230042, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.11322704702615738, + "rewards/margins": 0.03787517175078392, + "rewards/margins_max": 0.11973077058792114, + "rewards/margins_min": -0.05645657330751419, + "rewards/margins_std": 0.07862423360347748, + "rewards/rejected": 0.07535187900066376, + "step": 910 + }, + { + "dpo_losses": 0.6834079623222351, + "epoch": 0.24, + "grad_norm": 8.095979950408838, + "learning_rate": 4.705005045028414e-07, + "logits/chosen": -2.822742462158203, + "logits/rejected": -2.7415847778320312, + "logps/chosen": -278.8885498046875, + "logps/rejected": -243.7444305419922, + "loss": 0.6852, + "positive_losses": 0.06942252814769745, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11463338136672974, + "rewards/margins": 0.021800417453050613, + "rewards/margins_max": 0.1151091605424881, + "rewards/margins_min": -0.07252896577119827, + "rewards/margins_std": 0.08234294503927231, + "rewards/rejected": 0.09283297508955002, + "step": 920 + }, + { + "dpo_losses": 0.6700179576873779, + "epoch": 0.24, + "grad_norm": 6.0696545100674415, + "learning_rate": 4.694147707194659e-07, + "logits/chosen": -2.7774658203125, + "logits/rejected": -2.689136028289795, + "logps/chosen": -322.48638916015625, + "logps/rejected": -286.92474365234375, + "loss": 0.6783, + "positive_losses": 0.0, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.12033887207508087, + "rewards/margins": 0.04955907538533211, + "rewards/margins_max": 0.16666612029075623, + "rewards/margins_min": -0.06129683926701546, + "rewards/margins_std": 0.10141287744045258, + "rewards/rejected": 0.07077980041503906, + "step": 930 + }, + { + "dpo_losses": 0.6771188974380493, + "epoch": 0.25, + "grad_norm": 10.970419210597651, + "learning_rate": 4.683107158658781e-07, + "logits/chosen": -2.7917165756225586, + "logits/rejected": -2.7873263359069824, + "logps/chosen": -296.0466003417969, + "logps/rejected": -269.75799560546875, + "loss": 0.6823, + "positive_losses": 0.11498375236988068, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.11963772773742676, + "rewards/margins": 0.03364872187376022, + "rewards/margins_max": 0.1158483475446701, + "rewards/margins_min": -0.04820042848587036, + "rewards/margins_std": 0.07006116211414337, + "rewards/rejected": 0.08598899841308594, + "step": 940 + }, + { + "dpo_losses": 0.6775475740432739, + "epoch": 0.25, + "grad_norm": 8.962956381527691, + "learning_rate": 4.6718843213034066e-07, + "logits/chosen": -2.8136909008026123, + "logits/rejected": -2.8473093509674072, + "logps/chosen": -261.88470458984375, + "logps/rejected": -299.9488220214844, + "loss": 0.6828, + "positive_losses": 0.07727966457605362, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.11626646667718887, + "rewards/margins": 0.03341097757220268, + "rewards/margins_max": 0.12769392132759094, + "rewards/margins_min": -0.06785953044891357, + "rewards/margins_std": 0.08994203805923462, + "rewards/rejected": 0.08285548537969589, + "step": 950 + }, + { + "dpo_losses": 0.6749299764633179, + "epoch": 0.25, + "grad_norm": 1.979032277681068, + "learning_rate": 4.660480132232224e-07, + "logits/chosen": -2.8280081748962402, + "logits/rejected": -2.727102756500244, + "logps/chosen": -358.97918701171875, + "logps/rejected": -292.6557922363281, + "loss": 0.6805, + "positive_losses": 0.10472335666418076, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.11151669174432755, + "rewards/margins": 0.038539350032806396, + "rewards/margins_max": 0.1373523324728012, + "rewards/margins_min": -0.047525554895401, + "rewards/margins_std": 0.08352877199649811, + "rewards/rejected": 0.07297734171152115, + "step": 960 + }, + { + "dpo_losses": 0.6668910980224609, + "epoch": 0.25, + "grad_norm": 1.888579531032461, + "learning_rate": 4.64889554369174e-07, + "logits/chosen": -2.7996866703033447, + "logits/rejected": -2.7919023036956787, + "logps/chosen": -318.7244567871094, + "logps/rejected": -272.59552001953125, + "loss": 0.6809, + "positive_losses": 0.05099544674158096, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.12181965261697769, + "rewards/margins": 0.054952751845121384, + "rewards/margins_max": 0.14387230575084686, + "rewards/margins_min": -0.03384255990386009, + "rewards/margins_std": 0.07864460349082947, + "rewards/rejected": 0.0668669119477272, + "step": 970 + }, + { + "dpo_losses": 0.6755861043930054, + "epoch": 0.26, + "grad_norm": 1.7811269884967806, + "learning_rate": 4.637131522991764e-07, + "logits/chosen": -2.8695130348205566, + "logits/rejected": -2.8582377433776855, + "logps/chosen": -295.46136474609375, + "logps/rejected": -235.12045288085938, + "loss": 0.6812, + "positive_losses": 0.07995452731847763, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.10505534708499908, + "rewards/margins": 0.03675197437405586, + "rewards/margins_max": 0.11415354907512665, + "rewards/margins_min": -0.03910742700099945, + "rewards/margins_std": 0.06744858622550964, + "rewards/rejected": 0.06830336898565292, + "step": 980 + }, + { + "dpo_losses": 0.6747775077819824, + "epoch": 0.26, + "grad_norm": 1.7332998156363515, + "learning_rate": 4.6251890524246375e-07, + "logits/chosen": -2.793673038482666, + "logits/rejected": -2.7048346996307373, + "logps/chosen": -328.4063415527344, + "logps/rejected": -283.97613525390625, + "loss": 0.6845, + "positive_losses": 0.3158671259880066, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.11113989353179932, + "rewards/margins": 0.039355430752038956, + "rewards/margins_max": 0.12707474827766418, + "rewards/margins_min": -0.06679748743772507, + "rewards/margins_std": 0.08647724986076355, + "rewards/rejected": 0.07178448140621185, + "step": 990 + }, + { + "dpo_losses": 0.6791088581085205, + "epoch": 0.26, + "grad_norm": 11.379057108294523, + "learning_rate": 4.613069129183218e-07, + "logits/chosen": -2.796535015106201, + "logits/rejected": -2.753174304962158, + "logps/chosen": -227.5648193359375, + "logps/rejected": -206.048095703125, + "loss": 0.6883, + "positive_losses": 0.13388271629810333, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.09791069477796555, + "rewards/margins": 0.029800478368997574, + "rewards/margins_max": 0.11369460821151733, + "rewards/margins_min": -0.057328272610902786, + "rewards/margins_std": 0.07600894570350647, + "rewards/rejected": 0.06811021268367767, + "step": 1000 + }, + { + "epoch": 0.26, + "eval_dpo_losses": 0.6726115345954895, + "eval_logits/chosen": -2.803205966949463, + "eval_logits/rejected": -2.7647831439971924, + "eval_logps/chosen": -272.3450622558594, + "eval_logps/rejected": -254.33460998535156, + "eval_loss": 0.6859395503997803, + "eval_positive_losses": 0.09098844230175018, + "eval_rewards/accuracies": 0.7142857313156128, + "eval_rewards/chosen": 0.12149831652641296, + "eval_rewards/margins": 0.04335065931081772, + "eval_rewards/margins_max": 0.17721711099147797, + "eval_rewards/margins_min": -0.0735059604048729, + "eval_rewards/margins_std": 0.0821399986743927, + "eval_rewards/rejected": 0.07814766466617584, + "eval_runtime": 388.4799, + "eval_samples_per_second": 5.148, + "eval_steps_per_second": 0.162, + "step": 1000 + }, + { + "dpo_losses": 0.6663374900817871, + "epoch": 0.26, + "grad_norm": 1.8741883110446367, + "learning_rate": 4.6007727652776065e-07, + "logits/chosen": -2.8542165756225586, + "logits/rejected": -2.7912869453430176, + "logps/chosen": -271.45343017578125, + "logps/rejected": -273.6263427734375, + "loss": 0.6792, + "positive_losses": 0.1299796998500824, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.11328332126140594, + "rewards/margins": 0.05630182474851608, + "rewards/margins_max": 0.16808533668518066, + "rewards/margins_min": -0.02279646322131157, + "rewards/margins_std": 0.08499892055988312, + "rewards/rejected": 0.05698147416114807, + "step": 1010 + }, + { + "dpo_losses": 0.6849120855331421, + "epoch": 0.27, + "grad_norm": 1.6245219187621243, + "learning_rate": 4.588300987450652e-07, + "logits/chosen": -2.9056270122528076, + "logits/rejected": -2.8224825859069824, + "logps/chosen": -256.3388977050781, + "logps/rejected": -264.8789978027344, + "loss": 0.6981, + "positive_losses": 0.16140174865722656, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.10395157337188721, + "rewards/margins": 0.017809074372053146, + "rewards/margins_max": 0.08751221001148224, + "rewards/margins_min": -0.04618312045931816, + "rewards/margins_std": 0.062398601323366165, + "rewards/rejected": 0.08614251017570496, + "step": 1020 + }, + { + "dpo_losses": 0.6712228059768677, + "epoch": 0.27, + "grad_norm": 10.216083517021472, + "learning_rate": 4.5756548370922134e-07, + "logits/chosen": -2.810163736343384, + "logits/rejected": -2.8218376636505127, + "logps/chosen": -289.07733154296875, + "logps/rejected": -286.98638916015625, + "loss": 0.6867, + "positive_losses": 0.20693854987621307, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.11013605445623398, + "rewards/margins": 0.04705684632062912, + "rewards/margins_max": 0.1697276532649994, + "rewards/margins_min": -0.04310871288180351, + "rewards/margins_std": 0.09699669480323792, + "rewards/rejected": 0.06307922303676605, + "step": 1030 + }, + { + "dpo_losses": 0.6609120965003967, + "epoch": 0.27, + "grad_norm": 1.8241041911257145, + "learning_rate": 4.5628353701522047e-07, + "logits/chosen": -2.743584632873535, + "logits/rejected": -2.688563823699951, + "logps/chosen": -285.1751708984375, + "logps/rejected": -252.52377319335938, + "loss": 0.6819, + "positive_losses": 0.06223297119140625, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.12947912514209747, + "rewards/margins": 0.06809819489717484, + "rewards/margins_max": 0.18549703061580658, + "rewards/margins_min": -0.023058805614709854, + "rewards/margins_std": 0.09334772080183029, + "rewards/rejected": 0.06138092279434204, + "step": 1040 + }, + { + "dpo_losses": 0.6756216287612915, + "epoch": 0.27, + "grad_norm": 11.2175543952917, + "learning_rate": 4.549843657052429e-07, + "logits/chosen": -2.802422046661377, + "logits/rejected": -2.721710205078125, + "logps/chosen": -314.43414306640625, + "logps/rejected": -288.079345703125, + "loss": 0.6818, + "positive_losses": 0.06746216118335724, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.12974092364311218, + "rewards/margins": 0.03823871165513992, + "rewards/margins_max": 0.15835562348365784, + "rewards/margins_min": -0.07311789691448212, + "rewards/margins_std": 0.10263626277446747, + "rewards/rejected": 0.09150221198797226, + "step": 1050 + }, + { + "dpo_losses": 0.6694084405899048, + "epoch": 0.28, + "grad_norm": 1.9713422270144096, + "learning_rate": 4.5366807825971907e-07, + "logits/chosen": -2.846350908279419, + "logits/rejected": -2.7868642807006836, + "logps/chosen": -340.27117919921875, + "logps/rejected": -330.462646484375, + "loss": 0.6793, + "positive_losses": 0.0, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.13603763282299042, + "rewards/margins": 0.050386250019073486, + "rewards/margins_max": 0.14835581183433533, + "rewards/margins_min": -0.02989931032061577, + "rewards/margins_std": 0.0805257111787796, + "rewards/rejected": 0.08565138280391693, + "step": 1060 + }, + { + "dpo_losses": 0.6645206212997437, + "epoch": 0.28, + "grad_norm": 5.586548330348038, + "learning_rate": 4.5233478458827176e-07, + "logits/chosen": -2.840010166168213, + "logits/rejected": -2.842350959777832, + "logps/chosen": -247.47793579101562, + "logps/rejected": -243.4279022216797, + "loss": 0.6846, + "positive_losses": 0.0, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.11831434071063995, + "rewards/margins": 0.060487616807222366, + "rewards/margins_max": 0.1816258579492569, + "rewards/margins_min": -0.043870292603969574, + "rewards/margins_std": 0.09767617285251617, + "rewards/rejected": 0.05782672017812729, + "step": 1070 + }, + { + "dpo_losses": 0.6843565702438354, + "epoch": 0.28, + "grad_norm": 1.8777669835140915, + "learning_rate": 4.509845960205389e-07, + "logits/chosen": -2.8549046516418457, + "logits/rejected": -2.7536978721618652, + "logps/chosen": -310.20135498046875, + "logps/rejected": -312.7437744140625, + "loss": 0.6844, + "positive_losses": 0.03246307373046875, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.10435821115970612, + "rewards/margins": 0.020014088600873947, + "rewards/margins_max": 0.10428164899349213, + "rewards/margins_min": -0.08708689361810684, + "rewards/margins_std": 0.0864337682723999, + "rewards/rejected": 0.08434412628412247, + "step": 1080 + }, + { + "dpo_losses": 0.6706165075302124, + "epoch": 0.29, + "grad_norm": 2.032524530654526, + "learning_rate": 4.4961762529687736e-07, + "logits/chosen": -2.8304388523101807, + "logits/rejected": -2.756781578063965, + "logps/chosen": -243.0189666748047, + "logps/rejected": -204.32138061523438, + "loss": 0.6773, + "positive_losses": 0.16189880669116974, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11879110336303711, + "rewards/margins": 0.04820892959833145, + "rewards/margins_max": 0.13754215836524963, + "rewards/margins_min": -0.05152437090873718, + "rewards/margins_std": 0.08577823638916016, + "rewards/rejected": 0.07058216631412506, + "step": 1090 + }, + { + "dpo_losses": 0.6765921711921692, + "epoch": 0.29, + "grad_norm": 8.889417086908429, + "learning_rate": 4.482339865589492e-07, + "logits/chosen": -2.7313170433044434, + "logits/rejected": -2.742469549179077, + "logps/chosen": -277.88238525390625, + "logps/rejected": -260.5584411621094, + "loss": 0.692, + "positive_losses": 0.09104885905981064, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.09932636469602585, + "rewards/margins": 0.034714534878730774, + "rewards/margins_max": 0.10242130607366562, + "rewards/margins_min": -0.04449024051427841, + "rewards/margins_std": 0.06656105071306229, + "rewards/rejected": 0.06461183726787567, + "step": 1100 + }, + { + "epoch": 0.29, + "eval_dpo_losses": 0.6716243028640747, + "eval_logits/chosen": -2.808830738067627, + "eval_logits/rejected": -2.770286798477173, + "eval_logps/chosen": -271.9104919433594, + "eval_logps/rejected": -254.11595153808594, + "eval_loss": 0.685075581073761, + "eval_positive_losses": 0.09168452024459839, + "eval_rewards/accuracies": 0.7023809552192688, + "eval_rewards/chosen": 0.12584403157234192, + "eval_rewards/margins": 0.04550952836871147, + "eval_rewards/margins_max": 0.18452809751033783, + "eval_rewards/margins_min": -0.07608763873577118, + "eval_rewards/margins_std": 0.08527926355600357, + "eval_rewards/rejected": 0.08033448457717896, + "eval_runtime": 388.4713, + "eval_samples_per_second": 5.148, + "eval_steps_per_second": 0.162, + "step": 1100 + }, + { + "dpo_losses": 0.6707266569137573, + "epoch": 0.29, + "grad_norm": 2.1967816307227843, + "learning_rate": 4.4683379534019076e-07, + "logits/chosen": -2.8102149963378906, + "logits/rejected": -2.807123899459839, + "logps/chosen": -261.33843994140625, + "logps/rejected": -248.5167999267578, + "loss": 0.6784, + "positive_losses": 0.09397812187671661, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.11437442153692245, + "rewards/margins": 0.04670627787709236, + "rewards/margins_max": 0.13146765530109406, + "rewards/margins_min": -0.022779863327741623, + "rewards/margins_std": 0.06781142950057983, + "rewards/rejected": 0.06766814738512039, + "step": 1110 + }, + { + "dpo_losses": 0.6730443835258484, + "epoch": 0.29, + "grad_norm": 14.967622527345275, + "learning_rate": 4.4541716855616593e-07, + "logits/chosen": -2.8452019691467285, + "logits/rejected": -2.7901253700256348, + "logps/chosen": -257.3014831542969, + "logps/rejected": -195.07559204101562, + "loss": 0.6867, + "positive_losses": 0.0, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.1417730152606964, + "rewards/margins": 0.04227545112371445, + "rewards/margins_max": 0.11548507213592529, + "rewards/margins_min": -0.0384410098195076, + "rewards/margins_std": 0.0674058347940445, + "rewards/rejected": 0.09949756413698196, + "step": 1120 + }, + { + "dpo_losses": 0.664470911026001, + "epoch": 0.3, + "grad_norm": 2.0627318743208862, + "learning_rate": 4.4398422449480357e-07, + "logits/chosen": -2.9047281742095947, + "logits/rejected": -2.886209726333618, + "logps/chosen": -281.00128173828125, + "logps/rejected": -247.7537384033203, + "loss": 0.6671, + "positive_losses": 0.0, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.13944607973098755, + "rewards/margins": 0.060615021735429764, + "rewards/margins_max": 0.17492035031318665, + "rewards/margins_min": -0.026431847363710403, + "rewards/margins_std": 0.08961961418390274, + "rewards/rejected": 0.07883106172084808, + "step": 1130 + }, + { + "dpo_losses": 0.6733167171478271, + "epoch": 0.3, + "grad_norm": 16.524324924455595, + "learning_rate": 4.4253508280652036e-07, + "logits/chosen": -2.821608066558838, + "logits/rejected": -2.7850451469421387, + "logps/chosen": -217.66879272460938, + "logps/rejected": -190.71035766601562, + "loss": 0.6828, + "positive_losses": 0.09988708794116974, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11857322603464127, + "rewards/margins": 0.04155484959483147, + "rewards/margins_max": 0.11579285562038422, + "rewards/margins_min": -0.034957364201545715, + "rewards/margins_std": 0.06700852513313293, + "rewards/rejected": 0.0770183727145195, + "step": 1140 + }, + { + "dpo_losses": 0.6570025682449341, + "epoch": 0.3, + "grad_norm": 2.147627244228236, + "learning_rate": 4.410698644942302e-07, + "logits/chosen": -2.8799514770507812, + "logits/rejected": -2.866712808609009, + "logps/chosen": -294.8155822753906, + "logps/rejected": -241.6029052734375, + "loss": 0.6726, + "positive_losses": 0.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14851152896881104, + "rewards/margins": 0.07623986899852753, + "rewards/margins_max": 0.18512070178985596, + "rewards/margins_min": -0.04003595933318138, + "rewards/margins_std": 0.10335429012775421, + "rewards/rejected": 0.07227165997028351, + "step": 1150 + }, + { + "dpo_losses": 0.6709702610969543, + "epoch": 0.3, + "grad_norm": 2.018872151650389, + "learning_rate": 4.3958869190324057e-07, + "logits/chosen": -2.773653268814087, + "logits/rejected": -2.753624200820923, + "logps/chosen": -179.3298797607422, + "logps/rejected": -187.51380920410156, + "loss": 0.6789, + "positive_losses": 0.06843414157629013, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12052376568317413, + "rewards/margins": 0.04608723521232605, + "rewards/margins_max": 0.13753105700016022, + "rewards/margins_min": -0.018335824832320213, + "rewards/margins_std": 0.0684729665517807, + "rewards/rejected": 0.07443653792142868, + "step": 1160 + }, + { + "dpo_losses": 0.6723008155822754, + "epoch": 0.31, + "grad_norm": 1.9989585549130962, + "learning_rate": 4.380916887110365e-07, + "logits/chosen": -2.788508176803589, + "logits/rejected": -2.764782428741455, + "logps/chosen": -235.6918487548828, + "logps/rejected": -247.7445526123047, + "loss": 0.6773, + "positive_losses": 0.0934390053153038, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.12237273156642914, + "rewards/margins": 0.044223930686712265, + "rewards/margins_max": 0.12869183719158173, + "rewards/margins_min": -0.05642607808113098, + "rewards/margins_std": 0.08192186057567596, + "rewards/rejected": 0.07814880460500717, + "step": 1170 + }, + { + "dpo_losses": 0.6675256490707397, + "epoch": 0.31, + "grad_norm": 1.8728500756068767, + "learning_rate": 4.3657897991695394e-07, + "logits/chosen": -2.7991480827331543, + "logits/rejected": -2.7395639419555664, + "logps/chosen": -255.3263702392578, + "logps/rejected": -236.8482208251953, + "loss": 0.6748, + "positive_losses": 0.0, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.1421738862991333, + "rewards/margins": 0.053548168390989304, + "rewards/margins_max": 0.13603021204471588, + "rewards/margins_min": -0.020064514130353928, + "rewards/margins_std": 0.07148457318544388, + "rewards/rejected": 0.0886257067322731, + "step": 1180 + }, + { + "dpo_losses": 0.6752602458000183, + "epoch": 0.31, + "grad_norm": 4.44738270802729, + "learning_rate": 4.350506918317416e-07, + "logits/chosen": -2.8661818504333496, + "logits/rejected": -2.868072748184204, + "logps/chosen": -247.4238739013672, + "logps/rejected": -208.9613800048828, + "loss": 0.6928, + "positive_losses": 0.06978531181812286, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.1355341225862503, + "rewards/margins": 0.03722946345806122, + "rewards/margins_max": 0.10634903609752655, + "rewards/margins_min": -0.03015676699578762, + "rewards/margins_std": 0.05882125347852707, + "rewards/rejected": 0.09830465167760849, + "step": 1190 + }, + { + "dpo_losses": 0.6878072619438171, + "epoch": 0.31, + "grad_norm": 1.8532475397208668, + "learning_rate": 4.335069520670149e-07, + "logits/chosen": -2.7378358840942383, + "logits/rejected": -2.694200277328491, + "logps/chosen": -217.06900024414062, + "logps/rejected": -233.5520477294922, + "loss": 0.6781, + "positive_losses": 0.0, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.1189335361123085, + "rewards/margins": 0.012583857402205467, + "rewards/margins_max": 0.1028449758887291, + "rewards/margins_min": -0.064508818089962, + "rewards/margins_std": 0.07415572553873062, + "rewards/rejected": 0.10634968429803848, + "step": 1200 + }, + { + "epoch": 0.31, + "eval_dpo_losses": 0.6704273819923401, + "eval_logits/chosen": -2.8056468963623047, + "eval_logits/rejected": -2.767200231552124, + "eval_logps/chosen": -271.125244140625, + "eval_logps/rejected": -253.5946807861328, + "eval_loss": 0.6848036050796509, + "eval_positive_losses": 0.08884982764720917, + "eval_rewards/accuracies": 0.716269850730896, + "eval_rewards/chosen": 0.133696511387825, + "eval_rewards/margins": 0.04814951494336128, + "eval_rewards/margins_max": 0.19328844547271729, + "eval_rewards/margins_min": -0.07867568731307983, + "eval_rewards/margins_std": 0.0892573893070221, + "eval_rewards/rejected": 0.08554700762033463, + "eval_runtime": 409.1152, + "eval_samples_per_second": 4.889, + "eval_steps_per_second": 0.154, + "step": 1200 + }, + { + "dpo_losses": 0.6781080961227417, + "epoch": 0.32, + "grad_norm": 1.9127627572464478, + "learning_rate": 4.319478895245999e-07, + "logits/chosen": -2.8690595626831055, + "logits/rejected": -2.8480398654937744, + "logps/chosen": -293.2466735839844, + "logps/rejected": -253.9585723876953, + "loss": 0.6756, + "positive_losses": 0.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15057069063186646, + "rewards/margins": 0.033803313970565796, + "rewards/margins_max": 0.16536416113376617, + "rewards/margins_min": -0.08521204441785812, + "rewards/margins_std": 0.11450543254613876, + "rewards/rejected": 0.11676736921072006, + "step": 1210 + }, + { + "dpo_losses": 0.6695367693901062, + "epoch": 0.32, + "grad_norm": 1.971512566824635, + "learning_rate": 4.3037363438577036e-07, + "logits/chosen": -2.86470365524292, + "logits/rejected": -2.800983428955078, + "logps/chosen": -275.1996765136719, + "logps/rejected": -262.8089294433594, + "loss": 0.673, + "positive_losses": 0.01301498431712389, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.11730766296386719, + "rewards/margins": 0.05050047114491463, + "rewards/margins_max": 0.1559460461139679, + "rewards/margins_min": -0.055488090962171555, + "rewards/margins_std": 0.09744496643543243, + "rewards/rejected": 0.06680719554424286, + "step": 1220 + }, + { + "dpo_losses": 0.6828508377075195, + "epoch": 0.32, + "grad_norm": 7.802496036331749, + "learning_rate": 4.2878431810037716e-07, + "logits/chosen": -2.7981674671173096, + "logits/rejected": -2.8011233806610107, + "logps/chosen": -264.6150817871094, + "logps/rejected": -263.75469970703125, + "loss": 0.69, + "positive_losses": 0.25762253999710083, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.10469107329845428, + "rewards/margins": 0.022840503603219986, + "rewards/margins_max": 0.11655263602733612, + "rewards/margins_min": -0.06864559650421143, + "rewards/margins_std": 0.08273427188396454, + "rewards/rejected": 0.0818505734205246, + "step": 1230 + }, + { + "dpo_losses": 0.6616807579994202, + "epoch": 0.32, + "grad_norm": 8.05648555721981, + "learning_rate": 4.271800733758729e-07, + "logits/chosen": -2.6408848762512207, + "logits/rejected": -2.6748952865600586, + "logps/chosen": -241.6726531982422, + "logps/rejected": -206.65603637695312, + "loss": 0.6785, + "positive_losses": 0.004642486572265625, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.14186879992485046, + "rewards/margins": 0.06542383134365082, + "rewards/margins_max": 0.1544228345155716, + "rewards/margins_min": -0.00011541061394382268, + "rewards/margins_std": 0.0685013085603714, + "rewards/rejected": 0.07644496113061905, + "step": 1240 + }, + { + "dpo_losses": 0.6754915118217468, + "epoch": 0.33, + "grad_norm": 74.92735414011027, + "learning_rate": 4.255610341662304e-07, + "logits/chosen": -2.7187135219573975, + "logits/rejected": -2.7095625400543213, + "logps/chosen": -259.538818359375, + "logps/rejected": -245.74658203125, + "loss": 0.6953, + "positive_losses": 0.20729827880859375, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12701359391212463, + "rewards/margins": 0.037803538143634796, + "rewards/margins_max": 0.13912460207939148, + "rewards/margins_min": -0.06255535036325455, + "rewards/margins_std": 0.08858311921358109, + "rewards/rejected": 0.08921004831790924, + "step": 1250 + }, + { + "dpo_losses": 0.6648576259613037, + "epoch": 0.33, + "grad_norm": 11.821818879145804, + "learning_rate": 4.2392733566075757e-07, + "logits/chosen": -2.747102975845337, + "logits/rejected": -2.7102127075195312, + "logps/chosen": -221.44418334960938, + "logps/rejected": -193.30667114257812, + "loss": 0.6788, + "positive_losses": 0.0, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.1266448050737381, + "rewards/margins": 0.05905945971608162, + "rewards/margins_max": 0.14472545683383942, + "rewards/margins_min": -0.018324170261621475, + "rewards/margins_std": 0.07059869915246964, + "rewards/rejected": 0.06758534163236618, + "step": 1260 + }, + { + "dpo_losses": 0.6734832525253296, + "epoch": 0.33, + "grad_norm": 10.816641130304586, + "learning_rate": 4.2227911427280973e-07, + "logits/chosen": -2.790510416030884, + "logits/rejected": -2.817786455154419, + "logps/chosen": -277.05218505859375, + "logps/rejected": -285.94293212890625, + "loss": 0.6962, + "positive_losses": 0.008324814029037952, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1278315931558609, + "rewards/margins": 0.04247141629457474, + "rewards/margins_max": 0.1652711182832718, + "rewards/margins_min": -0.047576092183589935, + "rewards/margins_std": 0.09299333393573761, + "rewards/rejected": 0.08536018431186676, + "step": 1270 + }, + { + "dpo_losses": 0.6762028932571411, + "epoch": 0.33, + "grad_norm": 2.326050826477611, + "learning_rate": 4.206165076283982e-07, + "logits/chosen": -2.67604398727417, + "logits/rejected": -2.7359328269958496, + "logps/chosen": -212.44430541992188, + "logps/rejected": -236.13589477539062, + "loss": 0.6915, + "positive_losses": 0.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12036889791488647, + "rewards/margins": 0.03659987077116966, + "rewards/margins_max": 0.12502220273017883, + "rewards/margins_min": -0.06223265454173088, + "rewards/margins_std": 0.0837571993470192, + "rewards/rejected": 0.08376900851726532, + "step": 1280 + }, + { + "dpo_losses": 0.6659013032913208, + "epoch": 0.34, + "grad_norm": 9.185758464565456, + "learning_rate": 4.1893965455469946e-07, + "logits/chosen": -2.783086061477661, + "logits/rejected": -2.7353854179382324, + "logps/chosen": -231.83056640625, + "logps/rejected": -233.4626922607422, + "loss": 0.6817, + "positive_losses": 0.008187675848603249, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.13617418706417084, + "rewards/margins": 0.05796939134597778, + "rewards/margins_max": 0.16482461988925934, + "rewards/margins_min": -0.047472696751356125, + "rewards/margins_std": 0.0955493301153183, + "rewards/rejected": 0.07820478826761246, + "step": 1290 + }, + { + "dpo_losses": 0.6711560487747192, + "epoch": 0.34, + "grad_norm": 7.688879370025054, + "learning_rate": 4.172486950684626e-07, + "logits/chosen": -2.8060498237609863, + "logits/rejected": -2.763659954071045, + "logps/chosen": -196.8289794921875, + "logps/rejected": -220.77841186523438, + "loss": 0.6977, + "positive_losses": 0.6763796806335449, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.11435681581497192, + "rewards/margins": 0.04713314771652222, + "rewards/margins_max": 0.1410677134990692, + "rewards/margins_min": -0.06612564623355865, + "rewards/margins_std": 0.09420207142829895, + "rewards/rejected": 0.06722366809844971, + "step": 1300 + }, + { + "epoch": 0.34, + "eval_dpo_losses": 0.6696622967720032, + "eval_logits/chosen": -2.8049025535583496, + "eval_logits/rejected": -2.7666215896606445, + "eval_logps/chosen": -270.84783935546875, + "eval_logps/rejected": -253.48585510253906, + "eval_loss": 0.6843598484992981, + "eval_positive_losses": 0.09547553956508636, + "eval_rewards/accuracies": 0.7222222089767456, + "eval_rewards/chosen": 0.13647052645683289, + "eval_rewards/margins": 0.04983547329902649, + "eval_rewards/margins_max": 0.19835184514522552, + "eval_rewards/margins_min": -0.0814487487077713, + "eval_rewards/margins_std": 0.09169920533895493, + "eval_rewards/rejected": 0.0866350531578064, + "eval_runtime": 408.2798, + "eval_samples_per_second": 4.899, + "eval_steps_per_second": 0.154, + "step": 1300 + }, + { + "dpo_losses": 0.6705228686332703, + "epoch": 0.34, + "grad_norm": 9.831172209939382, + "learning_rate": 4.155437703643181e-07, + "logits/chosen": -2.797250747680664, + "logits/rejected": -2.759342908859253, + "logps/chosen": -281.2738952636719, + "logps/rejected": -258.2003479003906, + "loss": 0.6783, + "positive_losses": 0.0, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.14924822747707367, + "rewards/margins": 0.048123955726623535, + "rewards/margins_max": 0.15159112215042114, + "rewards/margins_min": -0.04315425828099251, + "rewards/margins_std": 0.08673207461833954, + "rewards/rejected": 0.10112428665161133, + "step": 1310 + }, + { + "dpo_losses": 0.6825979948043823, + "epoch": 0.35, + "grad_norm": 14.90326780801519, + "learning_rate": 4.138250228029881e-07, + "logits/chosen": -2.8126060962677, + "logits/rejected": -2.771669626235962, + "logps/chosen": -255.3660888671875, + "logps/rejected": -250.10989379882812, + "loss": 0.6879, + "positive_losses": 0.28322991728782654, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.12895464897155762, + "rewards/margins": 0.024239787831902504, + "rewards/margins_max": 0.16116644442081451, + "rewards/margins_min": -0.0756097361445427, + "rewards/margins_std": 0.1079140156507492, + "rewards/rejected": 0.10471485555171967, + "step": 1320 + }, + { + "dpo_losses": 0.6643841862678528, + "epoch": 0.35, + "grad_norm": 1.8995218829857292, + "learning_rate": 4.1209259589939935e-07, + "logits/chosen": -2.8548474311828613, + "logits/rejected": -2.847646474838257, + "logps/chosen": -260.09259033203125, + "logps/rejected": -255.16983032226562, + "loss": 0.6853, + "positive_losses": 0.0995582565665245, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.15146958827972412, + "rewards/margins": 0.06084311753511429, + "rewards/margins_max": 0.166192427277565, + "rewards/margins_min": -0.032936133444309235, + "rewards/margins_std": 0.08817348629236221, + "rewards/rejected": 0.09062648564577103, + "step": 1330 + }, + { + "dpo_losses": 0.6773894429206848, + "epoch": 0.35, + "grad_norm": 1.9373582972729462, + "learning_rate": 4.103466343106998e-07, + "logits/chosen": -2.6664412021636963, + "logits/rejected": -2.6139297485351562, + "logps/chosen": -329.4361267089844, + "logps/rejected": -256.4240417480469, + "loss": 0.6795, + "positive_losses": 0.013278961181640625, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.11371631920337677, + "rewards/margins": 0.034226398915052414, + "rewards/margins_max": 0.12248452007770538, + "rewards/margins_min": -0.06852659583091736, + "rewards/margins_std": 0.09230764210224152, + "rewards/rejected": 0.07948991656303406, + "step": 1340 + }, + { + "dpo_losses": 0.6746512055397034, + "epoch": 0.35, + "grad_norm": 2.1153716573785166, + "learning_rate": 4.085872838241796e-07, + "logits/chosen": -2.8163838386535645, + "logits/rejected": -2.8192570209503174, + "logps/chosen": -283.8049621582031, + "logps/rejected": -240.8268585205078, + "loss": 0.6828, + "positive_losses": 0.1023712158203125, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.12577365338802338, + "rewards/margins": 0.03993845731019974, + "rewards/margins_max": 0.15859736502170563, + "rewards/margins_min": -0.08044598996639252, + "rewards/margins_std": 0.1035895124077797, + "rewards/rejected": 0.08583520352840424, + "step": 1350 + }, + { + "dpo_losses": 0.6808849573135376, + "epoch": 0.36, + "grad_norm": 2.2392466328970273, + "learning_rate": 4.06814691345098e-07, + "logits/chosen": -2.7731029987335205, + "logits/rejected": -2.790194511413574, + "logps/chosen": -204.32423400878906, + "logps/rejected": -191.43460083007812, + "loss": 0.6798, + "positive_losses": 0.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12921229004859924, + "rewards/margins": 0.026992127299308777, + "rewards/margins_max": 0.144235298037529, + "rewards/margins_min": -0.0724717378616333, + "rewards/margins_std": 0.09821444004774094, + "rewards/rejected": 0.10222016274929047, + "step": 1360 + }, + { + "dpo_losses": 0.6734897494316101, + "epoch": 0.36, + "grad_norm": 1.873548963474501, + "learning_rate": 4.0502900488441707e-07, + "logits/chosen": -2.881939172744751, + "logits/rejected": -2.820146322250366, + "logps/chosen": -267.6750793457031, + "logps/rejected": -236.7908172607422, + "loss": 0.6721, + "positive_losses": 0.13007812201976776, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.13470585644245148, + "rewards/margins": 0.0420747809112072, + "rewards/margins_max": 0.1425054520368576, + "rewards/margins_min": -0.050276100635528564, + "rewards/margins_std": 0.08550667762756348, + "rewards/rejected": 0.09263106435537338, + "step": 1370 + }, + { + "dpo_losses": 0.6570364236831665, + "epoch": 0.36, + "grad_norm": 1.6063052780198595, + "learning_rate": 4.032303735464422e-07, + "logits/chosen": -2.7805044651031494, + "logits/rejected": -2.788447856903076, + "logps/chosen": -271.3067626953125, + "logps/rejected": -262.964599609375, + "loss": 0.6738, + "positive_losses": 0.008975982666015625, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.16087505221366882, + "rewards/margins": 0.07718931138515472, + "rewards/margins_max": 0.21250836551189423, + "rewards/margins_min": -0.028435688465833664, + "rewards/margins_std": 0.10952192544937134, + "rewards/rejected": 0.0836857259273529, + "step": 1380 + }, + { + "dpo_losses": 0.666195273399353, + "epoch": 0.36, + "grad_norm": 1.9813531409531817, + "learning_rate": 4.014189475163726e-07, + "logits/chosen": -2.8767552375793457, + "logits/rejected": -2.779618978500366, + "logps/chosen": -256.12518310546875, + "logps/rejected": -206.1197967529297, + "loss": 0.6808, + "positive_losses": 0.04650726169347763, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.13240960240364075, + "rewards/margins": 0.05797024443745613, + "rewards/margins_max": 0.17624790966510773, + "rewards/margins_min": -0.05525298789143562, + "rewards/margins_std": 0.10630662739276886, + "rewards/rejected": 0.07443936169147491, + "step": 1390 + }, + { + "dpo_losses": 0.6712676882743835, + "epoch": 0.37, + "grad_norm": 1.8304967900081062, + "learning_rate": 3.995948780477605e-07, + "logits/chosen": -2.7871243953704834, + "logits/rejected": -2.703411817550659, + "logps/chosen": -264.5346374511719, + "logps/rejected": -212.4336395263672, + "loss": 0.6773, + "positive_losses": 0.18939360976219177, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.13132144510746002, + "rewards/margins": 0.047726646065711975, + "rewards/margins_max": 0.16912686824798584, + "rewards/margins_min": -0.05112838000059128, + "rewards/margins_std": 0.09666156768798828, + "rewards/rejected": 0.08359479904174805, + "step": 1400 + }, + { + "epoch": 0.37, + "eval_dpo_losses": 0.6683064699172974, + "eval_logits/chosen": -2.8007144927978516, + "eval_logits/rejected": -2.7625784873962402, + "eval_logps/chosen": -270.8923034667969, + "eval_logps/rejected": -253.83428955078125, + "eval_loss": 0.6851855516433716, + "eval_positive_losses": 0.1091412678360939, + "eval_rewards/accuracies": 0.716269850730896, + "eval_rewards/chosen": 0.1360260248184204, + "eval_rewards/margins": 0.0528750941157341, + "eval_rewards/margins_max": 0.20844675600528717, + "eval_rewards/margins_min": -0.0865757167339325, + "eval_rewards/margins_std": 0.0966721922159195, + "eval_rewards/rejected": 0.08315093070268631, + "eval_runtime": 388.262, + "eval_samples_per_second": 5.151, + "eval_steps_per_second": 0.162, + "step": 1400 + }, + { + "dpo_losses": 0.6750961542129517, + "epoch": 0.37, + "grad_norm": 10.10602364271861, + "learning_rate": 3.977583174498816e-07, + "logits/chosen": -2.7661843299865723, + "logits/rejected": -2.786423444747925, + "logps/chosen": -217.5006866455078, + "logps/rejected": -231.273681640625, + "loss": 0.6789, + "positive_losses": 0.026738548651337624, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.12990109622478485, + "rewards/margins": 0.038937196135520935, + "rewards/margins_max": 0.1547955423593521, + "rewards/margins_min": -0.06859288364648819, + "rewards/margins_std": 0.09591875970363617, + "rewards/rejected": 0.09096390753984451, + "step": 1410 + }, + { + "dpo_losses": 0.6683656573295593, + "epoch": 0.37, + "grad_norm": 9.593516194728377, + "learning_rate": 3.9590941907501717e-07, + "logits/chosen": -2.807760715484619, + "logits/rejected": -2.7231407165527344, + "logps/chosen": -235.6064910888672, + "logps/rejected": -185.83670043945312, + "loss": 0.6906, + "positive_losses": 0.01294860802590847, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.13492132723331451, + "rewards/margins": 0.05229531601071358, + "rewards/margins_max": 0.15305258333683014, + "rewards/margins_min": -0.03562027961015701, + "rewards/margins_std": 0.08461041748523712, + "rewards/rejected": 0.08262600749731064, + "step": 1420 + }, + { + "dpo_losses": 0.6754311323165894, + "epoch": 0.37, + "grad_norm": 2.4033266615367173, + "learning_rate": 3.9404833730564974e-07, + "logits/chosen": -2.844654083251953, + "logits/rejected": -2.864405632019043, + "logps/chosen": -199.28477478027344, + "logps/rejected": -222.0977783203125, + "loss": 0.6963, + "positive_losses": 0.055442046374082565, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.10395960509777069, + "rewards/margins": 0.03751935809850693, + "rewards/margins_max": 0.12094000726938248, + "rewards/margins_min": -0.02232285775244236, + "rewards/margins_std": 0.06388449668884277, + "rewards/rejected": 0.06644026190042496, + "step": 1430 + }, + { + "dpo_losses": 0.6634265780448914, + "epoch": 0.38, + "grad_norm": 13.354282769438905, + "learning_rate": 3.9217522754157117e-07, + "logits/chosen": -2.815695285797119, + "logits/rejected": -2.7707958221435547, + "logps/chosen": -314.9673156738281, + "logps/rejected": -316.7956237792969, + "loss": 0.669, + "positive_losses": 0.024194717407226562, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.1410738229751587, + "rewards/margins": 0.06209304928779602, + "rewards/margins_max": 0.15248478949069977, + "rewards/margins_min": -0.021471448242664337, + "rewards/margins_std": 0.07498336583375931, + "rewards/rejected": 0.07898075878620148, + "step": 1440 + }, + { + "dpo_losses": 0.657206654548645, + "epoch": 0.38, + "grad_norm": 8.045910705268138, + "learning_rate": 3.9029024618690785e-07, + "logits/chosen": -2.8342227935791016, + "logits/rejected": -2.754894495010376, + "logps/chosen": -305.3880920410156, + "logps/rejected": -259.7880859375, + "loss": 0.6751, + "positive_losses": 0.1556953489780426, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14140953123569489, + "rewards/margins": 0.07678183168172836, + "rewards/margins_max": 0.18914107978343964, + "rewards/margins_min": -0.04185623675584793, + "rewards/margins_std": 0.10669572651386261, + "rewards/rejected": 0.06462768465280533, + "step": 1450 + }, + { + "dpo_losses": 0.659722626209259, + "epoch": 0.38, + "grad_norm": 1.8479703027395027, + "learning_rate": 3.883935506370605e-07, + "logits/chosen": -2.7728967666625977, + "logits/rejected": -2.7116799354553223, + "logps/chosen": -286.15850830078125, + "logps/rejected": -233.5446014404297, + "loss": 0.6665, + "positive_losses": 0.06868667900562286, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.13742755353450775, + "rewards/margins": 0.07048223912715912, + "rewards/margins_max": 0.1735135018825531, + "rewards/margins_min": -0.046617552638053894, + "rewards/margins_std": 0.09807271510362625, + "rewards/rejected": 0.06694532930850983, + "step": 1460 + }, + { + "dpo_losses": 0.6679905652999878, + "epoch": 0.38, + "grad_norm": 9.303180306667144, + "learning_rate": 3.864852992655616e-07, + "logits/chosen": -2.8669474124908447, + "logits/rejected": -2.833524227142334, + "logps/chosen": -239.24234008789062, + "logps/rejected": -227.33242797851562, + "loss": 0.6741, + "positive_losses": 0.14037056267261505, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12945117056369781, + "rewards/margins": 0.05349854752421379, + "rewards/margins_max": 0.15357722342014313, + "rewards/margins_min": -0.04545611888170242, + "rewards/margins_std": 0.09062834084033966, + "rewards/rejected": 0.07595261186361313, + "step": 1470 + }, + { + "dpo_losses": 0.6731019616127014, + "epoch": 0.39, + "grad_norm": 1.9833142306372231, + "learning_rate": 3.845656514108515e-07, + "logits/chosen": -2.8072688579559326, + "logits/rejected": -2.7603797912597656, + "logps/chosen": -226.1439208984375, + "logps/rejected": -260.9153747558594, + "loss": 0.6837, + "positive_losses": 0.23621253669261932, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.11956068128347397, + "rewards/margins": 0.04162520542740822, + "rewards/margins_max": 0.1307816356420517, + "rewards/margins_min": -0.028078163042664528, + "rewards/margins_std": 0.06945054233074188, + "rewards/rejected": 0.07793547958135605, + "step": 1480 + }, + { + "dpo_losses": 0.6623013615608215, + "epoch": 0.39, + "grad_norm": 9.77740929612468, + "learning_rate": 3.8263476736297375e-07, + "logits/chosen": -2.6985790729522705, + "logits/rejected": -2.696704864501953, + "logps/chosen": -262.1617431640625, + "logps/rejected": -234.5415802001953, + "loss": 0.681, + "positive_losses": 0.22186526656150818, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.13082730770111084, + "rewards/margins": 0.06536930799484253, + "rewards/margins_max": 0.19230645895004272, + "rewards/margins_min": -0.032393865287303925, + "rewards/margins_std": 0.10202561318874359, + "rewards/rejected": 0.06545799970626831, + "step": 1490 + }, + { + "dpo_losses": 0.6603057980537415, + "epoch": 0.39, + "grad_norm": 1.674629580404987, + "learning_rate": 3.8069280835019055e-07, + "logits/chosen": -2.8541314601898193, + "logits/rejected": -2.781519651412964, + "logps/chosen": -225.85855102539062, + "logps/rejected": -193.4555206298828, + "loss": 0.6802, + "positive_losses": 0.39620399475097656, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.1380954384803772, + "rewards/margins": 0.07008825242519379, + "rewards/margins_max": 0.20899653434753418, + "rewards/margins_min": -0.04256419837474823, + "rewards/margins_std": 0.11214927583932877, + "rewards/rejected": 0.06800718605518341, + "step": 1500 + }, + { + "epoch": 0.39, + "eval_dpo_losses": 0.667341411113739, + "eval_logits/chosen": -2.7934176921844482, + "eval_logits/rejected": -2.7549002170562744, + "eval_logps/chosen": -270.5391540527344, + "eval_logps/rejected": -253.69781494140625, + "eval_loss": 0.6854449510574341, + "eval_positive_losses": 0.12425005435943604, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": 0.13955748081207275, + "eval_rewards/margins": 0.0550418496131897, + "eval_rewards/margins_max": 0.21547779440879822, + "eval_rewards/margins_min": -0.08949919790029526, + "eval_rewards/margins_std": 0.10008959472179413, + "eval_rewards/rejected": 0.08451561629772186, + "eval_runtime": 388.5958, + "eval_samples_per_second": 5.147, + "eval_steps_per_second": 0.162, + "step": 1500 + }, + { + "dpo_losses": 0.6754101514816284, + "epoch": 0.4, + "grad_norm": 2.220484740457583, + "learning_rate": 3.7873993652552073e-07, + "logits/chosen": -2.8614954948425293, + "logits/rejected": -2.7796661853790283, + "logps/chosen": -292.0195617675781, + "logps/rejected": -251.2197265625, + "loss": 0.6926, + "positive_losses": 0.2932479977607727, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.12026111036539078, + "rewards/margins": 0.03840740770101547, + "rewards/margins_max": 0.1505320966243744, + "rewards/margins_min": -0.07637099921703339, + "rewards/margins_std": 0.10024436563253403, + "rewards/rejected": 0.08185369521379471, + "step": 1510 + }, + { + "dpo_losses": 0.663870632648468, + "epoch": 0.4, + "grad_norm": 2.203422580398517, + "learning_rate": 3.767763149531995e-07, + "logits/chosen": -2.823636054992676, + "logits/rejected": -2.744633197784424, + "logps/chosen": -271.84796142578125, + "logps/rejected": -210.0604705810547, + "loss": 0.6816, + "positive_losses": 0.22603663802146912, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.13827243447303772, + "rewards/margins": 0.06253242492675781, + "rewards/margins_max": 0.19688096642494202, + "rewards/margins_min": -0.054659806191921234, + "rewards/margins_std": 0.110364630818367, + "rewards/rejected": 0.07573998719453812, + "step": 1520 + }, + { + "dpo_losses": 0.6732046604156494, + "epoch": 0.4, + "grad_norm": 1.8093560015219587, + "learning_rate": 3.7480210759506326e-07, + "logits/chosen": -2.7630019187927246, + "logits/rejected": -2.7755062580108643, + "logps/chosen": -263.8840637207031, + "logps/rejected": -230.9619598388672, + "loss": 0.6767, + "positive_losses": 0.01628875732421875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.13674938678741455, + "rewards/margins": 0.04170869663357735, + "rewards/margins_max": 0.14020602405071259, + "rewards/margins_min": -0.040444426238536835, + "rewards/margins_std": 0.07792656123638153, + "rewards/rejected": 0.0950406938791275, + "step": 1530 + }, + { + "dpo_losses": 0.6739233136177063, + "epoch": 0.4, + "grad_norm": 2.148534710158725, + "learning_rate": 3.728174792968582e-07, + "logits/chosen": -2.7825496196746826, + "logits/rejected": -2.7479727268218994, + "logps/chosen": -363.34619140625, + "logps/rejected": -367.6847839355469, + "loss": 0.678, + "positive_losses": 0.06572417914867401, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.14340153336524963, + "rewards/margins": 0.042062122374773026, + "rewards/margins_max": 0.15321552753448486, + "rewards/margins_min": -0.05941515043377876, + "rewards/margins_std": 0.0947646051645279, + "rewards/rejected": 0.1013394147157669, + "step": 1540 + }, + { + "dpo_losses": 0.6643571853637695, + "epoch": 0.41, + "grad_norm": 6.019472630615777, + "learning_rate": 3.70822595774476e-07, + "logits/chosen": -2.748858690261841, + "logits/rejected": -2.7964751720428467, + "logps/chosen": -291.16668701171875, + "logps/rejected": -272.07269287109375, + "loss": 0.6811, + "positive_losses": 0.009188842959702015, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.16058219969272614, + "rewards/margins": 0.06413300335407257, + "rewards/margins_max": 0.2249596118927002, + "rewards/margins_min": -0.04774421080946922, + "rewards/margins_std": 0.12317200750112534, + "rewards/rejected": 0.09644921123981476, + "step": 1550 + }, + { + "dpo_losses": 0.6761940121650696, + "epoch": 0.41, + "grad_norm": 2.066512395371634, + "learning_rate": 3.688176236001168e-07, + "logits/chosen": -2.7642502784729004, + "logits/rejected": -2.744047164916992, + "logps/chosen": -256.73370361328125, + "logps/rejected": -243.156982421875, + "loss": 0.6854, + "positive_losses": 0.03718109056353569, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.13707491755485535, + "rewards/margins": 0.03892552852630615, + "rewards/margins_max": 0.19296100735664368, + "rewards/margins_min": -0.10505714267492294, + "rewards/margins_std": 0.13237911462783813, + "rewards/rejected": 0.098149374127388, + "step": 1560 + }, + { + "dpo_losses": 0.6572185754776001, + "epoch": 0.41, + "grad_norm": 8.177892274792171, + "learning_rate": 3.6680273018838016e-07, + "logits/chosen": -2.798093318939209, + "logits/rejected": -2.7817108631134033, + "logps/chosen": -345.5523986816406, + "logps/rejected": -253.31076049804688, + "loss": 0.6728, + "positive_losses": 0.03433532640337944, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.1384911984205246, + "rewards/margins": 0.07636358588933945, + "rewards/margins_max": 0.19075119495391846, + "rewards/margins_min": -0.03996484354138374, + "rewards/margins_std": 0.10142095386981964, + "rewards/rejected": 0.06212761253118515, + "step": 1570 + }, + { + "dpo_losses": 0.6835813522338867, + "epoch": 0.41, + "grad_norm": 14.303723172028716, + "learning_rate": 3.6477808378228596e-07, + "logits/chosen": -2.8051185607910156, + "logits/rejected": -2.855712413787842, + "logps/chosen": -256.68109130859375, + "logps/rejected": -242.5233612060547, + "loss": 0.68, + "positive_losses": 0.11357422173023224, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.11277903616428375, + "rewards/margins": 0.021566368639469147, + "rewards/margins_max": 0.11864233016967773, + "rewards/margins_min": -0.06020314246416092, + "rewards/margins_std": 0.07883908599615097, + "rewards/rejected": 0.09121266007423401, + "step": 1580 + }, + { + "dpo_losses": 0.6786788105964661, + "epoch": 0.42, + "grad_norm": 1.940205807338258, + "learning_rate": 3.6274385343922674e-07, + "logits/chosen": -2.8355250358581543, + "logits/rejected": -2.888169765472412, + "logps/chosen": -294.1134338378906, + "logps/rejected": -279.9457092285156, + "loss": 0.6724, + "positive_losses": 0.0, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.13763108849525452, + "rewards/margins": 0.03273003175854683, + "rewards/margins_max": 0.1772867739200592, + "rewards/margins_min": -0.08706100285053253, + "rewards/margins_std": 0.11496637761592865, + "rewards/rejected": 0.1049010381102562, + "step": 1590 + }, + { + "dpo_losses": 0.6733390092849731, + "epoch": 0.42, + "grad_norm": 1.767002412393906, + "learning_rate": 3.6070020901685057e-07, + "logits/chosen": -2.7376842498779297, + "logits/rejected": -2.7593541145324707, + "logps/chosen": -239.187744140625, + "logps/rejected": -196.0511932373047, + "loss": 0.6816, + "positive_losses": 0.295419305562973, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1259082853794098, + "rewards/margins": 0.042100995779037476, + "rewards/margins_max": 0.14133401215076447, + "rewards/margins_min": -0.03318362310528755, + "rewards/margins_std": 0.07885169237852097, + "rewards/rejected": 0.08380730450153351, + "step": 1600 + }, + { + "epoch": 0.42, + "eval_dpo_losses": 0.6668837070465088, + "eval_logits/chosen": -2.795292377471924, + "eval_logits/rejected": -2.7573533058166504, + "eval_logps/chosen": -270.2237548828125, + "eval_logps/rejected": -253.4888458251953, + "eval_loss": 0.6848409175872803, + "eval_positive_losses": 0.12261239439249039, + "eval_rewards/accuracies": 0.726190447807312, + "eval_rewards/chosen": 0.14271163940429688, + "eval_rewards/margins": 0.05610635504126549, + "eval_rewards/margins_max": 0.21962520480155945, + "eval_rewards/margins_min": -0.0916348546743393, + "eval_rewards/margins_std": 0.10248645395040512, + "eval_rewards/rejected": 0.08660528808832169, + "eval_runtime": 389.7, + "eval_samples_per_second": 5.132, + "eval_steps_per_second": 0.162, + "step": 1600 + }, + { + "dpo_losses": 0.663489818572998, + "epoch": 0.42, + "grad_norm": 1.8293309594872051, + "learning_rate": 3.5864732115887863e-07, + "logits/chosen": -2.747380495071411, + "logits/rejected": -2.742300510406494, + "logps/chosen": -261.0233459472656, + "logps/rejected": -240.55416870117188, + "loss": 0.6719, + "positive_losses": 0.0, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.14141225814819336, + "rewards/margins": 0.0640452578663826, + "rewards/margins_max": 0.1955864131450653, + "rewards/margins_min": -0.028682339936494827, + "rewards/margins_std": 0.1018737331032753, + "rewards/rejected": 0.07736701518297195, + "step": 1610 + }, + { + "dpo_losses": 0.6576683521270752, + "epoch": 0.42, + "grad_norm": 2.427551115305182, + "learning_rate": 3.565853612808562e-07, + "logits/chosen": -2.799431562423706, + "logits/rejected": -2.7337801456451416, + "logps/chosen": -269.6554260253906, + "logps/rejected": -235.99038696289062, + "loss": 0.6778, + "positive_losses": 0.008718108758330345, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.16144730150699615, + "rewards/margins": 0.07694243639707565, + "rewards/margins_max": 0.20366080105304718, + "rewards/margins_min": -0.026887202635407448, + "rewards/margins_std": 0.10323099046945572, + "rewards/rejected": 0.0845048725605011, + "step": 1620 + }, + { + "dpo_losses": 0.6518866419792175, + "epoch": 0.43, + "grad_norm": 10.793623306246538, + "learning_rate": 3.5451450155583984e-07, + "logits/chosen": -2.915398359298706, + "logits/rejected": -2.8122076988220215, + "logps/chosen": -282.4895324707031, + "logps/rejected": -249.7183074951172, + "loss": 0.685, + "positive_losses": 0.1295158416032791, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.15124383568763733, + "rewards/margins": 0.08674627542495728, + "rewards/margins_max": 0.17734436690807343, + "rewards/margins_min": -0.00693632522597909, + "rewards/margins_std": 0.08227060735225677, + "rewards/rejected": 0.06449756771326065, + "step": 1630 + }, + { + "dpo_losses": 0.6686679720878601, + "epoch": 0.43, + "grad_norm": 10.090910615538187, + "learning_rate": 3.5243491490002055e-07, + "logits/chosen": -2.876235008239746, + "logits/rejected": -2.8371047973632812, + "logps/chosen": -268.490966796875, + "logps/rejected": -227.34738159179688, + "loss": 0.6765, + "positive_losses": 0.04539031907916069, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.14302578568458557, + "rewards/margins": 0.0534796416759491, + "rewards/margins_max": 0.19195103645324707, + "rewards/margins_min": -0.06843677908182144, + "rewards/margins_std": 0.11482664197683334, + "rewards/rejected": 0.08954615145921707, + "step": 1640 + }, + { + "dpo_losses": 0.6530863046646118, + "epoch": 0.43, + "grad_norm": 2.2831072628106055, + "learning_rate": 3.503467749582857e-07, + "logits/chosen": -2.8279144763946533, + "logits/rejected": -2.712979793548584, + "logps/chosen": -374.8084716796875, + "logps/rejected": -279.04193115234375, + "loss": 0.6833, + "positive_losses": 0.15686893463134766, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.16030217707157135, + "rewards/margins": 0.08735756576061249, + "rewards/margins_max": 0.21465222537517548, + "rewards/margins_min": -0.06027594953775406, + "rewards/margins_std": 0.12725059688091278, + "rewards/rejected": 0.07294458150863647, + "step": 1650 + }, + { + "dpo_losses": 0.6651492118835449, + "epoch": 0.43, + "grad_norm": 8.591032063098105, + "learning_rate": 3.482502560897194e-07, + "logits/chosen": -2.7746264934539795, + "logits/rejected": -2.7501578330993652, + "logps/chosen": -237.156494140625, + "logps/rejected": -277.9545593261719, + "loss": 0.6731, + "positive_losses": 0.10025139153003693, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1401684582233429, + "rewards/margins": 0.060249678790569305, + "rewards/margins_max": 0.17184853553771973, + "rewards/margins_min": -0.05993221327662468, + "rewards/margins_std": 0.1048416867852211, + "rewards/rejected": 0.079918771982193, + "step": 1660 + }, + { + "dpo_losses": 0.6637318730354309, + "epoch": 0.44, + "grad_norm": 1.7493149233877625, + "learning_rate": 3.4614553335304403e-07, + "logits/chosen": -2.8647806644439697, + "logits/rejected": -2.821565628051758, + "logps/chosen": -248.33609008789062, + "logps/rejected": -221.92294311523438, + "loss": 0.679, + "positive_losses": 0.35750922560691833, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14442947506904602, + "rewards/margins": 0.0623578205704689, + "rewards/margins_max": 0.17120857536792755, + "rewards/margins_min": -0.0359518863260746, + "rewards/margins_std": 0.09716440737247467, + "rewards/rejected": 0.08207164704799652, + "step": 1670 + }, + { + "dpo_losses": 0.6731246709823608, + "epoch": 0.44, + "grad_norm": 8.473047046081899, + "learning_rate": 3.440327824920022e-07, + "logits/chosen": -2.7717459201812744, + "logits/rejected": -2.715242385864258, + "logps/chosen": -297.0379943847656, + "logps/rejected": -245.60391235351562, + "loss": 0.682, + "positive_losses": 0.05773162841796875, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.13697683811187744, + "rewards/margins": 0.042325470596551895, + "rewards/margins_max": 0.14347949624061584, + "rewards/margins_min": -0.0518612377345562, + "rewards/margins_std": 0.08395689725875854, + "rewards/rejected": 0.09465137869119644, + "step": 1680 + }, + { + "dpo_losses": 0.6721340417861938, + "epoch": 0.44, + "grad_norm": 6.683373434782019, + "learning_rate": 3.4191217992068287e-07, + "logits/chosen": -2.6870052814483643, + "logits/rejected": -2.6862998008728027, + "logps/chosen": -227.3953094482422, + "logps/rejected": -246.79638671875, + "loss": 0.6696, + "positive_losses": 0.04251289367675781, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.13126316666603088, + "rewards/margins": 0.0451076440513134, + "rewards/margins_max": 0.1354115754365921, + "rewards/margins_min": -0.04070950672030449, + "rewards/margins_std": 0.07816879451274872, + "rewards/rejected": 0.08615552634000778, + "step": 1690 + }, + { + "dpo_losses": 0.6801950931549072, + "epoch": 0.44, + "grad_norm": 1.983917724306998, + "learning_rate": 3.3978390270879056e-07, + "logits/chosen": -2.7492804527282715, + "logits/rejected": -2.744694232940674, + "logps/chosen": -199.28369140625, + "logps/rejected": -252.99630737304688, + "loss": 0.6737, + "positive_losses": 0.18640442192554474, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.11901885271072388, + "rewards/margins": 0.029588323086500168, + "rewards/margins_max": 0.13933536410331726, + "rewards/margins_min": -0.08721192181110382, + "rewards/margins_std": 0.09942667186260223, + "rewards/rejected": 0.0894305482506752, + "step": 1700 + }, + { + "epoch": 0.44, + "eval_dpo_losses": 0.6653831005096436, + "eval_logits/chosen": -2.793133020401001, + "eval_logits/rejected": -2.7549686431884766, + "eval_logps/chosen": -270.14947509765625, + "eval_logps/rejected": -253.75079345703125, + "eval_loss": 0.6862542629241943, + "eval_positive_losses": 0.14278966188430786, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": 0.1434543877840042, + "eval_rewards/margins": 0.059468500316143036, + "eval_rewards/margins_max": 0.23018646240234375, + "eval_rewards/margins_min": -0.0957309901714325, + "eval_rewards/margins_std": 0.10726842284202576, + "eval_rewards/rejected": 0.08398589491844177, + "eval_runtime": 389.1458, + "eval_samples_per_second": 5.139, + "eval_steps_per_second": 0.162, + "step": 1700 + }, + { + "dpo_losses": 0.6800428628921509, + "epoch": 0.45, + "grad_norm": 14.473184981063904, + "learning_rate": 3.376481285668599e-07, + "logits/chosen": -2.860978603363037, + "logits/rejected": -2.842038631439209, + "logps/chosen": -240.4024200439453, + "logps/rejected": -230.64913940429688, + "loss": 0.6958, + "positive_losses": 0.13513031601905823, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.14729043841362, + "rewards/margins": 0.02992558851838112, + "rewards/margins_max": 0.14201180636882782, + "rewards/margins_min": -0.07873591035604477, + "rewards/margins_std": 0.09927816689014435, + "rewards/rejected": 0.11736486107110977, + "step": 1710 + }, + { + "dpo_losses": 0.6766767501831055, + "epoch": 0.45, + "grad_norm": 2.140319802462128, + "learning_rate": 3.355050358314172e-07, + "logits/chosen": -2.800375461578369, + "logits/rejected": -2.7702994346618652, + "logps/chosen": -244.689697265625, + "logps/rejected": -273.8795471191406, + "loss": 0.6794, + "positive_losses": 0.05370616912841797, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.14131440222263336, + "rewards/margins": 0.03704925253987312, + "rewards/margins_max": 0.16323471069335938, + "rewards/margins_min": -0.08994705975055695, + "rewards/margins_std": 0.11388404667377472, + "rewards/rejected": 0.10426516830921173, + "step": 1720 + }, + { + "dpo_losses": 0.6766036748886108, + "epoch": 0.45, + "grad_norm": 2.4323905799461247, + "learning_rate": 3.33354803450089e-07, + "logits/chosen": -2.851832866668701, + "logits/rejected": -2.7648284435272217, + "logps/chosen": -265.53814697265625, + "logps/rejected": -291.24462890625, + "loss": 0.6799, + "positive_losses": 0.0, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.14070968329906464, + "rewards/margins": 0.03625965490937233, + "rewards/margins_max": 0.1697661578655243, + "rewards/margins_min": -0.06259065866470337, + "rewards/margins_std": 0.10244093835353851, + "rewards/rejected": 0.1044500470161438, + "step": 1730 + }, + { + "dpo_losses": 0.654105544090271, + "epoch": 0.46, + "grad_norm": 10.980245712115805, + "learning_rate": 3.311976109666605e-07, + "logits/chosen": -2.7741200923919678, + "logits/rejected": -2.701322078704834, + "logps/chosen": -306.71246337890625, + "logps/rejected": -232.8634490966797, + "loss": 0.6806, + "positive_losses": 0.13347473740577698, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1625155210494995, + "rewards/margins": 0.08344938606023788, + "rewards/margins_max": 0.2080841064453125, + "rewards/margins_min": -0.047725483775138855, + "rewards/margins_std": 0.11776135861873627, + "rewards/rejected": 0.07906611263751984, + "step": 1740 + }, + { + "dpo_losses": 0.6599873900413513, + "epoch": 0.46, + "grad_norm": 12.333233875870272, + "learning_rate": 3.2903363850608317e-07, + "logits/chosen": -2.7754392623901367, + "logits/rejected": -2.762289047241211, + "logps/chosen": -258.095458984375, + "logps/rejected": -257.9560241699219, + "loss": 0.6767, + "positive_losses": 0.0, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.15608219802379608, + "rewards/margins": 0.07032088190317154, + "rewards/margins_max": 0.18100914359092712, + "rewards/margins_min": -0.04148901253938675, + "rewards/margins_std": 0.10034122318029404, + "rewards/rejected": 0.08576132357120514, + "step": 1750 + }, + { + "dpo_losses": 0.6527787446975708, + "epoch": 0.46, + "grad_norm": 12.099111073526487, + "learning_rate": 3.2686306675943477e-07, + "logits/chosen": -2.6830406188964844, + "logits/rejected": -2.641451597213745, + "logps/chosen": -256.3885803222656, + "logps/rejected": -233.4460906982422, + "loss": 0.6736, + "positive_losses": 0.0111083984375, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.18093226850032806, + "rewards/margins": 0.08572479337453842, + "rewards/margins_max": 0.22395269572734833, + "rewards/margins_min": -0.010747433640062809, + "rewards/margins_std": 0.10372080653905869, + "rewards/rejected": 0.09520746767520905, + "step": 1760 + }, + { + "dpo_losses": 0.6773605942726135, + "epoch": 0.46, + "grad_norm": 11.900600745160016, + "learning_rate": 3.2468607696883145e-07, + "logits/chosen": -2.8585307598114014, + "logits/rejected": -2.8548781871795654, + "logps/chosen": -291.2569885253906, + "logps/rejected": -283.5742492675781, + "loss": 0.6797, + "positive_losses": 0.02425079420208931, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.14840981364250183, + "rewards/margins": 0.03481290489435196, + "rewards/margins_max": 0.16832002997398376, + "rewards/margins_min": -0.06583338230848312, + "rewards/margins_std": 0.10414840281009674, + "rewards/rejected": 0.11359691619873047, + "step": 1770 + }, + { + "dpo_losses": 0.6759124994277954, + "epoch": 0.47, + "grad_norm": 2.0423300567114206, + "learning_rate": 3.2250285091229435e-07, + "logits/chosen": -2.799861431121826, + "logits/rejected": -2.733640670776367, + "logps/chosen": -268.4593200683594, + "logps/rejected": -248.49301147460938, + "loss": 0.6701, + "positive_losses": 0.1370445191860199, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.1430630385875702, + "rewards/margins": 0.03846190497279167, + "rewards/margins_max": 0.14976127445697784, + "rewards/margins_min": -0.09638581424951553, + "rewards/margins_std": 0.10821805894374847, + "rewards/rejected": 0.10460114479064941, + "step": 1780 + }, + { + "dpo_losses": 0.6586912870407104, + "epoch": 0.47, + "grad_norm": 12.508617024992398, + "learning_rate": 3.2031357088857083e-07, + "logits/chosen": -2.816551923751831, + "logits/rejected": -2.7572274208068848, + "logps/chosen": -268.2996826171875, + "logps/rejected": -223.4669647216797, + "loss": 0.6824, + "positive_losses": 0.0, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.16138462722301483, + "rewards/margins": 0.07290495932102203, + "rewards/margins_max": 0.20781588554382324, + "rewards/margins_min": -0.025771930813789368, + "rewards/margins_std": 0.10554580390453339, + "rewards/rejected": 0.0884796530008316, + "step": 1790 + }, + { + "dpo_losses": 0.6669738292694092, + "epoch": 0.47, + "grad_norm": 2.193588068275639, + "learning_rate": 3.1811841970191267e-07, + "logits/chosen": -2.8713536262512207, + "logits/rejected": -2.7351841926574707, + "logps/chosen": -335.30426025390625, + "logps/rejected": -294.9971618652344, + "loss": 0.6913, + "positive_losses": 0.09362602233886719, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.14786486327648163, + "rewards/margins": 0.055970776826143265, + "rewards/margins_max": 0.17564713954925537, + "rewards/margins_min": -0.04175977408885956, + "rewards/margins_std": 0.09620113670825958, + "rewards/rejected": 0.09189409017562866, + "step": 1800 + }, + { + "epoch": 0.47, + "eval_dpo_losses": 0.666235089302063, + "eval_logits/chosen": -2.7922110557556152, + "eval_logits/rejected": -2.754131555557251, + "eval_logps/chosen": -269.03106689453125, + "eval_logps/rejected": -252.44105529785156, + "eval_loss": 0.6821897625923157, + "eval_positive_losses": 0.1096610277891159, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": 0.15463854372501373, + "eval_rewards/margins": 0.05755544453859329, + "eval_rewards/margins_max": 0.2257683426141739, + "eval_rewards/margins_min": -0.09160422533750534, + "eval_rewards/margins_std": 0.10461423546075821, + "eval_rewards/rejected": 0.09708309173583984, + "eval_runtime": 388.9754, + "eval_samples_per_second": 5.142, + "eval_steps_per_second": 0.162, + "step": 1800 + }, + { + "dpo_losses": 0.6621342897415161, + "epoch": 0.47, + "grad_norm": 2.1187524396043984, + "learning_rate": 3.1591758064681257e-07, + "logits/chosen": -2.818814992904663, + "logits/rejected": -2.7996132373809814, + "logps/chosen": -315.8829040527344, + "logps/rejected": -281.2411804199219, + "loss": 0.6787, + "positive_losses": 0.0, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.17190691828727722, + "rewards/margins": 0.06481160968542099, + "rewards/margins_max": 0.1503869891166687, + "rewards/margins_min": -0.014813661575317383, + "rewards/margins_std": 0.07438337802886963, + "rewards/rejected": 0.10709531605243683, + "step": 1810 + }, + { + "dpo_losses": 0.6597224473953247, + "epoch": 0.48, + "grad_norm": 9.000680685793558, + "learning_rate": 3.13711237492698e-07, + "logits/chosen": -2.77579402923584, + "logits/rejected": -2.6911988258361816, + "logps/chosen": -275.7292785644531, + "logps/rejected": -284.06390380859375, + "loss": 0.686, + "positive_losses": 0.08585052192211151, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.1561126410961151, + "rewards/margins": 0.07060912251472473, + "rewards/margins_max": 0.1737409383058548, + "rewards/margins_min": -0.014506603591144085, + "rewards/margins_std": 0.08497828245162964, + "rewards/rejected": 0.08550353348255157, + "step": 1820 + }, + { + "dpo_losses": 0.6639354228973389, + "epoch": 0.48, + "grad_norm": 4.982395460656168, + "learning_rate": 3.1149957446858767e-07, + "logits/chosen": -2.7818925380706787, + "logits/rejected": -2.7447280883789062, + "logps/chosen": -301.8582458496094, + "logps/rejected": -402.88580322265625, + "loss": 0.6831, + "positive_losses": 0.0, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.16820065677165985, + "rewards/margins": 0.06484942138195038, + "rewards/margins_max": 0.21876180171966553, + "rewards/margins_min": -0.08734156936407089, + "rewards/margins_std": 0.1392730474472046, + "rewards/rejected": 0.10335125029087067, + "step": 1830 + }, + { + "dpo_losses": 0.6596516370773315, + "epoch": 0.48, + "grad_norm": 2.5992478162942168, + "learning_rate": 3.0928277624770736e-07, + "logits/chosen": -2.810455799102783, + "logits/rejected": -2.7492587566375732, + "logps/chosen": -231.97482299804688, + "logps/rejected": -233.7396697998047, + "loss": 0.6789, + "positive_losses": 0.08066530525684357, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.146644726395607, + "rewards/margins": 0.07179627567529678, + "rewards/margins_max": 0.1726509928703308, + "rewards/margins_min": -0.04879069700837135, + "rewards/margins_std": 0.10129410028457642, + "rewards/rejected": 0.07484843581914902, + "step": 1840 + }, + { + "dpo_losses": 0.6742871999740601, + "epoch": 0.48, + "grad_norm": 7.1406853671203745, + "learning_rate": 3.0706102793207073e-07, + "logits/chosen": -2.8016586303710938, + "logits/rejected": -2.7073276042938232, + "logps/chosen": -223.1007537841797, + "logps/rejected": -206.54574584960938, + "loss": 0.6759, + "positive_losses": 0.20059967041015625, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.1434759795665741, + "rewards/margins": 0.04163094609975815, + "rewards/margins_max": 0.1384052038192749, + "rewards/margins_min": -0.074953094124794, + "rewards/margins_std": 0.0965457558631897, + "rewards/rejected": 0.10184504091739655, + "step": 1850 + }, + { + "dpo_losses": 0.6566171050071716, + "epoch": 0.49, + "grad_norm": 6.202071170028259, + "learning_rate": 3.048345150370226e-07, + "logits/chosen": -2.6942129135131836, + "logits/rejected": -2.657686471939087, + "logps/chosen": -268.5079345703125, + "logps/rejected": -256.33282470703125, + "loss": 0.6743, + "positive_losses": 0.3425118327140808, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16685865819454193, + "rewards/margins": 0.07956352084875107, + "rewards/margins_max": 0.22378845512866974, + "rewards/margins_min": -0.03199433535337448, + "rewards/margins_std": 0.11701379716396332, + "rewards/rejected": 0.08729512244462967, + "step": 1860 + }, + { + "dpo_losses": 0.6586459875106812, + "epoch": 0.49, + "grad_norm": 2.081671535200173, + "learning_rate": 3.0260342347574913e-07, + "logits/chosen": -2.705451488494873, + "logits/rejected": -2.723024606704712, + "logps/chosen": -259.8138732910156, + "logps/rejected": -275.05108642578125, + "loss": 0.6623, + "positive_losses": 0.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15820066630840302, + "rewards/margins": 0.07316181063652039, + "rewards/margins_max": 0.16405954957008362, + "rewards/margins_min": -0.02333083376288414, + "rewards/margins_std": 0.08455059677362442, + "rewards/rejected": 0.08503885567188263, + "step": 1870 + }, + { + "dpo_losses": 0.6664489507675171, + "epoch": 0.49, + "grad_norm": 15.607777129020638, + "learning_rate": 3.0036793954375357e-07, + "logits/chosen": -2.789748430252075, + "logits/rejected": -2.715607166290283, + "logps/chosen": -258.93841552734375, + "logps/rejected": -248.56689453125, + "loss": 0.6671, + "positive_losses": 0.006903409957885742, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.15480181574821472, + "rewards/margins": 0.05630043148994446, + "rewards/margins_max": 0.15635153651237488, + "rewards/margins_min": -0.04761496186256409, + "rewards/margins_std": 0.09293356537818909, + "rewards/rejected": 0.09850136935710907, + "step": 1880 + }, + { + "dpo_losses": 0.6808010339736938, + "epoch": 0.49, + "grad_norm": 2.1090421060742464, + "learning_rate": 2.9812824990330085e-07, + "logits/chosen": -2.8155131340026855, + "logits/rejected": -2.814131259918213, + "logps/chosen": -288.41436767578125, + "logps/rejected": -339.4684753417969, + "loss": 0.6773, + "positive_losses": 0.14153671264648438, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.12658026814460754, + "rewards/margins": 0.02811632677912712, + "rewards/margins_max": 0.14468391239643097, + "rewards/margins_min": -0.0852559506893158, + "rewards/margins_std": 0.1038467064499855, + "rewards/rejected": 0.09846396744251251, + "step": 1890 + }, + { + "dpo_losses": 0.6638901233673096, + "epoch": 0.5, + "grad_norm": 2.030096695011095, + "learning_rate": 2.958845415678316e-07, + "logits/chosen": -2.841752052307129, + "logits/rejected": -2.759030818939209, + "logps/chosen": -267.19378662109375, + "logps/rejected": -222.11886596679688, + "loss": 0.691, + "positive_losses": 0.27200716733932495, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.144225612282753, + "rewards/margins": 0.06363539397716522, + "rewards/margins_max": 0.20767728984355927, + "rewards/margins_min": -0.053487379103899, + "rewards/margins_std": 0.11568088829517365, + "rewards/rejected": 0.08059023320674896, + "step": 1900 + }, + { + "epoch": 0.5, + "eval_dpo_losses": 0.6649388074874878, + "eval_logits/chosen": -2.78464412689209, + "eval_logits/rejected": -2.746262788772583, + "eval_logps/chosen": -269.3756408691406, + "eval_logps/rejected": -253.0802459716797, + "eval_loss": 0.6836426854133606, + "eval_positive_losses": 0.13374747335910797, + "eval_rewards/accuracies": 0.7222222089767456, + "eval_rewards/chosen": 0.15119239687919617, + "eval_rewards/margins": 0.06050121411681175, + "eval_rewards/margins_max": 0.23447264730930328, + "eval_rewards/margins_min": -0.09601601213216782, + "eval_rewards/margins_std": 0.10917651653289795, + "eval_rewards/rejected": 0.09069117158651352, + "eval_runtime": 390.0032, + "eval_samples_per_second": 5.128, + "eval_steps_per_second": 0.162, + "step": 1900 + }, + { + "dpo_losses": 0.6559640765190125, + "epoch": 0.5, + "grad_norm": 12.147662505424783, + "learning_rate": 2.936370018863459e-07, + "logits/chosen": -2.8035902976989746, + "logits/rejected": -2.748021125793457, + "logps/chosen": -241.41848754882812, + "logps/rejected": -229.57290649414062, + "loss": 0.6783, + "positive_losses": 0.0, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.16947892308235168, + "rewards/margins": 0.0783829391002655, + "rewards/margins_max": 0.19369716942310333, + "rewards/margins_min": -0.01700790971517563, + "rewards/margins_std": 0.09164775907993317, + "rewards/rejected": 0.09109597653150558, + "step": 1910 + }, + { + "dpo_losses": 0.6684740781784058, + "epoch": 0.5, + "grad_norm": 4.552578734000376, + "learning_rate": 2.913858185277605e-07, + "logits/chosen": -2.7616660594940186, + "logits/rejected": -2.737854242324829, + "logps/chosen": -258.0267028808594, + "logps/rejected": -198.5316925048828, + "loss": 0.6805, + "positive_losses": 0.19840697944164276, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.14402440190315247, + "rewards/margins": 0.052981119602918625, + "rewards/margins_max": 0.17679978907108307, + "rewards/margins_min": -0.04407616704702377, + "rewards/margins_std": 0.09694145619869232, + "rewards/rejected": 0.09104329347610474, + "step": 1920 + }, + { + "dpo_losses": 0.65994793176651, + "epoch": 0.51, + "grad_norm": 7.729074592418925, + "learning_rate": 2.89131179465238e-07, + "logits/chosen": -2.8554089069366455, + "logits/rejected": -2.733668327331543, + "logps/chosen": -340.9375305175781, + "logps/rejected": -238.2502899169922, + "loss": 0.6778, + "positive_losses": 0.10468940436840057, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.14773961901664734, + "rewards/margins": 0.0713655948638916, + "rewards/margins_max": 0.19929155707359314, + "rewards/margins_min": -0.032435785979032516, + "rewards/margins_std": 0.10413695871829987, + "rewards/rejected": 0.07637403905391693, + "step": 1930 + }, + { + "dpo_losses": 0.6654216647148132, + "epoch": 0.51, + "grad_norm": 19.203100367406904, + "learning_rate": 2.8687327296049125e-07, + "logits/chosen": -2.791396379470825, + "logits/rejected": -2.7972702980041504, + "logps/chosen": -253.9713592529297, + "logps/rejected": -249.3101348876953, + "loss": 0.6877, + "positive_losses": 0.1420997679233551, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.13887836039066315, + "rewards/margins": 0.05868466570973396, + "rewards/margins_max": 0.15763770043849945, + "rewards/margins_min": -0.03611772507429123, + "rewards/margins_std": 0.08720506727695465, + "rewards/rejected": 0.08019369840621948, + "step": 1940 + }, + { + "dpo_losses": 0.6656764149665833, + "epoch": 0.51, + "grad_norm": 1.9695130337905855, + "learning_rate": 2.846122875480637e-07, + "logits/chosen": -2.816713333129883, + "logits/rejected": -2.8190550804138184, + "logps/chosen": -278.9736022949219, + "logps/rejected": -269.1643981933594, + "loss": 0.6844, + "positive_losses": 0.1202617660164833, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.14900079369544983, + "rewards/margins": 0.05904467776417732, + "rewards/margins_max": 0.17524728178977966, + "rewards/margins_min": -0.06684452295303345, + "rewards/margins_std": 0.10755829513072968, + "rewards/rejected": 0.0899561196565628, + "step": 1950 + }, + { + "dpo_losses": 0.6570440530776978, + "epoch": 0.51, + "grad_norm": 10.334737948863516, + "learning_rate": 2.8234841201958647e-07, + "logits/chosen": -2.8929736614227295, + "logits/rejected": -2.8422701358795166, + "logps/chosen": -280.4637756347656, + "logps/rejected": -246.18167114257812, + "loss": 0.6811, + "positive_losses": 0.0, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.16151626408100128, + "rewards/margins": 0.07666916400194168, + "rewards/margins_max": 0.19590993225574493, + "rewards/margins_min": -0.03747622296214104, + "rewards/margins_std": 0.10714240372180939, + "rewards/rejected": 0.0848471149802208, + "step": 1960 + }, + { + "dpo_losses": 0.6631742715835571, + "epoch": 0.52, + "grad_norm": 9.15214743507445, + "learning_rate": 2.800818354080148e-07, + "logits/chosen": -2.8967909812927246, + "logits/rejected": -2.877074718475342, + "logps/chosen": -274.1905517578125, + "logps/rejected": -249.51174926757812, + "loss": 0.6852, + "positive_losses": 0.31591281294822693, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.15203619003295898, + "rewards/margins": 0.064478799700737, + "rewards/margins_max": 0.17947709560394287, + "rewards/margins_min": -0.04000955447554588, + "rewards/margins_std": 0.09873761236667633, + "rewards/rejected": 0.08755739033222198, + "step": 1970 + }, + { + "dpo_losses": 0.6746450662612915, + "epoch": 0.52, + "grad_norm": 11.208157524989577, + "learning_rate": 2.778127469718435e-07, + "logits/chosen": -2.818582534790039, + "logits/rejected": -2.7697455883026123, + "logps/chosen": -195.31723022460938, + "logps/rejected": -208.20095825195312, + "loss": 0.6837, + "positive_losses": 0.24081268906593323, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.15358874201774597, + "rewards/margins": 0.0399480015039444, + "rewards/margins_max": 0.15406204760074615, + "rewards/margins_min": -0.04506593197584152, + "rewards/margins_std": 0.09146953374147415, + "rewards/rejected": 0.11364071071147919, + "step": 1980 + }, + { + "dpo_losses": 0.6614011526107788, + "epoch": 0.52, + "grad_norm": 1.8222015140618013, + "learning_rate": 2.755413361793039e-07, + "logits/chosen": -2.841437816619873, + "logits/rejected": -2.704587936401367, + "logps/chosen": -250.64920043945312, + "logps/rejected": -241.6019287109375, + "loss": 0.6725, + "positive_losses": 0.10384368896484375, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.14233501255512238, + "rewards/margins": 0.0674857422709465, + "rewards/margins_max": 0.18242308497428894, + "rewards/margins_min": -0.029808182269334793, + "rewards/margins_std": 0.09713619947433472, + "rewards/rejected": 0.07484927773475647, + "step": 1990 + }, + { + "dpo_losses": 0.6818459630012512, + "epoch": 0.52, + "grad_norm": 1.9816734366893491, + "learning_rate": 2.7326779269254356e-07, + "logits/chosen": -2.785947799682617, + "logits/rejected": -2.7597334384918213, + "logps/chosen": -222.0934600830078, + "logps/rejected": -214.197021484375, + "loss": 0.6743, + "positive_losses": 0.22264710068702698, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.13967108726501465, + "rewards/margins": 0.02557896077632904, + "rewards/margins_max": 0.13581949472427368, + "rewards/margins_min": -0.0820813775062561, + "rewards/margins_std": 0.10081374645233154, + "rewards/rejected": 0.11409211158752441, + "step": 2000 + }, + { + "epoch": 0.52, + "eval_dpo_losses": 0.6653285622596741, + "eval_logits/chosen": -2.78452730178833, + "eval_logits/rejected": -2.7460243701934814, + "eval_logps/chosen": -268.96856689453125, + "eval_logps/rejected": -252.5889129638672, + "eval_loss": 0.6819599866867065, + "eval_positive_losses": 0.11702584475278854, + "eval_rewards/accuracies": 0.7182539701461792, + "eval_rewards/chosen": 0.15526309609413147, + "eval_rewards/margins": 0.059658586978912354, + "eval_rewards/margins_max": 0.23282285034656525, + "eval_rewards/margins_min": -0.09585469961166382, + "eval_rewards/margins_std": 0.10850544273853302, + "eval_rewards/rejected": 0.09560451656579971, + "eval_runtime": 389.4379, + "eval_samples_per_second": 5.136, + "eval_steps_per_second": 0.162, + "step": 2000 + }, + { + "dpo_losses": 0.6639446020126343, + "epoch": 0.53, + "grad_norm": 2.148193567473034, + "learning_rate": 2.709923063517895e-07, + "logits/chosen": -2.7333264350891113, + "logits/rejected": -2.756164073944092, + "logps/chosen": -242.3228759765625, + "logps/rejected": -218.51513671875, + "loss": 0.6658, + "positive_losses": 0.08923111110925674, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.1582655906677246, + "rewards/margins": 0.06178750470280647, + "rewards/margins_max": 0.17090222239494324, + "rewards/margins_min": -0.03684164583683014, + "rewards/margins_std": 0.09074191749095917, + "rewards/rejected": 0.09647808969020844, + "step": 2010 + }, + { + "dpo_losses": 0.6573031544685364, + "epoch": 0.53, + "grad_norm": 10.929660276279535, + "learning_rate": 2.68715067159496e-07, + "logits/chosen": -2.8864777088165283, + "logits/rejected": -2.8166232109069824, + "logps/chosen": -290.9087219238281, + "logps/rejected": -229.5659637451172, + "loss": 0.6787, + "positive_losses": 0.08258895576000214, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.17253132164478302, + "rewards/margins": 0.0766986683011055, + "rewards/margins_max": 0.20891022682189941, + "rewards/margins_min": -0.05160030722618103, + "rewards/margins_std": 0.11397655308246613, + "rewards/rejected": 0.09583264589309692, + "step": 2020 + }, + { + "dpo_losses": 0.6642154455184937, + "epoch": 0.53, + "grad_norm": 7.321003721451175, + "learning_rate": 2.664362652644806e-07, + "logits/chosen": -2.8380165100097656, + "logits/rejected": -2.8278119564056396, + "logps/chosen": -271.6103820800781, + "logps/rejected": -254.341796875, + "loss": 0.6834, + "positive_losses": 0.34309062361717224, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.148406982421875, + "rewards/margins": 0.06311032921075821, + "rewards/margins_max": 0.21719925105571747, + "rewards/margins_min": -0.05854882672429085, + "rewards/margins_std": 0.12032978236675262, + "rewards/rejected": 0.0852966457605362, + "step": 2030 + }, + { + "dpo_losses": 0.6638925075531006, + "epoch": 0.53, + "grad_norm": 2.022652020676524, + "learning_rate": 2.6415609094604555e-07, + "logits/chosen": -2.616114854812622, + "logits/rejected": -2.670973062515259, + "logps/chosen": -285.9636535644531, + "logps/rejected": -204.63404846191406, + "loss": 0.6706, + "positive_losses": 0.14490394294261932, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.137128084897995, + "rewards/margins": 0.062432728707790375, + "rewards/margins_max": 0.17718909680843353, + "rewards/margins_min": -0.04789603129029274, + "rewards/margins_std": 0.09822549670934677, + "rewards/rejected": 0.07469536364078522, + "step": 2040 + }, + { + "dpo_losses": 0.6626821160316467, + "epoch": 0.54, + "grad_norm": 17.23544803436547, + "learning_rate": 2.618747345980904e-07, + "logits/chosen": -2.7996604442596436, + "logits/rejected": -2.7959325313568115, + "logps/chosen": -266.400146484375, + "logps/rejected": -245.243896484375, + "loss": 0.6937, + "positive_losses": 0.0, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.14582450687885284, + "rewards/margins": 0.06493877619504929, + "rewards/margins_max": 0.17728032171726227, + "rewards/margins_min": -0.05895475670695305, + "rewards/margins_std": 0.10525840520858765, + "rewards/rejected": 0.08088572323322296, + "step": 2050 + }, + { + "dpo_losses": 0.6693453192710876, + "epoch": 0.54, + "grad_norm": 10.744319390725588, + "learning_rate": 2.595923867132136e-07, + "logits/chosen": -2.8013222217559814, + "logits/rejected": -2.7826759815216064, + "logps/chosen": -293.9305419921875, + "logps/rejected": -248.7853240966797, + "loss": 0.6889, + "positive_losses": 0.3333267271518707, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.16735167801380157, + "rewards/margins": 0.05091765522956848, + "rewards/margins_max": 0.16602104902267456, + "rewards/margins_min": -0.04544571787118912, + "rewards/margins_std": 0.09119518101215363, + "rewards/rejected": 0.1164340227842331, + "step": 2060 + }, + { + "dpo_losses": 0.6830196976661682, + "epoch": 0.54, + "grad_norm": 1.9390942567104548, + "learning_rate": 2.5730923786680667e-07, + "logits/chosen": -2.7646231651306152, + "logits/rejected": -2.766101837158203, + "logps/chosen": -214.64804077148438, + "logps/rejected": -267.1541748046875, + "loss": 0.6715, + "positive_losses": 0.0, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.14950905740261078, + "rewards/margins": 0.02300949953496456, + "rewards/margins_max": 0.11503533273935318, + "rewards/margins_min": -0.07700347900390625, + "rewards/margins_std": 0.08432716876268387, + "rewards/rejected": 0.12649956345558167, + "step": 2070 + }, + { + "dpo_losses": 0.6756635308265686, + "epoch": 0.54, + "grad_norm": 2.0422233549208757, + "learning_rate": 2.5502547870114135e-07, + "logits/chosen": -2.8308072090148926, + "logits/rejected": -2.7787580490112305, + "logps/chosen": -208.3933563232422, + "logps/rejected": -225.6388397216797, + "loss": 0.6679, + "positive_losses": 0.0, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.15099266171455383, + "rewards/margins": 0.03903906047344208, + "rewards/margins_max": 0.14198842644691467, + "rewards/margins_min": -0.09506646543741226, + "rewards/margins_std": 0.10874740779399872, + "rewards/rejected": 0.11195359379053116, + "step": 2080 + }, + { + "dpo_losses": 0.6744376420974731, + "epoch": 0.55, + "grad_norm": 8.139545273885084, + "learning_rate": 2.527412999094506e-07, + "logits/chosen": -2.7396240234375, + "logits/rejected": -2.7286412715911865, + "logps/chosen": -257.28582763671875, + "logps/rejected": -290.82537841796875, + "loss": 0.6855, + "positive_losses": 0.23939552903175354, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.12208755314350128, + "rewards/margins": 0.04112662002444267, + "rewards/margins_max": 0.15945208072662354, + "rewards/margins_min": -0.07624942809343338, + "rewards/margins_std": 0.10272153466939926, + "rewards/rejected": 0.08096092194318771, + "step": 2090 + }, + { + "dpo_losses": 0.6722500920295715, + "epoch": 0.55, + "grad_norm": 1.7640498739488435, + "learning_rate": 2.5045689222000636e-07, + "logits/chosen": -2.7935140132904053, + "logits/rejected": -2.794473171234131, + "logps/chosen": -237.05728149414062, + "logps/rejected": -202.26231384277344, + "loss": 0.6787, + "positive_losses": 0.0, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.13129688799381256, + "rewards/margins": 0.04541920870542526, + "rewards/margins_max": 0.1597176045179367, + "rewards/margins_min": -0.04207003861665726, + "rewards/margins_std": 0.09029006958007812, + "rewards/rejected": 0.0858776792883873, + "step": 2100 + }, + { + "epoch": 0.55, + "eval_dpo_losses": 0.6646059155464172, + "eval_logits/chosen": -2.783231735229492, + "eval_logits/rejected": -2.7445175647735596, + "eval_logps/chosen": -269.0392761230469, + "eval_logps/rejected": -252.820556640625, + "eval_loss": 0.6826277375221252, + "eval_positive_losses": 0.12551376223564148, + "eval_rewards/accuracies": 0.7182539701461792, + "eval_rewards/chosen": 0.15455636382102966, + "eval_rewards/margins": 0.06126810982823372, + "eval_rewards/margins_max": 0.2372942566871643, + "eval_rewards/margins_min": -0.09696952998638153, + "eval_rewards/margins_std": 0.11048813909292221, + "eval_rewards/rejected": 0.09328825026750565, + "eval_runtime": 399.0656, + "eval_samples_per_second": 5.012, + "eval_steps_per_second": 0.158, + "step": 2100 + }, + { + "dpo_losses": 0.6669995188713074, + "epoch": 0.55, + "grad_norm": 1.8424762916851205, + "learning_rate": 2.481724463801933e-07, + "logits/chosen": -2.7471299171447754, + "logits/rejected": -2.6674296855926514, + "logps/chosen": -251.7501983642578, + "logps/rejected": -212.1397247314453, + "loss": 0.6783, + "positive_losses": 0.0, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.15100225806236267, + "rewards/margins": 0.0555257685482502, + "rewards/margins_max": 0.18034568428993225, + "rewards/margins_min": -0.029644068330526352, + "rewards/margins_std": 0.09330420196056366, + "rewards/rejected": 0.09547650068998337, + "step": 2110 + }, + { + "dpo_losses": 0.6630354523658752, + "epoch": 0.55, + "grad_norm": 21.202486662507965, + "learning_rate": 2.4588815314058154e-07, + "logits/chosen": -2.729743003845215, + "logits/rejected": -2.711665391921997, + "logps/chosen": -227.17294311523438, + "logps/rejected": -248.97134399414062, + "loss": 0.673, + "positive_losses": 0.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1562889814376831, + "rewards/margins": 0.06316865980625153, + "rewards/margins_max": 0.13936588168144226, + "rewards/margins_min": -0.014763864688575268, + "rewards/margins_std": 0.07098125666379929, + "rewards/rejected": 0.09312032163143158, + "step": 2120 + }, + { + "dpo_losses": 0.6803591847419739, + "epoch": 0.56, + "grad_norm": 2.3717799191522384, + "learning_rate": 2.4360420323899917e-07, + "logits/chosen": -2.7886428833007812, + "logits/rejected": -2.7954256534576416, + "logps/chosen": -194.2913055419922, + "logps/rejected": -247.00051879882812, + "loss": 0.6785, + "positive_losses": 0.07509269565343857, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.14525488018989563, + "rewards/margins": 0.02834526263177395, + "rewards/margins_max": 0.1326114535331726, + "rewards/margins_min": -0.0706716850399971, + "rewards/margins_std": 0.09119173139333725, + "rewards/rejected": 0.11690962314605713, + "step": 2130 + }, + { + "dpo_losses": 0.6663404107093811, + "epoch": 0.56, + "grad_norm": 2.462527388212223, + "learning_rate": 2.4132078738460583e-07, + "logits/chosen": -2.8206303119659424, + "logits/rejected": -2.8253495693206787, + "logps/chosen": -274.0443420410156, + "logps/rejected": -268.05853271484375, + "loss": 0.6789, + "positive_losses": 0.10881118476390839, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.16541233658790588, + "rewards/margins": 0.0581609308719635, + "rewards/margins_max": 0.18914298713207245, + "rewards/margins_min": -0.05697988346219063, + "rewards/margins_std": 0.10790137201547623, + "rewards/rejected": 0.10725139081478119, + "step": 2140 + }, + { + "dpo_losses": 0.6610409617424011, + "epoch": 0.56, + "grad_norm": 8.627465821659698, + "learning_rate": 2.390380962419682e-07, + "logits/chosen": -2.8324084281921387, + "logits/rejected": -2.7803261280059814, + "logps/chosen": -268.6856994628906, + "logps/rejected": -203.6580810546875, + "loss": 0.6753, + "positive_losses": 0.0, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1652398705482483, + "rewards/margins": 0.06922824680805206, + "rewards/margins_max": 0.20310267806053162, + "rewards/margins_min": -0.04356042295694351, + "rewards/margins_std": 0.11182417720556259, + "rewards/rejected": 0.09601160883903503, + "step": 2150 + }, + { + "dpo_losses": 0.672164797782898, + "epoch": 0.57, + "grad_norm": 6.040860598055274, + "learning_rate": 2.3675632041513977e-07, + "logits/chosen": -2.655301570892334, + "logits/rejected": -2.660374879837036, + "logps/chosen": -208.69058227539062, + "logps/rejected": -226.7170867919922, + "loss": 0.6758, + "positive_losses": 0.11751461029052734, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1308482438325882, + "rewards/margins": 0.04433317855000496, + "rewards/margins_max": 0.1322614550590515, + "rewards/margins_min": -0.02744942344725132, + "rewards/margins_std": 0.07075206935405731, + "rewards/rejected": 0.08651508390903473, + "step": 2160 + }, + { + "dpo_losses": 0.6654033064842224, + "epoch": 0.57, + "grad_norm": 9.69070952377682, + "learning_rate": 2.344756504317453e-07, + "logits/chosen": -2.625159502029419, + "logits/rejected": -2.6462783813476562, + "logps/chosen": -237.1065216064453, + "logps/rejected": -233.6138916015625, + "loss": 0.6773, + "positive_losses": 0.12059593200683594, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.14112631976604462, + "rewards/margins": 0.05939141660928726, + "rewards/margins_max": 0.1860862672328949, + "rewards/margins_min": -0.05647587776184082, + "rewards/margins_std": 0.10976777970790863, + "rewards/rejected": 0.08173491060733795, + "step": 2170 + }, + { + "dpo_losses": 0.6683284640312195, + "epoch": 0.57, + "grad_norm": 20.456535597036282, + "learning_rate": 2.3219627672707237e-07, + "logits/chosen": -2.898542642593384, + "logits/rejected": -2.832404375076294, + "logps/chosen": -323.9666442871094, + "logps/rejected": -243.0486297607422, + "loss": 0.675, + "positive_losses": 0.0, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.16770347952842712, + "rewards/margins": 0.0541713610291481, + "rewards/margins_max": 0.19824166595935822, + "rewards/margins_min": -0.07504001259803772, + "rewards/margins_std": 0.12081035226583481, + "rewards/rejected": 0.11353211104869843, + "step": 2180 + }, + { + "dpo_losses": 0.6708992719650269, + "epoch": 0.57, + "grad_norm": 5.65471210964526, + "learning_rate": 2.2991838962816918e-07, + "logits/chosen": -2.854893207550049, + "logits/rejected": -2.881042718887329, + "logps/chosen": -269.02606201171875, + "logps/rejected": -322.48980712890625, + "loss": 0.6757, + "positive_losses": 0.005802154541015625, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.13117845356464386, + "rewards/margins": 0.048213545233011246, + "rewards/margins_max": 0.16716358065605164, + "rewards/margins_min": -0.08829066902399063, + "rewards/margins_std": 0.11207526922225952, + "rewards/rejected": 0.08296488225460052, + "step": 2190 + }, + { + "dpo_losses": 0.6560848951339722, + "epoch": 0.58, + "grad_norm": 2.031732237865772, + "learning_rate": 2.2764217933795297e-07, + "logits/chosen": -2.8572869300842285, + "logits/rejected": -2.7690012454986572, + "logps/chosen": -347.05364990234375, + "logps/rejected": -270.6988525390625, + "loss": 0.6738, + "positive_losses": 0.16602382063865662, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.17479580640792847, + "rewards/margins": 0.07932952046394348, + "rewards/margins_max": 0.21747556328773499, + "rewards/margins_min": -0.0325501449406147, + "rewards/margins_std": 0.11231104284524918, + "rewards/rejected": 0.09546627104282379, + "step": 2200 + }, + { + "epoch": 0.58, + "eval_dpo_losses": 0.6644929647445679, + "eval_logits/chosen": -2.780320644378662, + "eval_logits/rejected": -2.7417633533477783, + "eval_logps/chosen": -268.6586608886719, + "eval_logps/rejected": -252.46463012695312, + "eval_loss": 0.6815550327301025, + "eval_positive_losses": 0.1156652644276619, + "eval_rewards/accuracies": 0.7182539701461792, + "eval_rewards/chosen": 0.1583622545003891, + "eval_rewards/margins": 0.06151484698057175, + "eval_rewards/margins_max": 0.23826445639133453, + "eval_rewards/margins_min": -0.09692387282848358, + "eval_rewards/margins_std": 0.11083362251520157, + "eval_rewards/rejected": 0.09684741497039795, + "eval_runtime": 389.3755, + "eval_samples_per_second": 5.136, + "eval_steps_per_second": 0.162, + "step": 2200 + }, + { + "dpo_losses": 0.6702221035957336, + "epoch": 0.58, + "grad_norm": 1.8553276570532913, + "learning_rate": 2.253678359193278e-07, + "logits/chosen": -2.837172746658325, + "logits/rejected": -2.839228868484497, + "logps/chosen": -249.7177276611328, + "logps/rejected": -244.18408203125, + "loss": 0.676, + "positive_losses": 0.37015992403030396, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.16470842063426971, + "rewards/margins": 0.0498427152633667, + "rewards/margins_max": 0.17060586810112, + "rewards/margins_min": -0.08222378045320511, + "rewards/margins_std": 0.11427643150091171, + "rewards/rejected": 0.11486568301916122, + "step": 2210 + }, + { + "dpo_losses": 0.6609566807746887, + "epoch": 0.58, + "grad_norm": 2.680723118088112, + "learning_rate": 2.230955492793149e-07, + "logits/chosen": -2.8189260959625244, + "logits/rejected": -2.804194688796997, + "logps/chosen": -306.3068542480469, + "logps/rejected": -256.5824890136719, + "loss": 0.6763, + "positive_losses": 0.0, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.15530112385749817, + "rewards/margins": 0.06911114603281021, + "rewards/margins_max": 0.2111690789461136, + "rewards/margins_min": -0.06732382625341415, + "rewards/margins_std": 0.1288124918937683, + "rewards/rejected": 0.08619000017642975, + "step": 2220 + }, + { + "dpo_losses": 0.6690382957458496, + "epoch": 0.58, + "grad_norm": 6.015237533179085, + "learning_rate": 2.2082550915319468e-07, + "logits/chosen": -2.770582675933838, + "logits/rejected": -2.7712674140930176, + "logps/chosen": -246.59390258789062, + "logps/rejected": -265.6105041503906, + "loss": 0.6801, + "positive_losses": 0.2791542112827301, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1340925395488739, + "rewards/margins": 0.05243048071861267, + "rewards/margins_max": 0.18013811111450195, + "rewards/margins_min": -0.05579759553074837, + "rewards/margins_std": 0.10320155322551727, + "rewards/rejected": 0.08166205883026123, + "step": 2230 + }, + { + "dpo_losses": 0.667000412940979, + "epoch": 0.59, + "grad_norm": 2.614393513591382, + "learning_rate": 2.1855790508866433e-07, + "logits/chosen": -2.8280067443847656, + "logits/rejected": -2.782679557800293, + "logps/chosen": -272.31341552734375, + "logps/rejected": -226.23159790039062, + "loss": 0.6911, + "positive_losses": 0.0, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.15997463464736938, + "rewards/margins": 0.05621781200170517, + "rewards/margins_max": 0.1634344756603241, + "rewards/margins_min": -0.05732632428407669, + "rewards/margins_std": 0.1002826914191246, + "rewards/rejected": 0.10375680774450302, + "step": 2240 + }, + { + "dpo_losses": 0.6544098258018494, + "epoch": 0.59, + "grad_norm": 6.138037993242967, + "learning_rate": 2.162929264300107e-07, + "logits/chosen": -2.8011958599090576, + "logits/rejected": -2.787996292114258, + "logps/chosen": -271.0982360839844, + "logps/rejected": -234.0089111328125, + "loss": 0.6835, + "positive_losses": 0.12156429141759872, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.1650722324848175, + "rewards/margins": 0.08291522413492203, + "rewards/margins_max": 0.24250057339668274, + "rewards/margins_min": -0.02806040272116661, + "rewards/margins_std": 0.12254680693149567, + "rewards/rejected": 0.08215700834989548, + "step": 2250 + }, + { + "dpo_losses": 0.667629599571228, + "epoch": 0.59, + "grad_norm": 2.031461968732793, + "learning_rate": 2.1403076230230005e-07, + "logits/chosen": -2.7610487937927246, + "logits/rejected": -2.715662717819214, + "logps/chosen": -227.35494995117188, + "logps/rejected": -223.0266571044922, + "loss": 0.6805, + "positive_losses": 0.009455109015107155, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.139937624335289, + "rewards/margins": 0.05409275367856026, + "rewards/margins_max": 0.167481929063797, + "rewards/margins_min": -0.03251870721578598, + "rewards/margins_std": 0.09049482643604279, + "rewards/rejected": 0.08584487438201904, + "step": 2260 + }, + { + "dpo_losses": 0.6574854850769043, + "epoch": 0.59, + "grad_norm": 2.015774625207423, + "learning_rate": 2.1177160159558596e-07, + "logits/chosen": -2.7960407733917236, + "logits/rejected": -2.708859443664551, + "logps/chosen": -246.37698364257812, + "logps/rejected": -247.0392608642578, + "loss": 0.6688, + "positive_losses": 0.21091079711914062, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.17055392265319824, + "rewards/margins": 0.07748468220233917, + "rewards/margins_max": 0.24710354208946228, + "rewards/margins_min": -0.05870268493890762, + "rewards/margins_std": 0.13558170199394226, + "rewards/rejected": 0.09306924045085907, + "step": 2270 + }, + { + "dpo_losses": 0.6679006814956665, + "epoch": 0.6, + "grad_norm": 10.93004175120596, + "learning_rate": 2.0951563294913734e-07, + "logits/chosen": -2.8326520919799805, + "logits/rejected": -2.8108348846435547, + "logps/chosen": -244.97140502929688, + "logps/rejected": -255.8402862548828, + "loss": 0.6984, + "positive_losses": 0.2469741851091385, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.14643892645835876, + "rewards/margins": 0.05538954213261604, + "rewards/margins_max": 0.17577563226222992, + "rewards/margins_min": -0.08614195883274078, + "rewards/margins_std": 0.12158197164535522, + "rewards/rejected": 0.09104935824871063, + "step": 2280 + }, + { + "dpo_losses": 0.6632338762283325, + "epoch": 0.6, + "grad_norm": 1.677939068316556, + "learning_rate": 2.072630447356869e-07, + "logits/chosen": -2.7692759037017822, + "logits/rejected": -2.681032657623291, + "logps/chosen": -209.3841094970703, + "logps/rejected": -208.48489379882812, + "loss": 0.6739, + "positive_losses": 0.0, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.17293371260166168, + "rewards/margins": 0.06350459158420563, + "rewards/margins_max": 0.17482289671897888, + "rewards/margins_min": -0.030863529071211815, + "rewards/margins_std": 0.09300075471401215, + "rewards/rejected": 0.10942912101745605, + "step": 2290 + }, + { + "dpo_losses": 0.6755298376083374, + "epoch": 0.6, + "grad_norm": 9.301373484924401, + "learning_rate": 2.0501402504570232e-07, + "logits/chosen": -2.7473981380462646, + "logits/rejected": -2.7651193141937256, + "logps/chosen": -241.1534881591797, + "logps/rejected": -235.19644165039062, + "loss": 0.675, + "positive_losses": 0.027724647894501686, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14800508320331573, + "rewards/margins": 0.04038618132472038, + "rewards/margins_max": 0.1762312650680542, + "rewards/margins_min": -0.09547598659992218, + "rewards/margins_std": 0.1235724464058876, + "rewards/rejected": 0.10761890560388565, + "step": 2300 + }, + { + "epoch": 0.6, + "eval_dpo_losses": 0.6642228960990906, + "eval_logits/chosen": -2.7834320068359375, + "eval_logits/rejected": -2.744966506958008, + "eval_logps/chosen": -268.5911560058594, + "eval_logps/rejected": -252.45948791503906, + "eval_loss": 0.6816120743751526, + "eval_positive_losses": 0.12096957862377167, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": 0.15903764963150024, + "eval_rewards/margins": 0.06213868409395218, + "eval_rewards/margins_max": 0.24037744104862213, + "eval_rewards/margins_min": -0.09743154793977737, + "eval_rewards/margins_std": 0.11178537458181381, + "eval_rewards/rejected": 0.09689898043870926, + "eval_runtime": 390.1632, + "eval_samples_per_second": 5.126, + "eval_steps_per_second": 0.161, + "step": 2300 + }, + { + "dpo_losses": 0.6589769124984741, + "epoch": 0.6, + "grad_norm": 5.2593957294275455, + "learning_rate": 2.027687616716804e-07, + "logits/chosen": -2.848520517349243, + "logits/rejected": -2.7592415809631348, + "logps/chosen": -311.2559509277344, + "logps/rejected": -236.20046997070312, + "loss": 0.6641, + "positive_losses": 0.06801052391529083, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.16040074825286865, + "rewards/margins": 0.07437174022197723, + "rewards/margins_max": 0.25320133566856384, + "rewards/margins_min": -0.03677508980035782, + "rewards/margins_std": 0.1288147270679474, + "rewards/rejected": 0.08602902293205261, + "step": 2310 + }, + { + "dpo_losses": 0.677810788154602, + "epoch": 0.61, + "grad_norm": 11.593297903289386, + "learning_rate": 2.005274420924668e-07, + "logits/chosen": -2.7715988159179688, + "logits/rejected": -2.731139659881592, + "logps/chosen": -266.9964599609375, + "logps/rejected": -257.47216796875, + "loss": 0.6717, + "positive_losses": 0.07237549126148224, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.16181442141532898, + "rewards/margins": 0.03579026088118553, + "rewards/margins_max": 0.17425528168678284, + "rewards/margins_min": -0.10560673475265503, + "rewards/margins_std": 0.12581433355808258, + "rewards/rejected": 0.12602415680885315, + "step": 2320 + }, + { + "dpo_losses": 0.6600149869918823, + "epoch": 0.61, + "grad_norm": 6.220344684041331, + "learning_rate": 1.9829025345760121e-07, + "logits/chosen": -2.8089206218719482, + "logits/rejected": -2.7957355976104736, + "logps/chosen": -284.0976257324219, + "logps/rejected": -308.8465881347656, + "loss": 0.6715, + "positive_losses": 0.14536181092262268, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.16044309735298157, + "rewards/margins": 0.0716666579246521, + "rewards/margins_max": 0.19783233106136322, + "rewards/margins_min": -0.04586394503712654, + "rewards/margins_std": 0.1109810620546341, + "rewards/rejected": 0.08877645432949066, + "step": 2330 + }, + { + "dpo_losses": 0.679665744304657, + "epoch": 0.61, + "grad_norm": 2.0598822656338167, + "learning_rate": 1.960573825716911e-07, + "logits/chosen": -2.822514533996582, + "logits/rejected": -2.7786355018615723, + "logps/chosen": -318.4324951171875, + "logps/rejected": -318.00213623046875, + "loss": 0.676, + "positive_losses": 0.089727021753788, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.15282617509365082, + "rewards/margins": 0.03167200833559036, + "rewards/margins_max": 0.17082737386226654, + "rewards/margins_min": -0.11809341609477997, + "rewards/margins_std": 0.12609757483005524, + "rewards/rejected": 0.12115416675806046, + "step": 2340 + }, + { + "dpo_losses": 0.6711980700492859, + "epoch": 0.62, + "grad_norm": 2.115547248617438, + "learning_rate": 1.9382901587881273e-07, + "logits/chosen": -2.834235429763794, + "logits/rejected": -2.8224523067474365, + "logps/chosen": -294.26800537109375, + "logps/rejected": -243.66738891601562, + "loss": 0.6746, + "positive_losses": 0.0, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1676826775074005, + "rewards/margins": 0.047168977558612823, + "rewards/margins_max": 0.14759781956672668, + "rewards/margins_min": -0.06013220548629761, + "rewards/margins_std": 0.09161853790283203, + "rewards/rejected": 0.1205136775970459, + "step": 2350 + }, + { + "dpo_losses": 0.6665584444999695, + "epoch": 0.62, + "grad_norm": 7.707595487455357, + "learning_rate": 1.9160533944694364e-07, + "logits/chosen": -2.813931703567505, + "logits/rejected": -2.763089179992676, + "logps/chosen": -270.51202392578125, + "logps/rejected": -204.17807006835938, + "loss": 0.6854, + "positive_losses": 0.49280548095703125, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.15664884448051453, + "rewards/margins": 0.05828050523996353, + "rewards/margins_max": 0.1808789223432541, + "rewards/margins_min": -0.05433495715260506, + "rewards/margins_std": 0.10690093040466309, + "rewards/rejected": 0.09836836159229279, + "step": 2360 + }, + { + "dpo_losses": 0.6780496835708618, + "epoch": 0.62, + "grad_norm": 2.712138500823685, + "learning_rate": 1.8938653895242602e-07, + "logits/chosen": -2.854897975921631, + "logits/rejected": -2.810944080352783, + "logps/chosen": -243.7271270751953, + "logps/rejected": -226.6142120361328, + "loss": 0.6882, + "positive_losses": 0.14659099280834198, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.13124720752239227, + "rewards/margins": 0.0334155447781086, + "rewards/margins_max": 0.12614381313323975, + "rewards/margins_min": -0.08595071732997894, + "rewards/margins_std": 0.09597662836313248, + "rewards/rejected": 0.09783166646957397, + "step": 2370 + }, + { + "dpo_losses": 0.6700248718261719, + "epoch": 0.62, + "grad_norm": 2.1139950489863195, + "learning_rate": 1.8717279966446264e-07, + "logits/chosen": -2.8533968925476074, + "logits/rejected": -2.8190789222717285, + "logps/chosen": -231.2190704345703, + "logps/rejected": -203.7569580078125, + "loss": 0.6723, + "positive_losses": 0.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.14022384583950043, + "rewards/margins": 0.04948741942644119, + "rewards/margins_max": 0.1588309109210968, + "rewards/margins_min": -0.044143229722976685, + "rewards/margins_std": 0.09119255840778351, + "rewards/rejected": 0.09073643386363983, + "step": 2380 + }, + { + "dpo_losses": 0.6732557415962219, + "epoch": 0.63, + "grad_norm": 23.04693061433968, + "learning_rate": 1.8496430642964694e-07, + "logits/chosen": -2.847764492034912, + "logits/rejected": -2.849989414215088, + "logps/chosen": -266.61572265625, + "logps/rejected": -279.8492431640625, + "loss": 0.6739, + "positive_losses": 0.03173675388097763, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16146275401115417, + "rewards/margins": 0.044871553778648376, + "rewards/margins_max": 0.18712307512760162, + "rewards/margins_min": -0.1083759069442749, + "rewards/margins_std": 0.12768994271755219, + "rewards/rejected": 0.1165911927819252, + "step": 2390 + }, + { + "dpo_losses": 0.6605753898620605, + "epoch": 0.63, + "grad_norm": 1.9620799350067777, + "learning_rate": 1.8276124365652855e-07, + "logits/chosen": -2.7887072563171387, + "logits/rejected": -2.7103452682495117, + "logps/chosen": -257.5548095703125, + "logps/rejected": -290.41815185546875, + "loss": 0.6821, + "positive_losses": 0.10989990085363388, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18278703093528748, + "rewards/margins": 0.07031223922967911, + "rewards/margins_max": 0.2028309404850006, + "rewards/margins_min": -0.05043686553835869, + "rewards/margins_std": 0.11253918707370758, + "rewards/rejected": 0.11247478425502777, + "step": 2400 + }, + { + "epoch": 0.63, + "eval_dpo_losses": 0.663316547870636, + "eval_logits/chosen": -2.784898042678833, + "eval_logits/rejected": -2.7465882301330566, + "eval_logps/chosen": -268.8607482910156, + "eval_logps/rejected": -252.93472290039062, + "eval_loss": 0.683151125907898, + "eval_positive_losses": 0.14105184376239777, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": 0.15634165704250336, + "eval_rewards/margins": 0.06419505923986435, + "eval_rewards/margins_max": 0.2465353161096573, + "eval_rewards/margins_min": -0.10097683221101761, + "eval_rewards/margins_std": 0.11476168036460876, + "eval_rewards/rejected": 0.09214659780263901, + "eval_runtime": 389.2899, + "eval_samples_per_second": 5.138, + "eval_steps_per_second": 0.162, + "step": 2400 + }, + { + "dpo_losses": 0.6710997819900513, + "epoch": 0.63, + "grad_norm": 2.284679604403075, + "learning_rate": 1.805637953002149e-07, + "logits/chosen": -2.8017404079437256, + "logits/rejected": -2.7698140144348145, + "logps/chosen": -288.701416015625, + "logps/rejected": -244.4599609375, + "loss": 0.6725, + "positive_losses": 0.056781768798828125, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17063315212726593, + "rewards/margins": 0.04911976680159569, + "rewards/margins_max": 0.19049224257469177, + "rewards/margins_min": -0.08515028655529022, + "rewards/margins_std": 0.12400348484516144, + "rewards/rejected": 0.12151336669921875, + "step": 2410 + }, + { + "dpo_losses": 0.6658786535263062, + "epoch": 0.63, + "grad_norm": 11.696369253359672, + "learning_rate": 1.7837214484701153e-07, + "logits/chosen": -2.834495782852173, + "logits/rejected": -2.774224042892456, + "logps/chosen": -341.9219970703125, + "logps/rejected": -286.7921142578125, + "loss": 0.6863, + "positive_losses": 0.20940017700195312, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.16808286309242249, + "rewards/margins": 0.05958018824458122, + "rewards/margins_max": 0.17783430218696594, + "rewards/margins_min": -0.08136147260665894, + "rewards/margins_std": 0.11622990667819977, + "rewards/rejected": 0.10850267112255096, + "step": 2420 + }, + { + "dpo_losses": 0.6711954474449158, + "epoch": 0.64, + "grad_norm": 2.085611496499541, + "learning_rate": 1.761864752991004e-07, + "logits/chosen": -2.8036391735076904, + "logits/rejected": -2.780700922012329, + "logps/chosen": -275.9913635253906, + "logps/rejected": -250.76846313476562, + "loss": 0.6908, + "positive_losses": 0.0, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.16365042328834534, + "rewards/margins": 0.048547059297561646, + "rewards/margins_max": 0.2041003406047821, + "rewards/margins_min": -0.08558313548564911, + "rewards/margins_std": 0.12569783627986908, + "rewards/rejected": 0.11510336399078369, + "step": 2430 + }, + { + "dpo_losses": 0.6707032918930054, + "epoch": 0.64, + "grad_norm": 6.078052196704086, + "learning_rate": 1.7400696915925995e-07, + "logits/chosen": -2.7483696937561035, + "logits/rejected": -2.706458568572998, + "logps/chosen": -277.4805908203125, + "logps/rejected": -247.01528930664062, + "loss": 0.6767, + "positive_losses": 0.11072997748851776, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1552312672138214, + "rewards/margins": 0.05023397132754326, + "rewards/margins_max": 0.2068198025226593, + "rewards/margins_min": -0.0821874588727951, + "rewards/margins_std": 0.12569008767604828, + "rewards/rejected": 0.10499731451272964, + "step": 2440 + }, + { + "dpo_losses": 0.6737798452377319, + "epoch": 0.64, + "grad_norm": 1.9705347115847889, + "learning_rate": 1.718338084156254e-07, + "logits/chosen": -2.836562156677246, + "logits/rejected": -2.8479294776916504, + "logps/chosen": -281.20611572265625, + "logps/rejected": -335.68292236328125, + "loss": 0.6803, + "positive_losses": 0.18814153969287872, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1451827734708786, + "rewards/margins": 0.043305326253175735, + "rewards/margins_max": 0.14558716118335724, + "rewards/margins_min": -0.09249875694513321, + "rewards/margins_std": 0.1084173321723938, + "rewards/rejected": 0.10187745094299316, + "step": 2450 + }, + { + "dpo_losses": 0.6762491464614868, + "epoch": 0.64, + "grad_norm": 9.533198923439455, + "learning_rate": 1.696671745264937e-07, + "logits/chosen": -2.7576818466186523, + "logits/rejected": -2.7617905139923096, + "logps/chosen": -230.6830291748047, + "logps/rejected": -228.86172485351562, + "loss": 0.6838, + "positive_losses": 0.06390075385570526, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.14102314412593842, + "rewards/margins": 0.038981709629297256, + "rewards/margins_max": 0.1658230423927307, + "rewards/margins_min": -0.1091441735625267, + "rewards/margins_std": 0.12223289906978607, + "rewards/rejected": 0.10204143822193146, + "step": 2460 + }, + { + "dpo_losses": 0.6576075553894043, + "epoch": 0.65, + "grad_norm": 7.047728087221175, + "learning_rate": 1.67507248405171e-07, + "logits/chosen": -2.7969839572906494, + "logits/rejected": -2.811170816421509, + "logps/chosen": -324.28948974609375, + "logps/rejected": -287.66900634765625, + "loss": 0.6849, + "positive_losses": 0.25962066650390625, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.16784675419330597, + "rewards/margins": 0.07765379548072815, + "rewards/margins_max": 0.24460339546203613, + "rewards/margins_min": -0.05037788301706314, + "rewards/margins_std": 0.13253864645957947, + "rewards/rejected": 0.09019295871257782, + "step": 2470 + }, + { + "dpo_losses": 0.6706292033195496, + "epoch": 0.65, + "grad_norm": 1.7907451063422377, + "learning_rate": 1.6535421040486683e-07, + "logits/chosen": -2.9084179401397705, + "logits/rejected": -2.8807246685028076, + "logps/chosen": -287.76953125, + "logps/rejected": -237.6548614501953, + "loss": 0.681, + "positive_losses": 0.00829315185546875, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15879002213478088, + "rewards/margins": 0.04883551970124245, + "rewards/margins_max": 0.16194190084934235, + "rewards/margins_min": -0.0865946039557457, + "rewards/margins_std": 0.11053447425365448, + "rewards/rejected": 0.10995452105998993, + "step": 2480 + }, + { + "dpo_losses": 0.6417179703712463, + "epoch": 0.65, + "grad_norm": 7.307945463297551, + "learning_rate": 1.6320824030363456e-07, + "logits/chosen": -2.678077220916748, + "logits/rejected": -2.6205215454101562, + "logps/chosen": -313.7894287109375, + "logps/rejected": -258.7198791503906, + "loss": 0.6577, + "positive_losses": 0.048689745366573334, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.18587902188301086, + "rewards/margins": 0.11005387455224991, + "rewards/margins_max": 0.25625452399253845, + "rewards/margins_min": -0.04094923287630081, + "rewards/margins_std": 0.12633609771728516, + "rewards/rejected": 0.07582515478134155, + "step": 2490 + }, + { + "dpo_losses": 0.6599959135055542, + "epoch": 0.65, + "grad_norm": 7.724047121209455, + "learning_rate": 1.6106951728936024e-07, + "logits/chosen": -2.7736682891845703, + "logits/rejected": -2.790349245071411, + "logps/chosen": -239.8456573486328, + "logps/rejected": -275.7630310058594, + "loss": 0.6881, + "positive_losses": 0.1184925064444542, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1587900072336197, + "rewards/margins": 0.07100746780633926, + "rewards/margins_max": 0.21270182728767395, + "rewards/margins_min": -0.03353138267993927, + "rewards/margins_std": 0.11178290843963623, + "rewards/rejected": 0.08778254687786102, + "step": 2500 + }, + { + "epoch": 0.65, + "eval_dpo_losses": 0.6630551815032959, + "eval_logits/chosen": -2.787369966506958, + "eval_logits/rejected": -2.749213695526123, + "eval_logps/chosen": -268.79345703125, + "eval_logps/rejected": -252.9271697998047, + "eval_loss": 0.6830382347106934, + "eval_positive_losses": 0.14260074496269226, + "eval_rewards/accuracies": 0.7222222089767456, + "eval_rewards/chosen": 0.15701442956924438, + "eval_rewards/margins": 0.06479236483573914, + "eval_rewards/margins_max": 0.24739092588424683, + "eval_rewards/margins_min": -0.10220367461442947, + "eval_rewards/margins_std": 0.11561723798513412, + "eval_rewards/rejected": 0.09222202748060226, + "eval_runtime": 389.6283, + "eval_samples_per_second": 5.133, + "eval_steps_per_second": 0.162, + "step": 2500 + }, + { + "dpo_losses": 0.6517472267150879, + "epoch": 0.66, + "grad_norm": 1.9878626534549249, + "learning_rate": 1.5893821994479994e-07, + "logits/chosen": -2.7108683586120605, + "logits/rejected": -2.649024486541748, + "logps/chosen": -240.2740478515625, + "logps/rejected": -232.99734497070312, + "loss": 0.6704, + "positive_losses": 0.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16711577773094177, + "rewards/margins": 0.08962143212556839, + "rewards/margins_max": 0.23768293857574463, + "rewards/margins_min": -0.05823158472776413, + "rewards/margins_std": 0.12899050116539001, + "rewards/rejected": 0.07749433815479279, + "step": 2510 + }, + { + "dpo_losses": 0.6465938687324524, + "epoch": 0.66, + "grad_norm": 12.428593652764857, + "learning_rate": 1.5681452623266867e-07, + "logits/chosen": -2.8300561904907227, + "logits/rejected": -2.7787134647369385, + "logps/chosen": -244.55810546875, + "logps/rejected": -229.2414093017578, + "loss": 0.6752, + "positive_losses": 0.0, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.1652805656194687, + "rewards/margins": 0.10005130618810654, + "rewards/margins_max": 0.251314252614975, + "rewards/margins_min": -0.03838371858000755, + "rewards/margins_std": 0.12975125014781952, + "rewards/rejected": 0.06522925198078156, + "step": 2520 + }, + { + "dpo_losses": 0.6602810025215149, + "epoch": 0.66, + "grad_norm": 7.703697446402158, + "learning_rate": 1.546986134807801e-07, + "logits/chosen": -2.7837038040161133, + "logits/rejected": -2.819833517074585, + "logps/chosen": -269.00579833984375, + "logps/rejected": -288.3307800292969, + "loss": 0.6664, + "positive_losses": 0.2041366547346115, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.14903002977371216, + "rewards/margins": 0.07221291214227676, + "rewards/margins_max": 0.24120278656482697, + "rewards/margins_min": -0.07335890829563141, + "rewards/margins_std": 0.13466687500476837, + "rewards/rejected": 0.0768171027302742, + "step": 2530 + }, + { + "dpo_losses": 0.6748021841049194, + "epoch": 0.66, + "grad_norm": 2.0470362010115877, + "learning_rate": 1.5259065836724034e-07, + "logits/chosen": -2.728024959564209, + "logits/rejected": -2.743741512298584, + "logps/chosen": -272.1194152832031, + "logps/rejected": -314.8052062988281, + "loss": 0.683, + "positive_losses": 0.09459342807531357, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.12930192053318024, + "rewards/margins": 0.04087451845407486, + "rewards/margins_max": 0.16113092005252838, + "rewards/margins_min": -0.0958721712231636, + "rewards/margins_std": 0.11435544490814209, + "rewards/rejected": 0.08842740207910538, + "step": 2540 + }, + { + "dpo_losses": 0.6604206562042236, + "epoch": 0.67, + "grad_norm": 1.6351465216728645, + "learning_rate": 1.5049083690569454e-07, + "logits/chosen": -2.8244802951812744, + "logits/rejected": -2.795187473297119, + "logps/chosen": -257.86102294921875, + "logps/rejected": -270.40325927734375, + "loss": 0.6749, + "positive_losses": 0.15022125840187073, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.1673004925251007, + "rewards/margins": 0.07084228098392487, + "rewards/margins_max": 0.20384936034679413, + "rewards/margins_min": -0.05480436235666275, + "rewards/margins_std": 0.1157851442694664, + "rewards/rejected": 0.09645821899175644, + "step": 2550 + }, + { + "dpo_losses": 0.6679819822311401, + "epoch": 0.67, + "grad_norm": 2.1467085106051065, + "learning_rate": 1.4839932443063056e-07, + "logits/chosen": -2.6807057857513428, + "logits/rejected": -2.6425938606262207, + "logps/chosen": -185.5140380859375, + "logps/rejected": -180.81089782714844, + "loss": 0.6694, + "positive_losses": 0.00739707937464118, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.14547543227672577, + "rewards/margins": 0.05403967574238777, + "rewards/margins_max": 0.18117661774158478, + "rewards/margins_min": -0.05315234512090683, + "rewards/margins_std": 0.10514678806066513, + "rewards/rejected": 0.09143576771020889, + "step": 2560 + }, + { + "dpo_losses": 0.6649169921875, + "epoch": 0.67, + "grad_norm": 13.485934241396896, + "learning_rate": 1.46316295582738e-07, + "logits/chosen": -2.8397908210754395, + "logits/rejected": -2.802277088165283, + "logps/chosen": -268.9765319824219, + "logps/rejected": -267.4847717285156, + "loss": 0.6949, + "positive_losses": 0.15890884399414062, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.16603443026542664, + "rewards/margins": 0.0631035789847374, + "rewards/margins_max": 0.17990688979625702, + "rewards/margins_min": -0.059853047132492065, + "rewards/margins_std": 0.10987289249897003, + "rewards/rejected": 0.10293082892894745, + "step": 2570 + }, + { + "dpo_losses": 0.6597756147384644, + "epoch": 0.68, + "grad_norm": 9.548073651225087, + "learning_rate": 1.4424192429432655e-07, + "logits/chosen": -2.796020030975342, + "logits/rejected": -2.770440101623535, + "logps/chosen": -251.7606201171875, + "logps/rejected": -265.84588623046875, + "loss": 0.6803, + "positive_losses": 0.09473152458667755, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16906090080738068, + "rewards/margins": 0.071074478328228, + "rewards/margins_max": 0.1876489222049713, + "rewards/margins_min": -0.04980349540710449, + "rewards/margins_std": 0.10807528346776962, + "rewards/rejected": 0.09798641502857208, + "step": 2580 + }, + { + "dpo_losses": 0.6544302105903625, + "epoch": 0.68, + "grad_norm": 4.496828020922733, + "learning_rate": 1.4217638377480158e-07, + "logits/chosen": -2.798560380935669, + "logits/rejected": -2.7506070137023926, + "logps/chosen": -284.4412841796875, + "logps/rejected": -232.6147918701172, + "loss": 0.6724, + "positive_losses": 0.13090820610523224, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.1532086730003357, + "rewards/margins": 0.08290395885705948, + "rewards/margins_max": 0.20642821490764618, + "rewards/margins_min": -0.04796721413731575, + "rewards/margins_std": 0.11846397072076797, + "rewards/rejected": 0.07030472904443741, + "step": 2590 + }, + { + "dpo_losses": 0.6792913675308228, + "epoch": 0.68, + "grad_norm": 13.357376038313298, + "learning_rate": 1.401198464962021e-07, + "logits/chosen": -2.6972367763519287, + "logits/rejected": -2.7522435188293457, + "logps/chosen": -216.80258178710938, + "logps/rejected": -253.00991821289062, + "loss": 0.6871, + "positive_losses": 0.30749091506004333, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.12594708800315857, + "rewards/margins": 0.030662814155220985, + "rewards/margins_max": 0.13051792979240417, + "rewards/margins_min": -0.06012769415974617, + "rewards/margins_std": 0.08853994309902191, + "rewards/rejected": 0.09528429806232452, + "step": 2600 + }, + { + "epoch": 0.68, + "eval_dpo_losses": 0.6637357473373413, + "eval_logits/chosen": -2.783560037612915, + "eval_logits/rejected": -2.7451324462890625, + "eval_logps/chosen": -268.1626281738281, + "eval_logps/rejected": -252.1409149169922, + "eval_loss": 0.6808480620384216, + "eval_positive_losses": 0.11579447984695435, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": 0.16332247853279114, + "eval_rewards/margins": 0.06323818117380142, + "eval_rewards/margins_max": 0.24468624591827393, + "eval_rewards/margins_min": -0.09912735968828201, + "eval_rewards/margins_std": 0.11344098299741745, + "eval_rewards/rejected": 0.10008430480957031, + "eval_runtime": 389.2769, + "eval_samples_per_second": 5.138, + "eval_steps_per_second": 0.162, + "step": 2600 + }, + { + "dpo_losses": 0.6520320773124695, + "epoch": 0.68, + "grad_norm": 9.598610507226221, + "learning_rate": 1.3807248417879894e-07, + "logits/chosen": -2.7677581310272217, + "logits/rejected": -2.6992528438568115, + "logps/chosen": -243.98825073242188, + "logps/rejected": -180.00013732910156, + "loss": 0.6777, + "positive_losses": 0.11720962822437286, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1732184886932373, + "rewards/margins": 0.08967794477939606, + "rewards/margins_max": 0.27268117666244507, + "rewards/margins_min": -0.042021073400974274, + "rewards/margins_std": 0.14438588917255402, + "rewards/rejected": 0.08354054391384125, + "step": 2610 + }, + { + "dpo_losses": 0.6666117906570435, + "epoch": 0.69, + "grad_norm": 6.692568560574354, + "learning_rate": 1.3603446777675665e-07, + "logits/chosen": -2.6856884956359863, + "logits/rejected": -2.6961658000946045, + "logps/chosen": -234.836181640625, + "logps/rejected": -251.9840087890625, + "loss": 0.6944, + "positive_losses": 0.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.16049371659755707, + "rewards/margins": 0.05744044855237007, + "rewards/margins_max": 0.19652999937534332, + "rewards/margins_min": -0.07018232345581055, + "rewards/margins_std": 0.11920342594385147, + "rewards/rejected": 0.1030532717704773, + "step": 2620 + }, + { + "dpo_losses": 0.6739298105239868, + "epoch": 0.69, + "grad_norm": 2.7563088616208113, + "learning_rate": 1.3400596746385814e-07, + "logits/chosen": -2.7988457679748535, + "logits/rejected": -2.8075976371765137, + "logps/chosen": -243.51730346679688, + "logps/rejected": -238.9121551513672, + "loss": 0.6832, + "positive_losses": 0.37547796964645386, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.13830161094665527, + "rewards/margins": 0.04181267321109772, + "rewards/margins_max": 0.17611472308635712, + "rewards/margins_min": -0.087108314037323, + "rewards/margins_std": 0.11464836448431015, + "rewards/rejected": 0.09648893028497696, + "step": 2630 + }, + { + "dpo_losses": 0.6748381853103638, + "epoch": 0.69, + "grad_norm": 15.581915207796786, + "learning_rate": 1.3198715261929586e-07, + "logits/chosen": -2.8534445762634277, + "logits/rejected": -2.8705265522003174, + "logps/chosen": -301.2350769042969, + "logps/rejected": -282.84124755859375, + "loss": 0.6904, + "positive_losses": 0.09905795753002167, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1465148627758026, + "rewards/margins": 0.041211504489183426, + "rewards/margins_max": 0.20768475532531738, + "rewards/margins_min": -0.06541645526885986, + "rewards/margins_std": 0.1251172572374344, + "rewards/rejected": 0.10530336201190948, + "step": 2640 + }, + { + "dpo_losses": 0.6479775309562683, + "epoch": 0.69, + "grad_norm": 1.7643300937257929, + "learning_rate": 1.299781918135282e-07, + "logits/chosen": -2.8581717014312744, + "logits/rejected": -2.779258966445923, + "logps/chosen": -227.37393188476562, + "logps/rejected": -206.65213012695312, + "loss": 0.6719, + "positive_losses": 0.15762968361377716, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.1743009388446808, + "rewards/margins": 0.09570324420928955, + "rewards/margins_max": 0.21518942713737488, + "rewards/margins_min": -0.016625795513391495, + "rewards/margins_std": 0.1057358831167221, + "rewards/rejected": 0.07859767973423004, + "step": 2650 + }, + { + "dpo_losses": 0.6573314070701599, + "epoch": 0.7, + "grad_norm": 10.786175561891858, + "learning_rate": 1.279792527942045e-07, + "logits/chosen": -2.8398871421813965, + "logits/rejected": -2.837266206741333, + "logps/chosen": -312.1396484375, + "logps/rejected": -241.70187377929688, + "loss": 0.6739, + "positive_losses": 0.0, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.17937134206295013, + "rewards/margins": 0.07706346362829208, + "rewards/margins_max": 0.17430905997753143, + "rewards/margins_min": -0.03193662688136101, + "rewards/margins_std": 0.09627391397953033, + "rewards/rejected": 0.10230787843465805, + "step": 2660 + }, + { + "dpo_losses": 0.6637741923332214, + "epoch": 0.7, + "grad_norm": 5.670050041072099, + "learning_rate": 1.259905024721576e-07, + "logits/chosen": -2.8783836364746094, + "logits/rejected": -2.7891175746917725, + "logps/chosen": -275.72515869140625, + "logps/rejected": -226.322265625, + "loss": 0.6759, + "positive_losses": 0.051831819117069244, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.13656029105186462, + "rewards/margins": 0.06270195543766022, + "rewards/margins_max": 0.17947831749916077, + "rewards/margins_min": -0.04513033479452133, + "rewards/margins_std": 0.10144752264022827, + "rewards/rejected": 0.07385829836130142, + "step": 2670 + }, + { + "dpo_losses": 0.6624875068664551, + "epoch": 0.7, + "grad_norm": 9.926135588667393, + "learning_rate": 1.2401210690746703e-07, + "logits/chosen": -2.85262131690979, + "logits/rejected": -2.817472457885742, + "logps/chosen": -333.7018127441406, + "logps/rejected": -368.96319580078125, + "loss": 0.6893, + "positive_losses": 0.582872748374939, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.1505921632051468, + "rewards/margins": 0.06744848191738129, + "rewards/margins_max": 0.22661364078521729, + "rewards/margins_min": -0.09928081184625626, + "rewards/margins_std": 0.13975416123867035, + "rewards/rejected": 0.08314366638660431, + "step": 2680 + }, + { + "dpo_losses": 0.6623440980911255, + "epoch": 0.7, + "grad_norm": 2.2882187886567933, + "learning_rate": 1.2204423129559305e-07, + "logits/chosen": -2.705098867416382, + "logits/rejected": -2.6861870288848877, + "logps/chosen": -183.91455078125, + "logps/rejected": -223.2926025390625, + "loss": 0.68, + "positive_losses": 0.22965507209300995, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.1502687633037567, + "rewards/margins": 0.06848227977752686, + "rewards/margins_max": 0.20223240554332733, + "rewards/margins_min": -0.08552606403827667, + "rewards/margins_std": 0.12745562195777893, + "rewards/rejected": 0.08178650587797165, + "step": 2690 + }, + { + "dpo_losses": 0.6769312620162964, + "epoch": 0.71, + "grad_norm": 13.211990251334853, + "learning_rate": 1.2008703995358299e-07, + "logits/chosen": -2.7353789806365967, + "logits/rejected": -2.6951496601104736, + "logps/chosen": -232.9313507080078, + "logps/rejected": -224.88162231445312, + "loss": 0.683, + "positive_losses": 0.0, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.14097455143928528, + "rewards/margins": 0.03472239524126053, + "rewards/margins_max": 0.12374613434076309, + "rewards/margins_min": -0.053682826459407806, + "rewards/margins_std": 0.08197776973247528, + "rewards/rejected": 0.10625217854976654, + "step": 2700 + }, + { + "epoch": 0.71, + "eval_dpo_losses": 0.6639883518218994, + "eval_logits/chosen": -2.782484769821167, + "eval_logits/rejected": -2.7438457012176514, + "eval_logps/chosen": -268.0138244628906, + "eval_logps/rejected": -251.9336395263672, + "eval_loss": 0.6799085736274719, + "eval_positive_losses": 0.10900817066431046, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": 0.1648109257221222, + "eval_rewards/margins": 0.0626538097858429, + "eval_rewards/margins_max": 0.2421714812517166, + "eval_rewards/margins_min": -0.09798896312713623, + "eval_rewards/margins_std": 0.11240836977958679, + "eval_rewards/rejected": 0.1021571010351181, + "eval_runtime": 389.6711, + "eval_samples_per_second": 5.133, + "eval_steps_per_second": 0.162, + "step": 2700 + }, + { + "dpo_losses": 0.6747549772262573, + "epoch": 0.71, + "grad_norm": 2.0042003393483747, + "learning_rate": 1.1814069630635068e-07, + "logits/chosen": -2.9221720695495605, + "logits/rejected": -2.870772123336792, + "logps/chosen": -270.75146484375, + "logps/rejected": -264.449951171875, + "loss": 0.6896, + "positive_losses": 0.2701377868652344, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.16046664118766785, + "rewards/margins": 0.039906375110149384, + "rewards/margins_max": 0.15826861560344696, + "rewards/margins_min": -0.06516039371490479, + "rewards/margins_std": 0.10186408460140228, + "rewards/rejected": 0.12056026607751846, + "step": 2710 + }, + { + "dpo_losses": 0.6686604619026184, + "epoch": 0.71, + "grad_norm": 2.744524446300366, + "learning_rate": 1.1620536287303051e-07, + "logits/chosen": -2.77477765083313, + "logits/rejected": -2.7189297676086426, + "logps/chosen": -204.58444213867188, + "logps/rejected": -225.67245483398438, + "loss": 0.6675, + "positive_losses": 0.0, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.16270171105861664, + "rewards/margins": 0.05269937589764595, + "rewards/margins_max": 0.13205501437187195, + "rewards/margins_min": -0.06587193161249161, + "rewards/margins_std": 0.09286411851644516, + "rewards/rejected": 0.11000235378742218, + "step": 2720 + }, + { + "dpo_losses": 0.6740443706512451, + "epoch": 0.71, + "grad_norm": 2.114891130066726, + "learning_rate": 1.1428120125340716e-07, + "logits/chosen": -2.720109462738037, + "logits/rejected": -2.7368974685668945, + "logps/chosen": -315.4144592285156, + "logps/rejected": -291.7226867675781, + "loss": 0.6874, + "positive_losses": 0.0, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1961982250213623, + "rewards/margins": 0.04646676033735275, + "rewards/margins_max": 0.23910515010356903, + "rewards/margins_min": -0.11346033960580826, + "rewards/margins_std": 0.1614883840084076, + "rewards/rejected": 0.14973145723342896, + "step": 2730 + }, + { + "dpo_losses": 0.6759941577911377, + "epoch": 0.72, + "grad_norm": 9.69964507721924, + "learning_rate": 1.123683721144223e-07, + "logits/chosen": -2.813666343688965, + "logits/rejected": -2.8336429595947266, + "logps/chosen": -207.52584838867188, + "logps/rejected": -249.69692993164062, + "loss": 0.6861, + "positive_losses": 0.02162322960793972, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1626010537147522, + "rewards/margins": 0.03769798204302788, + "rewards/margins_max": 0.16509708762168884, + "rewards/margins_min": -0.07659967988729477, + "rewards/margins_std": 0.10486801713705063, + "rewards/rejected": 0.12490306049585342, + "step": 2740 + }, + { + "dpo_losses": 0.6610895395278931, + "epoch": 0.72, + "grad_norm": 10.986869604711686, + "learning_rate": 1.1046703517675845e-07, + "logits/chosen": -2.8068175315856934, + "logits/rejected": -2.7834575176239014, + "logps/chosen": -257.51422119140625, + "logps/rejected": -221.0384979248047, + "loss": 0.6955, + "positive_losses": 0.0, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.1787460744380951, + "rewards/margins": 0.06879962235689163, + "rewards/margins_max": 0.17229793965816498, + "rewards/margins_min": -0.027008920907974243, + "rewards/margins_std": 0.09230650961399078, + "rewards/rejected": 0.10994645208120346, + "step": 2750 + }, + { + "dpo_losses": 0.6526534557342529, + "epoch": 0.72, + "grad_norm": 10.098201372495632, + "learning_rate": 1.085773492015028e-07, + "logits/chosen": -2.8394532203674316, + "logits/rejected": -2.745370388031006, + "logps/chosen": -351.73455810546875, + "logps/rejected": -284.814208984375, + "loss": 0.6655, + "positive_losses": 0.0, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.18526779115200043, + "rewards/margins": 0.08610363304615021, + "rewards/margins_max": 0.20476289093494415, + "rewards/margins_min": -0.028496265411376953, + "rewards/margins_std": 0.10529184341430664, + "rewards/rejected": 0.09916415065526962, + "step": 2760 + }, + { + "dpo_losses": 0.6613882780075073, + "epoch": 0.72, + "grad_norm": 1.9894765322581731, + "learning_rate": 1.0669947197689033e-07, + "logits/chosen": -2.9060561656951904, + "logits/rejected": -2.8399486541748047, + "logps/chosen": -290.3299560546875, + "logps/rejected": -259.8872985839844, + "loss": 0.693, + "positive_losses": 0.0, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.16502395272254944, + "rewards/margins": 0.06983888149261475, + "rewards/margins_max": 0.2175408899784088, + "rewards/margins_min": -0.08632582426071167, + "rewards/margins_std": 0.13449755311012268, + "rewards/rejected": 0.0951850563287735, + "step": 2770 + }, + { + "dpo_losses": 0.6637939214706421, + "epoch": 0.73, + "grad_norm": 1.693095361528334, + "learning_rate": 1.048335603051291e-07, + "logits/chosen": -2.7786593437194824, + "logits/rejected": -2.721432685852051, + "logps/chosen": -325.7188720703125, + "logps/rejected": -225.9957275390625, + "loss": 0.6761, + "positive_losses": 0.10735664516687393, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.17629331350326538, + "rewards/margins": 0.06277813762426376, + "rewards/margins_max": 0.18301381170749664, + "rewards/margins_min": -0.04662441462278366, + "rewards/margins_std": 0.10360904037952423, + "rewards/rejected": 0.11351517587900162, + "step": 2780 + }, + { + "dpo_losses": 0.6610652804374695, + "epoch": 0.73, + "grad_norm": 1.5668810650373743, + "learning_rate": 1.0297976998930663e-07, + "logits/chosen": -2.918388843536377, + "logits/rejected": -2.8212196826934814, + "logps/chosen": -342.0752868652344, + "logps/rejected": -275.6966857910156, + "loss": 0.6758, + "positive_losses": 0.0019851685501635075, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.19239678978919983, + "rewards/margins": 0.06978907436132431, + "rewards/margins_max": 0.19509926438331604, + "rewards/margins_min": -0.04050639644265175, + "rewards/margins_std": 0.10559085756540298, + "rewards/rejected": 0.12260772287845612, + "step": 2790 + }, + { + "dpo_losses": 0.6732099652290344, + "epoch": 0.73, + "grad_norm": 11.168298023302597, + "learning_rate": 1.0113825582038077e-07, + "logits/chosen": -2.82181978225708, + "logits/rejected": -2.7915401458740234, + "logps/chosen": -226.28927612304688, + "logps/rejected": -222.4562225341797, + "loss": 0.6785, + "positive_losses": 0.1656135618686676, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.1473783701658249, + "rewards/margins": 0.04381309077143669, + "rewards/margins_max": 0.18773579597473145, + "rewards/margins_min": -0.08167463541030884, + "rewards/margins_std": 0.12001528590917587, + "rewards/rejected": 0.1035652607679367, + "step": 2800 + }, + { + "epoch": 0.73, + "eval_dpo_losses": 0.6633877158164978, + "eval_logits/chosen": -2.7828617095947266, + "eval_logits/rejected": -2.74420428276062, + "eval_logps/chosen": -268.234130859375, + "eval_logps/rejected": -252.28933715820312, + "eval_loss": 0.6808531284332275, + "eval_positive_losses": 0.11935018002986908, + "eval_rewards/accuracies": 0.7142857313156128, + "eval_rewards/chosen": 0.16260772943496704, + "eval_rewards/margins": 0.06400728970766068, + "eval_rewards/margins_max": 0.2456274777650833, + "eval_rewards/margins_min": -0.10007373243570328, + "eval_rewards/margins_std": 0.11423919349908829, + "eval_rewards/rejected": 0.09860043972730637, + "eval_runtime": 391.2169, + "eval_samples_per_second": 5.112, + "eval_steps_per_second": 0.161, + "step": 2800 + }, + { + "dpo_losses": 0.6562505960464478, + "epoch": 0.74, + "grad_norm": 2.6651011481065163, + "learning_rate": 9.930917156425475e-08, + "logits/chosen": -2.7254879474639893, + "logits/rejected": -2.7593536376953125, + "logps/chosen": -224.43960571289062, + "logps/rejected": -202.2880401611328, + "loss": 0.6673, + "positive_losses": 0.0, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.16689622402191162, + "rewards/margins": 0.08031634986400604, + "rewards/margins_max": 0.19498199224472046, + "rewards/margins_min": -0.03201249986886978, + "rewards/margins_std": 0.10381577908992767, + "rewards/rejected": 0.08657988160848618, + "step": 2810 + }, + { + "dpo_losses": 0.6722387075424194, + "epoch": 0.74, + "grad_norm": 2.395446900978545, + "learning_rate": 9.749266994893754e-08, + "logits/chosen": -2.7374165058135986, + "logits/rejected": -2.7058839797973633, + "logps/chosen": -225.48974609375, + "logps/rejected": -256.9203186035156, + "loss": 0.6653, + "positive_losses": 0.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15854783356189728, + "rewards/margins": 0.04549305886030197, + "rewards/margins_max": 0.17629720270633698, + "rewards/margins_min": -0.06231803819537163, + "rewards/margins_std": 0.10453619807958603, + "rewards/rejected": 0.1130547747015953, + "step": 2820 + }, + { + "dpo_losses": 0.6781406998634338, + "epoch": 0.74, + "grad_norm": 2.149688245659782, + "learning_rate": 9.568890265179128e-08, + "logits/chosen": -2.719682216644287, + "logits/rejected": -2.696131467819214, + "logps/chosen": -271.04443359375, + "logps/rejected": -413.84649658203125, + "loss": 0.6691, + "positive_losses": 0.12154903262853622, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.13876976072788239, + "rewards/margins": 0.0326072983443737, + "rewards/margins_max": 0.1310979127883911, + "rewards/margins_min": -0.08175252377986908, + "rewards/margins_std": 0.09425903856754303, + "rewards/rejected": 0.10616246610879898, + "step": 2830 + }, + { + "dpo_losses": 0.6699537634849548, + "epoch": 0.74, + "grad_norm": 1.9474233182763336, + "learning_rate": 9.389802028686616e-08, + "logits/chosen": -2.719886302947998, + "logits/rejected": -2.709803819656372, + "logps/chosen": -194.5177001953125, + "logps/rejected": -175.28614807128906, + "loss": 0.6668, + "positive_losses": 0.0, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14552441239356995, + "rewards/margins": 0.04965772479772568, + "rewards/margins_max": 0.16322022676467896, + "rewards/margins_min": -0.06379050016403198, + "rewards/margins_std": 0.10277913510799408, + "rewards/rejected": 0.09586669504642487, + "step": 2840 + }, + { + "dpo_losses": 0.6644850969314575, + "epoch": 0.75, + "grad_norm": 2.927693494596595, + "learning_rate": 9.212017239232426e-08, + "logits/chosen": -2.8066647052764893, + "logits/rejected": -2.832144260406494, + "logps/chosen": -265.63226318359375, + "logps/rejected": -243.2608184814453, + "loss": 0.6782, + "positive_losses": 0.06858177483081818, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.15173125267028809, + "rewards/margins": 0.06289365887641907, + "rewards/margins_max": 0.1837751567363739, + "rewards/margins_min": -0.12117429822683334, + "rewards/margins_std": 0.13747075200080872, + "rewards/rejected": 0.08883760124444962, + "step": 2850 + }, + { + "dpo_losses": 0.6735584735870361, + "epoch": 0.75, + "grad_norm": 2.2505464949789444, + "learning_rate": 9.035550741795328e-08, + "logits/chosen": -2.802375316619873, + "logits/rejected": -2.763209819793701, + "logps/chosen": -247.94754028320312, + "logps/rejected": -231.13558959960938, + "loss": 0.6617, + "positive_losses": 0.0, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.15921562910079956, + "rewards/margins": 0.04222042113542557, + "rewards/margins_max": 0.14945976436138153, + "rewards/margins_min": -0.07940498739480972, + "rewards/margins_std": 0.10080856084823608, + "rewards/rejected": 0.1169952005147934, + "step": 2860 + }, + { + "dpo_losses": 0.6567317247390747, + "epoch": 0.75, + "grad_norm": 2.502330647320227, + "learning_rate": 8.860417271277065e-08, + "logits/chosen": -2.6637213230133057, + "logits/rejected": -2.6016478538513184, + "logps/chosen": -274.00079345703125, + "logps/rejected": -253.66122436523438, + "loss": 0.6672, + "positive_losses": 0.0, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.1512971669435501, + "rewards/margins": 0.07740378379821777, + "rewards/margins_max": 0.19368486106395721, + "rewards/margins_min": -0.03223549947142601, + "rewards/margins_std": 0.10386307537555695, + "rewards/rejected": 0.07389337569475174, + "step": 2870 + }, + { + "dpo_losses": 0.6591945290565491, + "epoch": 0.75, + "grad_norm": 6.093505541470816, + "learning_rate": 8.686631451272029e-08, + "logits/chosen": -2.8315553665161133, + "logits/rejected": -2.7398500442504883, + "logps/chosen": -287.47442626953125, + "logps/rejected": -230.60415649414062, + "loss": 0.6897, + "positive_losses": 0.5427009463310242, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.14553631842136383, + "rewards/margins": 0.0738452821969986, + "rewards/margins_max": 0.2060689479112625, + "rewards/margins_min": -0.03878684341907501, + "rewards/margins_std": 0.10930664837360382, + "rewards/rejected": 0.07169099897146225, + "step": 2880 + }, + { + "dpo_losses": 0.6751716732978821, + "epoch": 0.76, + "grad_norm": 112.45037968199574, + "learning_rate": 8.514207792846168e-08, + "logits/chosen": -2.7375433444976807, + "logits/rejected": -2.722531795501709, + "logps/chosen": -269.4003601074219, + "logps/rejected": -297.16485595703125, + "loss": 0.6822, + "positive_losses": 0.5430085062980652, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1424095630645752, + "rewards/margins": 0.04013755917549133, + "rewards/margins_max": 0.16956102848052979, + "rewards/margins_min": -0.08579131960868835, + "rewards/margins_std": 0.11141952127218246, + "rewards/rejected": 0.10227201133966446, + "step": 2890 + }, + { + "dpo_losses": 0.6576107144355774, + "epoch": 0.76, + "grad_norm": 14.170585060505838, + "learning_rate": 8.343160693325355e-08, + "logits/chosen": -2.8384392261505127, + "logits/rejected": -2.801820993423462, + "logps/chosen": -288.8127136230469, + "logps/rejected": -245.9336700439453, + "loss": 0.6804, + "positive_losses": 0.13667449355125427, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.15546861290931702, + "rewards/margins": 0.07626311480998993, + "rewards/margins_max": 0.18484382331371307, + "rewards/margins_min": -0.040147729218006134, + "rewards/margins_std": 0.10086791217327118, + "rewards/rejected": 0.07920549809932709, + "step": 2900 + }, + { + "epoch": 0.76, + "eval_dpo_losses": 0.6628540754318237, + "eval_logits/chosen": -2.7846555709838867, + "eval_logits/rejected": -2.7461493015289307, + "eval_logps/chosen": -268.41455078125, + "eval_logps/rejected": -252.5924835205078, + "eval_loss": 0.6822347640991211, + "eval_positive_losses": 0.13458429276943207, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": 0.16080327332019806, + "eval_rewards/margins": 0.06523434817790985, + "eval_rewards/margins_max": 0.2495136708021164, + "eval_rewards/margins_min": -0.10225551575422287, + "eval_rewards/margins_std": 0.11622842401266098, + "eval_rewards/rejected": 0.0955689400434494, + "eval_runtime": 389.5641, + "eval_samples_per_second": 5.134, + "eval_steps_per_second": 0.162, + "step": 2900 + }, + { + "dpo_losses": 0.6652017831802368, + "epoch": 0.76, + "grad_norm": 11.459882337888212, + "learning_rate": 8.173504435093173e-08, + "logits/chosen": -2.7680153846740723, + "logits/rejected": -2.692370891571045, + "logps/chosen": -260.47747802734375, + "logps/rejected": -214.68319702148438, + "loss": 0.6668, + "positive_losses": 0.044872283935546875, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.15751729905605316, + "rewards/margins": 0.05986157804727554, + "rewards/margins_max": 0.18254561722278595, + "rewards/margins_min": -0.04402995854616165, + "rewards/margins_std": 0.10341081768274307, + "rewards/rejected": 0.09765572845935822, + "step": 2910 + }, + { + "dpo_losses": 0.6544531583786011, + "epoch": 0.76, + "grad_norm": 20.25977532189708, + "learning_rate": 8.005253184398359e-08, + "logits/chosen": -2.7313950061798096, + "logits/rejected": -2.6855309009552, + "logps/chosen": -265.0918884277344, + "logps/rejected": -204.74412536621094, + "loss": 0.6786, + "positive_losses": 0.5254768133163452, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.16128066182136536, + "rewards/margins": 0.08340667188167572, + "rewards/margins_max": 0.2493068277835846, + "rewards/margins_min": -0.04549757391214371, + "rewards/margins_std": 0.12967939674854279, + "rewards/rejected": 0.07787398993968964, + "step": 2920 + }, + { + "dpo_losses": 0.6423149108886719, + "epoch": 0.77, + "grad_norm": 15.548518875088533, + "learning_rate": 7.838420990171926e-08, + "logits/chosen": -2.730849504470825, + "logits/rejected": -2.702017068862915, + "logps/chosen": -290.5140380859375, + "logps/rejected": -235.5591583251953, + "loss": 0.6799, + "positive_losses": 0.0, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.19081711769104004, + "rewards/margins": 0.10711536556482315, + "rewards/margins_max": 0.2387050837278366, + "rewards/margins_min": -0.004120032303035259, + "rewards/margins_std": 0.10651262104511261, + "rewards/rejected": 0.08370174467563629, + "step": 2930 + }, + { + "dpo_losses": 0.6588220596313477, + "epoch": 0.77, + "grad_norm": 1.7419158313182692, + "learning_rate": 7.673021782854083e-08, + "logits/chosen": -2.8104755878448486, + "logits/rejected": -2.8491647243499756, + "logps/chosen": -283.4486083984375, + "logps/rejected": -260.8815612792969, + "loss": 0.6847, + "positive_losses": 0.36250799894332886, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.171676367521286, + "rewards/margins": 0.07349254935979843, + "rewards/margins_max": 0.22684898972511292, + "rewards/margins_min": -0.038581203669309616, + "rewards/margins_std": 0.117698073387146, + "rewards/rejected": 0.09818382561206818, + "step": 2940 + }, + { + "dpo_losses": 0.6470920443534851, + "epoch": 0.77, + "grad_norm": 20.281353072975755, + "learning_rate": 7.509069373231039e-08, + "logits/chosen": -2.9167816638946533, + "logits/rejected": -2.873137950897217, + "logps/chosen": -268.0244445800781, + "logps/rejected": -281.0406188964844, + "loss": 0.6973, + "positive_losses": 0.5683601498603821, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.1619308590888977, + "rewards/margins": 0.09898541867733002, + "rewards/margins_max": 0.23728612065315247, + "rewards/margins_min": -0.035527534782886505, + "rewards/margins_std": 0.11940480768680573, + "rewards/rejected": 0.06294544041156769, + "step": 2950 + }, + { + "dpo_losses": 0.6785377264022827, + "epoch": 0.77, + "grad_norm": 1.9344681429277137, + "learning_rate": 7.346577451281821e-08, + "logits/chosen": -2.729431629180908, + "logits/rejected": -2.7387375831604004, + "logps/chosen": -242.9123992919922, + "logps/rejected": -236.4193878173828, + "loss": 0.667, + "positive_losses": 0.1447097808122635, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.14629048109054565, + "rewards/margins": 0.034385181963443756, + "rewards/margins_max": 0.18973104655742645, + "rewards/margins_min": -0.11912697553634644, + "rewards/margins_std": 0.13839221000671387, + "rewards/rejected": 0.11190527677536011, + "step": 2960 + }, + { + "dpo_losses": 0.6690041422843933, + "epoch": 0.78, + "grad_norm": 8.081258553414122, + "learning_rate": 7.185559585035136e-08, + "logits/chosen": -2.8309693336486816, + "logits/rejected": -2.8083622455596924, + "logps/chosen": -219.78732299804688, + "logps/rejected": -195.7440643310547, + "loss": 0.6817, + "positive_losses": 0.05683441087603569, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14709046483039856, + "rewards/margins": 0.05245544761419296, + "rewards/margins_max": 0.172337144613266, + "rewards/margins_min": -0.05212901905179024, + "rewards/margins_std": 0.10182130336761475, + "rewards/rejected": 0.0946350246667862, + "step": 2970 + }, + { + "dpo_losses": 0.6723123788833618, + "epoch": 0.78, + "grad_norm": 2.180158448016903, + "learning_rate": 7.026029219436502e-08, + "logits/chosen": -2.7011642456054688, + "logits/rejected": -2.672740936279297, + "logps/chosen": -292.22528076171875, + "logps/rejected": -297.1006164550781, + "loss": 0.6716, + "positive_losses": 0.20120945572853088, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.15260522067546844, + "rewards/margins": 0.04637325555086136, + "rewards/margins_max": 0.19567745923995972, + "rewards/margins_min": -0.10428061336278915, + "rewards/margins_std": 0.1315806806087494, + "rewards/rejected": 0.10623196512460709, + "step": 2980 + }, + { + "dpo_losses": 0.6591792106628418, + "epoch": 0.78, + "grad_norm": 2.4447852593104873, + "learning_rate": 6.867999675225522e-08, + "logits/chosen": -2.7815351486206055, + "logits/rejected": -2.7108778953552246, + "logps/chosen": -291.34375, + "logps/rejected": -299.01239013671875, + "loss": 0.6753, + "positive_losses": 0.1278236359357834, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.1643422693014145, + "rewards/margins": 0.0730084627866745, + "rewards/margins_max": 0.2041645050048828, + "rewards/margins_min": -0.034510623663663864, + "rewards/margins_std": 0.11097339540719986, + "rewards/rejected": 0.09133382886648178, + "step": 2990 + }, + { + "dpo_losses": 0.6654072999954224, + "epoch": 0.79, + "grad_norm": 2.581985542481455, + "learning_rate": 6.711484147823662e-08, + "logits/chosen": -2.7847719192504883, + "logits/rejected": -2.7462828159332275, + "logps/chosen": -224.4231719970703, + "logps/rejected": -255.90652465820312, + "loss": 0.6741, + "positive_losses": 0.0, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.16300776600837708, + "rewards/margins": 0.05929435044527054, + "rewards/margins_max": 0.18764983117580414, + "rewards/margins_min": -0.046713314950466156, + "rewards/margins_std": 0.10569945722818375, + "rewards/rejected": 0.10371343791484833, + "step": 3000 + }, + { + "epoch": 0.79, + "eval_dpo_losses": 0.6630541682243347, + "eval_logits/chosen": -2.7844808101654053, + "eval_logits/rejected": -2.7461202144622803, + "eval_logps/chosen": -268.1099548339844, + "eval_logps/rejected": -252.24093627929688, + "eval_loss": 0.6808385252952576, + "eval_positive_losses": 0.1180451363325119, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": 0.16384930908679962, + "eval_rewards/margins": 0.06476480513811111, + "eval_rewards/margins_max": 0.2480427622795105, + "eval_rewards/margins_min": -0.10046318173408508, + "eval_rewards/margins_std": 0.11531291902065277, + "eval_rewards/rejected": 0.0990845113992691, + "eval_runtime": 389.5682, + "eval_samples_per_second": 5.134, + "eval_steps_per_second": 0.162, + "step": 3000 + }, + { + "dpo_losses": 0.6563786268234253, + "epoch": 0.79, + "grad_norm": 2.150591737799825, + "learning_rate": 6.556495706232412e-08, + "logits/chosen": -2.824885368347168, + "logits/rejected": -2.781872272491455, + "logps/chosen": -292.45086669921875, + "logps/rejected": -249.26254272460938, + "loss": 0.6716, + "positive_losses": 0.17349128425121307, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.16372860968112946, + "rewards/margins": 0.07855953276157379, + "rewards/margins_max": 0.19966019690036774, + "rewards/margins_min": -0.03546585515141487, + "rewards/margins_std": 0.10650608688592911, + "rewards/rejected": 0.08516907691955566, + "step": 3010 + }, + { + "dpo_losses": 0.6640094518661499, + "epoch": 0.79, + "grad_norm": 6.759203252845801, + "learning_rate": 6.403047291942057e-08, + "logits/chosen": -2.7413415908813477, + "logits/rejected": -2.7051734924316406, + "logps/chosen": -225.3956756591797, + "logps/rejected": -183.965087890625, + "loss": 0.6774, + "positive_losses": 0.107610322535038, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.1544293761253357, + "rewards/margins": 0.06344745308160782, + "rewards/margins_max": 0.18341758847236633, + "rewards/margins_min": -0.0742439404129982, + "rewards/margins_std": 0.10965760797262192, + "rewards/rejected": 0.09098193049430847, + "step": 3020 + }, + { + "dpo_losses": 0.6746552586555481, + "epoch": 0.79, + "grad_norm": 1.989812534837794, + "learning_rate": 6.251151717851021e-08, + "logits/chosen": -2.761890172958374, + "logits/rejected": -2.772231340408325, + "logps/chosen": -210.30636596679688, + "logps/rejected": -281.67205810546875, + "loss": 0.6891, + "positive_losses": 0.038213349878787994, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.15557453036308289, + "rewards/margins": 0.040930986404418945, + "rewards/margins_max": 0.14809675514698029, + "rewards/margins_min": -0.07007072120904922, + "rewards/margins_std": 0.09487451612949371, + "rewards/rejected": 0.11464353650808334, + "step": 3030 + }, + { + "dpo_losses": 0.6591284275054932, + "epoch": 0.8, + "grad_norm": 8.24123633803779, + "learning_rate": 6.100821667196041e-08, + "logits/chosen": -2.8116676807403564, + "logits/rejected": -2.753418445587158, + "logps/chosen": -283.1061706542969, + "logps/rejected": -318.7002868652344, + "loss": 0.6655, + "positive_losses": 0.12325821071863174, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.19477012753486633, + "rewards/margins": 0.07227279245853424, + "rewards/margins_max": 0.18873360753059387, + "rewards/margins_min": -0.058577846735715866, + "rewards/margins_std": 0.111363984644413, + "rewards/rejected": 0.12249733507633209, + "step": 3040 + }, + { + "dpo_losses": 0.6539517045021057, + "epoch": 0.8, + "grad_norm": 105.3660147785447, + "learning_rate": 5.952069692493061e-08, + "logits/chosen": -2.8027291297912598, + "logits/rejected": -2.773585796356201, + "logps/chosen": -280.40838623046875, + "logps/rejected": -266.98675537109375, + "loss": 0.677, + "positive_losses": 0.14699554443359375, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.16817526519298553, + "rewards/margins": 0.08418744802474976, + "rewards/margins_max": 0.21738891303539276, + "rewards/margins_min": -0.03661995381116867, + "rewards/margins_std": 0.1140676960349083, + "rewards/rejected": 0.08398783951997757, + "step": 3050 + }, + { + "dpo_losses": 0.6710882186889648, + "epoch": 0.8, + "grad_norm": 2.4427182997857435, + "learning_rate": 5.8049082144891794e-08, + "logits/chosen": -2.7883102893829346, + "logits/rejected": -2.7700586318969727, + "logps/chosen": -290.86627197265625, + "logps/rejected": -272.36419677734375, + "loss": 0.6659, + "positive_losses": 0.049817658960819244, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15865769982337952, + "rewards/margins": 0.04813426360487938, + "rewards/margins_max": 0.17915073037147522, + "rewards/margins_min": -0.06630988419055939, + "rewards/margins_std": 0.11496686935424805, + "rewards/rejected": 0.11052343994379044, + "step": 3060 + }, + { + "dpo_losses": 0.6638559103012085, + "epoch": 0.8, + "grad_norm": 12.267114011928014, + "learning_rate": 5.659349521125459e-08, + "logits/chosen": -2.6516406536102295, + "logits/rejected": -2.6265311241149902, + "logps/chosen": -234.54946899414062, + "logps/rejected": -211.96896362304688, + "loss": 0.6734, + "positive_losses": 0.022372817620635033, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.1678735613822937, + "rewards/margins": 0.06170881539583206, + "rewards/margins_max": 0.16219031810760498, + "rewards/margins_min": -0.03331523388624191, + "rewards/margins_std": 0.08575156331062317, + "rewards/rejected": 0.10616473853588104, + "step": 3070 + }, + { + "dpo_losses": 0.6670681834220886, + "epoch": 0.81, + "grad_norm": 7.370967473746132, + "learning_rate": 5.5154057665109e-08, + "logits/chosen": -2.777247190475464, + "logits/rejected": -2.767956495285034, + "logps/chosen": -224.2619171142578, + "logps/rejected": -257.23052978515625, + "loss": 0.6841, + "positive_losses": 0.1429794281721115, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15937839448451996, + "rewards/margins": 0.05837244912981987, + "rewards/margins_max": 0.1938154101371765, + "rewards/margins_min": -0.0703662633895874, + "rewards/margins_std": 0.11851917207241058, + "rewards/rejected": 0.10100595653057098, + "step": 3080 + }, + { + "dpo_losses": 0.6655889749526978, + "epoch": 0.81, + "grad_norm": 1.9205130736947094, + "learning_rate": 5.3730889699075853e-08, + "logits/chosen": -2.8654658794403076, + "logits/rejected": -2.750743865966797, + "logps/chosen": -295.4491271972656, + "logps/rejected": -248.1079864501953, + "loss": 0.6867, + "positive_losses": 0.30791252851486206, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.1357293426990509, + "rewards/margins": 0.05936115235090256, + "rewards/margins_max": 0.17789778113365173, + "rewards/margins_min": -0.05033574625849724, + "rewards/margins_std": 0.09862684458494186, + "rewards/rejected": 0.07636817544698715, + "step": 3090 + }, + { + "dpo_losses": 0.6691070795059204, + "epoch": 0.81, + "grad_norm": 15.561858749966415, + "learning_rate": 5.2324110147270893e-08, + "logits/chosen": -2.8081393241882324, + "logits/rejected": -2.7991766929626465, + "logps/chosen": -278.1468505859375, + "logps/rejected": -275.0913391113281, + "loss": 0.6856, + "positive_losses": 0.03154640272259712, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.17184340953826904, + "rewards/margins": 0.05207213759422302, + "rewards/margins_max": 0.1620274782180786, + "rewards/margins_min": -0.06637358665466309, + "rewards/margins_std": 0.10301927477121353, + "rewards/rejected": 0.11977127939462662, + "step": 3100 + }, + { + "epoch": 0.81, + "eval_dpo_losses": 0.6627760529518127, + "eval_logits/chosen": -2.7823197841644287, + "eval_logits/rejected": -2.743809223175049, + "eval_logps/chosen": -268.22344970703125, + "eval_logps/rejected": -252.41836547851562, + "eval_loss": 0.6812266707420349, + "eval_positive_losses": 0.12757235765457153, + "eval_rewards/accuracies": 0.7222222089767456, + "eval_rewards/chosen": 0.162714421749115, + "eval_rewards/margins": 0.06540438532829285, + "eval_rewards/margins_max": 0.2497921884059906, + "eval_rewards/margins_min": -0.10203025490045547, + "eval_rewards/margins_std": 0.1164335235953331, + "eval_rewards/rejected": 0.09731005132198334, + "eval_runtime": 390.3466, + "eval_samples_per_second": 5.124, + "eval_steps_per_second": 0.161, + "step": 3100 + }, + { + "dpo_losses": 0.6689059138298035, + "epoch": 0.81, + "grad_norm": 2.2747325027317773, + "learning_rate": 5.0933836475381795e-08, + "logits/chosen": -2.811298131942749, + "logits/rejected": -2.7352309226989746, + "logps/chosen": -325.28240966796875, + "logps/rejected": -284.04608154296875, + "loss": 0.6792, + "positive_losses": 0.2840765118598938, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.16880102455615997, + "rewards/margins": 0.05320410802960396, + "rewards/margins_max": 0.20439854264259338, + "rewards/margins_min": -0.06459472328424454, + "rewards/margins_std": 0.12115363031625748, + "rewards/rejected": 0.11559691280126572, + "step": 3110 + }, + { + "dpo_losses": 0.6610020995140076, + "epoch": 0.82, + "grad_norm": 1.9238160418600554, + "learning_rate": 4.956018477086005e-08, + "logits/chosen": -2.8051600456237793, + "logits/rejected": -2.7575900554656982, + "logps/chosen": -258.21368408203125, + "logps/rejected": -248.81930541992188, + "loss": 0.6664, + "positive_losses": 0.0, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.17074163258075714, + "rewards/margins": 0.06870418787002563, + "rewards/margins_max": 0.1845478117465973, + "rewards/margins_min": -0.056457966566085815, + "rewards/margins_std": 0.10763120651245117, + "rewards/rejected": 0.10203742980957031, + "step": 3120 + }, + { + "dpo_losses": 0.6620965003967285, + "epoch": 0.82, + "grad_norm": 1.8853223857876538, + "learning_rate": 4.820326973322763e-08, + "logits/chosen": -2.7807884216308594, + "logits/rejected": -2.77821683883667, + "logps/chosen": -241.8834686279297, + "logps/rejected": -221.46566772460938, + "loss": 0.6621, + "positive_losses": 0.169139102101326, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.169694721698761, + "rewards/margins": 0.06687848269939423, + "rewards/margins_max": 0.2040960043668747, + "rewards/margins_min": -0.06565778702497482, + "rewards/margins_std": 0.12235186249017715, + "rewards/rejected": 0.10281624644994736, + "step": 3130 + }, + { + "dpo_losses": 0.6488478779792786, + "epoch": 0.82, + "grad_norm": 2.114513665415584, + "learning_rate": 4.686320466449981e-08, + "logits/chosen": -2.6740877628326416, + "logits/rejected": -2.723538875579834, + "logps/chosen": -207.9197235107422, + "logps/rejected": -207.06576538085938, + "loss": 0.6765, + "positive_losses": 0.0, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.16196046769618988, + "rewards/margins": 0.09336929768323898, + "rewards/margins_max": 0.211838960647583, + "rewards/margins_min": -0.024275779724121094, + "rewards/margins_std": 0.10504420846700668, + "rewards/rejected": 0.0685911625623703, + "step": 3140 + }, + { + "dpo_losses": 0.6786141991615295, + "epoch": 0.82, + "grad_norm": 2.2968035836048664, + "learning_rate": 4.554010145972417e-08, + "logits/chosen": -2.783653736114502, + "logits/rejected": -2.8021936416625977, + "logps/chosen": -274.2801208496094, + "logps/rejected": -278.07366943359375, + "loss": 0.6777, + "positive_losses": 0.1276847869157791, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.15860846638679504, + "rewards/margins": 0.033648617565631866, + "rewards/margins_max": 0.18964725732803345, + "rewards/margins_min": -0.11228775978088379, + "rewards/margins_std": 0.13103614747524261, + "rewards/rejected": 0.12495984137058258, + "step": 3150 + }, + { + "dpo_losses": 0.6525992155075073, + "epoch": 0.83, + "grad_norm": 6.062725888792658, + "learning_rate": 4.423407059763745e-08, + "logits/chosen": -2.8857107162475586, + "logits/rejected": -2.815931558609009, + "logps/chosen": -311.3929443359375, + "logps/rejected": -211.2099609375, + "loss": 0.6679, + "positive_losses": 0.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18596258759498596, + "rewards/margins": 0.08752071857452393, + "rewards/margins_max": 0.2264481484889984, + "rewards/margins_min": -0.046439796686172485, + "rewards/margins_std": 0.12269928306341171, + "rewards/rejected": 0.09844187647104263, + "step": 3160 + }, + { + "dpo_losses": 0.6678873896598816, + "epoch": 0.83, + "grad_norm": 9.53360466559852, + "learning_rate": 4.294522113144078e-08, + "logits/chosen": -2.8117034435272217, + "logits/rejected": -2.791072368621826, + "logps/chosen": -299.37103271484375, + "logps/rejected": -281.37615966796875, + "loss": 0.6766, + "positive_losses": 0.3987411558628082, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1582927405834198, + "rewards/margins": 0.05428671836853027, + "rewards/margins_max": 0.17796790599822998, + "rewards/margins_min": -0.06260037422180176, + "rewards/margins_std": 0.11026357114315033, + "rewards/rejected": 0.10400601476430893, + "step": 3170 + }, + { + "dpo_losses": 0.6631403565406799, + "epoch": 0.83, + "grad_norm": 8.95303126314368, + "learning_rate": 4.1673660679693804e-08, + "logits/chosen": -2.6631081104278564, + "logits/rejected": -2.671422243118286, + "logps/chosen": -269.2955017089844, + "logps/rejected": -231.2158203125, + "loss": 0.6828, + "positive_losses": 0.08814601600170135, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.17081721127033234, + "rewards/margins": 0.0644526481628418, + "rewards/margins_max": 0.17851416766643524, + "rewards/margins_min": -0.05811725929379463, + "rewards/margins_std": 0.10786614567041397, + "rewards/rejected": 0.10636456310749054, + "step": 3180 + }, + { + "dpo_losses": 0.660934567451477, + "epoch": 0.83, + "grad_norm": 1.9643955819256005, + "learning_rate": 4.041949541732825e-08, + "logits/chosen": -2.74068021774292, + "logits/rejected": -2.6296112537384033, + "logps/chosen": -231.2447052001953, + "logps/rejected": -222.8256072998047, + "loss": 0.6751, + "positive_losses": 0.3026248812675476, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.1583462655544281, + "rewards/margins": 0.07087197154760361, + "rewards/margins_max": 0.22593124210834503, + "rewards/margins_min": -0.08451506495475769, + "rewards/margins_std": 0.14156809449195862, + "rewards/rejected": 0.08747430145740509, + "step": 3190 + }, + { + "dpo_losses": 0.6655724048614502, + "epoch": 0.84, + "grad_norm": 1.8090603516961787, + "learning_rate": 3.9182830066782605e-08, + "logits/chosen": -2.692063570022583, + "logits/rejected": -2.7058863639831543, + "logps/chosen": -261.7070007324219, + "logps/rejected": -234.983154296875, + "loss": 0.6678, + "positive_losses": 0.02298889122903347, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.144895538687706, + "rewards/margins": 0.05822502821683884, + "rewards/margins_max": 0.1596185564994812, + "rewards/margins_min": -0.04484047740697861, + "rewards/margins_std": 0.08938737213611603, + "rewards/rejected": 0.08667052537202835, + "step": 3200 + }, + { + "epoch": 0.84, + "eval_dpo_losses": 0.6627413630485535, + "eval_logits/chosen": -2.7853500843048096, + "eval_logits/rejected": -2.747154712677002, + "eval_logps/chosen": -268.1344909667969, + "eval_logps/rejected": -252.33535766601562, + "eval_loss": 0.6808730363845825, + "eval_positive_losses": 0.12440192699432373, + "eval_rewards/accuracies": 0.7222222089767456, + "eval_rewards/chosen": 0.1636039912700653, + "eval_rewards/margins": 0.06546396762132645, + "eval_rewards/margins_max": 0.25000351667404175, + "eval_rewards/margins_min": -0.10156844556331635, + "eval_rewards/margins_std": 0.11613842099905014, + "eval_rewards/rejected": 0.09814003854990005, + "eval_runtime": 389.6112, + "eval_samples_per_second": 5.133, + "eval_steps_per_second": 0.162, + "step": 3200 + }, + { + "dpo_losses": 0.6610216498374939, + "epoch": 0.84, + "grad_norm": 8.077637648500255, + "learning_rate": 3.79637678892577e-08, + "logits/chosen": -2.8092358112335205, + "logits/rejected": -2.786480188369751, + "logps/chosen": -241.1995391845703, + "logps/rejected": -257.9070739746094, + "loss": 0.677, + "positive_losses": 0.05450744554400444, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.14913347363471985, + "rewards/margins": 0.06916774064302444, + "rewards/margins_max": 0.20991599559783936, + "rewards/margins_min": -0.05254759639501572, + "rewards/margins_std": 0.11959397792816162, + "rewards/rejected": 0.0799657329916954, + "step": 3210 + }, + { + "dpo_losses": 0.6678776741027832, + "epoch": 0.84, + "grad_norm": 2.4239630773017296, + "learning_rate": 3.6762410676094645e-08, + "logits/chosen": -2.718482494354248, + "logits/rejected": -2.672642469406128, + "logps/chosen": -247.31820678710938, + "logps/rejected": -254.6053009033203, + "loss": 0.6796, + "positive_losses": 0.03541602939367294, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.1772005707025528, + "rewards/margins": 0.05370795726776123, + "rewards/margins_max": 0.16246002912521362, + "rewards/margins_min": -0.04772583767771721, + "rewards/margins_std": 0.09779927134513855, + "rewards/rejected": 0.12349263578653336, + "step": 3220 + }, + { + "dpo_losses": 0.6720653772354126, + "epoch": 0.85, + "grad_norm": 2.523629479442598, + "learning_rate": 3.557885874027497e-08, + "logits/chosen": -2.775148868560791, + "logits/rejected": -2.758457899093628, + "logps/chosen": -300.9856872558594, + "logps/rejected": -315.10015869140625, + "loss": 0.6848, + "positive_losses": 0.0, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.14731472730636597, + "rewards/margins": 0.048546768724918365, + "rewards/margins_max": 0.21185681223869324, + "rewards/margins_min": -0.10242996364831924, + "rewards/margins_std": 0.13624873757362366, + "rewards/rejected": 0.098767951130867, + "step": 3230 + }, + { + "dpo_losses": 0.6623659133911133, + "epoch": 0.85, + "grad_norm": 11.640982848945923, + "learning_rate": 3.441321090804469e-08, + "logits/chosen": -2.9063901901245117, + "logits/rejected": -2.827007293701172, + "logps/chosen": -249.5447998046875, + "logps/rejected": -290.7762145996094, + "loss": 0.6814, + "positive_losses": 0.161967471241951, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.1393720656633377, + "rewards/margins": 0.06511013209819794, + "rewards/margins_max": 0.16636310517787933, + "rewards/margins_min": -0.03109910525381565, + "rewards/margins_std": 0.09238677471876144, + "rewards/rejected": 0.07426193356513977, + "step": 3240 + }, + { + "dpo_losses": 0.6672154068946838, + "epoch": 0.85, + "grad_norm": 10.181580491202146, + "learning_rate": 3.326556451066234e-08, + "logits/chosen": -2.848665952682495, + "logits/rejected": -2.8185975551605225, + "logps/chosen": -284.4683532714844, + "logps/rejected": -275.9518127441406, + "loss": 0.6871, + "positive_losses": 0.097559355199337, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.15505456924438477, + "rewards/margins": 0.055702775716781616, + "rewards/margins_max": 0.17600347101688385, + "rewards/margins_min": -0.041048236191272736, + "rewards/margins_std": 0.09688388556241989, + "rewards/rejected": 0.09935178607702255, + "step": 3250 + }, + { + "dpo_losses": 0.6560079455375671, + "epoch": 0.85, + "grad_norm": 14.361829479425445, + "learning_rate": 3.2136015376271946e-08, + "logits/chosen": -2.8152987957000732, + "logits/rejected": -2.8128440380096436, + "logps/chosen": -237.4013214111328, + "logps/rejected": -266.92230224609375, + "loss": 0.6727, + "positive_losses": 0.08050384372472763, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.15222205221652985, + "rewards/margins": 0.0787474662065506, + "rewards/margins_max": 0.198978453874588, + "rewards/margins_min": -0.04368770867586136, + "rewards/margins_std": 0.10543539375066757, + "rewards/rejected": 0.07347457110881805, + "step": 3260 + }, + { + "dpo_losses": 0.6642253994941711, + "epoch": 0.86, + "grad_norm": 1.8469226501824152, + "learning_rate": 3.102465782190106e-08, + "logits/chosen": -2.7718749046325684, + "logits/rejected": -2.741318702697754, + "logps/chosen": -236.1094970703125, + "logps/rejected": -247.9010772705078, + "loss": 0.6721, + "positive_losses": 0.0, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.15767602622509003, + "rewards/margins": 0.06289416551589966, + "rewards/margins_max": 0.21374940872192383, + "rewards/margins_min": -0.06856616586446762, + "rewards/margins_std": 0.12185641378164291, + "rewards/rejected": 0.09478186070919037, + "step": 3270 + }, + { + "dpo_losses": 0.6692901849746704, + "epoch": 0.86, + "grad_norm": 2.1990584049075594, + "learning_rate": 2.993158464558565e-08, + "logits/chosen": -2.859236717224121, + "logits/rejected": -2.7646913528442383, + "logps/chosen": -198.5013427734375, + "logps/rejected": -175.32022094726562, + "loss": 0.6738, + "positive_losses": 0.055785369127988815, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.1464720368385315, + "rewards/margins": 0.052502166479825974, + "rewards/margins_max": 0.19133032858371735, + "rewards/margins_min": -0.0839792937040329, + "rewards/margins_std": 0.11933982372283936, + "rewards/rejected": 0.09396988153457642, + "step": 3280 + }, + { + "dpo_losses": 0.646541953086853, + "epoch": 0.86, + "grad_norm": 9.628112685985919, + "learning_rate": 2.8856887118621358e-08, + "logits/chosen": -2.747638702392578, + "logits/rejected": -2.724984645843506, + "logps/chosen": -279.7018737792969, + "logps/rejected": -280.64251708984375, + "loss": 0.6694, + "positive_losses": 0.02620544470846653, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.17957869172096252, + "rewards/margins": 0.09934578835964203, + "rewards/margins_max": 0.22174513339996338, + "rewards/margins_min": -0.04211106896400452, + "rewards/margins_std": 0.11883123964071274, + "rewards/rejected": 0.0802329033613205, + "step": 3290 + }, + { + "dpo_losses": 0.6547456383705139, + "epoch": 0.86, + "grad_norm": 13.829934556741401, + "learning_rate": 2.7800654977942482e-08, + "logits/chosen": -2.870971202850342, + "logits/rejected": -2.828831911087036, + "logps/chosen": -324.831787109375, + "logps/rejected": -328.6128234863281, + "loss": 0.6786, + "positive_losses": 0.2483566254377365, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.1950651854276657, + "rewards/margins": 0.08232709765434265, + "rewards/margins_max": 0.21603958308696747, + "rewards/margins_min": -0.06784708052873611, + "rewards/margins_std": 0.12510669231414795, + "rewards/rejected": 0.11273808777332306, + "step": 3300 + }, + { + "epoch": 0.86, + "eval_dpo_losses": 0.6626967787742615, + "eval_logits/chosen": -2.7811217308044434, + "eval_logits/rejected": -2.7425291538238525, + "eval_logps/chosen": -268.10919189453125, + "eval_logps/rejected": -252.32167053222656, + "eval_loss": 0.6810629963874817, + "eval_positive_losses": 0.12673041224479675, + "eval_rewards/accuracies": 0.7222222089767456, + "eval_rewards/chosen": 0.16385717689990997, + "eval_rewards/margins": 0.06558007746934891, + "eval_rewards/margins_max": 0.2502087950706482, + "eval_rewards/margins_min": -0.10188627988100052, + "eval_rewards/margins_std": 0.1165127083659172, + "eval_rewards/rejected": 0.09827709943056107, + "eval_runtime": 389.5445, + "eval_samples_per_second": 5.134, + "eval_steps_per_second": 0.162, + "step": 3300 + }, + { + "dpo_losses": 0.6536380052566528, + "epoch": 0.87, + "grad_norm": 10.468335317113388, + "learning_rate": 2.676297641862879e-08, + "logits/chosen": -2.733778476715088, + "logits/rejected": -2.673896551132202, + "logps/chosen": -204.01513671875, + "logps/rejected": -214.28353881835938, + "loss": 0.6713, + "positive_losses": 0.09598731994628906, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.14947354793548584, + "rewards/margins": 0.08431442081928253, + "rewards/margins_max": 0.2124217450618744, + "rewards/margins_min": -0.031432487070560455, + "rewards/margins_std": 0.10755596309900284, + "rewards/rejected": 0.0651591345667839, + "step": 3310 + }, + { + "dpo_losses": 0.6592745780944824, + "epoch": 0.87, + "grad_norm": 2.4810075309912802, + "learning_rate": 2.5743938086541352e-08, + "logits/chosen": -2.5734775066375732, + "logits/rejected": -2.5956883430480957, + "logps/chosen": -288.0391540527344, + "logps/rejected": -232.1073455810547, + "loss": 0.6769, + "positive_losses": 0.35831108689308167, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.16860854625701904, + "rewards/margins": 0.0769532322883606, + "rewards/margins_max": 0.2719673216342926, + "rewards/margins_min": -0.0690711960196495, + "rewards/margins_std": 0.15144231915473938, + "rewards/rejected": 0.09165529161691666, + "step": 3320 + }, + { + "dpo_losses": 0.6756593585014343, + "epoch": 0.87, + "grad_norm": 11.08969308523051, + "learning_rate": 2.474362507108757e-08, + "logits/chosen": -2.821928024291992, + "logits/rejected": -2.852421760559082, + "logps/chosen": -265.82354736328125, + "logps/rejected": -295.21044921875, + "loss": 0.6664, + "positive_losses": 0.0, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.15832513570785522, + "rewards/margins": 0.03733016178011894, + "rewards/margins_max": 0.13323178887367249, + "rewards/margins_min": -0.049654535949230194, + "rewards/margins_std": 0.0812670961022377, + "rewards/rejected": 0.1209949478507042, + "step": 3330 + }, + { + "dpo_losses": 0.6848273277282715, + "epoch": 0.87, + "grad_norm": 1.935696162097246, + "learning_rate": 2.3762120898116495e-08, + "logits/chosen": -2.805917501449585, + "logits/rejected": -2.8301444053649902, + "logps/chosen": -257.66290283203125, + "logps/rejected": -310.5917053222656, + "loss": 0.6745, + "positive_losses": 0.20382681488990784, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.13177986443042755, + "rewards/margins": 0.019999397918581963, + "rewards/margins_max": 0.14217321574687958, + "rewards/margins_min": -0.11165271699428558, + "rewards/margins_std": 0.11129863560199738, + "rewards/rejected": 0.11178047955036163, + "step": 3340 + }, + { + "dpo_losses": 0.6613301038742065, + "epoch": 0.88, + "grad_norm": 6.993539280993901, + "learning_rate": 2.2799507522944044e-08, + "logits/chosen": -2.863429546356201, + "logits/rejected": -2.7525746822357178, + "logps/chosen": -277.91192626953125, + "logps/rejected": -247.7657928466797, + "loss": 0.6798, + "positive_losses": 0.2877071499824524, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.16407844424247742, + "rewards/margins": 0.06978114694356918, + "rewards/margins_max": 0.21672868728637695, + "rewards/margins_min": -0.06434138119220734, + "rewards/margins_std": 0.1292802393436432, + "rewards/rejected": 0.09429730474948883, + "step": 3350 + }, + { + "dpo_losses": 0.6659659147262573, + "epoch": 0.88, + "grad_norm": 1.7625116902498015, + "learning_rate": 2.1855865323510054e-08, + "logits/chosen": -2.8760766983032227, + "logits/rejected": -2.812631845474243, + "logps/chosen": -266.16845703125, + "logps/rejected": -319.29388427734375, + "loss": 0.6708, + "positive_losses": 0.0024131773971021175, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.15420962870121002, + "rewards/margins": 0.05738378316164017, + "rewards/margins_max": 0.15622182190418243, + "rewards/margins_min": -0.0385439433157444, + "rewards/margins_std": 0.08907020092010498, + "rewards/rejected": 0.09682585299015045, + "step": 3360 + }, + { + "dpo_losses": 0.6743310689926147, + "epoch": 0.88, + "grad_norm": 8.324356228334473, + "learning_rate": 2.0931273093666573e-08, + "logits/chosen": -2.8275797367095947, + "logits/rejected": -2.8362526893615723, + "logps/chosen": -254.84768676757812, + "logps/rejected": -265.889404296875, + "loss": 0.6689, + "positive_losses": 0.006755066104233265, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.16076824069023132, + "rewards/margins": 0.04187941551208496, + "rewards/margins_max": 0.18899044394493103, + "rewards/margins_min": -0.08825278282165527, + "rewards/margins_std": 0.12628883123397827, + "rewards/rejected": 0.11888883262872696, + "step": 3370 + }, + { + "dpo_losses": 0.669025182723999, + "epoch": 0.88, + "grad_norm": 1.7763929569667392, + "learning_rate": 2.002580803659873e-08, + "logits/chosen": -2.854543685913086, + "logits/rejected": -2.7524867057800293, + "logps/chosen": -269.3807373046875, + "logps/rejected": -230.5249786376953, + "loss": 0.6985, + "positive_losses": 0.07552261650562286, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.13878145813941956, + "rewards/margins": 0.05284886434674263, + "rewards/margins_max": 0.17732667922973633, + "rewards/margins_min": -0.05076334998011589, + "rewards/margins_std": 0.10122529417276382, + "rewards/rejected": 0.08593259006738663, + "step": 3380 + }, + { + "dpo_losses": 0.6597247123718262, + "epoch": 0.89, + "grad_norm": 1.943915972722519, + "learning_rate": 1.9139545758378256e-08, + "logits/chosen": -2.76768159866333, + "logits/rejected": -2.7349982261657715, + "logps/chosen": -241.35092163085938, + "logps/rejected": -259.1753845214844, + "loss": 0.6668, + "positive_losses": 0.09793557971715927, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.15099754929542542, + "rewards/margins": 0.07219112664461136, + "rewards/margins_max": 0.21532170474529266, + "rewards/margins_min": -0.08124847710132599, + "rewards/margins_std": 0.12921682000160217, + "rewards/rejected": 0.07880643010139465, + "step": 3390 + }, + { + "dpo_losses": 0.6647534370422363, + "epoch": 0.89, + "grad_norm": 1.8916313078456324, + "learning_rate": 1.8272560261650277e-08, + "logits/chosen": -2.796069860458374, + "logits/rejected": -2.7577872276306152, + "logps/chosen": -260.0286865234375, + "logps/rejected": -275.72344970703125, + "loss": 0.675, + "positive_losses": 0.06406746059656143, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.14727066457271576, + "rewards/margins": 0.060552507638931274, + "rewards/margins_max": 0.1828928291797638, + "rewards/margins_min": -0.04774991795420647, + "rewards/margins_std": 0.10030458122491837, + "rewards/rejected": 0.08671815693378448, + "step": 3400 + }, + { + "epoch": 0.89, + "eval_dpo_losses": 0.6627320647239685, + "eval_logits/chosen": -2.7832562923431396, + "eval_logits/rejected": -2.744842767715454, + "eval_logps/chosen": -268.03973388671875, + "eval_logps/rejected": -252.24195861816406, + "eval_loss": 0.680766224861145, + "eval_positive_losses": 0.12215282768011093, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": 0.16455139219760895, + "eval_rewards/margins": 0.06547729671001434, + "eval_rewards/margins_max": 0.2496509701013565, + "eval_rewards/margins_min": -0.10108717530965805, + "eval_rewards/margins_std": 0.11614864319562912, + "eval_rewards/rejected": 0.09907408058643341, + "eval_runtime": 390.0194, + "eval_samples_per_second": 5.128, + "eval_steps_per_second": 0.162, + "step": 3400 + }, + { + "dpo_losses": 0.6754037141799927, + "epoch": 0.89, + "grad_norm": 1.626861972705387, + "learning_rate": 1.742492393945427e-08, + "logits/chosen": -2.612673282623291, + "logits/rejected": -2.6271557807922363, + "logps/chosen": -228.8592071533203, + "logps/rejected": -260.7303466796875, + "loss": 0.6826, + "positive_losses": 0.2633230686187744, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.11556209623813629, + "rewards/margins": 0.03941815719008446, + "rewards/margins_max": 0.1536892205476761, + "rewards/margins_min": -0.08365072309970856, + "rewards/margins_std": 0.1035921722650528, + "rewards/rejected": 0.07614392787218094, + "step": 3410 + }, + { + "dpo_losses": 0.6367681622505188, + "epoch": 0.9, + "grad_norm": 2.493987865146216, + "learning_rate": 1.6596707569179302e-08, + "logits/chosen": -2.8057358264923096, + "logits/rejected": -2.7106568813323975, + "logps/chosen": -344.5505676269531, + "logps/rejected": -280.78411865234375, + "loss": 0.6756, + "positive_losses": 0.057952117174863815, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.18896682560443878, + "rewards/margins": 0.12057340145111084, + "rewards/margins_max": 0.27455177903175354, + "rewards/margins_min": -0.008689996786415577, + "rewards/margins_std": 0.12518426775932312, + "rewards/rejected": 0.06839345395565033, + "step": 3420 + }, + { + "dpo_losses": 0.6571991443634033, + "epoch": 0.9, + "grad_norm": 2.113252819384774, + "learning_rate": 1.5787980306653848e-08, + "logits/chosen": -2.8217930793762207, + "logits/rejected": -2.7686524391174316, + "logps/chosen": -317.1985778808594, + "logps/rejected": -278.02716064453125, + "loss": 0.6726, + "positive_losses": 0.03348579257726669, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.1714903563261032, + "rewards/margins": 0.07688074558973312, + "rewards/margins_max": 0.20531579852104187, + "rewards/margins_min": -0.014707878232002258, + "rewards/margins_std": 0.10292376577854156, + "rewards/rejected": 0.09460960328578949, + "step": 3430 + }, + { + "dpo_losses": 0.6721744537353516, + "epoch": 0.9, + "grad_norm": 1.8059559117382162, + "learning_rate": 1.499880968037165e-08, + "logits/chosen": -2.713167667388916, + "logits/rejected": -2.7415285110473633, + "logps/chosen": -268.3160400390625, + "logps/rejected": -277.2546691894531, + "loss": 0.6743, + "positive_losses": 0.1692478209733963, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.14324799180030823, + "rewards/margins": 0.045404307544231415, + "rewards/margins_max": 0.15480300784111023, + "rewards/margins_min": -0.07178832590579987, + "rewards/margins_std": 0.10168786346912384, + "rewards/rejected": 0.09784368425607681, + "step": 3440 + }, + { + "dpo_losses": 0.6616576313972473, + "epoch": 0.9, + "grad_norm": 10.562875594283618, + "learning_rate": 1.4229261585852803e-08, + "logits/chosen": -2.7967777252197266, + "logits/rejected": -2.7842721939086914, + "logps/chosen": -257.3331298828125, + "logps/rejected": -245.29470825195312, + "loss": 0.6779, + "positive_losses": 0.047858428210020065, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.1710709184408188, + "rewards/margins": 0.06789450347423553, + "rewards/margins_max": 0.17945295572280884, + "rewards/margins_min": -0.05227668210864067, + "rewards/margins_std": 0.10495837032794952, + "rewards/rejected": 0.10317642986774445, + "step": 3450 + }, + { + "dpo_losses": 0.6492521166801453, + "epoch": 0.91, + "grad_norm": 3.0638981955525155, + "learning_rate": 1.3479400280141883e-08, + "logits/chosen": -2.8031203746795654, + "logits/rejected": -2.7489144802093506, + "logps/chosen": -299.0844421386719, + "logps/rejected": -206.0711212158203, + "loss": 0.667, + "positive_losses": 0.0, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.1845073699951172, + "rewards/margins": 0.09424134343862534, + "rewards/margins_max": 0.22934171557426453, + "rewards/margins_min": -0.028472676873207092, + "rewards/margins_std": 0.1167953833937645, + "rewards/rejected": 0.09026604890823364, + "step": 3460 + }, + { + "dpo_losses": 0.6643989682197571, + "epoch": 0.91, + "grad_norm": 6.962641343714408, + "learning_rate": 1.2749288376442042e-08, + "logits/chosen": -2.7879843711853027, + "logits/rejected": -2.724153757095337, + "logps/chosen": -260.9576721191406, + "logps/rejected": -277.0984191894531, + "loss": 0.6717, + "positive_losses": 0.01393737830221653, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.16241498291492462, + "rewards/margins": 0.062306541949510574, + "rewards/margins_max": 0.18870343267917633, + "rewards/margins_min": -0.06285803765058517, + "rewards/margins_std": 0.1129065528512001, + "rewards/rejected": 0.10010842233896255, + "step": 3470 + }, + { + "dpo_losses": 0.6731021404266357, + "epoch": 0.91, + "grad_norm": 2.0789325263747136, + "learning_rate": 1.2038986838887127e-08, + "logits/chosen": -2.820610523223877, + "logits/rejected": -2.723703384399414, + "logps/chosen": -212.0461883544922, + "logps/rejected": -224.64724731445312, + "loss": 0.6798, + "positive_losses": 0.22083091735839844, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.14202693104743958, + "rewards/margins": 0.04587015509605408, + "rewards/margins_max": 0.19555941224098206, + "rewards/margins_min": -0.09572341293096542, + "rewards/margins_std": 0.1279156655073166, + "rewards/rejected": 0.0961567685008049, + "step": 3480 + }, + { + "dpo_losses": 0.6755630970001221, + "epoch": 0.91, + "grad_norm": 8.736703415606234, + "learning_rate": 1.1348554977451131e-08, + "logits/chosen": -2.7565951347351074, + "logits/rejected": -2.7981178760528564, + "logps/chosen": -238.7095947265625, + "logps/rejected": -209.91738891601562, + "loss": 0.6776, + "positive_losses": 0.0, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.15453873574733734, + "rewards/margins": 0.03820282220840454, + "rewards/margins_max": 0.14551648497581482, + "rewards/margins_min": -0.07384388148784637, + "rewards/margins_std": 0.09857748448848724, + "rewards/rejected": 0.1163359060883522, + "step": 3490 + }, + { + "dpo_losses": 0.6538613438606262, + "epoch": 0.92, + "grad_norm": 5.676467087455956, + "learning_rate": 1.06780504429958e-08, + "logits/chosen": -2.84360933303833, + "logits/rejected": -2.812001943588257, + "logps/chosen": -345.8283996582031, + "logps/rejected": -269.57781982421875, + "loss": 0.6743, + "positive_losses": 0.00730133056640625, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18043914437294006, + "rewards/margins": 0.0834948718547821, + "rewards/margins_max": 0.2217591553926468, + "rewards/margins_min": -0.031042709946632385, + "rewards/margins_std": 0.11449646949768066, + "rewards/rejected": 0.09694425761699677, + "step": 3500 + }, + { + "epoch": 0.92, + "eval_dpo_losses": 0.6627262830734253, + "eval_logits/chosen": -2.78560471534729, + "eval_logits/rejected": -2.7474148273468018, + "eval_logps/chosen": -268.0489807128906, + "eval_logps/rejected": -252.25411987304688, + "eval_loss": 0.6805341243743896, + "eval_positive_losses": 0.12150750309228897, + "eval_rewards/accuracies": 0.7242063283920288, + "eval_rewards/chosen": 0.16445913910865784, + "eval_rewards/margins": 0.0655067041516304, + "eval_rewards/margins_max": 0.2502119243144989, + "eval_rewards/margins_min": -0.10156121850013733, + "eval_rewards/margins_std": 0.1163892149925232, + "eval_rewards/rejected": 0.09895242750644684, + "eval_runtime": 389.4568, + "eval_samples_per_second": 5.135, + "eval_steps_per_second": 0.162, + "step": 3500 + }, + { + "dpo_losses": 0.6715716123580933, + "epoch": 0.92, + "grad_norm": 9.201369838734866, + "learning_rate": 1.0027529222456754e-08, + "logits/chosen": -2.766420364379883, + "logits/rejected": -2.7141261100769043, + "logps/chosen": -276.39501953125, + "logps/rejected": -255.0765380859375, + "loss": 0.6725, + "positive_losses": 0.19459286332130432, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.1418302208185196, + "rewards/margins": 0.04641687124967575, + "rewards/margins_max": 0.15345872938632965, + "rewards/margins_min": -0.06363958865404129, + "rewards/margins_std": 0.0970257893204689, + "rewards/rejected": 0.09541334956884384, + "step": 3510 + }, + { + "dpo_losses": 0.6719012260437012, + "epoch": 0.92, + "grad_norm": 2.078586039634592, + "learning_rate": 9.397045634168766e-09, + "logits/chosen": -2.7967326641082764, + "logits/rejected": -2.825498104095459, + "logps/chosen": -251.73776245117188, + "logps/rejected": -249.35696411132812, + "loss": 0.6834, + "positive_losses": 0.19272366166114807, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.12054480612277985, + "rewards/margins": 0.047685910016298294, + "rewards/margins_max": 0.21180269122123718, + "rewards/margins_min": -0.08542943000793457, + "rewards/margins_std": 0.1332327276468277, + "rewards/rejected": 0.07285889238119125, + "step": 3520 + }, + { + "dpo_losses": 0.6688061952590942, + "epoch": 0.92, + "grad_norm": 10.22580076780168, + "learning_rate": 8.78665232332998e-09, + "logits/chosen": -2.8747401237487793, + "logits/rejected": -2.8032772541046143, + "logps/chosen": -266.33514404296875, + "logps/rejected": -234.31851196289062, + "loss": 0.6704, + "positive_losses": 0.07058105617761612, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.15240497887134552, + "rewards/margins": 0.05244448781013489, + "rewards/margins_max": 0.1862691044807434, + "rewards/margins_min": -0.039194732904434204, + "rewards/margins_std": 0.09924636781215668, + "rewards/rejected": 0.09996049106121063, + "step": 3530 + }, + { + "dpo_losses": 0.6617444157600403, + "epoch": 0.93, + "grad_norm": 15.43040346440166, + "learning_rate": 8.196400257606206e-09, + "logits/chosen": -2.764500379562378, + "logits/rejected": -2.661292791366577, + "logps/chosen": -265.88470458984375, + "logps/rejected": -230.70852661132812, + "loss": 0.6896, + "positive_losses": 0.18745270371437073, + "rewards/accuracies": 0.7749999761581421, + "rewards/chosen": 0.15406164526939392, + "rewards/margins": 0.06750769913196564, + "rewards/margins_max": 0.20162923634052277, + "rewards/margins_min": -0.05225520208477974, + "rewards/margins_std": 0.11536550521850586, + "rewards/rejected": 0.08655395358800888, + "step": 3540 + }, + { + "dpo_losses": 0.6632605791091919, + "epoch": 0.93, + "grad_norm": 2.1012023844137704, + "learning_rate": 7.626338722875075e-09, + "logits/chosen": -2.7544524669647217, + "logits/rejected": -2.7378907203674316, + "logps/chosen": -245.88211059570312, + "logps/rejected": -251.2125244140625, + "loss": 0.6689, + "positive_losses": 0.0, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.17252537608146667, + "rewards/margins": 0.06453979015350342, + "rewards/margins_max": 0.20910212397575378, + "rewards/margins_min": -0.05815510079264641, + "rewards/margins_std": 0.11868518590927124, + "rewards/rejected": 0.10798557102680206, + "step": 3550 + }, + { + "dpo_losses": 0.648143470287323, + "epoch": 0.93, + "grad_norm": 10.053806126992908, + "learning_rate": 7.0765153191106875e-09, + "logits/chosen": -2.7637624740600586, + "logits/rejected": -2.686310291290283, + "logps/chosen": -264.9227600097656, + "logps/rejected": -247.6665802001953, + "loss": 0.6717, + "positive_losses": 0.025927353650331497, + "rewards/accuracies": 0.824999988079071, + "rewards/chosen": 0.15334150195121765, + "rewards/margins": 0.09508029371500015, + "rewards/margins_max": 0.22009754180908203, + "rewards/margins_min": -0.00194616022054106, + "rewards/margins_std": 0.09800895303487778, + "rewards/rejected": 0.0582612045109272, + "step": 3560 + }, + { + "dpo_losses": 0.6595361828804016, + "epoch": 0.93, + "grad_norm": 14.661114306391521, + "learning_rate": 6.54697595640899e-09, + "logits/chosen": -2.7691988945007324, + "logits/rejected": -2.7651801109313965, + "logps/chosen": -305.5066223144531, + "logps/rejected": -253.8525848388672, + "loss": 0.6729, + "positive_losses": 0.26647910475730896, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17720453441143036, + "rewards/margins": 0.07246123254299164, + "rewards/margins_max": 0.21851103007793427, + "rewards/margins_min": -0.05463584512472153, + "rewards/margins_std": 0.1229761391878128, + "rewards/rejected": 0.10474331676959991, + "step": 3570 + }, + { + "dpo_losses": 0.6499595642089844, + "epoch": 0.94, + "grad_norm": 11.351711033310751, + "learning_rate": 6.037764851154425e-09, + "logits/chosen": -2.841740131378174, + "logits/rejected": -2.756340742111206, + "logps/chosen": -291.97027587890625, + "logps/rejected": -261.6409912109375, + "loss": 0.689, + "positive_losses": 0.2165401428937912, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": 0.16067275404930115, + "rewards/margins": 0.09467877447605133, + "rewards/margins_max": 0.243547722697258, + "rewards/margins_min": -0.06149807572364807, + "rewards/margins_std": 0.1369081288576126, + "rewards/rejected": 0.06599397957324982, + "step": 3580 + }, + { + "dpo_losses": 0.6605626344680786, + "epoch": 0.94, + "grad_norm": 2.214755694002987, + "learning_rate": 5.548924522327747e-09, + "logits/chosen": -2.760728359222412, + "logits/rejected": -2.7672464847564697, + "logps/chosen": -192.11802673339844, + "logps/rejected": -197.56478881835938, + "loss": 0.6692, + "positive_losses": 0.0, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.165755957365036, + "rewards/margins": 0.0702081099152565, + "rewards/margins_max": 0.18508026003837585, + "rewards/margins_min": -0.05116415023803711, + "rewards/margins_std": 0.10595384985208511, + "rewards/rejected": 0.09554782509803772, + "step": 3590 + }, + { + "dpo_losses": 0.6780807971954346, + "epoch": 0.94, + "grad_norm": 2.3958579241831295, + "learning_rate": 5.080495787955691e-09, + "logits/chosen": -2.6624696254730225, + "logits/rejected": -2.6513009071350098, + "logps/chosen": -242.3583984375, + "logps/rejected": -269.24737548828125, + "loss": 0.6778, + "positive_losses": 0.15529099106788635, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.14953303337097168, + "rewards/margins": 0.034219689667224884, + "rewards/margins_max": 0.16842004656791687, + "rewards/margins_min": -0.10593481361865997, + "rewards/margins_std": 0.12279149144887924, + "rewards/rejected": 0.11531335115432739, + "step": 3600 + }, + { + "epoch": 0.94, + "eval_dpo_losses": 0.6625929474830627, + "eval_logits/chosen": -2.7853004932403564, + "eval_logits/rejected": -2.7470314502716064, + "eval_logps/chosen": -268.0681457519531, + "eval_logps/rejected": -252.3022003173828, + "eval_loss": 0.6809914708137512, + "eval_positive_losses": 0.12787041068077087, + "eval_rewards/accuracies": 0.7182539701461792, + "eval_rewards/chosen": 0.16426753997802734, + "eval_rewards/margins": 0.06579570472240448, + "eval_rewards/margins_max": 0.25073686242103577, + "eval_rewards/margins_min": -0.10168647021055222, + "eval_rewards/margins_std": 0.11665406078100204, + "eval_rewards/rejected": 0.09847183525562286, + "eval_runtime": 389.461, + "eval_samples_per_second": 5.135, + "eval_steps_per_second": 0.162, + "step": 3600 + }, + { + "dpo_losses": 0.6752595901489258, + "epoch": 0.94, + "grad_norm": 1.9735634709955117, + "learning_rate": 4.632517761702814e-09, + "logits/chosen": -2.683030128479004, + "logits/rejected": -2.70261812210083, + "logps/chosen": -225.0770721435547, + "logps/rejected": -216.30899047851562, + "loss": 0.6706, + "positive_losses": 0.007031249813735485, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.13683125376701355, + "rewards/margins": 0.03817175701260567, + "rewards/margins_max": 0.14266720414161682, + "rewards/margins_min": -0.048948802053928375, + "rewards/margins_std": 0.08342118561267853, + "rewards/rejected": 0.09865951538085938, + "step": 3610 + }, + { + "dpo_losses": 0.6676380038261414, + "epoch": 0.95, + "grad_norm": 3.130911461961453, + "learning_rate": 4.205027849605358e-09, + "logits/chosen": -2.7730565071105957, + "logits/rejected": -2.718841552734375, + "logps/chosen": -219.79776000976562, + "logps/rejected": -233.9205322265625, + "loss": 0.6702, + "positive_losses": 0.02769775316119194, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.15636537969112396, + "rewards/margins": 0.0540301576256752, + "rewards/margins_max": 0.16362106800079346, + "rewards/margins_min": -0.02917725406587124, + "rewards/margins_std": 0.08670753240585327, + "rewards/rejected": 0.10233521461486816, + "step": 3620 + }, + { + "dpo_losses": 0.6682838797569275, + "epoch": 0.95, + "grad_norm": 6.827471599261377, + "learning_rate": 3.798061746947995e-09, + "logits/chosen": -2.680844783782959, + "logits/rejected": -2.6313278675079346, + "logps/chosen": -235.08230590820312, + "logps/rejected": -297.84027099609375, + "loss": 0.6741, + "positive_losses": 0.3615362048149109, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.13550007343292236, + "rewards/margins": 0.05279763787984848, + "rewards/margins_max": 0.15825259685516357, + "rewards/margins_min": -0.054834891110658646, + "rewards/margins_std": 0.09746996313333511, + "rewards/rejected": 0.08270244300365448, + "step": 3630 + }, + { + "dpo_losses": 0.6689377427101135, + "epoch": 0.95, + "grad_norm": 5.3563624331270425, + "learning_rate": 3.411653435283157e-09, + "logits/chosen": -2.782923460006714, + "logits/rejected": -2.7497828006744385, + "logps/chosen": -250.45681762695312, + "logps/rejected": -224.9089813232422, + "loss": 0.6699, + "positive_losses": 0.007960510440170765, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.15632639825344086, + "rewards/margins": 0.05212927609682083, + "rewards/margins_max": 0.17522598803043365, + "rewards/margins_min": -0.055180471390485764, + "rewards/margins_std": 0.10455608367919922, + "rewards/rejected": 0.10419712215662003, + "step": 3640 + }, + { + "dpo_losses": 0.677409291267395, + "epoch": 0.96, + "grad_norm": 13.57210864894291, + "learning_rate": 3.0458351795936698e-09, + "logits/chosen": -2.7331697940826416, + "logits/rejected": -2.759122848510742, + "logps/chosen": -266.11138916015625, + "logps/rejected": -315.8394470214844, + "loss": 0.6718, + "positive_losses": 0.013798522762954235, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.16741888225078583, + "rewards/margins": 0.0366583950817585, + "rewards/margins_max": 0.190229594707489, + "rewards/margins_min": -0.10931988805532455, + "rewards/margins_std": 0.1350203901529312, + "rewards/rejected": 0.13076052069664001, + "step": 3650 + }, + { + "dpo_losses": 0.6810728311538696, + "epoch": 0.96, + "grad_norm": 2.2443964374976946, + "learning_rate": 2.700637525598598e-09, + "logits/chosen": -2.914287805557251, + "logits/rejected": -2.8853492736816406, + "logps/chosen": -264.8750305175781, + "logps/rejected": -253.35372924804688, + "loss": 0.6766, + "positive_losses": 0.0, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.14287258684635162, + "rewards/margins": 0.025848830118775368, + "rewards/margins_max": 0.1001816838979721, + "rewards/margins_min": -0.05944544076919556, + "rewards/margins_std": 0.07113735377788544, + "rewards/rejected": 0.11702374368906021, + "step": 3660 + }, + { + "dpo_losses": 0.6558116674423218, + "epoch": 0.96, + "grad_norm": 1.832206668873853, + "learning_rate": 2.3760892972027324e-09, + "logits/chosen": -2.774958372116089, + "logits/rejected": -2.752284049987793, + "logps/chosen": -277.69671630859375, + "logps/rejected": -295.549072265625, + "loss": 0.6636, + "positive_losses": 0.08751678466796875, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.17725810408592224, + "rewards/margins": 0.08058985322713852, + "rewards/margins_max": 0.25112852454185486, + "rewards/margins_min": -0.0600384883582592, + "rewards/margins_std": 0.14340457320213318, + "rewards/rejected": 0.09666825830936432, + "step": 3670 + }, + { + "dpo_losses": 0.655846357345581, + "epoch": 0.96, + "grad_norm": 9.385693364328457, + "learning_rate": 2.0722175940897645e-09, + "logits/chosen": -2.736213207244873, + "logits/rejected": -2.713000535964966, + "logps/chosen": -277.0622863769531, + "logps/rejected": -266.38720703125, + "loss": 0.6805, + "positive_losses": 0.07905082404613495, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.17580725252628326, + "rewards/margins": 0.07951674610376358, + "rewards/margins_max": 0.21051025390625, + "rewards/margins_min": -0.039799876511096954, + "rewards/margins_std": 0.11044758558273315, + "rewards/rejected": 0.09629050642251968, + "step": 3680 + }, + { + "dpo_losses": 0.6505584716796875, + "epoch": 0.97, + "grad_norm": 1.9905331193688285, + "learning_rate": 1.7890477894593748e-09, + "logits/chosen": -2.752624273300171, + "logits/rejected": -2.7264013290405273, + "logps/chosen": -312.59149169921875, + "logps/rejected": -239.69857788085938, + "loss": 0.6652, + "positive_losses": 0.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.18325236439704895, + "rewards/margins": 0.09036721289157867, + "rewards/margins_max": 0.20896823704242706, + "rewards/margins_min": -0.037368323653936386, + "rewards/margins_std": 0.10634877532720566, + "rewards/rejected": 0.09288517385721207, + "step": 3690 + }, + { + "dpo_losses": 0.6630789637565613, + "epoch": 0.97, + "grad_norm": 7.024509844530663, + "learning_rate": 1.5266035279088708e-09, + "logits/chosen": -2.896160840988159, + "logits/rejected": -2.8630728721618652, + "logps/chosen": -314.116943359375, + "logps/rejected": -273.3288879394531, + "loss": 0.6788, + "positive_losses": 0.15244026482105255, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.16923435032367706, + "rewards/margins": 0.0650079995393753, + "rewards/margins_max": 0.19643534719944, + "rewards/margins_min": -0.08276429772377014, + "rewards/margins_std": 0.12063497304916382, + "rewards/rejected": 0.10422635078430176, + "step": 3700 + }, + { + "epoch": 0.97, + "eval_dpo_losses": 0.6626591682434082, + "eval_logits/chosen": -2.781266212463379, + "eval_logits/rejected": -2.7426860332489014, + "eval_logps/chosen": -268.09796142578125, + "eval_logps/rejected": -252.3194580078125, + "eval_loss": 0.6810540556907654, + "eval_positive_losses": 0.12859095633029938, + "eval_rewards/accuracies": 0.7222222089767456, + "eval_rewards/chosen": 0.1639692187309265, + "eval_rewards/margins": 0.06567002832889557, + "eval_rewards/margins_max": 0.25067293643951416, + "eval_rewards/margins_min": -0.10208174586296082, + "eval_rewards/margins_std": 0.116749107837677, + "eval_rewards/rejected": 0.09829918295145035, + "eval_runtime": 390.2519, + "eval_samples_per_second": 5.125, + "eval_steps_per_second": 0.161, + "step": 3700 + }, + { + "dpo_losses": 0.6626378297805786, + "epoch": 0.97, + "grad_norm": 8.275436952239337, + "learning_rate": 1.2849067234584621e-09, + "logits/chosen": -2.679213762283325, + "logits/rejected": -2.680332660675049, + "logps/chosen": -285.9251708984375, + "logps/rejected": -265.3730773925781, + "loss": 0.6823, + "positive_losses": 0.0, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.1750296652317047, + "rewards/margins": 0.06485885381698608, + "rewards/margins_max": 0.17583505809307098, + "rewards/margins_min": -0.04068106785416603, + "rewards/margins_std": 0.09638006240129471, + "rewards/rejected": 0.11017082631587982, + "step": 3710 + }, + { + "dpo_losses": 0.6638874411582947, + "epoch": 0.97, + "grad_norm": 9.97430246156224, + "learning_rate": 1.0639775577218625e-09, + "logits/chosen": -2.8001160621643066, + "logits/rejected": -2.792501926422119, + "logps/chosen": -281.8971252441406, + "logps/rejected": -253.5721893310547, + "loss": 0.6714, + "positive_losses": 0.0, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.1858047991991043, + "rewards/margins": 0.06325621902942657, + "rewards/margins_max": 0.20727744698524475, + "rewards/margins_min": -0.06923703104257584, + "rewards/margins_std": 0.12179337441921234, + "rewards/rejected": 0.12254859507083893, + "step": 3720 + }, + { + "dpo_losses": 0.6496821641921997, + "epoch": 0.98, + "grad_norm": 8.743713736578215, + "learning_rate": 8.638344782207485e-10, + "logits/chosen": -2.887678861618042, + "logits/rejected": -2.821990966796875, + "logps/chosen": -320.71673583984375, + "logps/rejected": -252.2602996826172, + "loss": 0.6863, + "positive_losses": 0.0, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.19121643900871277, + "rewards/margins": 0.09353573620319366, + "rewards/margins_max": 0.24803459644317627, + "rewards/margins_min": -0.038599640130996704, + "rewards/margins_std": 0.1312188357114792, + "rewards/rejected": 0.09768068790435791, + "step": 3730 + }, + { + "dpo_losses": 0.6656588315963745, + "epoch": 0.98, + "grad_norm": 2.059055573345697, + "learning_rate": 6.844941968447149e-10, + "logits/chosen": -2.816688060760498, + "logits/rejected": -2.7649848461151123, + "logps/chosen": -317.15789794921875, + "logps/rejected": -317.58514404296875, + "loss": 0.6659, + "positive_losses": 0.028494644910097122, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.17586681246757507, + "rewards/margins": 0.06203201413154602, + "rewards/margins_max": 0.19731295108795166, + "rewards/margins_min": -0.06362716853618622, + "rewards/margins_std": 0.11939724534749985, + "rewards/rejected": 0.11383481323719025, + "step": 3740 + }, + { + "dpo_losses": 0.6846436262130737, + "epoch": 0.98, + "grad_norm": 2.128617293319985, + "learning_rate": 5.25971688455612e-10, + "logits/chosen": -2.7480340003967285, + "logits/rejected": -2.7826037406921387, + "logps/chosen": -279.6949157714844, + "logps/rejected": -252.7490234375, + "loss": 0.6753, + "positive_losses": 0.13470391929149628, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.1422509104013443, + "rewards/margins": 0.020876307040452957, + "rewards/margins_max": 0.13941016793251038, + "rewards/margins_min": -0.1157798022031784, + "rewards/margins_std": 0.11612125486135483, + "rewards/rejected": 0.12137460708618164, + "step": 3750 + }, + { + "dpo_losses": 0.6659666895866394, + "epoch": 0.98, + "grad_norm": 2.07576600608641, + "learning_rate": 3.882801896372967e-10, + "logits/chosen": -2.7559616565704346, + "logits/rejected": -2.782864570617676, + "logps/chosen": -219.6016082763672, + "logps/rejected": -228.398681640625, + "loss": 0.669, + "positive_losses": 0.0055253030732274055, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.15218304097652435, + "rewards/margins": 0.058321546763181686, + "rewards/margins_max": 0.17173783481121063, + "rewards/margins_min": -0.03377734497189522, + "rewards/margins_std": 0.09105263650417328, + "rewards/rejected": 0.09386148303747177, + "step": 3760 + }, + { + "dpo_losses": 0.6622568964958191, + "epoch": 0.99, + "grad_norm": 15.114443791711706, + "learning_rate": 2.714311975902661e-10, + "logits/chosen": -2.7868869304656982, + "logits/rejected": -2.738193988800049, + "logps/chosen": -288.66583251953125, + "logps/rejected": -321.9875183105469, + "loss": 0.6696, + "positive_losses": 0.0, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.1881106197834015, + "rewards/margins": 0.06645546853542328, + "rewards/margins_max": 0.19537228345870972, + "rewards/margins_min": -0.06403455138206482, + "rewards/margins_std": 0.11841548979282379, + "rewards/rejected": 0.1216551661491394, + "step": 3770 + }, + { + "dpo_losses": 0.6676747798919678, + "epoch": 0.99, + "grad_norm": 8.209236934388109, + "learning_rate": 1.754344691717591e-10, + "logits/chosen": -2.7726235389709473, + "logits/rejected": -2.696892023086548, + "logps/chosen": -279.5736389160156, + "logps/rejected": -267.17205810546875, + "loss": 0.6801, + "positive_losses": 0.09341277927160263, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.16620515286922455, + "rewards/margins": 0.056923191994428635, + "rewards/margins_max": 0.22022612392902374, + "rewards/margins_min": -0.12907883524894714, + "rewards/margins_std": 0.16013504564762115, + "rewards/rejected": 0.10928195714950562, + "step": 3780 + }, + { + "dpo_losses": 0.6785107851028442, + "epoch": 0.99, + "grad_norm": 15.44886268948098, + "learning_rate": 1.0029802008096333e-10, + "logits/chosen": -2.8079333305358887, + "logits/rejected": -2.8062872886657715, + "logps/chosen": -235.77810668945312, + "logps/rejected": -209.78030395507812, + "loss": 0.7003, + "positive_losses": 0.6782264709472656, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.11785753071308136, + "rewards/margins": 0.033666886389255524, + "rewards/margins_max": 0.16288526356220245, + "rewards/margins_min": -0.07857248932123184, + "rewards/margins_std": 0.10428880155086517, + "rewards/rejected": 0.08419065177440643, + "step": 3790 + }, + { + "dpo_losses": 0.6620692014694214, + "epoch": 0.99, + "grad_norm": 1.7519493145513187, + "learning_rate": 4.602812418974533e-11, + "logits/chosen": -2.693044424057007, + "logits/rejected": -2.674826145172119, + "logps/chosen": -255.14208984375, + "logps/rejected": -189.10543823242188, + "loss": 0.6668, + "positive_losses": 0.0, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.16355091333389282, + "rewards/margins": 0.06598031520843506, + "rewards/margins_max": 0.16940084099769592, + "rewards/margins_min": -0.025763630867004395, + "rewards/margins_std": 0.08741643279790878, + "rewards/rejected": 0.09757061302661896, + "step": 3800 + }, + { + "epoch": 0.99, + "eval_dpo_losses": 0.6626641154289246, + "eval_logits/chosen": -2.782968044281006, + "eval_logits/rejected": -2.7445266246795654, + "eval_logps/chosen": -268.0970458984375, + "eval_logps/rejected": -252.318603515625, + "eval_loss": 0.6810625195503235, + "eval_positive_losses": 0.12868142127990723, + "eval_rewards/accuracies": 0.7202380895614624, + "eval_rewards/chosen": 0.16397827863693237, + "eval_rewards/margins": 0.0656706765294075, + "eval_rewards/margins_max": 0.25085288286209106, + "eval_rewards/margins_min": -0.10247818380594254, + "eval_rewards/margins_std": 0.11689701676368713, + "eval_rewards/rejected": 0.09830759465694427, + "eval_runtime": 389.263, + "eval_samples_per_second": 5.138, + "eval_steps_per_second": 0.162, + "step": 3800 + }, + { + "dpo_losses": 0.6744269132614136, + "epoch": 1.0, + "grad_norm": 1.7814713725344193, + "learning_rate": 1.2629313018819309e-11, + "logits/chosen": -2.815308094024658, + "logits/rejected": -2.7851715087890625, + "logps/chosen": -297.2845458984375, + "logps/rejected": -259.3329162597656, + "loss": 0.6728, + "positive_losses": 0.046834565699100494, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.15666639804840088, + "rewards/margins": 0.04045479744672775, + "rewards/margins_max": 0.15807831287384033, + "rewards/margins_min": -0.07443277537822723, + "rewards/margins_std": 0.10473154485225677, + "rewards/rejected": 0.11621161550283432, + "step": 3810 + }, + { + "dpo_losses": 0.656509280204773, + "epoch": 1.0, + "grad_norm": 6.53868913211251, + "learning_rate": 1.0437535929996855e-13, + "logits/chosen": -2.748309373855591, + "logits/rejected": -2.7362165451049805, + "logps/chosen": -317.56524658203125, + "logps/rejected": -208.669921875, + "loss": 0.674, + "positive_losses": 0.18830108642578125, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.1658371239900589, + "rewards/margins": 0.07918829470872879, + "rewards/margins_max": 0.22351901233196259, + "rewards/margins_min": -0.048277318477630615, + "rewards/margins_std": 0.11985959112644196, + "rewards/rejected": 0.08664882928133011, + "step": 3820 + }, + { + "epoch": 1.0, + "step": 3821, + "total_flos": 0.0, + "train_loss": 0.6809778677109991, + "train_runtime": 43303.5615, + "train_samples_per_second": 1.412, + "train_steps_per_second": 0.088 + } + ], + "logging_steps": 10, + "max_steps": 3821, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}