{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 1065, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "dpo_losses": 0.6931471824645996, "epoch": 0.0, "grad_norm": 1.601693496929256, "learning_rate": 4.6728971962616815e-09, "logits/chosen": -2.861618995666504, "logits/rejected": -2.8205904960632324, "logps/chosen": -271.06011962890625, "logps/rejected": -211.1704559326172, "loss": 0.6931, "positive_losses": 0.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "dpo_losses": 0.6933066844940186, "epoch": 0.03, "grad_norm": 20.410276988279243, "learning_rate": 4.672897196261682e-08, "logits/chosen": -2.8335423469543457, "logits/rejected": -2.7909910678863525, "logps/chosen": -325.0599365234375, "logps/rejected": -274.9435119628906, "loss": 0.6965, "positive_losses": 0.050453320145606995, "rewards/accuracies": 0.2638888955116272, "rewards/chosen": -9.644959936849773e-05, "rewards/margins": -0.00031749275512993336, "rewards/margins_max": 0.0014122920110821724, "rewards/margins_min": -0.0021905910689383745, "rewards/margins_std": 0.0016328593483194709, "rewards/rejected": 0.00022104315576143563, "step": 10 }, { "dpo_losses": 0.6930336952209473, "epoch": 0.06, "grad_norm": 21.230310271372627, "learning_rate": 9.345794392523364e-08, "logits/chosen": -2.725196361541748, "logits/rejected": -2.706851005554199, "logps/chosen": -293.7933654785156, "logps/rejected": -215.7693328857422, "loss": 0.6994, "positive_losses": 0.07486093044281006, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.00032553955679759383, "rewards/margins": 0.00022889305546414107, "rewards/margins_max": 0.002858394058421254, "rewards/margins_min": -0.0026940270327031612, "rewards/margins_std": 0.0024744768161326647, "rewards/rejected": 9.664653771324083e-05, "step": 20 }, { "dpo_losses": 0.6927663087844849, "epoch": 0.08, "grad_norm": 17.85805321688973, "learning_rate": 1.4018691588785045e-07, "logits/chosen": -2.819422960281372, "logits/rejected": -2.750398874282837, "logps/chosen": -303.5675048828125, "logps/rejected": -232.4523162841797, "loss": 0.6975, "positive_losses": 0.04014534875750542, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0013837231090292335, "rewards/margins": 0.0007641493575647473, "rewards/margins_max": 0.003605480073019862, "rewards/margins_min": -0.0024470784701406956, "rewards/margins_std": 0.002705852035433054, "rewards/rejected": 0.000619573867879808, "step": 30 }, { "dpo_losses": 0.6928492188453674, "epoch": 0.11, "grad_norm": 1.7332928692545067, "learning_rate": 1.8691588785046729e-07, "logits/chosen": -2.8429579734802246, "logits/rejected": -2.7627620697021484, "logps/chosen": -277.1645202636719, "logps/rejected": -225.4744415283203, "loss": 0.6937, "positive_losses": 0.007175350096076727, "rewards/accuracies": 0.5625, "rewards/chosen": 0.003201347542926669, "rewards/margins": 0.0005985596217215061, "rewards/margins_max": 0.003463293891400099, "rewards/margins_min": -0.00237638084217906, "rewards/margins_std": 0.0026152683421969414, "rewards/rejected": 0.0026027881540358067, "step": 40 }, { "dpo_losses": 0.6924293637275696, "epoch": 0.14, "grad_norm": 2.2231645321380933, "learning_rate": 2.336448598130841e-07, "logits/chosen": -2.8112452030181885, "logits/rejected": -2.7390694618225098, "logps/chosen": -275.0422058105469, "logps/rejected": -233.49331665039062, "loss": 0.6929, "positive_losses": 0.007281684782356024, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0052792662754654884, "rewards/margins": 0.001439323415979743, "rewards/margins_max": 0.0054040043614804745, "rewards/margins_min": -0.0021666125394403934, "rewards/margins_std": 0.003360858652740717, "rewards/rejected": 0.003839943092316389, "step": 50 }, { "dpo_losses": 0.6919517517089844, "epoch": 0.17, "grad_norm": 2.413825162312939, "learning_rate": 2.803738317757009e-07, "logits/chosen": -2.7509729862213135, "logits/rejected": -2.7115964889526367, "logps/chosen": -312.9291076660156, "logps/rejected": -262.30059814453125, "loss": 0.6935, "positive_losses": 0.00518798828125, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.007105088327080011, "rewards/margins": 0.002395912306383252, "rewards/margins_max": 0.00661232927814126, "rewards/margins_min": -0.000950089015532285, "rewards/margins_std": 0.0033020805567502975, "rewards/rejected": 0.004709175787866116, "step": 60 }, { "dpo_losses": 0.6914928555488586, "epoch": 0.2, "grad_norm": 1.8246267381191894, "learning_rate": 3.271028037383177e-07, "logits/chosen": -2.78471302986145, "logits/rejected": -2.722522258758545, "logps/chosen": -321.91546630859375, "logps/rejected": -235.0277557373047, "loss": 0.6913, "positive_losses": 0.0, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.009457603096961975, "rewards/margins": 0.0033192108385264874, "rewards/margins_max": 0.009357670322060585, "rewards/margins_min": -0.001532487804070115, "rewards/margins_std": 0.0049352385103702545, "rewards/rejected": 0.006138390861451626, "step": 70 }, { "dpo_losses": 0.6906715631484985, "epoch": 0.23, "grad_norm": 1.8774986784177312, "learning_rate": 3.7383177570093457e-07, "logits/chosen": -2.8345515727996826, "logits/rejected": -2.7776432037353516, "logps/chosen": -320.06634521484375, "logps/rejected": -285.67156982421875, "loss": 0.6909, "positive_losses": 0.0006896972772665322, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.012041566893458366, "rewards/margins": 0.004970206413418055, "rewards/margins_max": 0.013391142711043358, "rewards/margins_min": -0.0014745177468284965, "rewards/margins_std": 0.006737919058650732, "rewards/rejected": 0.007071360945701599, "step": 80 }, { "dpo_losses": 0.6892363429069519, "epoch": 0.25, "grad_norm": 2.6194297181523774, "learning_rate": 4.205607476635514e-07, "logits/chosen": -2.7694807052612305, "logits/rejected": -2.726945400238037, "logps/chosen": -274.4305725097656, "logps/rejected": -198.20901489257812, "loss": 0.6894, "positive_losses": 0.0005071639898233116, "rewards/accuracies": 0.8125, "rewards/chosen": 0.015641968697309494, "rewards/margins": 0.007860125042498112, "rewards/margins_max": 0.01880500093102455, "rewards/margins_min": -0.0005177496350370347, "rewards/margins_std": 0.008911145851016045, "rewards/rejected": 0.007781843654811382, "step": 90 }, { "dpo_losses": 0.6890109777450562, "epoch": 0.28, "grad_norm": 1.9205518680432034, "learning_rate": 4.672897196261682e-07, "logits/chosen": -2.80806303024292, "logits/rejected": -2.7697219848632812, "logps/chosen": -279.78045654296875, "logps/rejected": -261.3872375488281, "loss": 0.6884, "positive_losses": 0.0013298034900799394, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.016312729567289352, "rewards/margins": 0.00831080786883831, "rewards/margins_max": 0.018694214522838593, "rewards/margins_min": 0.00036361132515594363, "rewards/margins_std": 0.00843831431120634, "rewards/rejected": 0.008001920767128468, "step": 100 }, { "epoch": 0.28, "eval_dpo_losses": 0.6917725205421448, "eval_logits/chosen": -2.8041651248931885, "eval_logits/rejected": -2.7650842666625977, "eval_logps/chosen": -283.2318420410156, "eval_logps/rejected": -257.4949035644531, "eval_loss": 0.6931358575820923, "eval_positive_losses": 0.011085770092904568, "eval_rewards/accuracies": 0.6079999804496765, "eval_rewards/chosen": 0.0136156240478158, "eval_rewards/margins": 0.0027753659524023533, "eval_rewards/margins_max": 0.017850197851657867, "eval_rewards/margins_min": -0.010674619115889072, "eval_rewards/margins_std": 0.009409897960722446, "eval_rewards/rejected": 0.010840258561074734, "eval_runtime": 428.0491, "eval_samples_per_second": 4.672, "eval_steps_per_second": 0.292, "step": 100 }, { "dpo_losses": 0.6845903396606445, "epoch": 0.31, "grad_norm": 2.377556980384867, "learning_rate": 4.999879018839287e-07, "logits/chosen": -2.8750226497650146, "logits/rejected": -2.7789218425750732, "logps/chosen": -372.0056457519531, "logps/rejected": -285.5588073730469, "loss": 0.6848, "positive_losses": 0.0, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.026000410318374634, "rewards/margins": 0.017250509932637215, "rewards/margins_max": 0.0338827520608902, "rewards/margins_min": 0.0025702244602143764, "rewards/margins_std": 0.01393597386777401, "rewards/rejected": 0.008749897591769695, "step": 110 }, { "dpo_losses": 0.6825487613677979, "epoch": 0.34, "grad_norm": 2.115970951796753, "learning_rate": 4.997728568369408e-07, "logits/chosen": -2.844456911087036, "logits/rejected": -2.790116786956787, "logps/chosen": -345.28887939453125, "logps/rejected": -265.2372741699219, "loss": 0.6828, "positive_losses": 0.0, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.03097311221063137, "rewards/margins": 0.02140369825065136, "rewards/margins_max": 0.042990267276763916, "rewards/margins_min": 0.005240675527602434, "rewards/margins_std": 0.017378441989421844, "rewards/rejected": 0.009569412097334862, "step": 120 }, { "dpo_losses": 0.679545521736145, "epoch": 0.37, "grad_norm": 1.8392631764823357, "learning_rate": 4.992892309373227e-07, "logits/chosen": -2.8388783931732178, "logits/rejected": -2.7566184997558594, "logps/chosen": -378.5120849609375, "logps/rejected": -299.33154296875, "loss": 0.6796, "positive_losses": 0.00618324289098382, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.0385219044983387, "rewards/margins": 0.027528375387191772, "rewards/margins_max": 0.05331053584814072, "rewards/margins_min": 0.003396461484953761, "rewards/margins_std": 0.021965984255075455, "rewards/rejected": 0.01099353563040495, "step": 130 }, { "dpo_losses": 0.6778298020362854, "epoch": 0.39, "grad_norm": 1.4924163642849055, "learning_rate": 4.985375442281968e-07, "logits/chosen": -2.791003704071045, "logits/rejected": -2.751797914505005, "logps/chosen": -314.1080322265625, "logps/rejected": -268.6662902832031, "loss": 0.6798, "positive_losses": 0.0, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.04306010901927948, "rewards/margins": 0.03122769668698311, "rewards/margins_max": 0.06796383112668991, "rewards/margins_min": 0.004374162759631872, "rewards/margins_std": 0.02954345941543579, "rewards/rejected": 0.011832410469651222, "step": 140 }, { "dpo_losses": 0.6768456101417542, "epoch": 0.42, "grad_norm": 2.003305207476629, "learning_rate": 4.975186049985817e-07, "logits/chosen": -2.7922985553741455, "logits/rejected": -2.752359628677368, "logps/chosen": -284.1427307128906, "logps/rejected": -211.85299682617188, "loss": 0.6759, "positive_losses": 0.000370025634765625, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.04542272537946701, "rewards/margins": 0.033125050365924835, "rewards/margins_max": 0.06794625520706177, "rewards/margins_min": 0.004188986029475927, "rewards/margins_std": 0.029155880212783813, "rewards/rejected": 0.012297680601477623, "step": 150 }, { "dpo_losses": 0.6737908720970154, "epoch": 0.45, "grad_norm": 1.673622074152846, "learning_rate": 4.962335089142375e-07, "logits/chosen": -2.770982265472412, "logits/rejected": -2.716006278991699, "logps/chosen": -295.1300048828125, "logps/rejected": -232.91378784179688, "loss": 0.6732, "positive_losses": 0.00577545166015625, "rewards/accuracies": 0.9375, "rewards/chosen": 0.054168350994586945, "rewards/margins": 0.03937327116727829, "rewards/margins_max": 0.0770280510187149, "rewards/margins_min": 0.005630848463624716, "rewards/margins_std": 0.03246045857667923, "rewards/rejected": 0.014795074239373207, "step": 160 }, { "dpo_losses": 0.6671137809753418, "epoch": 0.48, "grad_norm": 1.849316679565777, "learning_rate": 4.946836378394966e-07, "logits/chosen": -2.8234620094299316, "logits/rejected": -2.76485276222229, "logps/chosen": -323.1846923828125, "logps/rejected": -295.15374755859375, "loss": 0.6687, "positive_losses": 0.0, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.06436900794506073, "rewards/margins": 0.05324960872530937, "rewards/margins_max": 0.1005321741104126, "rewards/margins_min": 0.010768366977572441, "rewards/margins_std": 0.04033435881137848, "rewards/rejected": 0.011119391769170761, "step": 170 }, { "dpo_losses": 0.6649172902107239, "epoch": 0.51, "grad_norm": 1.9724747678160601, "learning_rate": 4.92870658351344e-07, "logits/chosen": -2.815851926803589, "logits/rejected": -2.727149248123169, "logps/chosen": -324.7388916015625, "logps/rejected": -292.8143615722656, "loss": 0.6646, "positive_losses": 0.0, "rewards/accuracies": 0.9375, "rewards/chosen": 0.07161912322044373, "rewards/margins": 0.057913679629564285, "rewards/margins_max": 0.1177336573600769, "rewards/margins_min": 0.01562586799263954, "rewards/margins_std": 0.04572517052292824, "rewards/rejected": 0.013705444522202015, "step": 180 }, { "dpo_losses": 0.6602508425712585, "epoch": 0.54, "grad_norm": 2.366543924703805, "learning_rate": 4.90796519947347e-07, "logits/chosen": -2.776212215423584, "logits/rejected": -2.6835803985595703, "logps/chosen": -313.767578125, "logps/rejected": -236.1117706298828, "loss": 0.6621, "positive_losses": 0.008851242251694202, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.08193298429250717, "rewards/margins": 0.0681738331913948, "rewards/margins_max": 0.1408631056547165, "rewards/margins_min": 0.011917511001229286, "rewards/margins_std": 0.059797000139951706, "rewards/rejected": 0.013759145513176918, "step": 190 }, { "dpo_losses": 0.659784197807312, "epoch": 0.56, "grad_norm": 8.125986362153837, "learning_rate": 4.88463452949359e-07, "logits/chosen": -2.851494312286377, "logits/rejected": -2.774346113204956, "logps/chosen": -330.9751281738281, "logps/rejected": -263.9167175292969, "loss": 0.6627, "positive_losses": 0.008233070373535156, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.08557725697755814, "rewards/margins": 0.06877057254314423, "rewards/margins_max": 0.13887056708335876, "rewards/margins_min": 0.01435057632625103, "rewards/margins_std": 0.05443096160888672, "rewards/rejected": 0.016806693747639656, "step": 200 }, { "epoch": 0.56, "eval_dpo_losses": 0.6858174800872803, "eval_logits/chosen": -2.7871391773223877, "eval_logits/rejected": -2.7485153675079346, "eval_logps/chosen": -279.9480895996094, "eval_logps/rejected": -255.46397399902344, "eval_loss": 0.6995190382003784, "eval_positive_losses": 0.1222764253616333, "eval_rewards/accuracies": 0.5960000157356262, "eval_rewards/chosen": 0.04645317792892456, "eval_rewards/margins": 0.015303360298275948, "eval_rewards/margins_max": 0.08988457173109055, "eval_rewards/margins_min": -0.049583785235881805, "eval_rewards/margins_std": 0.04650707170367241, "eval_rewards/rejected": 0.031149819493293762, "eval_runtime": 428.2785, "eval_samples_per_second": 4.67, "eval_steps_per_second": 0.292, "step": 200 }, { "dpo_losses": 0.6529717445373535, "epoch": 0.59, "grad_norm": 1.9509652365320336, "learning_rate": 4.858739661052539e-07, "logits/chosen": -2.7506725788116455, "logits/rejected": -2.722893238067627, "logps/chosen": -310.4784240722656, "logps/rejected": -255.10678100585938, "loss": 0.6562, "positive_losses": 0.017461013048887253, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.09705875813961029, "rewards/margins": 0.08387880027294159, "rewards/margins_max": 0.17185379564762115, "rewards/margins_min": 0.010865801945328712, "rewards/margins_std": 0.07355490326881409, "rewards/rejected": 0.013179955072700977, "step": 210 }, { "dpo_losses": 0.6484541893005371, "epoch": 0.62, "grad_norm": 1.8456619177361002, "learning_rate": 4.830308438912687e-07, "logits/chosen": -2.8496181964874268, "logits/rejected": -2.7542800903320312, "logps/chosen": -342.6438293457031, "logps/rejected": -272.774169921875, "loss": 0.6524, "positive_losses": 0.0, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.10812418162822723, "rewards/margins": 0.0925908237695694, "rewards/margins_max": 0.1654980480670929, "rewards/margins_min": 0.022400889545679092, "rewards/margins_std": 0.06456473469734192, "rewards/rejected": 0.015533369965851307, "step": 220 }, { "dpo_losses": 0.6464593410491943, "epoch": 0.65, "grad_norm": 1.9535872255784013, "learning_rate": 4.799371435178545e-07, "logits/chosen": -2.849377393722534, "logits/rejected": -2.7559587955474854, "logps/chosen": -360.3794860839844, "logps/rejected": -260.52764892578125, "loss": 0.6444, "positive_losses": 0.015559768304228783, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1129622682929039, "rewards/margins": 0.09730460494756699, "rewards/margins_max": 0.18746943771839142, "rewards/margins_min": 0.02147417888045311, "rewards/margins_std": 0.07491712272167206, "rewards/rejected": 0.015657661482691765, "step": 230 }, { "dpo_losses": 0.653369128704071, "epoch": 0.68, "grad_norm": 1.576015913761948, "learning_rate": 4.765961916422574e-07, "logits/chosen": -2.7950048446655273, "logits/rejected": -2.725090742111206, "logps/chosen": -314.9557800292969, "logps/rejected": -281.58880615234375, "loss": 0.6537, "positive_losses": 0.016252517700195312, "rewards/accuracies": 0.875, "rewards/chosen": 0.09841253608465195, "rewards/margins": 0.08255226910114288, "rewards/margins_max": 0.16673865914344788, "rewards/margins_min": 0.015284004621207714, "rewards/margins_std": 0.06914026290178299, "rewards/rejected": 0.015860268846154213, "step": 240 }, { "dpo_losses": 0.6478245258331299, "epoch": 0.7, "grad_norm": 1.8968881320655915, "learning_rate": 4.730115807913626e-07, "logits/chosen": -2.792325258255005, "logits/rejected": -2.715862989425659, "logps/chosen": -300.74859619140625, "logps/rejected": -249.9875030517578, "loss": 0.6546, "positive_losses": 0.023810099810361862, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1053255945444107, "rewards/margins": 0.09475791454315186, "rewards/margins_max": 0.19134044647216797, "rewards/margins_min": 0.01620624028146267, "rewards/margins_std": 0.08089859038591385, "rewards/rejected": 0.01056766789406538, "step": 250 }, { "dpo_losses": 0.6441227197647095, "epoch": 0.73, "grad_norm": 1.7119013888667354, "learning_rate": 4.691871654986485e-07, "logits/chosen": -2.839354991912842, "logits/rejected": -2.756789445877075, "logps/chosen": -280.3956298828125, "logps/rejected": -257.5377197265625, "loss": 0.6395, "positive_losses": 0.004910898394882679, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.11697886139154434, "rewards/margins": 0.10304615646600723, "rewards/margins_max": 0.20939341187477112, "rewards/margins_min": 0.02202121913433075, "rewards/margins_std": 0.08740874379873276, "rewards/rejected": 0.01393270492553711, "step": 260 }, { "dpo_losses": 0.6456023454666138, "epoch": 0.76, "grad_norm": 1.984892770095366, "learning_rate": 4.6512705815940536e-07, "logits/chosen": -2.783090591430664, "logits/rejected": -2.730664014816284, "logps/chosen": -297.0513000488281, "logps/rejected": -260.38958740234375, "loss": 0.6522, "positive_losses": 0.08494539558887482, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.12064293771982193, "rewards/margins": 0.09954321384429932, "rewards/margins_max": 0.18979784846305847, "rewards/margins_min": 0.028997600078582764, "rewards/margins_std": 0.07252895832061768, "rewards/rejected": 0.021099697798490524, "step": 270 }, { "dpo_losses": 0.6418065428733826, "epoch": 0.79, "grad_norm": 1.7581310746474135, "learning_rate": 4.6083562460867544e-07, "logits/chosen": -2.75692081451416, "logits/rejected": -2.717991590499878, "logps/chosen": -290.9193420410156, "logps/rejected": -270.82281494140625, "loss": 0.6449, "positive_losses": 0.07803992927074432, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.1253766566514969, "rewards/margins": 0.10763327777385712, "rewards/margins_max": 0.20058684051036835, "rewards/margins_min": 0.02709307335317135, "rewards/margins_std": 0.07942862808704376, "rewards/rejected": 0.017743363976478577, "step": 280 }, { "dpo_losses": 0.6451443433761597, "epoch": 0.82, "grad_norm": 1.741728444197785, "learning_rate": 4.563174794266683e-07, "logits/chosen": -2.7625350952148438, "logits/rejected": -2.7133007049560547, "logps/chosen": -275.4216003417969, "logps/rejected": -262.64422607421875, "loss": 0.655, "positive_losses": 0.12835732102394104, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.11494598537683487, "rewards/margins": 0.10120587050914764, "rewards/margins_max": 0.21776075661182404, "rewards/margins_min": 0.010339265689253807, "rewards/margins_std": 0.09577289968729019, "rewards/rejected": 0.013740080408751965, "step": 290 }, { "dpo_losses": 0.6270988583564758, "epoch": 0.85, "grad_norm": 1.8935337862274761, "learning_rate": 4.515774809767012e-07, "logits/chosen": -2.8255112171173096, "logits/rejected": -2.748577833175659, "logps/chosen": -331.1516418457031, "logps/rejected": -288.11700439453125, "loss": 0.6293, "positive_losses": 0.02850809134542942, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.1536138951778412, "rewards/margins": 0.1404499113559723, "rewards/margins_max": 0.27252131700515747, "rewards/margins_min": 0.027482062578201294, "rewards/margins_std": 0.10937514156103134, "rewards/rejected": 0.01316398847848177, "step": 300 }, { "epoch": 0.85, "eval_dpo_losses": 0.6802704930305481, "eval_logits/chosen": -2.768414258956909, "eval_logits/rejected": -2.730605363845825, "eval_logps/chosen": -277.8385009765625, "eval_logps/rejected": -254.60328674316406, "eval_loss": 0.7193401455879211, "eval_positive_losses": 0.35523319244384766, "eval_rewards/accuracies": 0.5960000157356262, "eval_rewards/chosen": 0.06754905730485916, "eval_rewards/margins": 0.027792593464255333, "eval_rewards/margins_max": 0.16008007526397705, "eval_rewards/margins_min": -0.08630798757076263, "eval_rewards/margins_std": 0.08258534967899323, "eval_rewards/rejected": 0.03975646197795868, "eval_runtime": 428.0151, "eval_samples_per_second": 4.673, "eval_steps_per_second": 0.292, "step": 300 }, { "dpo_losses": 0.6279357671737671, "epoch": 0.87, "grad_norm": 1.7270459495090738, "learning_rate": 4.4662072618099887e-07, "logits/chosen": -2.820918560028076, "logits/rejected": -2.7266643047332764, "logps/chosen": -282.9930725097656, "logps/rejected": -213.27236938476562, "loss": 0.6267, "positive_losses": 0.0, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.14822781085968018, "rewards/margins": 0.1387120932340622, "rewards/margins_max": 0.27559134364128113, "rewards/margins_min": 0.03083965554833412, "rewards/margins_std": 0.10963471978902817, "rewards/rejected": 0.009515730664134026, "step": 310 }, { "dpo_losses": 0.6319230794906616, "epoch": 0.9, "grad_norm": 7.736909208406228, "learning_rate": 4.414525450399712e-07, "logits/chosen": -2.78419828414917, "logits/rejected": -2.72727370262146, "logps/chosen": -294.29937744140625, "logps/rejected": -259.6752014160156, "loss": 0.6334, "positive_losses": 0.07355575263500214, "rewards/accuracies": 0.9375, "rewards/chosen": 0.13973799347877502, "rewards/margins": 0.12978938221931458, "rewards/margins_max": 0.26664164662361145, "rewards/margins_min": 0.02372616156935692, "rewards/margins_std": 0.10929105430841446, "rewards/rejected": 0.009948636405169964, "step": 320 }, { "dpo_losses": 0.6180365085601807, "epoch": 0.93, "grad_norm": 1.9077047206667683, "learning_rate": 4.360784949008615e-07, "logits/chosen": -2.8559248447418213, "logits/rejected": -2.775313138961792, "logps/chosen": -338.38507080078125, "logps/rejected": -267.0349426269531, "loss": 0.6252, "positive_losses": 0.016361523419618607, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.15934768319129944, "rewards/margins": 0.16030099987983704, "rewards/margins_max": 0.2862982153892517, "rewards/margins_min": 0.039303008466959, "rewards/margins_std": 0.11248087882995605, "rewards/rejected": -0.0009533234988339245, "step": 330 }, { "dpo_losses": 0.6223454475402832, "epoch": 0.96, "grad_norm": 1.835838607397251, "learning_rate": 4.305043544819289e-07, "logits/chosen": -2.741628646850586, "logits/rejected": -2.690565586090088, "logps/chosen": -283.5008239746094, "logps/rejected": -249.989013671875, "loss": 0.621, "positive_losses": 0.06820764392614365, "rewards/accuracies": 0.9375, "rewards/chosen": 0.16344675421714783, "rewards/margins": 0.15164029598236084, "rewards/margins_max": 0.32016128301620483, "rewards/margins_min": 0.039167340844869614, "rewards/margins_std": 0.12687677145004272, "rewards/rejected": 0.011806446127593517, "step": 340 }, { "dpo_losses": 0.6281483173370361, "epoch": 0.99, "grad_norm": 5.3160905260769695, "learning_rate": 4.247361176585903e-07, "logits/chosen": -2.7237563133239746, "logits/rejected": -2.6812942028045654, "logps/chosen": -300.01416015625, "logps/rejected": -285.3011779785156, "loss": 0.6314, "positive_losses": 0.09978675842285156, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1494494080543518, "rewards/margins": 0.1379416286945343, "rewards/margins_max": 0.26943984627723694, "rewards/margins_min": 0.03328308090567589, "rewards/margins_std": 0.10527817159891129, "rewards/rejected": 0.011507781222462654, "step": 350 }, { "dpo_losses": 0.6205986142158508, "epoch": 1.01, "grad_norm": 1.8675515047500797, "learning_rate": 4.187799870182038e-07, "logits/chosen": -2.76216983795166, "logits/rejected": -2.704974412918091, "logps/chosen": -284.5892028808594, "logps/rejected": -252.9727783203125, "loss": 0.6217, "positive_losses": 0.02007141150534153, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1552152931690216, "rewards/margins": 0.15586945414543152, "rewards/margins_max": 0.3017035126686096, "rewards/margins_min": 0.036038514226675034, "rewards/margins_std": 0.11902729421854019, "rewards/rejected": -0.0006541303591802716, "step": 360 }, { "dpo_losses": 0.6088902354240417, "epoch": 1.04, "grad_norm": 1.9531792595002184, "learning_rate": 4.126423671904236e-07, "logits/chosen": -2.7716732025146484, "logits/rejected": -2.7078254222869873, "logps/chosen": -336.2352600097656, "logps/rejected": -244.32177734375, "loss": 0.6125, "positive_losses": 0.02638239786028862, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.17914316058158875, "rewards/margins": 0.18257828056812286, "rewards/margins_max": 0.3468923568725586, "rewards/margins_min": 0.04474998638033867, "rewards/margins_std": 0.1425740271806717, "rewards/rejected": -0.003435105085372925, "step": 370 }, { "dpo_losses": 0.6105533838272095, "epoch": 1.07, "grad_norm": 1.9479255132083297, "learning_rate": 4.0632985796030007e-07, "logits/chosen": -2.7388248443603516, "logits/rejected": -2.6531803607940674, "logps/chosen": -268.4936218261719, "logps/rejected": -236.57095336914062, "loss": 0.61, "positive_losses": 0.008747863583266735, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.1682673990726471, "rewards/margins": 0.17755943536758423, "rewards/margins_max": 0.3241942226886749, "rewards/margins_min": 0.05380522087216377, "rewards/margins_std": 0.125107079744339, "rewards/rejected": -0.00929203350096941, "step": 380 }, { "dpo_losses": 0.6172813773155212, "epoch": 1.1, "grad_norm": 8.407134849747322, "learning_rate": 3.9984924717152713e-07, "logits/chosen": -2.7552475929260254, "logits/rejected": -2.7269043922424316, "logps/chosen": -285.97760009765625, "logps/rejected": -262.98431396484375, "loss": 0.6282, "positive_losses": 0.06859800964593887, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.15436658263206482, "rewards/margins": 0.1631786823272705, "rewards/margins_max": 0.3312760889530182, "rewards/margins_min": 0.041763290762901306, "rewards/margins_std": 0.13355527818202972, "rewards/rejected": -0.008812101557850838, "step": 390 }, { "dpo_losses": 0.6174243688583374, "epoch": 1.13, "grad_norm": 3.651013920669953, "learning_rate": 3.932075034274723e-07, "logits/chosen": -2.758373975753784, "logits/rejected": -2.6981711387634277, "logps/chosen": -276.49871826171875, "logps/rejected": -245.08224487304688, "loss": 0.6236, "positive_losses": 0.0625574141740799, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.15731404721736908, "rewards/margins": 0.16287846863269806, "rewards/margins_max": 0.3262539505958557, "rewards/margins_min": 0.043569326400756836, "rewards/margins_std": 0.13064977526664734, "rewards/rejected": -0.005564402788877487, "step": 400 }, { "epoch": 1.13, "eval_dpo_losses": 0.6756274700164795, "eval_logits/chosen": -2.749445915222168, "eval_logits/rejected": -2.711932420730591, "eval_logps/chosen": -276.5967712402344, "eval_logps/rejected": -254.45849609375, "eval_loss": 0.7519370317459106, "eval_positive_losses": 0.6894010305404663, "eval_rewards/accuracies": 0.609000027179718, "eval_rewards/chosen": 0.07996628433465958, "eval_rewards/margins": 0.03876199945807457, "eval_rewards/margins_max": 0.21824941039085388, "eval_rewards/margins_min": -0.11400224268436432, "eval_rewards/margins_std": 0.11133058369159698, "eval_rewards/rejected": 0.0412042960524559, "eval_runtime": 427.9231, "eval_samples_per_second": 4.674, "eval_steps_per_second": 0.292, "step": 400 }, { "dpo_losses": 0.6167701482772827, "epoch": 1.15, "grad_norm": 1.9654856670345158, "learning_rate": 3.8641176859783383e-07, "logits/chosen": -2.7697155475616455, "logits/rejected": -2.707765817642212, "logps/chosen": -278.96673583984375, "logps/rejected": -248.4454345703125, "loss": 0.6213, "positive_losses": 0.21929892897605896, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.17709851264953613, "rewards/margins": 0.16643425822257996, "rewards/margins_max": 0.3442317843437195, "rewards/margins_min": 0.015078430995345116, "rewards/margins_std": 0.15533670783042908, "rewards/rejected": 0.010664242319762707, "step": 410 }, { "dpo_losses": 0.6068592071533203, "epoch": 1.18, "grad_norm": 2.3100164325344315, "learning_rate": 3.7946935013898606e-07, "logits/chosen": -2.7368392944335938, "logits/rejected": -2.677018642425537, "logps/chosen": -298.12542724609375, "logps/rejected": -266.35406494140625, "loss": 0.6085, "positive_losses": 0.038385771214962006, "rewards/accuracies": 0.9375, "rewards/chosen": 0.19851230084896088, "rewards/margins": 0.18865473568439484, "rewards/margins_max": 0.4086156487464905, "rewards/margins_min": 0.0322859063744545, "rewards/margins_std": 0.16774147748947144, "rewards/rejected": 0.00985757913440466, "step": 420 }, { "dpo_losses": 0.6006068587303162, "epoch": 1.21, "grad_norm": 6.525192737290529, "learning_rate": 3.7238771323626817e-07, "logits/chosen": -2.768768787384033, "logits/rejected": -2.681751012802124, "logps/chosen": -323.71514892578125, "logps/rejected": -260.93621826171875, "loss": 0.6082, "positive_losses": 0.09947166591882706, "rewards/accuracies": 0.9375, "rewards/chosen": 0.1950441300868988, "rewards/margins": 0.2026427984237671, "rewards/margins_max": 0.3679996430873871, "rewards/margins_min": 0.06226770952343941, "rewards/margins_std": 0.1420767605304718, "rewards/rejected": -0.007598669268190861, "step": 430 }, { "dpo_losses": 0.6032605171203613, "epoch": 1.24, "grad_norm": 1.6385345947087144, "learning_rate": 3.651744727766676e-07, "logits/chosen": -2.768833637237549, "logits/rejected": -2.702613353729248, "logps/chosen": -273.583740234375, "logps/rejected": -246.775390625, "loss": 0.6049, "positive_losses": 0.0, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.17607031762599945, "rewards/margins": 0.1937752068042755, "rewards/margins_max": 0.34389567375183105, "rewards/margins_min": 0.06312290579080582, "rewards/margins_std": 0.1278633177280426, "rewards/rejected": -0.017704878002405167, "step": 440 }, { "dpo_losses": 0.6007322072982788, "epoch": 1.27, "grad_norm": 1.9767106796026328, "learning_rate": 3.5783738516052897e-07, "logits/chosen": -2.7184462547302246, "logits/rejected": -2.658703327178955, "logps/chosen": -294.31878662109375, "logps/rejected": -261.46221923828125, "loss": 0.6054, "positive_losses": 0.05530524253845215, "rewards/accuracies": 1.0, "rewards/chosen": 0.17491844296455383, "rewards/margins": 0.20017877221107483, "rewards/margins_max": 0.37646952271461487, "rewards/margins_min": 0.05014311149716377, "rewards/margins_std": 0.14955812692642212, "rewards/rejected": -0.02526034787297249, "step": 450 }, { "dpo_losses": 0.5913182497024536, "epoch": 1.3, "grad_norm": 1.9303555587009904, "learning_rate": 3.5038433996109404e-07, "logits/chosen": -2.786879777908325, "logits/rejected": -2.722357749938965, "logps/chosen": -335.79766845703125, "logps/rejected": -299.84027099609375, "loss": 0.5936, "positive_losses": 0.03368115425109863, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19536344707012177, "rewards/margins": 0.22316019237041473, "rewards/margins_max": 0.4146662652492523, "rewards/margins_min": 0.05830659344792366, "rewards/margins_std": 0.1629330813884735, "rewards/rejected": -0.027796756476163864, "step": 460 }, { "dpo_losses": 0.5880565643310547, "epoch": 1.32, "grad_norm": 5.289024274003296, "learning_rate": 3.428233514408398e-07, "logits/chosen": -2.7258143424987793, "logits/rejected": -2.703859806060791, "logps/chosen": -296.7102355957031, "logps/rejected": -250.7404022216797, "loss": 0.6047, "positive_losses": 0.15512129664421082, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.19506558775901794, "rewards/margins": 0.23098190128803253, "rewards/margins_max": 0.4174633026123047, "rewards/margins_min": 0.0681539997458458, "rewards/margins_std": 0.15801438689231873, "rewards/rejected": -0.0359162911772728, "step": 470 }, { "dpo_losses": 0.5843189358711243, "epoch": 1.35, "grad_norm": 1.5003059393102265, "learning_rate": 3.3516254993373945e-07, "logits/chosen": -2.755491018295288, "logits/rejected": -2.681509256362915, "logps/chosen": -311.5091247558594, "logps/rejected": -245.88937377929688, "loss": 0.5939, "positive_losses": 0.055457282811403275, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.19728389382362366, "rewards/margins": 0.23806998133659363, "rewards/margins_max": 0.4085385203361511, "rewards/margins_min": 0.06948045641183853, "rewards/margins_std": 0.15313410758972168, "rewards/rejected": -0.04078609496355057, "step": 480 }, { "dpo_losses": 0.588597297668457, "epoch": 1.38, "grad_norm": 2.2336443793585277, "learning_rate": 3.274101731027105e-07, "logits/chosen": -2.709615707397461, "logits/rejected": -2.663390636444092, "logps/chosen": -318.6739807128906, "logps/rejected": -286.79541015625, "loss": 0.593, "positive_losses": 0.0, "rewards/accuracies": 0.9375, "rewards/chosen": 0.20303764939308167, "rewards/margins": 0.23158419132232666, "rewards/margins_max": 0.457578182220459, "rewards/margins_min": 0.04808598756790161, "rewards/margins_std": 0.18310314416885376, "rewards/rejected": -0.028546560555696487, "step": 490 }, { "dpo_losses": 0.598129153251648, "epoch": 1.41, "grad_norm": 1.719544868467305, "learning_rate": 3.1957455708165314e-07, "logits/chosen": -2.6921727657318115, "logits/rejected": -2.678170680999756, "logps/chosen": -280.46063232421875, "logps/rejected": -249.769287109375, "loss": 0.6009, "positive_losses": 0.036452483385801315, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.17304545640945435, "rewards/margins": 0.2069230079650879, "rewards/margins_max": 0.40413007140159607, "rewards/margins_min": 0.054103028029203415, "rewards/margins_std": 0.15941500663757324, "rewards/rejected": -0.03387756645679474, "step": 500 }, { "epoch": 1.41, "eval_dpo_losses": 0.6718186140060425, "eval_logits/chosen": -2.7294745445251465, "eval_logits/rejected": -2.692014694213867, "eval_logps/chosen": -278.2060852050781, "eval_logps/rejected": -257.0343322753906, "eval_loss": 0.8434445858001709, "eval_positive_losses": 1.5495002269744873, "eval_rewards/accuracies": 0.609000027179718, "eval_rewards/chosen": 0.06387308984994888, "eval_rewards/margins": 0.04842698201537132, "eval_rewards/margins_max": 0.2708706855773926, "eval_rewards/margins_min": -0.1440476030111313, "eval_rewards/margins_std": 0.1388508826494217, "eval_rewards/rejected": 0.015446108765900135, "eval_runtime": 427.8728, "eval_samples_per_second": 4.674, "eval_steps_per_second": 0.292, "step": 500 }, { "dpo_losses": 0.5843192338943481, "epoch": 1.44, "grad_norm": 3.509981635934115, "learning_rate": 3.116641275116018e-07, "logits/chosen": -2.7144105434417725, "logits/rejected": -2.628718614578247, "logps/chosen": -321.0252380371094, "logps/rejected": -263.8638000488281, "loss": 0.5918, "positive_losses": 0.06754092872142792, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1961548924446106, "rewards/margins": 0.24090898036956787, "rewards/margins_max": 0.46518999338150024, "rewards/margins_min": 0.07308268547058105, "rewards/margins_std": 0.1766786277294159, "rewards/rejected": -0.04475412517786026, "step": 510 }, { "dpo_losses": 0.5933550596237183, "epoch": 1.46, "grad_norm": 1.7887167117515541, "learning_rate": 3.036873904806295e-07, "logits/chosen": -2.7657430171966553, "logits/rejected": -2.6845974922180176, "logps/chosen": -281.34649658203125, "logps/rejected": -241.7858123779297, "loss": 0.6137, "positive_losses": 0.2598763406276703, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.19071096181869507, "rewards/margins": 0.22014352679252625, "rewards/margins_max": 0.44077128171920776, "rewards/margins_min": 0.0637890174984932, "rewards/margins_std": 0.17364180088043213, "rewards/rejected": -0.02943255938589573, "step": 520 }, { "dpo_losses": 0.5821532011032104, "epoch": 1.49, "grad_norm": 10.11338724174373, "learning_rate": 2.956529233772492e-07, "logits/chosen": -2.769479513168335, "logits/rejected": -2.7147421836853027, "logps/chosen": -314.7546081542969, "logps/rejected": -300.4794921875, "loss": 0.618, "positive_losses": 0.35480666160583496, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.19147683680057526, "rewards/margins": 0.2443142831325531, "rewards/margins_max": 0.41471806168556213, "rewards/margins_min": 0.07710012048482895, "rewards/margins_std": 0.14999482035636902, "rewards/rejected": -0.05283746123313904, "step": 530 }, { "dpo_losses": 0.5760180950164795, "epoch": 1.52, "grad_norm": 1.9182115700216993, "learning_rate": 2.875693656671431e-07, "logits/chosen": -2.761521577835083, "logits/rejected": -2.678863286972046, "logps/chosen": -320.120849609375, "logps/rejected": -249.3918914794922, "loss": 0.5887, "positive_losses": 0.12351331859827042, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2093885838985443, "rewards/margins": 0.26241612434387207, "rewards/margins_max": 0.5384218692779541, "rewards/margins_min": 0.05588661879301071, "rewards/margins_std": 0.2135990560054779, "rewards/rejected": -0.05302751809358597, "step": 540 }, { "dpo_losses": 0.5680991411209106, "epoch": 1.55, "grad_norm": 1.947863946973032, "learning_rate": 2.794454096031429e-07, "logits/chosen": -2.7631947994232178, "logits/rejected": -2.695374011993408, "logps/chosen": -349.5855407714844, "logps/rejected": -319.5202941894531, "loss": 0.5909, "positive_losses": 0.10299448668956757, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.23324398696422577, "rewards/margins": 0.2784574627876282, "rewards/margins_max": 0.5278440713882446, "rewards/margins_min": 0.08051635324954987, "rewards/margins_std": 0.20143046975135803, "rewards/rejected": -0.04521343857049942, "step": 550 }, { "dpo_losses": 0.5882316827774048, "epoch": 1.58, "grad_norm": 6.883043057999946, "learning_rate": 2.7128979087844593e-07, "logits/chosen": -2.733025312423706, "logits/rejected": -2.6712567806243896, "logps/chosen": -285.25115966796875, "logps/rejected": -269.1498107910156, "loss": 0.5847, "positive_losses": 0.1969757080078125, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.1997162401676178, "rewards/margins": 0.23153042793273926, "rewards/margins_max": 0.4535094201564789, "rewards/margins_min": 0.05927438288927078, "rewards/margins_std": 0.17876756191253662, "rewards/rejected": -0.03181419149041176, "step": 560 }, { "dpo_losses": 0.5802090764045715, "epoch": 1.61, "grad_norm": 1.7758243312216686, "learning_rate": 2.6311127923312153e-07, "logits/chosen": -2.7622435092926025, "logits/rejected": -2.6936147212982178, "logps/chosen": -315.955322265625, "logps/rejected": -297.02313232421875, "loss": 0.5892, "positive_losses": 0.21824970841407776, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1967122107744217, "rewards/margins": 0.2509615123271942, "rewards/margins_max": 0.4743427336215973, "rewards/margins_min": 0.049542300403118134, "rewards/margins_std": 0.19265086948871613, "rewards/rejected": -0.05424932390451431, "step": 570 }, { "dpo_losses": 0.5920171737670898, "epoch": 1.63, "grad_norm": 1.860643226808477, "learning_rate": 2.5491866902400565e-07, "logits/chosen": -2.7078280448913574, "logits/rejected": -2.6779098510742188, "logps/chosen": -269.2068786621094, "logps/rejected": -271.0436096191406, "loss": 0.5991, "positive_losses": 0.12158451229333878, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.17235751450061798, "rewards/margins": 0.22299370169639587, "rewards/margins_max": 0.43409761786460876, "rewards/margins_min": 0.05049045756459236, "rewards/margins_std": 0.1718008816242218, "rewards/rejected": -0.05063622072339058, "step": 580 }, { "dpo_losses": 0.5816795229911804, "epoch": 1.66, "grad_norm": 24.09510524317517, "learning_rate": 2.4672076976812543e-07, "logits/chosen": -2.720360279083252, "logits/rejected": -2.6442887783050537, "logps/chosen": -288.3584289550781, "logps/rejected": -267.7664794921875, "loss": 0.5961, "positive_losses": 0.3148724436759949, "rewards/accuracies": 0.9375, "rewards/chosen": 0.183799147605896, "rewards/margins": 0.24748222529888153, "rewards/margins_max": 0.4496613144874573, "rewards/margins_min": 0.06704072654247284, "rewards/margins_std": 0.17722192406654358, "rewards/rejected": -0.06368308514356613, "step": 590 }, { "dpo_losses": 0.576720118522644, "epoch": 1.69, "grad_norm": 2.1497917207479507, "learning_rate": 2.385263966698222e-07, "logits/chosen": -2.728609085083008, "logits/rejected": -2.6675384044647217, "logps/chosen": -273.47369384765625, "logps/rejected": -296.99310302734375, "loss": 0.6136, "positive_losses": 0.32597580552101135, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.19901221990585327, "rewards/margins": 0.26053741574287415, "rewards/margins_max": 0.5135586261749268, "rewards/margins_min": 0.06456376612186432, "rewards/margins_std": 0.20331530272960663, "rewards/rejected": -0.06152517348527908, "step": 600 }, { "epoch": 1.69, "eval_dpo_losses": 0.6691358685493469, "eval_logits/chosen": -2.7203145027160645, "eval_logits/rejected": -2.682708263397217, "eval_logps/chosen": -277.7243957519531, "eval_logps/rejected": -257.23602294921875, "eval_loss": 0.8727347254753113, "eval_positive_losses": 1.8302409648895264, "eval_rewards/accuracies": 0.6129999756813049, "eval_rewards/chosen": 0.06869003176689148, "eval_rewards/margins": 0.055261291563510895, "eval_rewards/margins_max": 0.30490291118621826, "eval_rewards/margins_min": -0.15945661067962646, "eval_rewards/margins_std": 0.1552800089120865, "eval_rewards/rejected": 0.013428742997348309, "eval_runtime": 427.6004, "eval_samples_per_second": 4.677, "eval_steps_per_second": 0.292, "step": 600 }, { "dpo_losses": 0.5747733116149902, "epoch": 1.72, "grad_norm": 6.65096965403295, "learning_rate": 2.3034436114175838e-07, "logits/chosen": -2.7278988361358643, "logits/rejected": -2.6467108726501465, "logps/chosen": -305.2555847167969, "logps/rejected": -246.3316192626953, "loss": 0.5995, "positive_losses": 0.20225219428539276, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.22444753348827362, "rewards/margins": 0.2688870429992676, "rewards/margins_max": 0.5200773477554321, "rewards/margins_min": 0.04230537265539169, "rewards/margins_std": 0.21579697728157043, "rewards/rejected": -0.044439516961574554, "step": 610 }, { "dpo_losses": 0.5641311407089233, "epoch": 1.75, "grad_norm": 11.81079993553305, "learning_rate": 2.2218346133000264e-07, "logits/chosen": -2.7867112159729004, "logits/rejected": -2.71598744392395, "logps/chosen": -318.68121337890625, "logps/rejected": -289.48309326171875, "loss": 0.6115, "positive_losses": 0.15610671043395996, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.22943755984306335, "rewards/margins": 0.29184073209762573, "rewards/margins_max": 0.5524952411651611, "rewards/margins_min": 0.07403835654258728, "rewards/margins_std": 0.2165592461824417, "rewards/rejected": -0.06240314990282059, "step": 620 }, { "dpo_losses": 0.5642560720443726, "epoch": 1.77, "grad_norm": 7.357647930061899, "learning_rate": 2.1405247265337917e-07, "logits/chosen": -2.693892002105713, "logits/rejected": -2.6591708660125732, "logps/chosen": -349.5628356933594, "logps/rejected": -286.9067687988281, "loss": 0.5786, "positive_losses": 0.0, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2449304610490799, "rewards/margins": 0.28937476873397827, "rewards/margins_max": 0.5107169151306152, "rewards/margins_min": 0.09730032831430435, "rewards/margins_std": 0.18998651206493378, "rewards/rejected": -0.04444430395960808, "step": 630 }, { "dpo_losses": 0.5737847089767456, "epoch": 1.8, "grad_norm": 1.3349176272864234, "learning_rate": 2.0596013836725657e-07, "logits/chosen": -2.779600143432617, "logits/rejected": -2.700307846069336, "logps/chosen": -290.4892578125, "logps/rejected": -238.4832305908203, "loss": 0.5953, "positive_losses": 0.20754241943359375, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.2137231081724167, "rewards/margins": 0.2684459686279297, "rewards/margins_max": 0.4815581440925598, "rewards/margins_min": 0.08431941270828247, "rewards/margins_std": 0.1738905906677246, "rewards/rejected": -0.0547228567302227, "step": 640 }, { "dpo_losses": 0.5745794773101807, "epoch": 1.83, "grad_norm": 2.223670875381027, "learning_rate": 1.9791516016192213e-07, "logits/chosen": -2.7533204555511475, "logits/rejected": -2.6931979656219482, "logps/chosen": -308.6822814941406, "logps/rejected": -296.13787841796875, "loss": 0.6091, "positive_losses": 0.41213884949684143, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.21507827937602997, "rewards/margins": 0.26380831003189087, "rewards/margins_max": 0.4691304564476013, "rewards/margins_min": 0.051321595907211304, "rewards/margins_std": 0.18508048355579376, "rewards/rejected": -0.04873000457882881, "step": 650 }, { "dpo_losses": 0.5978478193283081, "epoch": 1.86, "grad_norm": 1.9229193850326198, "learning_rate": 1.8992618880565036e-07, "logits/chosen": -2.676647663116455, "logits/rejected": -2.6538872718811035, "logps/chosen": -228.5106658935547, "logps/rejected": -218.3628387451172, "loss": 0.6228, "positive_losses": 0.36836346983909607, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.17657354474067688, "rewards/margins": 0.20908644795417786, "rewards/margins_max": 0.38408344984054565, "rewards/margins_min": 0.0495840385556221, "rewards/margins_std": 0.1525198072195053, "rewards/rejected": -0.03251289203763008, "step": 660 }, { "dpo_losses": 0.5676581859588623, "epoch": 1.89, "grad_norm": 10.31975804008308, "learning_rate": 1.8200181484252885e-07, "logits/chosen": -2.772998809814453, "logits/rejected": -2.742371082305908, "logps/chosen": -282.56695556640625, "logps/rejected": -246.2154083251953, "loss": 0.5836, "positive_losses": 0.19931070506572723, "rewards/accuracies": 1.0, "rewards/chosen": 0.23336899280548096, "rewards/margins": 0.28042665123939514, "rewards/margins_max": 0.5176241397857666, "rewards/margins_min": 0.08305230736732483, "rewards/margins_std": 0.19908221065998077, "rewards/rejected": -0.047057636082172394, "step": 670 }, { "dpo_losses": 0.570050835609436, "epoch": 1.92, "grad_norm": 6.2395285814671375, "learning_rate": 1.7415055935504233e-07, "logits/chosen": -2.791836738586426, "logits/rejected": -2.7229678630828857, "logps/chosen": -289.1046142578125, "logps/rejected": -257.9730224609375, "loss": 0.5814, "positive_losses": 0.15246662497520447, "rewards/accuracies": 0.9375, "rewards/chosen": 0.21605853736400604, "rewards/margins": 0.274807333946228, "rewards/margins_max": 0.46929216384887695, "rewards/margins_min": 0.07633324712514877, "rewards/margins_std": 0.17314451932907104, "rewards/rejected": -0.05874878168106079, "step": 680 }, { "dpo_losses": 0.5896421074867249, "epoch": 1.94, "grad_norm": 9.413363887459912, "learning_rate": 1.6638086480134952e-07, "logits/chosen": -2.7295353412628174, "logits/rejected": -2.695042133331299, "logps/chosen": -219.7963409423828, "logps/rejected": -206.1276397705078, "loss": 0.5879, "positive_losses": 0.11815419048070908, "rewards/accuracies": 0.9375, "rewards/chosen": 0.18556587398052216, "rewards/margins": 0.22936753928661346, "rewards/margins_max": 0.44191575050354004, "rewards/margins_min": 0.05077395588159561, "rewards/margins_std": 0.17514900863170624, "rewards/rejected": -0.0438016876578331, "step": 690 }, { "dpo_losses": 0.5733457207679749, "epoch": 1.97, "grad_norm": 5.98740527370756, "learning_rate": 1.5870108593710471e-07, "logits/chosen": -2.6704626083374023, "logits/rejected": -2.5912749767303467, "logps/chosen": -277.55157470703125, "logps/rejected": -214.6815948486328, "loss": 0.5918, "positive_losses": 0.17365257441997528, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.21979399025440216, "rewards/margins": 0.26772385835647583, "rewards/margins_max": 0.4699093699455261, "rewards/margins_min": 0.09799458831548691, "rewards/margins_std": 0.17043128609657288, "rewards/rejected": -0.047929856926202774, "step": 700 }, { "epoch": 1.97, "eval_dpo_losses": 0.6676550507545471, "eval_logits/chosen": -2.7172420024871826, "eval_logits/rejected": -2.679697036743164, "eval_logps/chosen": -277.88079833984375, "eval_logps/rejected": -257.7734069824219, "eval_loss": 0.8997932076454163, "eval_positive_losses": 2.0811269283294678, "eval_rewards/accuracies": 0.621999979019165, "eval_rewards/chosen": 0.06712605804204941, "eval_rewards/margins": 0.0590706393122673, "eval_rewards/margins_max": 0.32306957244873047, "eval_rewards/margins_min": -0.16852563619613647, "eval_rewards/margins_std": 0.1640576869249344, "eval_rewards/rejected": 0.00805541779845953, "eval_runtime": 427.6887, "eval_samples_per_second": 4.676, "eval_steps_per_second": 0.292, "step": 700 }, { "dpo_losses": 0.5846664309501648, "epoch": 2.0, "grad_norm": 2.012900588641833, "learning_rate": 1.5111948083158528e-07, "logits/chosen": -2.665996551513672, "logits/rejected": -2.6332955360412598, "logps/chosen": -233.13113403320312, "logps/rejected": -231.0898895263672, "loss": 0.5777, "positive_losses": 0.08777942508459091, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.17990438640117645, "rewards/margins": 0.23856917023658752, "rewards/margins_max": 0.4145428240299225, "rewards/margins_min": 0.06277941167354584, "rewards/margins_std": 0.15543320775032043, "rewards/rejected": -0.058664750307798386, "step": 710 }, { "dpo_losses": 0.5619566440582275, "epoch": 2.03, "grad_norm": 8.927999929393918, "learning_rate": 1.4364420198778658e-07, "logits/chosen": -2.803697109222412, "logits/rejected": -2.725480556488037, "logps/chosen": -341.5400085449219, "logps/rejected": -310.74261474609375, "loss": 0.5941, "positive_losses": 0.07888917624950409, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2251276969909668, "rewards/margins": 0.2960202395915985, "rewards/margins_max": 0.5476168394088745, "rewards/margins_min": 0.08313913643360138, "rewards/margins_std": 0.21061666309833527, "rewards/rejected": -0.0708925798535347, "step": 720 }, { "dpo_losses": 0.5583115816116333, "epoch": 2.06, "grad_norm": 1.8517093519732466, "learning_rate": 1.3628328757603242e-07, "logits/chosen": -2.7375550270080566, "logits/rejected": -2.673107862472534, "logps/chosen": -314.8241271972656, "logps/rejected": -281.47967529296875, "loss": 0.5671, "positive_losses": 0.10358180850744247, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.23430731892585754, "rewards/margins": 0.3044988512992859, "rewards/margins_max": 0.5319117307662964, "rewards/margins_min": 0.0928182452917099, "rewards/margins_std": 0.199130579829216, "rewards/rejected": -0.07019157707691193, "step": 730 }, { "dpo_losses": 0.5659884810447693, "epoch": 2.08, "grad_norm": 9.541706290572293, "learning_rate": 1.2904465279052723e-07, "logits/chosen": -2.7077741622924805, "logits/rejected": -2.6461472511291504, "logps/chosen": -299.73486328125, "logps/rejected": -277.62725830078125, "loss": 0.6025, "positive_losses": 0.5914154052734375, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20683905482292175, "rewards/margins": 0.28546684980392456, "rewards/margins_max": 0.5176615715026855, "rewards/margins_min": 0.09277740865945816, "rewards/margins_std": 0.1960216760635376, "rewards/rejected": -0.07862778007984161, "step": 740 }, { "dpo_losses": 0.5690917372703552, "epoch": 2.11, "grad_norm": 1.8282721044580463, "learning_rate": 1.219360813381446e-07, "logits/chosen": -2.6811976432800293, "logits/rejected": -2.6621005535125732, "logps/chosen": -266.5814514160156, "logps/rejected": -242.2976837158203, "loss": 0.5786, "positive_losses": 0.02016754075884819, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.20154385268688202, "rewards/margins": 0.27638477087020874, "rewards/margins_max": 0.4697951376438141, "rewards/margins_min": 0.1011359691619873, "rewards/margins_std": 0.16673585772514343, "rewards/rejected": -0.0748409777879715, "step": 750 }, { "dpo_losses": 0.5634604096412659, "epoch": 2.14, "grad_norm": 8.095290107453106, "learning_rate": 1.149652170686039e-07, "logits/chosen": -2.648603916168213, "logits/rejected": -2.6005616188049316, "logps/chosen": -267.0931396484375, "logps/rejected": -269.1485595703125, "loss": 0.5637, "positive_losses": 0.0, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2069512903690338, "rewards/margins": 0.2928617596626282, "rewards/margins_max": 0.559156596660614, "rewards/margins_min": 0.08480425924062729, "rewards/margins_std": 0.21423542499542236, "rewards/rejected": -0.08591042459011078, "step": 760 }, { "dpo_losses": 0.5496206879615784, "epoch": 2.17, "grad_norm": 6.6085926070789, "learning_rate": 1.0813955575503587e-07, "logits/chosen": -2.6958391666412354, "logits/rejected": -2.6329185962677, "logps/chosen": -281.89190673828125, "logps/rejected": -222.1160125732422, "loss": 0.5739, "positive_losses": 0.15439815819263458, "rewards/accuracies": 1.0, "rewards/chosen": 0.22945666313171387, "rewards/margins": 0.3267485499382019, "rewards/margins_max": 0.5543676018714905, "rewards/margins_min": 0.1200588196516037, "rewards/margins_std": 0.20475919544696808, "rewards/rejected": -0.09729186445474625, "step": 770 }, { "dpo_losses": 0.5734694004058838, "epoch": 2.2, "grad_norm": 5.752056251728322, "learning_rate": 1.0146643703377486e-07, "logits/chosen": -2.751596212387085, "logits/rejected": -2.658079147338867, "logps/chosen": -297.37677001953125, "logps/rejected": -233.66806030273438, "loss": 0.6086, "positive_losses": 0.2737409174442291, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.20195269584655762, "rewards/margins": 0.268646776676178, "rewards/margins_max": 0.4993320405483246, "rewards/margins_min": 0.051596127450466156, "rewards/margins_std": 0.20203988254070282, "rewards/rejected": -0.06669410318136215, "step": 780 }, { "dpo_losses": 0.5670967102050781, "epoch": 2.23, "grad_norm": 2.435776321138929, "learning_rate": 9.495303651204494e-08, "logits/chosen": -2.712763786315918, "logits/rejected": -2.630561113357544, "logps/chosen": -300.8298645019531, "logps/rejected": -252.09829711914062, "loss": 0.5701, "positive_losses": 0.03285813331604004, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2152232676744461, "rewards/margins": 0.2839241027832031, "rewards/margins_max": 0.5225785970687866, "rewards/margins_min": 0.08764372766017914, "rewards/margins_std": 0.1984366476535797, "rewards/rejected": -0.06870082765817642, "step": 790 }, { "dpo_losses": 0.5538218021392822, "epoch": 2.25, "grad_norm": 7.082501471048372, "learning_rate": 8.860635805202615e-08, "logits/chosen": -2.7329680919647217, "logits/rejected": -2.6982383728027344, "logps/chosen": -299.06622314453125, "logps/rejected": -320.72845458984375, "loss": 0.5636, "positive_losses": 0.2872983515262604, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.2248470038175583, "rewards/margins": 0.3167414963245392, "rewards/margins_max": 0.5594197511672974, "rewards/margins_min": 0.11445317417383194, "rewards/margins_std": 0.19880005717277527, "rewards/rejected": -0.0918944776058197, "step": 800 }, { "epoch": 2.25, "eval_dpo_losses": 0.6666676998138428, "eval_logits/chosen": -2.7115840911865234, "eval_logits/rejected": -2.6734206676483154, "eval_logps/chosen": -278.4820251464844, "eval_logps/rejected": -258.64727783203125, "eval_loss": 0.9370628595352173, "eval_positive_losses": 2.4201414585113525, "eval_rewards/accuracies": 0.6259999871253967, "eval_rewards/chosen": 0.06111405789852142, "eval_rewards/margins": 0.06179738789796829, "eval_rewards/margins_max": 0.3369947075843811, "eval_rewards/margins_min": -0.17766639590263367, "eval_rewards/margins_std": 0.17163515090942383, "eval_rewards/rejected": -0.0006833283696323633, "eval_runtime": 428.0428, "eval_samples_per_second": 4.672, "eval_steps_per_second": 0.292, "step": 800 }, { "dpo_losses": 0.561680793762207, "epoch": 2.28, "grad_norm": 6.853436586712954, "learning_rate": 8.24332262395994e-08, "logits/chosen": -2.770097255706787, "logits/rejected": -2.7309253215789795, "logps/chosen": -273.7652587890625, "logps/rejected": -278.20025634765625, "loss": 0.5736, "positive_losses": 0.0, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21009182929992676, "rewards/margins": 0.2937797009944916, "rewards/margins_max": 0.5182637572288513, "rewards/margins_min": 0.0909772664308548, "rewards/margins_std": 0.1924486607313156, "rewards/rejected": -0.0836879163980484, "step": 810 }, { "dpo_losses": 0.5531629920005798, "epoch": 2.31, "grad_norm": 2.2576986633959164, "learning_rate": 7.644027904586586e-08, "logits/chosen": -2.707874059677124, "logits/rejected": -2.6698241233825684, "logps/chosen": -305.3839416503906, "logps/rejected": -288.03485107421875, "loss": 0.5736, "positive_losses": 0.12947359681129456, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.24521002173423767, "rewards/margins": 0.32028818130493164, "rewards/margins_max": 0.5808194875717163, "rewards/margins_min": 0.09252934157848358, "rewards/margins_std": 0.21892566978931427, "rewards/rejected": -0.07507814466953278, "step": 820 }, { "dpo_losses": 0.5370634198188782, "epoch": 2.34, "grad_norm": 2.1285258942046736, "learning_rate": 7.063396068933469e-08, "logits/chosen": -2.7070274353027344, "logits/rejected": -2.6111221313476562, "logps/chosen": -358.33660888671875, "logps/rejected": -278.06463623046875, "loss": 0.5593, "positive_losses": 0.08015155792236328, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2682201564311981, "rewards/margins": 0.35513558983802795, "rewards/margins_max": 0.5836988091468811, "rewards/margins_min": 0.14265316724777222, "rewards/margins_std": 0.19289033114910126, "rewards/rejected": -0.08691541850566864, "step": 830 }, { "dpo_losses": 0.5690010786056519, "epoch": 2.37, "grad_norm": 8.70012334977324, "learning_rate": 6.502051470645148e-08, "logits/chosen": -2.7239556312561035, "logits/rejected": -2.6661264896392822, "logps/chosen": -305.3841857910156, "logps/rejected": -253.9221649169922, "loss": 0.5836, "positive_losses": 0.07451782375574112, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.20678095519542694, "rewards/margins": 0.2766776978969574, "rewards/margins_max": 0.5134538412094116, "rewards/margins_min": 0.10238063335418701, "rewards/margins_std": 0.18080976605415344, "rewards/rejected": -0.06989672034978867, "step": 840 }, { "dpo_losses": 0.5521928071975708, "epoch": 2.39, "grad_norm": 2.0434453054659096, "learning_rate": 5.960597723792194e-08, "logits/chosen": -2.7380244731903076, "logits/rejected": -2.6666131019592285, "logps/chosen": -271.77508544921875, "logps/rejected": -247.48190307617188, "loss": 0.5717, "positive_losses": 0.22315779328346252, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2434767186641693, "rewards/margins": 0.3210228681564331, "rewards/margins_max": 0.5941131711006165, "rewards/margins_min": 0.11233635246753693, "rewards/margins_std": 0.21944975852966309, "rewards/rejected": -0.077546127140522, "step": 850 }, { "dpo_losses": 0.5401885509490967, "epoch": 2.42, "grad_norm": 2.1235783253077907, "learning_rate": 5.4396170538046486e-08, "logits/chosen": -2.7592415809631348, "logits/rejected": -2.7021079063415527, "logps/chosen": -321.6221923828125, "logps/rejected": -266.2222900390625, "loss": 0.5618, "positive_losses": 0.12075729668140411, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2578734755516052, "rewards/margins": 0.35080546140670776, "rewards/margins_max": 0.625725269317627, "rewards/margins_min": 0.10036028921604156, "rewards/margins_std": 0.2362469732761383, "rewards/rejected": -0.09293195605278015, "step": 860 }, { "dpo_losses": 0.5563656687736511, "epoch": 2.45, "grad_norm": 1.8366012996504069, "learning_rate": 4.93966967140487e-08, "logits/chosen": -2.7032082080841064, "logits/rejected": -2.6532039642333984, "logps/chosen": -290.3298034667969, "logps/rejected": -287.1317138671875, "loss": 0.5634, "positive_losses": 0.1806434690952301, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2245173156261444, "rewards/margins": 0.3118094205856323, "rewards/margins_max": 0.5898565053939819, "rewards/margins_min": 0.10215729475021362, "rewards/margins_std": 0.22476975619792938, "rewards/rejected": -0.08729207515716553, "step": 870 }, { "dpo_losses": 0.5541495084762573, "epoch": 2.48, "grad_norm": 6.44345291592453, "learning_rate": 4.4612931702126433e-08, "logits/chosen": -2.8107595443725586, "logits/rejected": -2.7267684936523438, "logps/chosen": -304.7294006347656, "logps/rejected": -270.358642578125, "loss": 0.5863, "positive_losses": 0.24675989151000977, "rewards/accuracies": 1.0, "rewards/chosen": 0.2371862679719925, "rewards/margins": 0.31598663330078125, "rewards/margins_max": 0.5958508253097534, "rewards/margins_min": 0.08327536284923553, "rewards/margins_std": 0.23089298605918884, "rewards/rejected": -0.07880039513111115, "step": 880 }, { "dpo_losses": 0.5567336082458496, "epoch": 2.51, "grad_norm": 7.843101687444187, "learning_rate": 4.005001948670605e-08, "logits/chosen": -2.775538682937622, "logits/rejected": -2.697503089904785, "logps/chosen": -322.6019287109375, "logps/rejected": -273.635986328125, "loss": 0.5736, "positive_losses": 0.37784984707832336, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2152644395828247, "rewards/margins": 0.3087191581726074, "rewards/margins_max": 0.5784704685211182, "rewards/margins_min": 0.09841950982809067, "rewards/margins_std": 0.21595220267772675, "rewards/rejected": -0.09345470368862152, "step": 890 }, { "dpo_losses": 0.543450117111206, "epoch": 2.54, "grad_norm": 8.016670976771136, "learning_rate": 3.571286656911376e-08, "logits/chosen": -2.77311635017395, "logits/rejected": -2.6821038722991943, "logps/chosen": -304.2124938964844, "logps/rejected": -293.931884765625, "loss": 0.5736, "positive_losses": 0.34936437010765076, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2413357049226761, "rewards/margins": 0.34193912148475647, "rewards/margins_max": 0.5774747133255005, "rewards/margins_min": 0.10269900411367416, "rewards/margins_std": 0.21173134446144104, "rewards/rejected": -0.10060342401266098, "step": 900 }, { "epoch": 2.54, "eval_dpo_losses": 0.6658776998519897, "eval_logits/chosen": -2.7107093334198, "eval_logits/rejected": -2.672572612762451, "eval_logps/chosen": -278.8089904785156, "eval_logps/rejected": -259.1817321777344, "eval_loss": 0.9591056704521179, "eval_positive_losses": 2.626781702041626, "eval_rewards/accuracies": 0.6320000290870667, "eval_rewards/chosen": 0.0578441321849823, "eval_rewards/margins": 0.06387220323085785, "eval_rewards/margins_max": 0.34665346145629883, "eval_rewards/margins_min": -0.18229486048221588, "eval_rewards/margins_std": 0.1764223277568817, "eval_rewards/rejected": -0.006028064992278814, "eval_runtime": 427.6906, "eval_samples_per_second": 4.676, "eval_steps_per_second": 0.292, "step": 900 }, { "dpo_losses": 0.5519391894340515, "epoch": 2.56, "grad_norm": 25.625793147024005, "learning_rate": 3.160613669161255e-08, "logits/chosen": -2.73115873336792, "logits/rejected": -2.670806407928467, "logps/chosen": -299.96832275390625, "logps/rejected": -239.826904296875, "loss": 0.5768, "positive_losses": 0.3415166437625885, "rewards/accuracies": 0.9375, "rewards/chosen": 0.23844274878501892, "rewards/margins": 0.3223797678947449, "rewards/margins_max": 0.5813696980476379, "rewards/margins_min": 0.09977176040410995, "rewards/margins_std": 0.21656760573387146, "rewards/rejected": -0.08393705636262894, "step": 910 }, { "dpo_losses": 0.5531378388404846, "epoch": 2.59, "grad_norm": 2.1656044654918523, "learning_rate": 2.7734245822478436e-08, "logits/chosen": -2.7032344341278076, "logits/rejected": -2.612525463104248, "logps/chosen": -289.3112487792969, "logps/rejected": -218.96713256835938, "loss": 0.5749, "positive_losses": 0.18140992522239685, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.2416727989912033, "rewards/margins": 0.32405149936676025, "rewards/margins_max": 0.6709355115890503, "rewards/margins_min": 0.08473904430866241, "rewards/margins_std": 0.26692530512809753, "rewards/rejected": -0.08237870782613754, "step": 920 }, { "dpo_losses": 0.5606621503829956, "epoch": 2.62, "grad_norm": 8.473170854444827, "learning_rate": 2.410135740750821e-08, "logits/chosen": -2.735752582550049, "logits/rejected": -2.671776294708252, "logps/chosen": -259.04132080078125, "logps/rejected": -235.8688201904297, "loss": 0.5834, "positive_losses": 0.36757582426071167, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.22701683640480042, "rewards/margins": 0.30118703842163086, "rewards/margins_max": 0.5452762246131897, "rewards/margins_min": 0.08702322095632553, "rewards/margins_std": 0.20802298188209534, "rewards/rejected": -0.07417017966508865, "step": 930 }, { "dpo_losses": 0.5595995187759399, "epoch": 2.65, "grad_norm": 6.88019656676779, "learning_rate": 2.071137789306418e-08, "logits/chosen": -2.7638261318206787, "logits/rejected": -2.685157537460327, "logps/chosen": -326.85858154296875, "logps/rejected": -302.33209228515625, "loss": 0.572, "positive_losses": 0.20179709792137146, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.2141149938106537, "rewards/margins": 0.3031490445137024, "rewards/margins_max": 0.5786975026130676, "rewards/margins_min": 0.07930903881788254, "rewards/margins_std": 0.2305254191160202, "rewards/rejected": -0.08903402090072632, "step": 940 }, { "dpo_losses": 0.5905728936195374, "epoch": 2.68, "grad_norm": 2.0169170880957616, "learning_rate": 1.7567952525471107e-08, "logits/chosen": -2.696136951446533, "logits/rejected": -2.6454825401306152, "logps/chosen": -245.9570770263672, "logps/rejected": -213.52334594726562, "loss": 0.6002, "positive_losses": 0.25799694657325745, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.1696554720401764, "rewards/margins": 0.22757229208946228, "rewards/margins_max": 0.4704417288303375, "rewards/margins_min": 0.05507839843630791, "rewards/margins_std": 0.18926933407783508, "rewards/rejected": -0.057916827499866486, "step": 950 }, { "dpo_losses": 0.560580313205719, "epoch": 2.7, "grad_norm": 7.04290537877064, "learning_rate": 1.467446143128101e-08, "logits/chosen": -2.787949800491333, "logits/rejected": -2.7239291667938232, "logps/chosen": -260.6053161621094, "logps/rejected": -223.7249755859375, "loss": 0.5671, "positive_losses": 0.06990127265453339, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.22634287178516388, "rewards/margins": 0.29788738489151, "rewards/margins_max": 0.5511468648910522, "rewards/margins_min": 0.08614132553339005, "rewards/margins_std": 0.2106626331806183, "rewards/rejected": -0.07154452800750732, "step": 960 }, { "dpo_losses": 0.5514933466911316, "epoch": 2.73, "grad_norm": 1.9051880301646325, "learning_rate": 1.2034015982622243e-08, "logits/chosen": -2.7561697959899902, "logits/rejected": -2.6536028385162354, "logps/chosen": -329.5728759765625, "logps/rejected": -290.1389465332031, "loss": 0.5909, "positive_losses": 0.08531048148870468, "rewards/accuracies": 1.0, "rewards/chosen": 0.22805485129356384, "rewards/margins": 0.3209593594074249, "rewards/margins_max": 0.5957485437393188, "rewards/margins_min": 0.09393687546253204, "rewards/margins_std": 0.2260352075099945, "rewards/rejected": -0.09290449321269989, "step": 970 }, { "dpo_losses": 0.5581797957420349, "epoch": 2.76, "grad_norm": 13.216663962544713, "learning_rate": 9.649455451539418e-09, "logits/chosen": -2.6970033645629883, "logits/rejected": -2.651496410369873, "logps/chosen": -266.47296142578125, "logps/rejected": -233.91098022460938, "loss": 0.5837, "positive_losses": 0.05353069305419922, "rewards/accuracies": 1.0, "rewards/chosen": 0.21983060240745544, "rewards/margins": 0.30383530259132385, "rewards/margins_max": 0.517738401889801, "rewards/margins_min": 0.09517470002174377, "rewards/margins_std": 0.19478817284107208, "rewards/rejected": -0.08400467783212662, "step": 980 }, { "dpo_losses": 0.5490652322769165, "epoch": 2.79, "grad_norm": 2.0486277290431736, "learning_rate": 7.523343956923194e-09, "logits/chosen": -2.7556817531585693, "logits/rejected": -2.6662416458129883, "logps/chosen": -294.7388916015625, "logps/rejected": -267.9339904785156, "loss": 0.5647, "positive_losses": 0.10996627807617188, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.2385723888874054, "rewards/margins": 0.3286248743534088, "rewards/margins_max": 0.6235631108283997, "rewards/margins_min": 0.11854185163974762, "rewards/margins_std": 0.2294415682554245, "rewards/rejected": -0.09005247056484222, "step": 990 }, { "dpo_losses": 0.5738095045089722, "epoch": 2.82, "grad_norm": 6.03858746514603, "learning_rate": 5.6579677073121945e-09, "logits/chosen": -2.7139980792999268, "logits/rejected": -2.6967344284057617, "logps/chosen": -273.75885009765625, "logps/rejected": -276.884033203125, "loss": 0.5825, "positive_losses": 0.2053932249546051, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.1967349648475647, "rewards/margins": 0.2664409279823303, "rewards/margins_max": 0.49395784735679626, "rewards/margins_min": 0.09026306867599487, "rewards/margins_std": 0.1819494068622589, "rewards/rejected": -0.06970598548650742, "step": 1000 }, { "epoch": 2.82, "eval_dpo_losses": 0.665816068649292, "eval_logits/chosen": -2.712677001953125, "eval_logits/rejected": -2.674919366836548, "eval_logps/chosen": -278.6134033203125, "eval_logps/rejected": -259.0028076171875, "eval_loss": 0.9543355107307434, "eval_positive_losses": 2.581048011779785, "eval_rewards/accuracies": 0.6290000081062317, "eval_rewards/chosen": 0.0598001554608345, "eval_rewards/margins": 0.06403880566358566, "eval_rewards/margins_max": 0.34747231006622314, "eval_rewards/margins_min": -0.1826418787240982, "eval_rewards/margins_std": 0.17671293020248413, "eval_rewards/rejected": -0.004238648805767298, "eval_runtime": 427.7348, "eval_samples_per_second": 4.676, "eval_steps_per_second": 0.292, "step": 1000 }, { "dpo_losses": 0.5594234466552734, "epoch": 2.85, "grad_norm": 10.058501971668669, "learning_rate": 4.0553325425319585e-09, "logits/chosen": -2.789537191390991, "logits/rejected": -2.7190775871276855, "logps/chosen": -301.9007568359375, "logps/rejected": -313.83831787109375, "loss": 0.6005, "positive_losses": 0.10010738670825958, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.21791288256645203, "rewards/margins": 0.30005866289138794, "rewards/margins_max": 0.5106627345085144, "rewards/margins_min": 0.13350318372249603, "rewards/margins_std": 0.18063752353191376, "rewards/rejected": -0.0821457952260971, "step": 1010 }, { "dpo_losses": 0.5577588081359863, "epoch": 2.87, "grad_norm": 1.8795598215831135, "learning_rate": 2.717161776814747e-09, "logits/chosen": -2.7825233936309814, "logits/rejected": -2.7229418754577637, "logps/chosen": -291.11932373046875, "logps/rejected": -272.02923583984375, "loss": 0.5722, "positive_losses": 0.11093978583812714, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.22510568797588348, "rewards/margins": 0.30276286602020264, "rewards/margins_max": 0.5086154341697693, "rewards/margins_min": 0.1081671267747879, "rewards/margins_std": 0.1743108481168747, "rewards/rejected": -0.07765716314315796, "step": 1020 }, { "dpo_losses": 0.555158257484436, "epoch": 2.9, "grad_norm": 1.7202517104603963, "learning_rate": 1.6448943457189613e-09, "logits/chosen": -2.6988253593444824, "logits/rejected": -2.6433205604553223, "logps/chosen": -286.4718322753906, "logps/rejected": -265.35052490234375, "loss": 0.5888, "positive_losses": 0.37335652112960815, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.23134306073188782, "rewards/margins": 0.31557995080947876, "rewards/margins_max": 0.594192624092102, "rewards/margins_min": 0.10240230709314346, "rewards/margins_std": 0.2274932563304901, "rewards/rejected": -0.08423684537410736, "step": 1030 }, { "dpo_losses": 0.5689266324043274, "epoch": 2.93, "grad_norm": 15.663649464629284, "learning_rate": 8.396832588411229e-10, "logits/chosen": -2.6604883670806885, "logits/rejected": -2.586012840270996, "logps/chosen": -273.3360290527344, "logps/rejected": -243.2672882080078, "loss": 0.5953, "positive_losses": 0.21942074596881866, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.21510818600654602, "rewards/margins": 0.2806702256202698, "rewards/margins_max": 0.5532082319259644, "rewards/margins_min": 0.06844080239534378, "rewards/margins_std": 0.22315998375415802, "rewards/rejected": -0.06556206941604614, "step": 1040 }, { "dpo_losses": 0.5607948303222656, "epoch": 2.96, "grad_norm": 7.720480782752179, "learning_rate": 3.0239435998430374e-10, "logits/chosen": -2.7420949935913086, "logits/rejected": -2.663933038711548, "logps/chosen": -278.32171630859375, "logps/rejected": -232.65087890625, "loss": 0.58, "positive_losses": 0.3233460783958435, "rewards/accuracies": 0.9375, "rewards/chosen": 0.23241767287254333, "rewards/margins": 0.2982451915740967, "rewards/margins_max": 0.537132978439331, "rewards/margins_min": 0.06783739477396011, "rewards/margins_std": 0.2153671681880951, "rewards/rejected": -0.06582748889923096, "step": 1050 }, { "dpo_losses": 0.5575666427612305, "epoch": 2.99, "grad_norm": 1.9829415978985032, "learning_rate": 3.360539611582669e-11, "logits/chosen": -2.6903703212738037, "logits/rejected": -2.675687789916992, "logps/chosen": -256.32183837890625, "logps/rejected": -261.22113037109375, "loss": 0.5684, "positive_losses": 0.07837722450494766, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.23069465160369873, "rewards/margins": 0.30815768241882324, "rewards/margins_max": 0.5907589793205261, "rewards/margins_min": 0.066887766122818, "rewards/margins_std": 0.2346896380186081, "rewards/rejected": -0.0774630457162857, "step": 1060 }, { "epoch": 3.0, "step": 1065, "total_flos": 0.0, "train_loss": 0.6146632369135467, "train_runtime": 13546.83, "train_samples_per_second": 1.257, "train_steps_per_second": 0.079 } ], "logging_steps": 10, "max_steps": 1065, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }