{ "best_metric": 0.8734214901924133, "best_model_checkpoint": "saves/Mistral-7B-Instruct-v0.3/lora/orpo/checkpoint-1500", "epoch": 2.997999555456768, "eval_steps": 500, "global_step": 1686, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017781729273171815, "grad_norm": 2.492755651473999, "learning_rate": 4.9995745934141085e-06, "logits/chosen": -2.952331781387329, "logits/rejected": -2.973951816558838, "logps/chosen": -1.0092018842697144, "logps/rejected": -1.3774441480636597, "loss": 1.0773, "odds_ratio_loss": 0.6805658936500549, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.1009201854467392, "rewards/margins": 0.03682423382997513, "rewards/rejected": -0.13774441182613373, "sft_loss": 1.0092018842697144, "step": 10 }, { "epoch": 0.03556345854634363, "grad_norm": 8.398221969604492, "learning_rate": 4.9982812903243405e-06, "logits/chosen": -2.924294948577881, "logits/rejected": -2.994157314300537, "logps/chosen": -1.0329482555389404, "logps/rejected": -1.2759336233139038, "loss": 1.1014, "odds_ratio_loss": 0.6848658323287964, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.10329482704401016, "rewards/margins": 0.024298548698425293, "rewards/rejected": -0.12759338319301605, "sft_loss": 1.0329482555389404, "step": 20 }, { "epoch": 0.05334518781951545, "grad_norm": 2.1793289184570312, "learning_rate": 4.996120496405222e-06, "logits/chosen": -2.9549760818481445, "logits/rejected": -2.9626007080078125, "logps/chosen": -1.0005769729614258, "logps/rejected": -1.488245964050293, "loss": 1.0616, "odds_ratio_loss": 0.6106585264205933, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1000577062368393, "rewards/margins": 0.048766884952783585, "rewards/rejected": -0.14882458746433258, "sft_loss": 1.0005769729614258, "step": 30 }, { "epoch": 0.07112691709268726, "grad_norm": 2.8343796730041504, "learning_rate": 4.99309296196014e-06, "logits/chosen": -2.924588203430176, "logits/rejected": -2.9867076873779297, "logps/chosen": -1.0675694942474365, "logps/rejected": -1.2621403932571411, "loss": 1.1331, "odds_ratio_loss": 0.6557044982910156, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.10675694793462753, "rewards/margins": 0.019457101821899414, "rewards/rejected": -0.12621404230594635, "sft_loss": 1.0675694942474365, "step": 40 }, { "epoch": 0.08890864636585907, "grad_norm": 2.04829478263855, "learning_rate": 4.989199738255166e-06, "logits/chosen": -2.956892967224121, "logits/rejected": -2.9954347610473633, "logps/chosen": -0.9171065092086792, "logps/rejected": -1.2301478385925293, "loss": 0.9797, "odds_ratio_loss": 0.6256455183029175, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0917106494307518, "rewards/margins": 0.03130412846803665, "rewards/rejected": -0.12301478534936905, "sft_loss": 0.9171065092086792, "step": 50 }, { "epoch": 0.1066903756390309, "grad_norm": 5.50786018371582, "learning_rate": 4.984442177154031e-06, "logits/chosen": -2.9277195930480957, "logits/rejected": -2.9476375579833984, "logps/chosen": -0.9940068125724792, "logps/rejected": -1.2362287044525146, "loss": 1.0632, "odds_ratio_loss": 0.6921108365058899, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.09940069168806076, "rewards/margins": 0.02422218956053257, "rewards/rejected": -0.12362287193536758, "sft_loss": 0.9940068125724792, "step": 60 }, { "epoch": 0.12447210491220272, "grad_norm": 1.7581864595413208, "learning_rate": 4.978821930648704e-06, "logits/chosen": -2.9205572605133057, "logits/rejected": -2.973936080932617, "logps/chosen": -0.9317066073417664, "logps/rejected": -1.0809520483016968, "loss": 1.0027, "odds_ratio_loss": 0.7100493907928467, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.093170665204525, "rewards/margins": 0.014924542978405952, "rewards/rejected": -0.1080952063202858, "sft_loss": 0.9317066073417664, "step": 70 }, { "epoch": 0.14225383418537452, "grad_norm": 3.336517572402954, "learning_rate": 4.97234095028576e-06, "logits/chosen": -2.9701972007751465, "logits/rejected": -2.971057415008545, "logps/chosen": -0.9333993792533875, "logps/rejected": -1.1716864109039307, "loss": 0.9964, "odds_ratio_loss": 0.6299672722816467, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0933399349451065, "rewards/margins": 0.023828700184822083, "rewards/rejected": -0.11716864258050919, "sft_loss": 0.9333993792533875, "step": 80 }, { "epoch": 0.16003556345854633, "grad_norm": 1.333382248878479, "learning_rate": 4.965001486488743e-06, "logits/chosen": -2.9220926761627197, "logits/rejected": -2.951408863067627, "logps/chosen": -0.8873022198677063, "logps/rejected": -1.1284812688827515, "loss": 0.9472, "odds_ratio_loss": 0.5987495183944702, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08873023092746735, "rewards/margins": 0.02411791868507862, "rewards/rejected": -0.11284814029932022, "sft_loss": 0.8873022198677063, "step": 90 }, { "epoch": 0.17781729273171815, "grad_norm": 1.715163230895996, "learning_rate": 4.956806087776732e-06, "logits/chosen": -3.0303444862365723, "logits/rejected": -3.04186749458313, "logps/chosen": -0.9242479205131531, "logps/rejected": -1.3088445663452148, "loss": 0.9857, "odds_ratio_loss": 0.6147152185440063, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.09242479503154755, "rewards/margins": 0.038459669798612595, "rewards/rejected": -0.13088446855545044, "sft_loss": 0.9242479205131531, "step": 100 }, { "epoch": 0.19559902200489, "grad_norm": 2.949481248855591, "learning_rate": 4.947757599879411e-06, "logits/chosen": -3.0064456462860107, "logits/rejected": -3.0399320125579834, "logps/chosen": -0.9601238965988159, "logps/rejected": -1.2331488132476807, "loss": 1.0259, "odds_ratio_loss": 0.6574784517288208, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.09601239114999771, "rewards/margins": 0.027302492409944534, "rewards/rejected": -0.12331489473581314, "sft_loss": 0.9601238965988159, "step": 110 }, { "epoch": 0.2133807512780618, "grad_norm": 1.2405259609222412, "learning_rate": 4.937859164748931e-06, "logits/chosen": -3.0256314277648926, "logits/rejected": -3.044879913330078, "logps/chosen": -0.8803631067276001, "logps/rejected": -1.0130887031555176, "loss": 0.9473, "odds_ratio_loss": 0.668988823890686, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.08803631365299225, "rewards/margins": 0.013272559270262718, "rewards/rejected": -0.10130886733531952, "sft_loss": 0.8803631067276001, "step": 120 }, { "epoch": 0.23116248055123362, "grad_norm": 2.040465831756592, "learning_rate": 4.92711421946891e-06, "logits/chosen": -3.0067856311798096, "logits/rejected": -2.970612049102783, "logps/chosen": -0.8932172060012817, "logps/rejected": -1.1789153814315796, "loss": 0.9558, "odds_ratio_loss": 0.6254903674125671, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.08932172507047653, "rewards/margins": 0.028569817543029785, "rewards/rejected": -0.11789155006408691, "sft_loss": 0.8932172060012817, "step": 130 }, { "epoch": 0.24894420982440543, "grad_norm": 1.586767554283142, "learning_rate": 4.915526495060961e-06, "logits/chosen": -3.0685572624206543, "logits/rejected": -3.0535078048706055, "logps/chosen": -0.8625435829162598, "logps/rejected": -1.1399943828582764, "loss": 0.9238, "odds_ratio_loss": 0.612372636795044, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.08625435084104538, "rewards/margins": 0.027745097875595093, "rewards/rejected": -0.11399944871664047, "sft_loss": 0.8625435829162598, "step": 140 }, { "epoch": 0.26672593909757725, "grad_norm": 1.953273057937622, "learning_rate": 4.903100015189153e-06, "logits/chosen": -3.0217204093933105, "logits/rejected": -3.059971570968628, "logps/chosen": -0.8424757719039917, "logps/rejected": -1.0430591106414795, "loss": 0.9062, "odds_ratio_loss": 0.6374109983444214, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08424757421016693, "rewards/margins": 0.020058346912264824, "rewards/rejected": -0.1043059229850769, "sft_loss": 0.8424757719039917, "step": 150 }, { "epoch": 0.28450766837074903, "grad_norm": 4.5785298347473145, "learning_rate": 4.889839094762848e-06, "logits/chosen": -3.001889705657959, "logits/rejected": -3.0023865699768066, "logps/chosen": -0.887285053730011, "logps/rejected": -1.1001445055007935, "loss": 0.952, "odds_ratio_loss": 0.6474493741989136, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.08872850239276886, "rewards/margins": 0.02128593623638153, "rewards/rejected": -0.11001445353031158, "sft_loss": 0.887285053730011, "step": 160 }, { "epoch": 0.3022893976439209, "grad_norm": 1.2961128950119019, "learning_rate": 4.875748338438416e-06, "logits/chosen": -3.055670738220215, "logits/rejected": -3.0634965896606445, "logps/chosen": -0.8919625282287598, "logps/rejected": -1.0326893329620361, "loss": 0.9602, "odds_ratio_loss": 0.6827921271324158, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08919624984264374, "rewards/margins": 0.014072686433792114, "rewards/rejected": -0.10326894372701645, "sft_loss": 0.8919625282287598, "step": 170 }, { "epoch": 0.32007112691709266, "grad_norm": 2.0726120471954346, "learning_rate": 4.8608326390203386e-06, "logits/chosen": -3.05631685256958, "logits/rejected": -3.0371289253234863, "logps/chosen": -0.8544119000434875, "logps/rejected": -1.0456076860427856, "loss": 0.9188, "odds_ratio_loss": 0.6443353891372681, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.08544120192527771, "rewards/margins": 0.01911957561969757, "rewards/rejected": -0.10456077009439468, "sft_loss": 0.8544119000434875, "step": 180 }, { "epoch": 0.3378528561902645, "grad_norm": 2.612196922302246, "learning_rate": 4.845097175762251e-06, "logits/chosen": -3.0822339057922363, "logits/rejected": -3.1029491424560547, "logps/chosen": -0.907193660736084, "logps/rejected": -1.0687024593353271, "loss": 0.9743, "odds_ratio_loss": 0.6714697480201721, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.09071935713291168, "rewards/margins": 0.01615087315440178, "rewards/rejected": -0.10687023401260376, "sft_loss": 0.907193660736084, "step": 190 }, { "epoch": 0.3556345854634363, "grad_norm": 3.9038424491882324, "learning_rate": 4.8285474125685286e-06, "logits/chosen": -3.066904067993164, "logits/rejected": -3.086334705352783, "logps/chosen": -0.9056366086006165, "logps/rejected": -1.0252189636230469, "loss": 0.9757, "odds_ratio_loss": 0.7004884481430054, "rewards/accuracies": 0.53125, "rewards/chosen": -0.09056366980075836, "rewards/margins": 0.01195824146270752, "rewards/rejected": -0.10252189636230469, "sft_loss": 0.9056366086006165, "step": 200 }, { "epoch": 0.37341631473660813, "grad_norm": 1.57925283908844, "learning_rate": 4.811189096097025e-06, "logits/chosen": -3.044316530227661, "logits/rejected": -3.068372964859009, "logps/chosen": -0.882292628288269, "logps/rejected": -1.1092549562454224, "loss": 0.9473, "odds_ratio_loss": 0.6500683426856995, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.08822925388813019, "rewards/margins": 0.0226962361484766, "rewards/rejected": -0.11092549562454224, "sft_loss": 0.882292628288269, "step": 210 }, { "epoch": 0.39119804400978, "grad_norm": 3.1554384231567383, "learning_rate": 4.793028253763633e-06, "logits/chosen": -3.1082234382629395, "logits/rejected": -3.1198127269744873, "logps/chosen": -0.878674328327179, "logps/rejected": -1.0521525144577026, "loss": 0.9507, "odds_ratio_loss": 0.7204707860946655, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08786743879318237, "rewards/margins": 0.01734781637787819, "rewards/rejected": -0.10521525144577026, "sft_loss": 0.878674328327179, "step": 220 }, { "epoch": 0.40897977328295176, "grad_norm": 3.0515213012695312, "learning_rate": 4.774071191649352e-06, "logits/chosen": -3.0294933319091797, "logits/rejected": -3.036970615386963, "logps/chosen": -0.8506752252578735, "logps/rejected": -1.1133465766906738, "loss": 0.9107, "odds_ratio_loss": 0.6004607677459717, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08506752550601959, "rewards/margins": 0.026267144829034805, "rewards/rejected": -0.1113346666097641, "sft_loss": 0.8506752252578735, "step": 230 }, { "epoch": 0.4267615025561236, "grad_norm": 4.107941627502441, "learning_rate": 4.7543244923105975e-06, "logits/chosen": -3.052797794342041, "logits/rejected": -3.0861849784851074, "logps/chosen": -0.9088889956474304, "logps/rejected": -0.9762862324714661, "loss": 0.9829, "odds_ratio_loss": 0.7397087812423706, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.09088890254497528, "rewards/margins": 0.006739714182913303, "rewards/rejected": -0.09762861579656601, "sft_loss": 0.9088889956474304, "step": 240 }, { "epoch": 0.4445432318292954, "grad_norm": 1.7963005304336548, "learning_rate": 4.733795012493506e-06, "logits/chosen": -3.077770471572876, "logits/rejected": -3.1305344104766846, "logps/chosen": -0.9044251441955566, "logps/rejected": -1.036949872970581, "loss": 0.9739, "odds_ratio_loss": 0.6946715712547302, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.09044251590967178, "rewards/margins": 0.013252484612166882, "rewards/rejected": -0.10369499772787094, "sft_loss": 0.9044251441955566, "step": 250 }, { "epoch": 0.46232496110246724, "grad_norm": 1.4697704315185547, "learning_rate": 4.712489880753035e-06, "logits/chosen": -3.078249931335449, "logits/rejected": -3.072510004043579, "logps/chosen": -0.81315678358078, "logps/rejected": -0.9732586741447449, "loss": 0.8776, "odds_ratio_loss": 0.644811749458313, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08131568133831024, "rewards/margins": 0.016010191291570663, "rewards/rejected": -0.0973258763551712, "sft_loss": 0.81315678358078, "step": 260 }, { "epoch": 0.480106690375639, "grad_norm": 5.489832878112793, "learning_rate": 4.690416494977673e-06, "logits/chosen": -3.067095994949341, "logits/rejected": -3.109727382659912, "logps/chosen": -0.8310638666152954, "logps/rejected": -1.1116752624511719, "loss": 0.8925, "odds_ratio_loss": 0.6144498586654663, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0831063836812973, "rewards/margins": 0.028061147779226303, "rewards/rejected": -0.1111675351858139, "sft_loss": 0.8310638666152954, "step": 270 }, { "epoch": 0.49788841964881086, "grad_norm": 1.4339563846588135, "learning_rate": 4.667582519820639e-06, "logits/chosen": -3.068760395050049, "logits/rejected": -3.1055545806884766, "logps/chosen": -0.9461262822151184, "logps/rejected": -1.0382800102233887, "loss": 1.018, "odds_ratio_loss": 0.7186475992202759, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.09461262822151184, "rewards/margins": 0.009215375408530235, "rewards/rejected": -0.10382799804210663, "sft_loss": 0.9461262822151184, "step": 280 }, { "epoch": 0.5156701489219827, "grad_norm": 3.6930854320526123, "learning_rate": 4.643995884038443e-06, "logits/chosen": -3.0967042446136475, "logits/rejected": -3.1315600872039795, "logps/chosen": -0.8749726414680481, "logps/rejected": -1.058611273765564, "loss": 0.9404, "odds_ratio_loss": 0.6541867256164551, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.08749726414680481, "rewards/margins": 0.018363865092396736, "rewards/rejected": -0.1058611273765564, "sft_loss": 0.8749726414680481, "step": 290 }, { "epoch": 0.5334518781951545, "grad_norm": 1.7125145196914673, "learning_rate": 4.6196647777377475e-06, "logits/chosen": -3.0732457637786865, "logits/rejected": -3.093071222305298, "logps/chosen": -0.8488075137138367, "logps/rejected": -0.9796191453933716, "loss": 0.9176, "odds_ratio_loss": 0.6876064538955688, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0848807543516159, "rewards/margins": 0.013081158511340618, "rewards/rejected": -0.0979619026184082, "sft_loss": 0.8488075137138367, "step": 300 }, { "epoch": 0.5512336074683263, "grad_norm": 1.6855430603027344, "learning_rate": 4.59459764953147e-06, "logits/chosen": -3.115689754486084, "logits/rejected": -3.099546194076538, "logps/chosen": -0.8772395253181458, "logps/rejected": -1.0352530479431152, "loss": 0.9431, "odds_ratio_loss": 0.6584862470626831, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.08772395551204681, "rewards/margins": 0.015801362693309784, "rewards/rejected": -0.1035253182053566, "sft_loss": 0.8772395253181458, "step": 310 }, { "epoch": 0.5690153367414981, "grad_norm": 3.039783239364624, "learning_rate": 4.568803203605133e-06, "logits/chosen": -3.1416523456573486, "logits/rejected": -3.1220498085021973, "logps/chosen": -0.8318166732788086, "logps/rejected": -1.035842776298523, "loss": 0.8969, "odds_ratio_loss": 0.650640606880188, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.08318166434764862, "rewards/margins": 0.020402604714035988, "rewards/rejected": -0.10358426719903946, "sft_loss": 0.8318166732788086, "step": 320 }, { "epoch": 0.58679706601467, "grad_norm": 1.5947670936584473, "learning_rate": 4.542290396694462e-06, "logits/chosen": -3.100538969039917, "logits/rejected": -3.1203720569610596, "logps/chosen": -0.855880081653595, "logps/rejected": -1.0065386295318604, "loss": 0.9255, "odds_ratio_loss": 0.6964801549911499, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.0855880007147789, "rewards/margins": 0.01506584882736206, "rewards/rejected": -0.10065384954214096, "sft_loss": 0.855880081653595, "step": 330 }, { "epoch": 0.6045787952878418, "grad_norm": 3.104470729827881, "learning_rate": 4.515068434975298e-06, "logits/chosen": -3.0526375770568848, "logits/rejected": -3.0920848846435547, "logps/chosen": -0.8729322552680969, "logps/rejected": -1.0834085941314697, "loss": 0.9376, "odds_ratio_loss": 0.6469117403030396, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08729322254657745, "rewards/margins": 0.021047625690698624, "rewards/rejected": -0.10834084451198578, "sft_loss": 0.8729322552680969, "step": 340 }, { "epoch": 0.6223605245610135, "grad_norm": 1.5185527801513672, "learning_rate": 4.487146770866887e-06, "logits/chosen": -3.1082205772399902, "logits/rejected": -3.146754503250122, "logps/chosen": -0.866405189037323, "logps/rejected": -0.985508143901825, "loss": 0.9343, "odds_ratio_loss": 0.6789035797119141, "rewards/accuracies": 0.53125, "rewards/chosen": -0.08664052188396454, "rewards/margins": 0.011910290457308292, "rewards/rejected": -0.09855081886053085, "sft_loss": 0.866405189037323, "step": 350 }, { "epoch": 0.6401422538341853, "grad_norm": 2.0399420261383057, "learning_rate": 4.458535099749666e-06, "logits/chosen": -3.114278793334961, "logits/rejected": -3.1290249824523926, "logps/chosen": -0.9554277658462524, "logps/rejected": -1.0395957231521606, "loss": 1.0312, "odds_ratio_loss": 0.7574664950370789, "rewards/accuracies": 0.53125, "rewards/chosen": -0.095542773604393, "rewards/margins": 0.008416806347668171, "rewards/rejected": -0.1039595827460289, "sft_loss": 0.9554277658462524, "step": 360 }, { "epoch": 0.6579239831073572, "grad_norm": 2.1999988555908203, "learning_rate": 4.429243356598694e-06, "logits/chosen": -3.0874438285827637, "logits/rejected": -3.098285436630249, "logps/chosen": -0.8949627876281738, "logps/rejected": -1.1512229442596436, "loss": 0.9596, "odds_ratio_loss": 0.645936131477356, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08949627727270126, "rewards/margins": 0.02562602423131466, "rewards/rejected": -0.11512230336666107, "sft_loss": 0.8949627876281738, "step": 370 }, { "epoch": 0.675705712380529, "grad_norm": 5.087428092956543, "learning_rate": 4.399281712533875e-06, "logits/chosen": -3.118114709854126, "logits/rejected": -3.1233677864074707, "logps/chosen": -0.8167802095413208, "logps/rejected": -0.9615100026130676, "loss": 0.8862, "odds_ratio_loss": 0.6943861246109009, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08167801797389984, "rewards/margins": 0.01447297353297472, "rewards/rejected": -0.09615099430084229, "sft_loss": 0.8167802095413208, "step": 380 }, { "epoch": 0.6934874416537008, "grad_norm": 2.3240132331848145, "learning_rate": 4.368660571288192e-06, "logits/chosen": -3.1258320808410645, "logits/rejected": -3.1687591075897217, "logps/chosen": -0.8471567034721375, "logps/rejected": -0.9503539204597473, "loss": 0.917, "odds_ratio_loss": 0.6983430981636047, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.08471567928791046, "rewards/margins": 0.010319720953702927, "rewards/rejected": -0.09503538906574249, "sft_loss": 0.8471567034721375, "step": 390 }, { "epoch": 0.7112691709268726, "grad_norm": 1.7417421340942383, "learning_rate": 4.337390565595163e-06, "logits/chosen": -3.0782721042633057, "logits/rejected": -3.099292278289795, "logps/chosen": -0.9293394088745117, "logps/rejected": -0.9703164100646973, "loss": 1.0041, "odds_ratio_loss": 0.7478191256523132, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.09293393790721893, "rewards/margins": 0.004097697325050831, "rewards/rejected": -0.09703163802623749, "sft_loss": 0.9293394088745117, "step": 400 }, { "epoch": 0.7290509002000445, "grad_norm": 2.362359046936035, "learning_rate": 4.305482553496786e-06, "logits/chosen": -3.0271878242492676, "logits/rejected": -3.0372941493988037, "logps/chosen": -0.8028362989425659, "logps/rejected": -0.994833767414093, "loss": 0.868, "odds_ratio_loss": 0.6515198945999146, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.08028362691402435, "rewards/margins": 0.01919974386692047, "rewards/rejected": -0.09948337078094482, "sft_loss": 0.8028362989425659, "step": 410 }, { "epoch": 0.7468326294732163, "grad_norm": 2.797231674194336, "learning_rate": 4.272947614573244e-06, "logits/chosen": -3.0782933235168457, "logits/rejected": -3.116833448410034, "logps/chosen": -0.8883565068244934, "logps/rejected": -1.025665283203125, "loss": 0.9551, "odds_ratio_loss": 0.6675280332565308, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.08883564919233322, "rewards/margins": 0.013730885460972786, "rewards/rejected": -0.10256652534008026, "sft_loss": 0.8883565068244934, "step": 420 }, { "epoch": 0.7646143587463881, "grad_norm": 1.3282934427261353, "learning_rate": 4.23979704609569e-06, "logits/chosen": -3.10003399848938, "logits/rejected": -3.1360583305358887, "logps/chosen": -0.8379910588264465, "logps/rejected": -0.9617422819137573, "loss": 0.9024, "odds_ratio_loss": 0.6443312168121338, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.08379910886287689, "rewards/margins": 0.01237512193620205, "rewards/rejected": -0.0961742252111435, "sft_loss": 0.8379910588264465, "step": 430 }, { "epoch": 0.78239608801956, "grad_norm": 2.023909330368042, "learning_rate": 4.206042359103435e-06, "logits/chosen": -3.0400068759918213, "logits/rejected": -3.081937313079834, "logps/chosen": -0.8709232211112976, "logps/rejected": -1.0840847492218018, "loss": 0.9355, "odds_ratio_loss": 0.6454750299453735, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0870923325419426, "rewards/margins": 0.021316152065992355, "rewards/rejected": -0.10840848833322525, "sft_loss": 0.8709232211112976, "step": 440 }, { "epoch": 0.8001778172927317, "grad_norm": 1.6410523653030396, "learning_rate": 4.17169527440691e-06, "logits/chosen": -3.09321928024292, "logits/rejected": -3.0952792167663574, "logps/chosen": -0.858233630657196, "logps/rejected": -0.9470105171203613, "loss": 0.9306, "odds_ratio_loss": 0.7236040830612183, "rewards/accuracies": 0.46875, "rewards/chosen": -0.08582336455583572, "rewards/margins": 0.008877689950168133, "rewards/rejected": -0.09470105171203613, "sft_loss": 0.858233630657196, "step": 450 }, { "epoch": 0.8179595465659035, "grad_norm": 3.3464248180389404, "learning_rate": 4.136767718517797e-06, "logits/chosen": -3.0975563526153564, "logits/rejected": -3.112638235092163, "logps/chosen": -0.7778853178024292, "logps/rejected": -0.9875160455703735, "loss": 0.8402, "odds_ratio_loss": 0.6234691143035889, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07778853178024292, "rewards/margins": 0.020963061600923538, "rewards/rejected": -0.09875159710645676, "sft_loss": 0.7778853178024292, "step": 460 }, { "epoch": 0.8357412758390753, "grad_norm": 6.556829929351807, "learning_rate": 4.1012718195077196e-06, "logits/chosen": -3.1534528732299805, "logits/rejected": -3.208789348602295, "logps/chosen": -0.8605148196220398, "logps/rejected": -0.9714852571487427, "loss": 0.9289, "odds_ratio_loss": 0.6839339137077332, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.08605148643255234, "rewards/margins": 0.011097034439444542, "rewards/rejected": -0.09714852273464203, "sft_loss": 0.8605148196220398, "step": 470 }, { "epoch": 0.8535230051122472, "grad_norm": 1.461613655090332, "learning_rate": 4.065219902796953e-06, "logits/chosen": -3.090115785598755, "logits/rejected": -3.088887929916382, "logps/chosen": -0.8349069356918335, "logps/rejected": -1.0508782863616943, "loss": 0.8996, "odds_ratio_loss": 0.6472839713096619, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08349069207906723, "rewards/margins": 0.021597128361463547, "rewards/rejected": -0.10508781671524048, "sft_loss": 0.8349069356918335, "step": 480 }, { "epoch": 0.871304734385419, "grad_norm": 1.3518534898757935, "learning_rate": 4.028624486874608e-06, "logits/chosen": -3.1022493839263916, "logits/rejected": -3.1475415229797363, "logps/chosen": -0.8089026212692261, "logps/rejected": -1.0444796085357666, "loss": 0.8747, "odds_ratio_loss": 0.6580694317817688, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08089026063680649, "rewards/margins": 0.023557689040899277, "rewards/rejected": -0.10444796085357666, "sft_loss": 0.8089026212692261, "step": 490 }, { "epoch": 0.8890864636585908, "grad_norm": 1.6888097524642944, "learning_rate": 3.99149827895177e-06, "logits/chosen": -3.127162456512451, "logits/rejected": -3.143782138824463, "logps/chosen": -0.8767441511154175, "logps/rejected": -0.9676705598831177, "loss": 0.9464, "odds_ratio_loss": 0.6963816285133362, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08767442405223846, "rewards/margins": 0.009092646650969982, "rewards/rejected": -0.09676706790924072, "sft_loss": 0.8767441511154175, "step": 500 }, { "epoch": 0.8890864636585908, "eval_logits/chosen": -3.114872932434082, "eval_logits/rejected": -3.143216371536255, "eval_logps/chosen": -0.828136146068573, "eval_logps/rejected": -1.0306241512298584, "eval_loss": 0.8918758630752563, "eval_odds_ratio_loss": 0.6373972296714783, "eval_rewards/accuracies": 0.5690000057220459, "eval_rewards/chosen": -0.08281362056732178, "eval_rewards/margins": 0.020248806104063988, "eval_rewards/rejected": -0.10306241363286972, "eval_runtime": 348.9195, "eval_samples_per_second": 2.866, "eval_sft_loss": 0.828136146068573, "eval_steps_per_second": 1.433, "step": 500 }, { "epoch": 0.9068681929317626, "grad_norm": 1.511196494102478, "learning_rate": 3.953854170549114e-06, "logits/chosen": -3.118255138397217, "logits/rejected": -3.1173043251037598, "logps/chosen": -0.8566571474075317, "logps/rejected": -0.9489420056343079, "loss": 0.9245, "odds_ratio_loss": 0.6780352592468262, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08566570281982422, "rewards/margins": 0.009228493086993694, "rewards/rejected": -0.09489420056343079, "sft_loss": 0.8566571474075317, "step": 510 }, { "epoch": 0.9246499222049345, "grad_norm": 2.5393214225769043, "learning_rate": 3.91570523302051e-06, "logits/chosen": -3.1395115852355957, "logits/rejected": -3.147805690765381, "logps/chosen": -0.7916607856750488, "logps/rejected": -0.9899943470954895, "loss": 0.8583, "odds_ratio_loss": 0.6660428047180176, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.07916607707738876, "rewards/margins": 0.01983334682881832, "rewards/rejected": -0.09899942576885223, "sft_loss": 0.7916607856750488, "step": 520 }, { "epoch": 0.9424316514781063, "grad_norm": 1.5944111347198486, "learning_rate": 3.8770647130141996e-06, "logits/chosen": -3.150245428085327, "logits/rejected": -3.141481876373291, "logps/chosen": -0.8228055834770203, "logps/rejected": -0.9976710081100464, "loss": 0.8888, "odds_ratio_loss": 0.6599084734916687, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08228056132793427, "rewards/margins": 0.017486536875367165, "rewards/rejected": -0.09976708889007568, "sft_loss": 0.8228055834770203, "step": 530 }, { "epoch": 0.960213380751278, "grad_norm": 2.3844027519226074, "learning_rate": 3.837946027873086e-06, "logits/chosen": -3.106717586517334, "logits/rejected": -3.109330177307129, "logps/chosen": -0.8973621129989624, "logps/rejected": -1.0649579763412476, "loss": 0.966, "odds_ratio_loss": 0.6861368417739868, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.089736208319664, "rewards/margins": 0.01675957441329956, "rewards/rejected": -0.10649579763412476, "sft_loss": 0.8973621129989624, "step": 540 }, { "epoch": 0.9779951100244498, "grad_norm": 5.164077281951904, "learning_rate": 3.7983627609757713e-06, "logits/chosen": -3.167064666748047, "logits/rejected": -3.16302490234375, "logps/chosen": -0.8979376554489136, "logps/rejected": -0.9942687153816223, "loss": 0.9667, "odds_ratio_loss": 0.6874598264694214, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.08979376405477524, "rewards/margins": 0.009633105248212814, "rewards/rejected": -0.09942687302827835, "sft_loss": 0.8979376554489136, "step": 550 }, { "epoch": 0.9957768392976217, "grad_norm": 1.5917680263519287, "learning_rate": 3.758328657019924e-06, "logits/chosen": -3.1346166133880615, "logits/rejected": -3.1376471519470215, "logps/chosen": -0.8218947649002075, "logps/rejected": -1.0310758352279663, "loss": 0.8891, "odds_ratio_loss": 0.6719549298286438, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08218947798013687, "rewards/margins": 0.020918114110827446, "rewards/rejected": -0.10310759395360947, "sft_loss": 0.8218947649002075, "step": 560 }, { "epoch": 1.0135585685707935, "grad_norm": 6.842823505401611, "learning_rate": 3.717857617249642e-06, "logits/chosen": -3.1036324501037598, "logits/rejected": -3.145653009414673, "logps/chosen": -0.8951196670532227, "logps/rejected": -1.0871955156326294, "loss": 0.9655, "odds_ratio_loss": 0.7041261792182922, "rewards/accuracies": 0.53125, "rewards/chosen": -0.08951196819543839, "rewards/margins": 0.01920759119093418, "rewards/rejected": -0.10871955007314682, "sft_loss": 0.8951196670532227, "step": 570 }, { "epoch": 1.0313402978439654, "grad_norm": 1.3233413696289062, "learning_rate": 3.6769636946284543e-06, "logits/chosen": -3.145310878753662, "logits/rejected": -3.1411328315734863, "logps/chosen": -0.8030536770820618, "logps/rejected": -0.9519485235214233, "loss": 0.8686, "odds_ratio_loss": 0.6551788449287415, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08030536770820618, "rewards/margins": 0.01488947682082653, "rewards/rejected": -0.09519485384225845, "sft_loss": 0.8030536770820618, "step": 580 }, { "epoch": 1.049122027117137, "grad_norm": 1.561957597732544, "learning_rate": 3.6356610889596355e-06, "logits/chosen": -3.1137917041778564, "logits/rejected": -3.15521502494812, "logps/chosen": -0.8285630941390991, "logps/rejected": -0.9533591270446777, "loss": 0.895, "odds_ratio_loss": 0.6645855903625488, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.08285631239414215, "rewards/margins": 0.012479597702622414, "rewards/rejected": -0.09533590078353882, "sft_loss": 0.8285630941390991, "step": 590 }, { "epoch": 1.066903756390309, "grad_norm": 2.0521960258483887, "learning_rate": 3.593964141955541e-06, "logits/chosen": -3.0969531536102295, "logits/rejected": -3.0988070964813232, "logps/chosen": -0.8090001344680786, "logps/rejected": -0.9104982614517212, "loss": 0.8782, "odds_ratio_loss": 0.6919649839401245, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08090001344680786, "rewards/margins": 0.01014980860054493, "rewards/rejected": -0.09104982018470764, "sft_loss": 0.8090001344680786, "step": 600 }, { "epoch": 1.0846854856634809, "grad_norm": 1.7750905752182007, "learning_rate": 3.5518873322576573e-06, "logits/chosen": -3.044728994369507, "logits/rejected": -3.0994975566864014, "logps/chosen": -0.8208731412887573, "logps/rejected": -0.9647499918937683, "loss": 0.8855, "odds_ratio_loss": 0.6465209126472473, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.08208731561899185, "rewards/margins": 0.014387677423655987, "rewards/rejected": -0.09647499024868011, "sft_loss": 0.8208731412887573, "step": 610 }, { "epoch": 1.1024672149366526, "grad_norm": 1.3415883779525757, "learning_rate": 3.5094452704091143e-06, "logits/chosen": -3.0923125743865967, "logits/rejected": -3.084038734436035, "logps/chosen": -0.7946149110794067, "logps/rejected": -0.9547470808029175, "loss": 0.8602, "odds_ratio_loss": 0.6559656858444214, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0794614925980568, "rewards/margins": 0.016013216227293015, "rewards/rejected": -0.09547470510005951, "sft_loss": 0.7946149110794067, "step": 620 }, { "epoch": 1.1202489442098245, "grad_norm": 3.5497653484344482, "learning_rate": 3.46665269378139e-06, "logits/chosen": -3.059072971343994, "logits/rejected": -3.069256067276001, "logps/chosen": -0.8376399874687195, "logps/rejected": -0.9781678318977356, "loss": 0.9066, "odds_ratio_loss": 0.6894850134849548, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08376399427652359, "rewards/margins": 0.01405278779566288, "rewards/rejected": -0.09781678020954132, "sft_loss": 0.8376399874687195, "step": 630 }, { "epoch": 1.1380306734829961, "grad_norm": 3.2695467472076416, "learning_rate": 3.4235244614569794e-06, "logits/chosen": -3.0830600261688232, "logits/rejected": -3.0843684673309326, "logps/chosen": -0.8990565538406372, "logps/rejected": -0.9821575880050659, "loss": 0.9716, "odds_ratio_loss": 0.7258378267288208, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08990565687417984, "rewards/margins": 0.008310111239552498, "rewards/rejected": -0.09821576625108719, "sft_loss": 0.8990565538406372, "step": 640 }, { "epoch": 1.155812402756168, "grad_norm": 1.196513056755066, "learning_rate": 3.3800755490698008e-06, "logits/chosen": -3.126264810562134, "logits/rejected": -3.124204635620117, "logps/chosen": -0.8168405294418335, "logps/rejected": -1.0534611940383911, "loss": 0.8775, "odds_ratio_loss": 0.6068293452262878, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.08168406784534454, "rewards/margins": 0.023662051185965538, "rewards/rejected": -0.10534612834453583, "sft_loss": 0.8168405294418335, "step": 650 }, { "epoch": 1.17359413202934, "grad_norm": 1.7081139087677002, "learning_rate": 3.3363210436051287e-06, "logits/chosen": -3.130343198776245, "logits/rejected": -3.126983165740967, "logps/chosen": -0.8528251647949219, "logps/rejected": -1.019565224647522, "loss": 0.9228, "odds_ratio_loss": 0.6997644901275635, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.08528250455856323, "rewards/margins": 0.01667400822043419, "rewards/rejected": -0.10195653140544891, "sft_loss": 0.8528251647949219, "step": 660 }, { "epoch": 1.1913758613025116, "grad_norm": 1.8305083513259888, "learning_rate": 3.292276138160867e-06, "logits/chosen": -3.109675645828247, "logits/rejected": -3.1157774925231934, "logps/chosen": -0.7888280153274536, "logps/rejected": -0.9577935338020325, "loss": 0.8538, "odds_ratio_loss": 0.6501932144165039, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.07888280600309372, "rewards/margins": 0.016896549612283707, "rewards/rejected": -0.09577935189008713, "sft_loss": 0.7888280153274536, "step": 670 }, { "epoch": 1.2091575905756835, "grad_norm": 2.812506675720215, "learning_rate": 3.2479561266719694e-06, "logits/chosen": -3.1019396781921387, "logits/rejected": -3.107755184173584, "logps/chosen": -0.8298002481460571, "logps/rejected": -0.9901537895202637, "loss": 0.8946, "odds_ratio_loss": 0.6482952833175659, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.08298002183437347, "rewards/margins": 0.01603536494076252, "rewards/rejected": -0.09901538491249084, "sft_loss": 0.8298002481460571, "step": 680 }, { "epoch": 1.2269393198488552, "grad_norm": 3.2340750694274902, "learning_rate": 3.2033763985998533e-06, "logits/chosen": -3.121992588043213, "logits/rejected": -3.124979257583618, "logps/chosen": -0.7747536301612854, "logps/rejected": -1.1079862117767334, "loss": 0.8347, "odds_ratio_loss": 0.5990911722183228, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07747535407543182, "rewards/margins": 0.033323269337415695, "rewards/rejected": -0.11079863458871841, "sft_loss": 0.7747536301612854, "step": 690 }, { "epoch": 1.244721049122027, "grad_norm": 1.9602211713790894, "learning_rate": 3.1585524335886335e-06, "logits/chosen": -3.1363680362701416, "logits/rejected": -3.1302547454833984, "logps/chosen": -0.7745245695114136, "logps/rejected": -0.9697211980819702, "loss": 0.838, "odds_ratio_loss": 0.6345950365066528, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07745245844125748, "rewards/margins": 0.019519677385687828, "rewards/rejected": -0.09697212278842926, "sft_loss": 0.7745245695114136, "step": 700 }, { "epoch": 1.262502778395199, "grad_norm": 3.0812952518463135, "learning_rate": 3.1134997960900536e-06, "logits/chosen": -3.0771961212158203, "logits/rejected": -3.08510160446167, "logps/chosen": -0.7646561861038208, "logps/rejected": -1.0509836673736572, "loss": 0.8246, "odds_ratio_loss": 0.5993521809577942, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07646562159061432, "rewards/margins": 0.028632745146751404, "rewards/rejected": -0.10509836673736572, "sft_loss": 0.7646561861038208, "step": 710 }, { "epoch": 1.2802845076683709, "grad_norm": 1.5706931352615356, "learning_rate": 3.0682341299589583e-06, "logits/chosen": -3.096446990966797, "logits/rejected": -3.110931873321533, "logps/chosen": -0.8055674433708191, "logps/rejected": -0.9553298950195312, "loss": 0.8716, "odds_ratio_loss": 0.6605285406112671, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08055675029754639, "rewards/margins": 0.014976252801716328, "rewards/rejected": -0.09553299844264984, "sft_loss": 0.8055674433708191, "step": 720 }, { "epoch": 1.2980662369415426, "grad_norm": 1.670327067375183, "learning_rate": 3.022771153021201e-06, "logits/chosen": -3.127776622772217, "logits/rejected": -3.1598572731018066, "logps/chosen": -0.7699373960494995, "logps/rejected": -0.9526535272598267, "loss": 0.8354, "odds_ratio_loss": 0.654297947883606, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07699373364448547, "rewards/margins": 0.018271619454026222, "rewards/rejected": -0.09526535123586655, "sft_loss": 0.7699373960494995, "step": 730 }, { "epoch": 1.3158479662147144, "grad_norm": 1.666502833366394, "learning_rate": 2.9771266516158625e-06, "logits/chosen": -3.0938611030578613, "logits/rejected": -3.111356735229492, "logps/chosen": -0.795330822467804, "logps/rejected": -0.9487611055374146, "loss": 0.8641, "odds_ratio_loss": 0.687196671962738, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.07953307777643204, "rewards/margins": 0.015343038365244865, "rewards/rejected": -0.09487612545490265, "sft_loss": 0.795330822467804, "step": 740 }, { "epoch": 1.3336296954878861, "grad_norm": 1.529642939567566, "learning_rate": 2.9313164751136802e-06, "logits/chosen": -3.082942485809326, "logits/rejected": -3.1158337593078613, "logps/chosen": -0.789255678653717, "logps/rejected": -0.9912340044975281, "loss": 0.8503, "odds_ratio_loss": 0.6099725961685181, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07892556488513947, "rewards/margins": 0.020197834819555283, "rewards/rejected": -0.09912340342998505, "sft_loss": 0.789255678653717, "step": 750 }, { "epoch": 1.351411424761058, "grad_norm": 2.9339799880981445, "learning_rate": 2.8853565304135956e-06, "logits/chosen": -3.1478281021118164, "logits/rejected": -3.144963264465332, "logps/chosen": -0.8711767196655273, "logps/rejected": -0.9750477075576782, "loss": 0.9427, "odds_ratio_loss": 0.7154635190963745, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08711767941713333, "rewards/margins": 0.010387107729911804, "rewards/rejected": -0.09750477969646454, "sft_loss": 0.8711767196655273, "step": 760 }, { "epoch": 1.36919315403423, "grad_norm": 3.5656025409698486, "learning_rate": 2.839262776419313e-06, "logits/chosen": -3.1182093620300293, "logits/rejected": -3.1154582500457764, "logps/chosen": -0.7866981029510498, "logps/rejected": -1.0985205173492432, "loss": 0.8467, "odds_ratio_loss": 0.5998324155807495, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07866980135440826, "rewards/margins": 0.031182238832116127, "rewards/rejected": -0.10985203832387924, "sft_loss": 0.7866981029510498, "step": 770 }, { "epoch": 1.3869748833074016, "grad_norm": 2.08962345123291, "learning_rate": 2.793051218497817e-06, "logits/chosen": -3.1209683418273926, "logits/rejected": -3.1391050815582275, "logps/chosen": -0.797200083732605, "logps/rejected": -0.8991384506225586, "loss": 0.8653, "odds_ratio_loss": 0.6806570887565613, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.07972002029418945, "rewards/margins": 0.010193833149969578, "rewards/rejected": -0.08991385996341705, "sft_loss": 0.797200083732605, "step": 780 }, { "epoch": 1.4047566125805735, "grad_norm": 1.7654404640197754, "learning_rate": 2.7467379029217437e-06, "logits/chosen": -3.092345714569092, "logits/rejected": -3.099000930786133, "logps/chosen": -0.7963561415672302, "logps/rejected": -0.9906966090202332, "loss": 0.8603, "odds_ratio_loss": 0.639264702796936, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0796356275677681, "rewards/margins": 0.019434038549661636, "rewards/rejected": -0.09906966984272003, "sft_loss": 0.7963561415672302, "step": 790 }, { "epoch": 1.4225383418537452, "grad_norm": 1.4254413843154907, "learning_rate": 2.7003389112975546e-06, "logits/chosen": -3.1396844387054443, "logits/rejected": -3.180053949356079, "logps/chosen": -0.844267725944519, "logps/rejected": -0.9890397191047668, "loss": 0.9108, "odds_ratio_loss": 0.6649594902992249, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.08442677557468414, "rewards/margins": 0.01447719894349575, "rewards/rejected": -0.09890398383140564, "sft_loss": 0.844267725944519, "step": 800 }, { "epoch": 1.440320071126917, "grad_norm": 3.8261585235595703, "learning_rate": 2.653870354981437e-06, "logits/chosen": -3.123039722442627, "logits/rejected": -3.1270766258239746, "logps/chosen": -0.7622265219688416, "logps/rejected": -0.9670180082321167, "loss": 0.8268, "odds_ratio_loss": 0.6459091901779175, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.07622265070676804, "rewards/margins": 0.020479146391153336, "rewards/rejected": -0.09670180082321167, "sft_loss": 0.7622265219688416, "step": 810 }, { "epoch": 1.458101800400089, "grad_norm": 6.478664875030518, "learning_rate": 2.6073483694848777e-06, "logits/chosen": -3.0914266109466553, "logits/rejected": -3.1468262672424316, "logps/chosen": -0.7940482497215271, "logps/rejected": -0.9618217349052429, "loss": 0.8608, "odds_ratio_loss": 0.6678633093833923, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07940482348203659, "rewards/margins": 0.016777347773313522, "rewards/rejected": -0.09618218243122101, "sft_loss": 0.7940482497215271, "step": 820 }, { "epoch": 1.4758835296732609, "grad_norm": 1.7955982685089111, "learning_rate": 2.560789108871847e-06, "logits/chosen": -3.087249755859375, "logits/rejected": -3.099762439727783, "logps/chosen": -0.8293372392654419, "logps/rejected": -1.0816946029663086, "loss": 0.8942, "odds_ratio_loss": 0.6483136415481567, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08293372392654419, "rewards/margins": 0.025235742330551147, "rewards/rejected": -0.10816947370767593, "sft_loss": 0.8293372392654419, "step": 830 }, { "epoch": 1.4936652589464325, "grad_norm": 4.553436279296875, "learning_rate": 2.514208740149544e-06, "logits/chosen": -3.123802900314331, "logits/rejected": -3.1615843772888184, "logps/chosen": -0.8601408004760742, "logps/rejected": -1.0482033491134644, "loss": 0.9282, "odds_ratio_loss": 0.6806772947311401, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.08601407706737518, "rewards/margins": 0.018806258216500282, "rewards/rejected": -0.10482033342123032, "sft_loss": 0.8601408004760742, "step": 840 }, { "epoch": 1.5114469882196042, "grad_norm": 3.1794512271881104, "learning_rate": 2.46762343765464e-06, "logits/chosen": -3.1444077491760254, "logits/rejected": -3.1544933319091797, "logps/chosen": -0.8352905511856079, "logps/rejected": -1.0490363836288452, "loss": 0.898, "odds_ratio_loss": 0.6273452639579773, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0835290476679802, "rewards/margins": 0.021374579519033432, "rewards/rejected": -0.10490362346172333, "sft_loss": 0.8352905511856079, "step": 850 }, { "epoch": 1.5292287174927761, "grad_norm": 1.8062447309494019, "learning_rate": 2.4210493774369903e-06, "logits/chosen": -3.0938150882720947, "logits/rejected": -3.102355718612671, "logps/chosen": -0.8377913236618042, "logps/rejected": -0.9871052503585815, "loss": 0.9059, "odds_ratio_loss": 0.6812715530395508, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08377913385629654, "rewards/margins": 0.014931400306522846, "rewards/rejected": -0.09871052205562592, "sft_loss": 0.8377913236618042, "step": 860 }, { "epoch": 1.547010446765948, "grad_norm": 1.5386985540390015, "learning_rate": 2.374502731642732e-06, "logits/chosen": -3.1051342487335205, "logits/rejected": -3.1245017051696777, "logps/chosen": -0.8524861335754395, "logps/rejected": -1.017881155014038, "loss": 0.9179, "odds_ratio_loss": 0.6543157696723938, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08524861931800842, "rewards/margins": 0.016539499163627625, "rewards/rejected": -0.10178811848163605, "sft_loss": 0.8524861335754395, "step": 870 }, { "epoch": 1.56479217603912, "grad_norm": 2.0160138607025146, "learning_rate": 2.3279996628987556e-06, "logits/chosen": -3.093174457550049, "logits/rejected": -3.1271913051605225, "logps/chosen": -0.8324817419052124, "logps/rejected": -0.9784995913505554, "loss": 0.8998, "odds_ratio_loss": 0.6732369661331177, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.08324816823005676, "rewards/margins": 0.01460178941488266, "rewards/rejected": -0.09784995764493942, "sft_loss": 0.8324817419052124, "step": 880 }, { "epoch": 1.5825739053122916, "grad_norm": 1.5362610816955566, "learning_rate": 2.281556318700474e-06, "logits/chosen": -3.1063926219940186, "logits/rejected": -3.150496482849121, "logps/chosen": -0.7895249128341675, "logps/rejected": -0.9089393615722656, "loss": 0.8592, "odds_ratio_loss": 0.6968772411346436, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.07895249128341675, "rewards/margins": 0.011941445991396904, "rewards/rejected": -0.0908939391374588, "sft_loss": 0.7895249128341675, "step": 890 }, { "epoch": 1.6003556345854635, "grad_norm": 3.5220394134521484, "learning_rate": 2.2351888258048408e-06, "logits/chosen": -3.0469326972961426, "logits/rejected": -3.095856189727783, "logps/chosen": -0.798681914806366, "logps/rejected": -0.9773387908935547, "loss": 0.8628, "odds_ratio_loss": 0.641067385673523, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07986819744110107, "rewards/margins": 0.017865682020783424, "rewards/rejected": -0.09773387759923935, "sft_loss": 0.798681914806366, "step": 900 }, { "epoch": 1.6181373638586352, "grad_norm": 2.17846941947937, "learning_rate": 2.188913284630584e-06, "logits/chosen": -3.1179308891296387, "logits/rejected": -3.14939284324646, "logps/chosen": -0.8766034841537476, "logps/rejected": -0.9665753245353699, "loss": 0.9481, "odds_ratio_loss": 0.7149003148078918, "rewards/accuracies": 0.5, "rewards/chosen": -0.08766035735607147, "rewards/margins": 0.008997179567813873, "rewards/rejected": -0.09665753692388535, "sft_loss": 0.8766034841537476, "step": 910 }, { "epoch": 1.635919093131807, "grad_norm": 6.3163251876831055, "learning_rate": 2.1427457636675652e-06, "logits/chosen": -3.118419647216797, "logits/rejected": -3.1435821056365967, "logps/chosen": -0.8221105337142944, "logps/rejected": -0.9653439521789551, "loss": 0.8905, "odds_ratio_loss": 0.6840168833732605, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.08221106976270676, "rewards/margins": 0.014323326759040356, "rewards/rejected": -0.09653439372777939, "sft_loss": 0.8221105337142944, "step": 920 }, { "epoch": 1.653700822404979, "grad_norm": 2.3323636054992676, "learning_rate": 2.096702293897247e-06, "logits/chosen": -3.118809700012207, "logits/rejected": -3.1240172386169434, "logps/chosen": -0.7927727699279785, "logps/rejected": -1.0566480159759521, "loss": 0.856, "odds_ratio_loss": 0.6325381994247437, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.07927727699279785, "rewards/margins": 0.026387536898255348, "rewards/rejected": -0.10566481202840805, "sft_loss": 0.7927727699279785, "step": 930 }, { "epoch": 1.6714825516781509, "grad_norm": 2.5871617794036865, "learning_rate": 2.0507988632261672e-06, "logits/chosen": -3.0783491134643555, "logits/rejected": -3.142695188522339, "logps/chosen": -0.788642406463623, "logps/rejected": -0.9959260821342468, "loss": 0.8502, "odds_ratio_loss": 0.6157304048538208, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07886423915624619, "rewards/margins": 0.020728373900055885, "rewards/rejected": -0.09959261119365692, "sft_loss": 0.788642406463623, "step": 940 }, { "epoch": 1.6892642809513225, "grad_norm": 6.09738302230835, "learning_rate": 2.005051410934382e-06, "logits/chosen": -3.1027965545654297, "logits/rejected": -3.1486849784851074, "logps/chosen": -0.8924347162246704, "logps/rejected": -1.025657057762146, "loss": 0.9605, "odds_ratio_loss": 0.6808988451957703, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08924347162246704, "rewards/margins": 0.013322234153747559, "rewards/rejected": -0.1025657057762146, "sft_loss": 0.8924347162246704, "step": 950 }, { "epoch": 1.7070460102244942, "grad_norm": 2.1956799030303955, "learning_rate": 1.9594758221407843e-06, "logits/chosen": -3.1190600395202637, "logits/rejected": -3.1192500591278076, "logps/chosen": -0.7558837532997131, "logps/rejected": -0.9841734766960144, "loss": 0.8162, "odds_ratio_loss": 0.6032260060310364, "rewards/accuracies": 0.625, "rewards/chosen": -0.07558837532997131, "rewards/margins": 0.0228289682418108, "rewards/rejected": -0.09841735661029816, "sft_loss": 0.7558837532997131, "step": 960 }, { "epoch": 1.724827739497666, "grad_norm": 3.2010116577148438, "learning_rate": 1.9140879222872408e-06, "logits/chosen": -3.119006872177124, "logits/rejected": -3.144542694091797, "logps/chosen": -0.7783070206642151, "logps/rejected": -0.8883264660835266, "loss": 0.8488, "odds_ratio_loss": 0.70525062084198, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.07783070206642151, "rewards/margins": 0.011001949198544025, "rewards/rejected": -0.08883266150951385, "sft_loss": 0.7783070206642151, "step": 970 }, { "epoch": 1.742609468770838, "grad_norm": 1.4073106050491333, "learning_rate": 1.8689034716434346e-06, "logits/chosen": -3.143158197402954, "logits/rejected": -3.162026882171631, "logps/chosen": -0.849888026714325, "logps/rejected": -0.9449575543403625, "loss": 0.9213, "odds_ratio_loss": 0.7145692706108093, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08498881012201309, "rewards/margins": 0.009506945498287678, "rewards/rejected": -0.0944957509636879, "sft_loss": 0.849888026714325, "step": 980 }, { "epoch": 1.76039119804401, "grad_norm": 1.643964171409607, "learning_rate": 1.8239381598343576e-06, "logits/chosen": -3.1246304512023926, "logits/rejected": -3.1464321613311768, "logps/chosen": -0.7999427914619446, "logps/rejected": -0.9506388902664185, "loss": 0.8683, "odds_ratio_loss": 0.6839095950126648, "rewards/accuracies": 0.53125, "rewards/chosen": -0.07999428361654282, "rewards/margins": 0.015069609507918358, "rewards/rejected": -0.09506388753652573, "sft_loss": 0.7999427914619446, "step": 990 }, { "epoch": 1.7781729273171816, "grad_norm": 4.767948150634766, "learning_rate": 1.779207600392312e-06, "logits/chosen": -3.123835325241089, "logits/rejected": -3.1298935413360596, "logps/chosen": -0.8067057728767395, "logps/rejected": -0.9445611238479614, "loss": 0.8737, "odds_ratio_loss": 0.6700451970100403, "rewards/accuracies": 0.5625, "rewards/chosen": -0.080670565366745, "rewards/margins": 0.013785535469651222, "rewards/rejected": -0.09445609152317047, "sft_loss": 0.8067057728767395, "step": 1000 }, { "epoch": 1.7781729273171816, "eval_logits/chosen": -3.1139109134674072, "eval_logits/rejected": -3.1430606842041016, "eval_logps/chosen": -0.8136406540870667, "eval_logps/rejected": -1.018557071685791, "eval_loss": 0.8773505687713623, "eval_odds_ratio_loss": 0.6370999217033386, "eval_rewards/accuracies": 0.5759999752044678, "eval_rewards/chosen": -0.08136406540870667, "eval_rewards/margins": 0.020491650328040123, "eval_rewards/rejected": -0.10185571014881134, "eval_runtime": 189.1267, "eval_samples_per_second": 5.287, "eval_sft_loss": 0.8136406540870667, "eval_steps_per_second": 2.644, "step": 1000 }, { "epoch": 1.7959546565903532, "grad_norm": 2.2980809211730957, "learning_rate": 1.7347273253353552e-06, "logits/chosen": -3.0896313190460205, "logits/rejected": -3.117469310760498, "logps/chosen": -0.8154736757278442, "logps/rejected": -0.9821268320083618, "loss": 0.8833, "odds_ratio_loss": 0.6783260107040405, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.08154736459255219, "rewards/margins": 0.016665320843458176, "rewards/rejected": -0.09821268171072006, "sft_loss": 0.8154736757278442, "step": 1010 }, { "epoch": 1.8137363858635251, "grad_norm": 4.3619232177734375, "learning_rate": 1.690512779774029e-06, "logits/chosen": -3.108875036239624, "logits/rejected": -3.119654655456543, "logps/chosen": -0.8301160931587219, "logps/rejected": -1.0722554922103882, "loss": 0.8927, "odds_ratio_loss": 0.6254863142967224, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.08301161974668503, "rewards/margins": 0.024213943630456924, "rewards/rejected": -0.10722555965185165, "sft_loss": 0.8301160931587219, "step": 1020 }, { "epoch": 1.831518115136697, "grad_norm": 2.628239870071411, "learning_rate": 1.6465793165482838e-06, "logits/chosen": -3.098904609680176, "logits/rejected": -3.1030189990997314, "logps/chosen": -0.7733818888664246, "logps/rejected": -0.9600175619125366, "loss": 0.8352, "odds_ratio_loss": 0.6180769205093384, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07733818888664246, "rewards/margins": 0.018663574010133743, "rewards/rejected": -0.0960017591714859, "sft_loss": 0.7733818888664246, "step": 1030 }, { "epoch": 1.849299844409869, "grad_norm": 2.7811410427093506, "learning_rate": 1.6029421908964305e-06, "logits/chosen": -3.0989787578582764, "logits/rejected": -3.1128220558166504, "logps/chosen": -0.7662326693534851, "logps/rejected": -1.2116987705230713, "loss": 0.8252, "odds_ratio_loss": 0.5896168351173401, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07662326097488403, "rewards/margins": 0.044546615332365036, "rewards/rejected": -0.12116988748311996, "sft_loss": 0.7662326693534851, "step": 1040 }, { "epoch": 1.8670815736830408, "grad_norm": 2.588897466659546, "learning_rate": 1.559616555157985e-06, "logits/chosen": -3.1540348529815674, "logits/rejected": -3.1318535804748535, "logps/chosen": -0.8036566972732544, "logps/rejected": -0.9966574907302856, "loss": 0.8694, "odds_ratio_loss": 0.656964123249054, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08036566525697708, "rewards/margins": 0.019300078973174095, "rewards/rejected": -0.09966574609279633, "sft_loss": 0.8036566972732544, "step": 1050 }, { "epoch": 1.8848633029562125, "grad_norm": 3.195645332336426, "learning_rate": 1.516617453512252e-06, "logits/chosen": -3.133869171142578, "logits/rejected": -3.1599550247192383, "logps/chosen": -0.8567641377449036, "logps/rejected": -0.9691047668457031, "loss": 0.9289, "odds_ratio_loss": 0.7213839888572693, "rewards/accuracies": 0.5, "rewards/chosen": -0.0856764167547226, "rewards/margins": 0.011234072968363762, "rewards/rejected": -0.0969104915857315, "sft_loss": 0.8567641377449036, "step": 1060 }, { "epoch": 1.9026450322293842, "grad_norm": 3.544257164001465, "learning_rate": 1.473959816754449e-06, "logits/chosen": -3.1071698665618896, "logits/rejected": -3.119621992111206, "logps/chosen": -0.8016077280044556, "logps/rejected": -0.9158931970596313, "loss": 0.8714, "odds_ratio_loss": 0.6980700492858887, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08016077429056168, "rewards/margins": 0.011428546160459518, "rewards/rejected": -0.09158932417631149, "sft_loss": 0.8016077280044556, "step": 1070 }, { "epoch": 1.920426761502556, "grad_norm": 2.2053537368774414, "learning_rate": 1.4316584571112213e-06, "logits/chosen": -3.1642978191375732, "logits/rejected": -3.1734910011291504, "logps/chosen": -0.8405769467353821, "logps/rejected": -0.9534690976142883, "loss": 0.9088, "odds_ratio_loss": 0.6824837327003479, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08405770361423492, "rewards/margins": 0.011289209127426147, "rewards/rejected": -0.09534691274166107, "sft_loss": 0.8405769467353821, "step": 1080 }, { "epoch": 1.938208490775728, "grad_norm": 3.7732386589050293, "learning_rate": 1.389728063097306e-06, "logits/chosen": -3.134726047515869, "logits/rejected": -3.1553549766540527, "logps/chosen": -0.8409829139709473, "logps/rejected": -1.037058711051941, "loss": 0.9054, "odds_ratio_loss": 0.6446219682693481, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08409829437732697, "rewards/margins": 0.019607581198215485, "rewards/rejected": -0.10370586812496185, "sft_loss": 0.8409829139709473, "step": 1090 }, { "epoch": 1.9559902200488999, "grad_norm": 1.9941437244415283, "learning_rate": 1.348183194415179e-06, "logits/chosen": -3.12330961227417, "logits/rejected": -3.0894432067871094, "logps/chosen": -0.8183244466781616, "logps/rejected": -1.0717554092407227, "loss": 0.8794, "odds_ratio_loss": 0.6103757619857788, "rewards/accuracies": 0.625, "rewards/chosen": -0.08183245360851288, "rewards/margins": 0.025343094021081924, "rewards/rejected": -0.1071755513548851, "sft_loss": 0.8183244466781616, "step": 1100 }, { "epoch": 1.9737719493220716, "grad_norm": 2.1723690032958984, "learning_rate": 1.3070382768994015e-06, "logits/chosen": -3.1375839710235596, "logits/rejected": -3.1476972103118896, "logps/chosen": -0.8107814788818359, "logps/rejected": -0.9439038038253784, "loss": 0.8756, "odds_ratio_loss": 0.6477454900741577, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.08107815682888031, "rewards/margins": 0.01331222616136074, "rewards/rejected": -0.0943903774023056, "sft_loss": 0.8107814788818359, "step": 1110 }, { "epoch": 1.9915536785952432, "grad_norm": 2.5445873737335205, "learning_rate": 1.2663075975074746e-06, "logits/chosen": -3.1265861988067627, "logits/rejected": -3.1423563957214355, "logps/chosen": -0.79461270570755, "logps/rejected": -1.0579864978790283, "loss": 0.8606, "odds_ratio_loss": 0.6598424911499023, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07946126163005829, "rewards/margins": 0.026337390765547752, "rewards/rejected": -0.10579866170883179, "sft_loss": 0.79461270570755, "step": 1120 }, { "epoch": 2.009335407868415, "grad_norm": 5.374589920043945, "learning_rate": 1.2260052993589034e-06, "logits/chosen": -3.117713212966919, "logits/rejected": -3.1392993927001953, "logps/chosen": -0.9073926210403442, "logps/rejected": -0.9984840154647827, "loss": 0.9829, "odds_ratio_loss": 0.7546229362487793, "rewards/accuracies": 0.46875, "rewards/chosen": -0.09073926508426666, "rewards/margins": 0.009109143167734146, "rewards/rejected": -0.09984840452671051, "sft_loss": 0.9073926210403442, "step": 1130 }, { "epoch": 2.027117137141587, "grad_norm": 1.554049015045166, "learning_rate": 1.1861453768242099e-06, "logits/chosen": -3.1674160957336426, "logits/rejected": -3.16347599029541, "logps/chosen": -0.730399489402771, "logps/rejected": -0.9565631747245789, "loss": 0.7897, "odds_ratio_loss": 0.5933364629745483, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0730399638414383, "rewards/margins": 0.02261636219918728, "rewards/rejected": -0.09565632045269012, "sft_loss": 0.730399489402771, "step": 1140 }, { "epoch": 2.044898866414759, "grad_norm": 10.319910049438477, "learning_rate": 1.1467416706655982e-06, "logits/chosen": -3.141704559326172, "logits/rejected": -3.173985719680786, "logps/chosen": -0.8747559785842896, "logps/rejected": -1.045388102531433, "loss": 0.9448, "odds_ratio_loss": 0.7000676989555359, "rewards/accuracies": 0.5, "rewards/chosen": -0.08747559785842896, "rewards/margins": 0.017063220962882042, "rewards/rejected": -0.10453881323337555, "sft_loss": 0.8747559785842896, "step": 1150 }, { "epoch": 2.062680595687931, "grad_norm": 2.64601993560791, "learning_rate": 1.1078078632309559e-06, "logits/chosen": -3.1251769065856934, "logits/rejected": -3.154083251953125, "logps/chosen": -0.7768465280532837, "logps/rejected": -0.9674509167671204, "loss": 0.8405, "odds_ratio_loss": 0.6370204091072083, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0776846632361412, "rewards/margins": 0.01906043104827404, "rewards/rejected": -0.0967450961470604, "sft_loss": 0.7768465280532837, "step": 1160 }, { "epoch": 2.0804623249611023, "grad_norm": 8.88864517211914, "learning_rate": 1.0693574737028627e-06, "logits/chosen": -3.1327998638153076, "logits/rejected": -3.1513829231262207, "logps/chosen": -0.811104416847229, "logps/rejected": -0.959033191204071, "loss": 0.8792, "odds_ratio_loss": 0.6805382966995239, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08111042529344559, "rewards/margins": 0.01479288749396801, "rewards/rejected": -0.09590331465005875, "sft_loss": 0.811104416847229, "step": 1170 }, { "epoch": 2.098244054234274, "grad_norm": 2.3200793266296387, "learning_rate": 1.0314038534042586e-06, "logits/chosen": -3.154930830001831, "logits/rejected": -3.1501238346099854, "logps/chosen": -0.7636488676071167, "logps/rejected": -0.9957521557807922, "loss": 0.8285, "odds_ratio_loss": 0.6481651067733765, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.07636488974094391, "rewards/margins": 0.023210326209664345, "rewards/rejected": -0.0995752140879631, "sft_loss": 0.7636488676071167, "step": 1180 }, { "epoch": 2.116025783507446, "grad_norm": 3.470479965209961, "learning_rate": 9.939601811623946e-07, "logits/chosen": -3.1409804821014404, "logits/rejected": -3.14042329788208, "logps/chosen": -0.813196063041687, "logps/rejected": -0.9785780906677246, "loss": 0.8813, "odds_ratio_loss": 0.6813095808029175, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08131961524486542, "rewards/margins": 0.01653820462524891, "rewards/rejected": -0.09785781800746918, "sft_loss": 0.813196063041687, "step": 1190 }, { "epoch": 2.133807512780618, "grad_norm": 2.298424005508423, "learning_rate": 9.570394587326825e-07, "logits/chosen": -3.1406095027923584, "logits/rejected": -3.138267993927002, "logps/chosen": -0.7988274693489075, "logps/rejected": -1.0399543046951294, "loss": 0.8608, "odds_ratio_loss": 0.6200910806655884, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07988274842500687, "rewards/margins": 0.024112680926918983, "rewards/rejected": -0.1039954274892807, "sft_loss": 0.7988274693489075, "step": 1200 }, { "epoch": 2.15158924205379, "grad_norm": 1.9331620931625366, "learning_rate": 9.206545062840302e-07, "logits/chosen": -3.181776285171509, "logits/rejected": -3.1430013179779053, "logps/chosen": -0.7699235081672668, "logps/rejected": -1.0029253959655762, "loss": 0.831, "odds_ratio_loss": 0.6103402376174927, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07699234783649445, "rewards/margins": 0.02330019325017929, "rewards/rejected": -0.10029254108667374, "sft_loss": 0.7699235081672668, "step": 1210 }, { "epoch": 2.1693709713269618, "grad_norm": 1.9117600917816162, "learning_rate": 8.848179579472285e-07, "logits/chosen": -3.16937518119812, "logits/rejected": -3.171738862991333, "logps/chosen": -0.7665938138961792, "logps/rejected": -0.8684927821159363, "loss": 0.8349, "odds_ratio_loss": 0.6835185885429382, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.07665937393903732, "rewards/margins": 0.010189898312091827, "rewards/rejected": -0.08684928715229034, "sft_loss": 0.7665938138961792, "step": 1220 }, { "epoch": 2.1871527006001332, "grad_norm": 5.053982734680176, "learning_rate": 8.495422574279403e-07, "logits/chosen": -3.1011910438537598, "logits/rejected": -3.092721462249756, "logps/chosen": -0.7057160139083862, "logps/rejected": -0.9676464796066284, "loss": 0.7646, "odds_ratio_loss": 0.5885148048400879, "rewards/accuracies": 0.625, "rewards/chosen": -0.07057160139083862, "rewards/margins": 0.026193052530288696, "rewards/rejected": -0.09676466137170792, "sft_loss": 0.7057160139083862, "step": 1230 }, { "epoch": 2.204934429873305, "grad_norm": 2.2379298210144043, "learning_rate": 8.148396536858063e-07, "logits/chosen": -3.1442742347717285, "logits/rejected": -3.1458396911621094, "logps/chosen": -0.8305691480636597, "logps/rejected": -1.0573723316192627, "loss": 0.8959, "odds_ratio_loss": 0.6531893610954285, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08305691182613373, "rewards/margins": 0.02268032357096672, "rewards/rejected": -0.10573724657297134, "sft_loss": 0.8305691480636597, "step": 1240 }, { "epoch": 2.222716159146477, "grad_norm": 2.2036707401275635, "learning_rate": 7.807221966811815e-07, "logits/chosen": -3.1296424865722656, "logits/rejected": -3.142879009246826, "logps/chosen": -0.815384566783905, "logps/rejected": -0.9788722991943359, "loss": 0.8822, "odds_ratio_loss": 0.6686090230941772, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.08153846114873886, "rewards/margins": 0.01634877361357212, "rewards/rejected": -0.09788723289966583, "sft_loss": 0.815384566783905, "step": 1250 }, { "epoch": 2.240497888419649, "grad_norm": 1.7958269119262695, "learning_rate": 7.47201733190962e-07, "logits/chosen": -3.1007513999938965, "logits/rejected": -3.1123993396759033, "logps/chosen": -0.7537363767623901, "logps/rejected": -0.9363679885864258, "loss": 0.8162, "odds_ratio_loss": 0.6245176792144775, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.07537363469600677, "rewards/margins": 0.018263157457113266, "rewards/rejected": -0.09363678842782974, "sft_loss": 0.7537363767623901, "step": 1260 }, { "epoch": 2.258279617692821, "grad_norm": 2.2002153396606445, "learning_rate": 7.142899026949721e-07, "logits/chosen": -3.181532382965088, "logits/rejected": -3.189258098602295, "logps/chosen": -0.7867833971977234, "logps/rejected": -0.9312666654586792, "loss": 0.8504, "odds_ratio_loss": 0.6360999941825867, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07867833971977234, "rewards/margins": 0.01444832980632782, "rewards/rejected": -0.09312666952610016, "sft_loss": 0.7867833971977234, "step": 1270 }, { "epoch": 2.2760613469659923, "grad_norm": 5.216893672943115, "learning_rate": 6.819981333343273e-07, "logits/chosen": -3.0660347938537598, "logits/rejected": -3.095858097076416, "logps/chosen": -0.7724840641021729, "logps/rejected": -0.9914291501045227, "loss": 0.8347, "odds_ratio_loss": 0.6223303079605103, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07724839448928833, "rewards/margins": 0.021894508972764015, "rewards/rejected": -0.09914292395114899, "sft_loss": 0.7724840641021729, "step": 1280 }, { "epoch": 2.293843076239164, "grad_norm": 2.3061211109161377, "learning_rate": 6.503376379431839e-07, "logits/chosen": -3.1206648349761963, "logits/rejected": -3.1211998462677, "logps/chosen": -0.8609710931777954, "logps/rejected": -0.918415904045105, "loss": 0.9299, "odds_ratio_loss": 0.6894447803497314, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0860971063375473, "rewards/margins": 0.005744467489421368, "rewards/rejected": -0.09184158593416214, "sft_loss": 0.8609710931777954, "step": 1290 }, { "epoch": 2.311624805512336, "grad_norm": 1.7814314365386963, "learning_rate": 6.193194101552502e-07, "logits/chosen": -3.126063346862793, "logits/rejected": -3.1108012199401855, "logps/chosen": -0.7555452585220337, "logps/rejected": -1.0133960247039795, "loss": 0.8139, "odds_ratio_loss": 0.5832154154777527, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.07555452734231949, "rewards/margins": 0.025785094127058983, "rewards/rejected": -0.10133961588144302, "sft_loss": 0.7555452585220337, "step": 1300 }, { "epoch": 2.329406534785508, "grad_norm": 3.5369389057159424, "learning_rate": 5.889542205864083e-07, "logits/chosen": -3.1118927001953125, "logits/rejected": -3.1322388648986816, "logps/chosen": -0.809761643409729, "logps/rejected": -1.0040373802185059, "loss": 0.873, "odds_ratio_loss": 0.6319769620895386, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.08097617328166962, "rewards/margins": 0.019427578896284103, "rewards/rejected": -0.10040374100208282, "sft_loss": 0.809761643409729, "step": 1310 }, { "epoch": 2.34718826405868, "grad_norm": 1.8906471729278564, "learning_rate": 5.592526130947862e-07, "logits/chosen": -3.094481945037842, "logits/rejected": -3.1228842735290527, "logps/chosen": -0.8294750452041626, "logps/rejected": -1.0103859901428223, "loss": 0.8988, "odds_ratio_loss": 0.6931812763214111, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.0829475075006485, "rewards/margins": 0.018091093748807907, "rewards/rejected": -0.10103859752416611, "sft_loss": 0.8294750452041626, "step": 1320 }, { "epoch": 2.3649699933318518, "grad_norm": 2.300255298614502, "learning_rate": 5.302249011195507e-07, "logits/chosen": -3.091862916946411, "logits/rejected": -3.1117231845855713, "logps/chosen": -0.7831630110740662, "logps/rejected": -0.9011325836181641, "loss": 0.8506, "odds_ratio_loss": 0.6743569374084473, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07831630855798721, "rewards/margins": 0.01179695688188076, "rewards/rejected": -0.09011325985193253, "sft_loss": 0.7831630110740662, "step": 1330 }, { "epoch": 2.382751722605023, "grad_norm": 2.0519402027130127, "learning_rate": 5.018811640997307e-07, "logits/chosen": -3.1082584857940674, "logits/rejected": -3.143366813659668, "logps/chosen": -0.8331505656242371, "logps/rejected": -1.1331783533096313, "loss": 0.8955, "odds_ratio_loss": 0.6230587363243103, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.08331505209207535, "rewards/margins": 0.030002791434526443, "rewards/rejected": -0.1133178323507309, "sft_loss": 0.8331505656242371, "step": 1340 }, { "epoch": 2.400533451878195, "grad_norm": 2.004222869873047, "learning_rate": 4.7423124397427105e-07, "logits/chosen": -3.0787928104400635, "logits/rejected": -3.1223533153533936, "logps/chosen": -0.8188081979751587, "logps/rejected": -0.9587352871894836, "loss": 0.8855, "odds_ratio_loss": 0.6670054197311401, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.08188082277774811, "rewards/margins": 0.013992704451084137, "rewards/rejected": -0.09587351977825165, "sft_loss": 0.8188081979751587, "step": 1350 }, { "epoch": 2.418315181151367, "grad_norm": 2.007474422454834, "learning_rate": 4.472847417645787e-07, "logits/chosen": -3.1503520011901855, "logits/rejected": -3.1351065635681152, "logps/chosen": -0.8101593255996704, "logps/rejected": -1.086388349533081, "loss": 0.8703, "odds_ratio_loss": 0.6015632748603821, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0810159370303154, "rewards/margins": 0.027622899040579796, "rewards/rejected": -0.10863884538412094, "sft_loss": 0.8101593255996704, "step": 1360 }, { "epoch": 2.436096910424539, "grad_norm": 1.4029066562652588, "learning_rate": 4.210510142406993e-07, "logits/chosen": -3.122816562652588, "logits/rejected": -3.095937490463257, "logps/chosen": -0.7846102714538574, "logps/rejected": -1.0122342109680176, "loss": 0.8472, "odds_ratio_loss": 0.626270055770874, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.07846103608608246, "rewards/margins": 0.022762387990951538, "rewards/rejected": -0.101223424077034, "sft_loss": 0.7846102714538574, "step": 1370 }, { "epoch": 2.4538786396977104, "grad_norm": 1.7324745655059814, "learning_rate": 3.9553917067232966e-07, "logits/chosen": -3.122037172317505, "logits/rejected": -3.1394925117492676, "logps/chosen": -0.8041805028915405, "logps/rejected": -0.9916130900382996, "loss": 0.87, "odds_ratio_loss": 0.658187210559845, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08041805773973465, "rewards/margins": 0.018743254244327545, "rewards/rejected": -0.0991613045334816, "sft_loss": 0.8041805028915405, "step": 1380 }, { "epoch": 2.4716603689708823, "grad_norm": 2.2863593101501465, "learning_rate": 3.707580696657509e-07, "logits/chosen": -3.118274450302124, "logits/rejected": -3.109182834625244, "logps/chosen": -0.7898752689361572, "logps/rejected": -0.945044219493866, "loss": 0.8546, "odds_ratio_loss": 0.6472212672233582, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.07898753136396408, "rewards/margins": 0.015516892075538635, "rewards/rejected": -0.09450441598892212, "sft_loss": 0.7898752689361572, "step": 1390 }, { "epoch": 2.489442098244054, "grad_norm": 2.1385016441345215, "learning_rate": 3.4671631608781815e-07, "logits/chosen": -3.125810384750366, "logits/rejected": -3.1365230083465576, "logps/chosen": -0.8170459866523743, "logps/rejected": -1.0128613710403442, "loss": 0.8858, "odds_ratio_loss": 0.6880038380622864, "rewards/accuracies": 0.5625, "rewards/chosen": -0.08170458674430847, "rewards/margins": 0.019581545144319534, "rewards/rejected": -0.1012861356139183, "sft_loss": 0.8170459866523743, "step": 1400 }, { "epoch": 2.507223827517226, "grad_norm": 2.561035633087158, "learning_rate": 3.234222580780405e-07, "logits/chosen": -3.1027114391326904, "logits/rejected": -3.124307632446289, "logps/chosen": -0.7941089868545532, "logps/rejected": -0.9503856897354126, "loss": 0.8579, "odds_ratio_loss": 0.6381289958953857, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07941089570522308, "rewards/margins": 0.015627671033143997, "rewards/rejected": -0.09503857046365738, "sft_loss": 0.7941089868545532, "step": 1410 }, { "epoch": 2.525005556790398, "grad_norm": 2.1497950553894043, "learning_rate": 3.0088398414982375e-07, "logits/chosen": -3.0808122158050537, "logits/rejected": -3.118448257446289, "logps/chosen": -0.8251555562019348, "logps/rejected": -1.0561994314193726, "loss": 0.8918, "odds_ratio_loss": 0.6662226319313049, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.08251555263996124, "rewards/margins": 0.023104388266801834, "rewards/rejected": -0.10561992973089218, "sft_loss": 0.8251555562019348, "step": 1420 }, { "epoch": 2.54278728606357, "grad_norm": 2.1733312606811523, "learning_rate": 2.7910932038184487e-07, "logits/chosen": -3.0798656940460205, "logits/rejected": -3.0586531162261963, "logps/chosen": -0.8029570579528809, "logps/rejected": -0.9888774156570435, "loss": 0.8665, "odds_ratio_loss": 0.6356409192085266, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.08029570430517197, "rewards/margins": 0.018592040985822678, "rewards/rejected": -0.09888775646686554, "sft_loss": 0.8029570579528809, "step": 1430 }, { "epoch": 2.5605690153367417, "grad_norm": 2.0504164695739746, "learning_rate": 2.5810582770057325e-07, "logits/chosen": -3.1239726543426514, "logits/rejected": -3.1643452644348145, "logps/chosen": -0.7773226499557495, "logps/rejected": -0.9956240653991699, "loss": 0.8421, "odds_ratio_loss": 0.6477575302124023, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.07773226499557495, "rewards/margins": 0.02183014526963234, "rewards/rejected": -0.09956242144107819, "sft_loss": 0.7773226499557495, "step": 1440 }, { "epoch": 2.578350744609913, "grad_norm": 2.4383292198181152, "learning_rate": 2.3788079925484402e-07, "logits/chosen": -3.1351797580718994, "logits/rejected": -3.1292059421539307, "logps/chosen": -0.8360335230827332, "logps/rejected": -0.9335094690322876, "loss": 0.9052, "odds_ratio_loss": 0.6920183300971985, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.08360335975885391, "rewards/margins": 0.009747589938342571, "rewards/rejected": -0.09335094690322876, "sft_loss": 0.8360335230827332, "step": 1450 }, { "epoch": 2.596132473883085, "grad_norm": 2.4566612243652344, "learning_rate": 2.1844125788342661e-07, "logits/chosen": -3.108156681060791, "logits/rejected": -3.1151247024536133, "logps/chosen": -0.7554203867912292, "logps/rejected": -1.1023544073104858, "loss": 0.8163, "odds_ratio_loss": 0.6091145277023315, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07554203271865845, "rewards/margins": 0.034693412482738495, "rewards/rejected": -0.11023545265197754, "sft_loss": 0.7554203867912292, "step": 1460 }, { "epoch": 2.613914203156257, "grad_norm": 2.3035502433776855, "learning_rate": 1.9979395367644428e-07, "logits/chosen": -3.143681049346924, "logits/rejected": -3.1587703227996826, "logps/chosen": -0.7682673931121826, "logps/rejected": -0.9972553253173828, "loss": 0.8278, "odds_ratio_loss": 0.5949780344963074, "rewards/accuracies": 0.59375, "rewards/chosen": -0.07682673633098602, "rewards/margins": 0.02289879135787487, "rewards/rejected": -0.09972552955150604, "sft_loss": 0.7682673931121826, "step": 1470 }, { "epoch": 2.631695932429429, "grad_norm": 3.8865010738372803, "learning_rate": 1.81945361631512e-07, "logits/chosen": -3.1585988998413086, "logits/rejected": -3.178792715072632, "logps/chosen": -0.8142994046211243, "logps/rejected": -0.951032817363739, "loss": 0.8828, "odds_ratio_loss": 0.6848469972610474, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.08142994344234467, "rewards/margins": 0.013673332519829273, "rewards/rejected": -0.09510327875614166, "sft_loss": 0.8142994046211243, "step": 1480 }, { "epoch": 2.6494776617026004, "grad_norm": 3.1666336059570312, "learning_rate": 1.6490167940538343e-07, "logits/chosen": -3.154137372970581, "logits/rejected": -3.1491308212280273, "logps/chosen": -0.7683095932006836, "logps/rejected": -1.0064373016357422, "loss": 0.8283, "odds_ratio_loss": 0.6001058220863342, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.07683096826076508, "rewards/margins": 0.023812763392925262, "rewards/rejected": -0.10064373165369034, "sft_loss": 0.7683095932006836, "step": 1490 }, { "epoch": 2.6672593909757722, "grad_norm": 9.307645797729492, "learning_rate": 1.4866882516191339e-07, "logits/chosen": -3.0799524784088135, "logits/rejected": -3.1244568824768066, "logps/chosen": -0.8257862329483032, "logps/rejected": -1.011817216873169, "loss": 0.8923, "odds_ratio_loss": 0.6649435758590698, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0825786143541336, "rewards/margins": 0.018603112548589706, "rewards/rejected": -0.10118173062801361, "sft_loss": 0.8257862329483032, "step": 1500 }, { "epoch": 2.6672593909757722, "eval_logits/chosen": -3.1171207427978516, "eval_logits/rejected": -3.145516872406006, "eval_logps/chosen": -0.8098240494728088, "eval_logps/rejected": -1.0174543857574463, "eval_loss": 0.8734214901924133, "eval_odds_ratio_loss": 0.6359757781028748, "eval_rewards/accuracies": 0.5720000267028809, "eval_rewards/chosen": -0.08098240196704865, "eval_rewards/margins": 0.02076304331421852, "eval_rewards/rejected": -0.10174543410539627, "eval_runtime": 237.6146, "eval_samples_per_second": 4.208, "eval_sft_loss": 0.8098240494728088, "eval_steps_per_second": 2.104, "step": 1500 }, { "epoch": 2.685041120248944, "grad_norm": 4.906961441040039, "learning_rate": 1.3325243551706057e-07, "logits/chosen": -3.0958564281463623, "logits/rejected": -3.1364972591400146, "logps/chosen": -0.7746607065200806, "logps/rejected": -1.0890486240386963, "loss": 0.834, "odds_ratio_loss": 0.5929327607154846, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.07746607810258865, "rewards/margins": 0.03143879026174545, "rewards/rejected": -0.10890486091375351, "sft_loss": 0.7746607065200806, "step": 1510 }, { "epoch": 2.702822849522116, "grad_norm": 8.813859939575195, "learning_rate": 1.1865786358165737e-07, "logits/chosen": -3.1081910133361816, "logits/rejected": -3.160123586654663, "logps/chosen": -0.7778806686401367, "logps/rejected": -0.9513812065124512, "loss": 0.842, "odds_ratio_loss": 0.6413436532020569, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.07778806984424591, "rewards/margins": 0.017350060865283012, "rewards/rejected": -0.09513812512159348, "sft_loss": 0.7778806686401367, "step": 1520 }, { "epoch": 2.720604578795288, "grad_norm": 5.624754428863525, "learning_rate": 1.0489017710262311e-07, "logits/chosen": -3.1469523906707764, "logits/rejected": -3.178358554840088, "logps/chosen": -0.8407548666000366, "logps/rejected": -1.1098471879959106, "loss": 0.9062, "odds_ratio_loss": 0.6548250317573547, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.08407549560070038, "rewards/margins": 0.026909226551651955, "rewards/rejected": -0.11098472028970718, "sft_loss": 0.8407548666000366, "step": 1530 }, { "epoch": 2.73838630806846, "grad_norm": 3.942481756210327, "learning_rate": 9.195415670326446e-08, "logits/chosen": -3.13153076171875, "logits/rejected": -3.1526730060577393, "logps/chosen": -0.8119581341743469, "logps/rejected": -1.0097941160202026, "loss": 0.8766, "odds_ratio_loss": 0.6465227007865906, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.08119582384824753, "rewards/margins": 0.019783606752753258, "rewards/rejected": -0.10097942501306534, "sft_loss": 0.8119581341743469, "step": 1540 }, { "epoch": 2.7561680373416317, "grad_norm": 3.0953104496002197, "learning_rate": 7.985429422327384e-08, "logits/chosen": -3.1453542709350586, "logits/rejected": -3.165792226791382, "logps/chosen": -0.8054162263870239, "logps/rejected": -0.9290239214897156, "loss": 0.875, "odds_ratio_loss": 0.6959558129310608, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.08054163306951523, "rewards/margins": 0.01236076932400465, "rewards/rejected": -0.09290239959955215, "sft_loss": 0.8054162263870239, "step": 1550 }, { "epoch": 2.773949766614803, "grad_norm": 1.9901708364486694, "learning_rate": 6.859479115900818e-08, "logits/chosen": -3.118248224258423, "logits/rejected": -3.158768892288208, "logps/chosen": -0.7784756422042847, "logps/rejected": -0.9923986196517944, "loss": 0.8408, "odds_ratio_loss": 0.6230874061584473, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.07784756273031235, "rewards/margins": 0.021392302587628365, "rewards/rejected": -0.09923987090587616, "sft_loss": 0.7784756422042847, "step": 1560 }, { "epoch": 2.791731495887975, "grad_norm": 10.119742393493652, "learning_rate": 5.817955720457902e-08, "logits/chosen": -3.107785701751709, "logits/rejected": -3.1253585815429688, "logps/chosen": -0.8034731149673462, "logps/rejected": -0.9698166847229004, "loss": 0.8686, "odds_ratio_loss": 0.6507803201675415, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.08034731447696686, "rewards/margins": 0.016634367406368256, "rewards/rejected": -0.09698168933391571, "sft_loss": 0.8034731149673462, "step": 1570 }, { "epoch": 2.809513225161147, "grad_norm": 3.319011926651001, "learning_rate": 4.861220889427199e-08, "logits/chosen": -3.1124069690704346, "logits/rejected": -3.131798505783081, "logps/chosen": -0.8197135925292969, "logps/rejected": -0.9885567426681519, "loss": 0.887, "odds_ratio_loss": 0.6726602911949158, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.08197136968374252, "rewards/margins": 0.01688431203365326, "rewards/rejected": -0.09885567426681519, "sft_loss": 0.8197135925292969, "step": 1580 }, { "epoch": 2.827294954434319, "grad_norm": 2.032493829727173, "learning_rate": 3.9896068346758074e-08, "logits/chosen": -3.134514331817627, "logits/rejected": -3.1544039249420166, "logps/chosen": -0.8260439038276672, "logps/rejected": -0.9724828004837036, "loss": 0.8923, "odds_ratio_loss": 0.6625251770019531, "rewards/accuracies": 0.59375, "rewards/chosen": -0.08260440081357956, "rewards/margins": 0.01464388333261013, "rewards/rejected": -0.09724827855825424, "sft_loss": 0.8260439038276672, "step": 1590 }, { "epoch": 2.8450766837074903, "grad_norm": 4.936295986175537, "learning_rate": 3.203416211153832e-08, "logits/chosen": -3.1045830249786377, "logits/rejected": -3.163196086883545, "logps/chosen": -0.8155530691146851, "logps/rejected": -0.9749042391777039, "loss": 0.8845, "odds_ratio_loss": 0.6895264983177185, "rewards/accuracies": 0.5, "rewards/chosen": -0.0815553218126297, "rewards/margins": 0.015935102477669716, "rewards/rejected": -0.09749042987823486, "sft_loss": 0.8155530691146851, "step": 1600 }, { "epoch": 2.8628584129806622, "grad_norm": 3.0522594451904297, "learning_rate": 2.5029220118019393e-08, "logits/chosen": -3.0816335678100586, "logits/rejected": -3.120738983154297, "logps/chosen": -0.8227775692939758, "logps/rejected": -0.9608209729194641, "loss": 0.8883, "odds_ratio_loss": 0.6554335355758667, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.08227775990962982, "rewards/margins": 0.013804334215819836, "rewards/rejected": -0.09608209133148193, "sft_loss": 0.8227775692939758, "step": 1610 }, { "epoch": 2.880640142253834, "grad_norm": 6.9008331298828125, "learning_rate": 1.8883674727586122e-08, "logits/chosen": -3.120664119720459, "logits/rejected": -3.1391146183013916, "logps/chosen": -0.7796843647956848, "logps/rejected": -1.048107385635376, "loss": 0.8417, "odds_ratio_loss": 0.620618999004364, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.07796843349933624, "rewards/margins": 0.026842307299375534, "rewards/rejected": -0.10481073707342148, "sft_loss": 0.7796843647956848, "step": 1620 }, { "epoch": 2.898421871527006, "grad_norm": 1.4706188440322876, "learning_rate": 1.3599659889000639e-08, "logits/chosen": -3.1607601642608643, "logits/rejected": -3.1758437156677246, "logps/chosen": -0.7644230723381042, "logps/rejected": -0.8909217715263367, "loss": 0.8322, "odds_ratio_loss": 0.6777127981185913, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.07644230127334595, "rewards/margins": 0.01264987699687481, "rewards/rejected": -0.0890921801328659, "sft_loss": 0.7644230723381042, "step": 1630 }, { "epoch": 2.916203600800178, "grad_norm": 1.4733339548110962, "learning_rate": 9.179010397421528e-09, "logits/chosen": -3.1298046112060547, "logits/rejected": -3.1558828353881836, "logps/chosen": -0.7814117670059204, "logps/rejected": -0.9674129486083984, "loss": 0.8469, "odds_ratio_loss": 0.6549249887466431, "rewards/accuracies": 0.5, "rewards/chosen": -0.07814116775989532, "rewards/margins": 0.01860012486577034, "rewards/rejected": -0.09674130380153656, "sft_loss": 0.7814117670059204, "step": 1640 }, { "epoch": 2.93398533007335, "grad_norm": 1.6659821271896362, "learning_rate": 5.623261257296509e-09, "logits/chosen": -3.100876569747925, "logits/rejected": -3.1546549797058105, "logps/chosen": -0.7405164241790771, "logps/rejected": -0.9196218252182007, "loss": 0.8037, "odds_ratio_loss": 0.6319458484649658, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0740516409277916, "rewards/margins": 0.017910538241267204, "rewards/rejected": -0.09196218103170395, "sft_loss": 0.7405164241790771, "step": 1650 }, { "epoch": 2.9517670593465217, "grad_norm": 2.577908754348755, "learning_rate": 2.933647149357122e-09, "logits/chosen": -3.1165366172790527, "logits/rejected": -3.137150764465332, "logps/chosen": -0.7795825600624084, "logps/rejected": -0.9781227111816406, "loss": 0.8445, "odds_ratio_loss": 0.648685097694397, "rewards/accuracies": 0.5625, "rewards/chosen": -0.07795824855566025, "rewards/margins": 0.01985403150320053, "rewards/rejected": -0.09781228005886078, "sft_loss": 0.7795825600624084, "step": 1660 }, { "epoch": 2.969548788619693, "grad_norm": 2.166626453399658, "learning_rate": 1.1111020018930717e-09, "logits/chosen": -3.156930446624756, "logits/rejected": -3.1462855339050293, "logps/chosen": -0.8264468908309937, "logps/rejected": -0.9435287714004517, "loss": 0.8929, "odds_ratio_loss": 0.6643285751342773, "rewards/accuracies": 0.5, "rewards/chosen": -0.08264468610286713, "rewards/margins": 0.011708182282745838, "rewards/rejected": -0.09435287117958069, "sft_loss": 0.8264468908309937, "step": 1670 }, { "epoch": 2.987330517892865, "grad_norm": 2.1216466426849365, "learning_rate": 1.5625866646051813e-10, "logits/chosen": -3.1541049480438232, "logits/rejected": -3.1485564708709717, "logps/chosen": -0.779746413230896, "logps/rejected": -0.9904630780220032, "loss": 0.8399, "odds_ratio_loss": 0.6012987494468689, "rewards/accuracies": 0.625, "rewards/chosen": -0.07797463238239288, "rewards/margins": 0.021071670576930046, "rewards/rejected": -0.09904631227254868, "sft_loss": 0.779746413230896, "step": 1680 }, { "epoch": 2.997999555456768, "step": 1686, "total_flos": 1.9814178520144282e+18, "train_loss": 0.8985705958283811, "train_runtime": 25618.6466, "train_samples_per_second": 1.054, "train_steps_per_second": 0.066 } ], "logging_steps": 10, "max_steps": 1686, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 1.9814178520144282e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }