{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 40000000000000000, "global_step": 522, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009578544061302681, "grad_norm": 19.977911424248134, "learning_rate": 4.777070063694267e-09, "logits/chosen": 0.9882557988166809, "logits/rejected": 0.86724853515625, "logps/chosen": -0.6835294961929321, "logps/rejected": -0.7026089429855347, "loss": 5.255, "nll_loss": 0.6835293769836426, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6.835294246673584, "rewards/margins": 0.19079475104808807, "rewards/rejected": -7.026089668273926, "step": 5 }, { "epoch": 0.019157088122605363, "grad_norm": 15.532794462607491, "learning_rate": 9.554140127388535e-09, "logits/chosen": 0.8198736906051636, "logits/rejected": 0.7917336225509644, "logps/chosen": -0.7847039103507996, "logps/rejected": -0.7573590278625488, "loss": 5.3866, "nll_loss": 0.7847039103507996, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -7.847039222717285, "rewards/margins": -0.2734483778476715, "rewards/rejected": -7.573590278625488, "step": 10 }, { "epoch": 0.028735632183908046, "grad_norm": 17.18263537036945, "learning_rate": 1.4331210191082803e-08, "logits/chosen": 0.6409434080123901, "logits/rejected": 0.8134855031967163, "logps/chosen": -0.7896786332130432, "logps/rejected": -0.7245721220970154, "loss": 5.1665, "nll_loss": 0.7896786332130432, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -7.896786689758301, "rewards/margins": -0.6510653495788574, "rewards/rejected": -7.245721340179443, "step": 15 }, { "epoch": 0.038314176245210725, "grad_norm": 14.59673969815782, "learning_rate": 1.910828025477707e-08, "logits/chosen": 0.7096026539802551, "logits/rejected": 0.8205940127372742, "logps/chosen": -0.7373861074447632, "logps/rejected": -0.7078397870063782, "loss": 5.229, "nll_loss": 0.7373861074447632, "rewards/accuracies": 0.0, "rewards/chosen": -7.373861789703369, "rewards/margins": -0.29546356201171875, "rewards/rejected": -7.07839822769165, "step": 20 }, { "epoch": 0.04789272030651341, "grad_norm": 15.05129246603728, "learning_rate": 2.3885350318471336e-08, "logits/chosen": 0.9928520321846008, "logits/rejected": 0.9827763438224792, "logps/chosen": -0.5954749584197998, "logps/rejected": -0.654929518699646, "loss": 5.4048, "nll_loss": 0.595474898815155, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.954749584197998, "rewards/margins": 0.5945457220077515, "rewards/rejected": -6.549294948577881, "step": 25 }, { "epoch": 0.05747126436781609, "grad_norm": 20.985831817061353, "learning_rate": 2.8662420382165606e-08, "logits/chosen": 0.7672659158706665, "logits/rejected": 0.8577300310134888, "logps/chosen": -0.6465980410575867, "logps/rejected": -0.7104107737541199, "loss": 5.2394, "nll_loss": 0.6465979814529419, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.465980529785156, "rewards/margins": 0.6381272077560425, "rewards/rejected": -7.104107856750488, "step": 30 }, { "epoch": 0.06704980842911877, "grad_norm": 16.252866127879475, "learning_rate": 3.343949044585987e-08, "logits/chosen": 0.8599953651428223, "logits/rejected": 0.8298212289810181, "logps/chosen": -0.7009156346321106, "logps/rejected": -0.7079470157623291, "loss": 5.3018, "nll_loss": 0.700915515422821, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.009156227111816, "rewards/margins": 0.07031383365392685, "rewards/rejected": -7.079470157623291, "step": 35 }, { "epoch": 0.07662835249042145, "grad_norm": 16.99436823968821, "learning_rate": 3.821656050955414e-08, "logits/chosen": 0.6351531744003296, "logits/rejected": 0.7295518517494202, "logps/chosen": -0.7221059799194336, "logps/rejected": -0.7749537825584412, "loss": 5.2718, "nll_loss": 0.722105860710144, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.221059322357178, "rewards/margins": 0.5284790992736816, "rewards/rejected": -7.749538421630859, "step": 40 }, { "epoch": 0.08620689655172414, "grad_norm": 16.474791128586975, "learning_rate": 4.29936305732484e-08, "logits/chosen": 0.9291857481002808, "logits/rejected": 0.83441162109375, "logps/chosen": -0.6729990243911743, "logps/rejected": -0.7266682386398315, "loss": 5.2957, "nll_loss": 0.6729990243911743, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6.729989528656006, "rewards/margins": 0.5366925001144409, "rewards/rejected": -7.2666826248168945, "step": 45 }, { "epoch": 0.09578544061302682, "grad_norm": 17.51495070571998, "learning_rate": 4.777070063694267e-08, "logits/chosen": 0.776635468006134, "logits/rejected": 0.845638632774353, "logps/chosen": -0.7210168838500977, "logps/rejected": -0.6794952154159546, "loss": 5.2687, "nll_loss": 0.7210168838500977, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -7.210168361663818, "rewards/margins": -0.41521579027175903, "rewards/rejected": -6.794952392578125, "step": 50 }, { "epoch": 0.1053639846743295, "grad_norm": 15.080710084896578, "learning_rate": 5.2547770700636935e-08, "logits/chosen": 0.8091619610786438, "logits/rejected": 0.8544187545776367, "logps/chosen": -0.7046722769737244, "logps/rejected": -0.703011691570282, "loss": 5.276, "nll_loss": 0.7046722769737244, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -7.046723365783691, "rewards/margins": -0.01660604402422905, "rewards/rejected": -7.030117034912109, "step": 55 }, { "epoch": 0.11494252873563218, "grad_norm": 22.50777831402302, "learning_rate": 5.732484076433121e-08, "logits/chosen": 0.8281611204147339, "logits/rejected": 0.8864553570747375, "logps/chosen": -0.6377026438713074, "logps/rejected": -0.7030226588249207, "loss": 5.2442, "nll_loss": 0.6377025842666626, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6.377026557922363, "rewards/margins": 0.6532003283500671, "rewards/rejected": -7.030226707458496, "step": 60 }, { "epoch": 0.12452107279693486, "grad_norm": 19.79864592347249, "learning_rate": 6.210191082802548e-08, "logits/chosen": 1.0290864706039429, "logits/rejected": 0.7254050374031067, "logps/chosen": -0.6748142838478088, "logps/rejected": -0.6928014755249023, "loss": 5.3121, "nll_loss": 0.6748142838478088, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6.748143196105957, "rewards/margins": 0.1798720359802246, "rewards/rejected": -6.928015232086182, "step": 65 }, { "epoch": 0.13409961685823754, "grad_norm": 15.639368843476637, "learning_rate": 6.687898089171974e-08, "logits/chosen": 0.9622044563293457, "logits/rejected": 0.9868735074996948, "logps/chosen": -0.7220475077629089, "logps/rejected": -0.8194485902786255, "loss": 5.2694, "nll_loss": 0.7220475077629089, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.220475196838379, "rewards/margins": 0.9740108251571655, "rewards/rejected": -8.194485664367676, "step": 70 }, { "epoch": 0.14367816091954022, "grad_norm": 15.338789064769195, "learning_rate": 7.165605095541401e-08, "logits/chosen": 0.8242565989494324, "logits/rejected": 0.8403714299201965, "logps/chosen": -0.8075603246688843, "logps/rejected": -0.7985510230064392, "loss": 5.307, "nll_loss": 0.8075603246688843, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.075602531433105, "rewards/margins": -0.09009275585412979, "rewards/rejected": -7.985510349273682, "step": 75 }, { "epoch": 0.1532567049808429, "grad_norm": 16.558919395561837, "learning_rate": 7.643312101910828e-08, "logits/chosen": 0.852637767791748, "logits/rejected": 0.6942145228385925, "logps/chosen": -0.6923746466636658, "logps/rejected": -0.748778760433197, "loss": 5.2346, "nll_loss": 0.6923746466636658, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.923746585845947, "rewards/margins": 0.564041256904602, "rewards/rejected": -7.48778772354126, "step": 80 }, { "epoch": 0.16283524904214558, "grad_norm": 14.992266742573277, "learning_rate": 8.121019108280254e-08, "logits/chosen": 0.8808382749557495, "logits/rejected": 0.9680411219596863, "logps/chosen": -0.6730803847312927, "logps/rejected": -0.8849495649337769, "loss": 5.2452, "nll_loss": 0.6730804443359375, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.730803489685059, "rewards/margins": 2.118691921234131, "rewards/rejected": -8.849495887756348, "step": 85 }, { "epoch": 0.1724137931034483, "grad_norm": 14.696295604697424, "learning_rate": 8.59872611464968e-08, "logits/chosen": 0.6036122441291809, "logits/rejected": 0.6832916140556335, "logps/chosen": -0.7643388509750366, "logps/rejected": -0.7602866888046265, "loss": 5.3007, "nll_loss": 0.7643388509750366, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.643387794494629, "rewards/margins": -0.040520571172237396, "rewards/rejected": -7.602867126464844, "step": 90 }, { "epoch": 0.18199233716475097, "grad_norm": 14.017222797723742, "learning_rate": 9.076433121019108e-08, "logits/chosen": 0.9275333285331726, "logits/rejected": 0.7763484120368958, "logps/chosen": -0.7543958425521851, "logps/rejected": -0.7292922139167786, "loss": 5.2406, "nll_loss": 0.7543958425521851, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -7.5439581871032715, "rewards/margins": -0.25103625655174255, "rewards/rejected": -7.292922019958496, "step": 95 }, { "epoch": 0.19157088122605365, "grad_norm": 14.22025157094158, "learning_rate": 9.554140127388534e-08, "logits/chosen": 0.9820090532302856, "logits/rejected": 0.8248282670974731, "logps/chosen": -0.6709014177322388, "logps/rejected": -0.7284021973609924, "loss": 5.461, "nll_loss": 0.6709014177322388, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6.709013938903809, "rewards/margins": 0.5750080943107605, "rewards/rejected": -7.284021854400635, "step": 100 }, { "epoch": 0.20114942528735633, "grad_norm": 15.684437324553063, "learning_rate": 1.0031847133757961e-07, "logits/chosen": 0.8076725006103516, "logits/rejected": 0.8857797384262085, "logps/chosen": -0.6815362572669983, "logps/rejected": -0.6763182878494263, "loss": 5.3235, "nll_loss": 0.6815363168716431, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6.815362453460693, "rewards/margins": -0.05218010023236275, "rewards/rejected": -6.763182163238525, "step": 105 }, { "epoch": 0.210727969348659, "grad_norm": 24.773788939839015, "learning_rate": 1.0509554140127387e-07, "logits/chosen": 0.7703748941421509, "logits/rejected": 0.747571587562561, "logps/chosen": -0.6411559581756592, "logps/rejected": -0.6272796392440796, "loss": 5.1527, "nll_loss": 0.6411559581756592, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -6.411559104919434, "rewards/margins": -0.13876314461231232, "rewards/rejected": -6.272796630859375, "step": 110 }, { "epoch": 0.22030651340996169, "grad_norm": 15.503433446190241, "learning_rate": 1.0987261146496813e-07, "logits/chosen": 0.7363082766532898, "logits/rejected": 0.7384678721427917, "logps/chosen": -0.7916821241378784, "logps/rejected": -0.8139774203300476, "loss": 5.2975, "nll_loss": 0.7916821241378784, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.916821479797363, "rewards/margins": 0.2229524552822113, "rewards/rejected": -8.139774322509766, "step": 115 }, { "epoch": 0.22988505747126436, "grad_norm": 14.405865015988635, "learning_rate": 1.1464968152866242e-07, "logits/chosen": 0.988644003868103, "logits/rejected": 0.6871722936630249, "logps/chosen": -0.8117585182189941, "logps/rejected": -0.7687948942184448, "loss": 5.2603, "nll_loss": 0.8117585182189941, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -8.117586135864258, "rewards/margins": -0.42963656783103943, "rewards/rejected": -7.687948703765869, "step": 120 }, { "epoch": 0.23946360153256704, "grad_norm": 15.441159671033866, "learning_rate": 1.194267515923567e-07, "logits/chosen": 0.9181255102157593, "logits/rejected": 1.0879169702529907, "logps/chosen": -0.7296438813209534, "logps/rejected": -0.7042439579963684, "loss": 5.3496, "nll_loss": 0.7296438217163086, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -7.296438694000244, "rewards/margins": -0.25399884581565857, "rewards/rejected": -7.0424394607543945, "step": 125 }, { "epoch": 0.24904214559386972, "grad_norm": 14.525160680247224, "learning_rate": 1.2420382165605095e-07, "logits/chosen": 1.0101947784423828, "logits/rejected": 0.9281114339828491, "logps/chosen": -0.6179158091545105, "logps/rejected": -0.5991578102111816, "loss": 5.349, "nll_loss": 0.6179158091545105, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -6.1791582107543945, "rewards/margins": -0.1875801980495453, "rewards/rejected": -5.991578102111816, "step": 130 }, { "epoch": 0.25862068965517243, "grad_norm": 14.83309451045206, "learning_rate": 1.2898089171974521e-07, "logits/chosen": 0.6288986802101135, "logits/rejected": 0.8249040842056274, "logps/chosen": -0.671284019947052, "logps/rejected": -0.7514477968215942, "loss": 5.2492, "nll_loss": 0.671284019947052, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.7128400802612305, "rewards/margins": 0.801638126373291, "rewards/rejected": -7.5144782066345215, "step": 135 }, { "epoch": 0.2681992337164751, "grad_norm": 15.778143495292198, "learning_rate": 1.3375796178343948e-07, "logits/chosen": 0.979997992515564, "logits/rejected": 0.8755594491958618, "logps/chosen": -0.5514487028121948, "logps/rejected": -0.675363302230835, "loss": 5.3335, "nll_loss": 0.5514487028121948, "rewards/accuracies": 1.0, "rewards/chosen": -5.514487266540527, "rewards/margins": 1.2391456365585327, "rewards/rejected": -6.75363302230835, "step": 140 }, { "epoch": 0.2777777777777778, "grad_norm": 18.462671529719298, "learning_rate": 1.3853503184713377e-07, "logits/chosen": 0.8637372851371765, "logits/rejected": 0.9276704788208008, "logps/chosen": -0.7762002944946289, "logps/rejected": -1.0306679010391235, "loss": 5.2209, "nll_loss": 0.7762002348899841, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.762002468109131, "rewards/margins": 2.5446763038635254, "rewards/rejected": -10.306678771972656, "step": 145 }, { "epoch": 0.28735632183908044, "grad_norm": 14.528423213748091, "learning_rate": 1.4331210191082803e-07, "logits/chosen": 0.9764812588691711, "logits/rejected": 1.0769740343093872, "logps/chosen": -0.7632086277008057, "logps/rejected": -0.7627191543579102, "loss": 5.1848, "nll_loss": 0.7632086277008057, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -7.632086277008057, "rewards/margins": -0.004894733428955078, "rewards/rejected": -7.627191066741943, "step": 150 }, { "epoch": 0.29693486590038315, "grad_norm": 14.670904215756279, "learning_rate": 1.480891719745223e-07, "logits/chosen": 0.7151988744735718, "logits/rejected": 0.8136464953422546, "logps/chosen": -0.8410874605178833, "logps/rejected": -0.6722872853279114, "loss": 5.322, "nll_loss": 0.8410874605178833, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -8.410874366760254, "rewards/margins": -1.688001036643982, "rewards/rejected": -6.722872734069824, "step": 155 }, { "epoch": 0.3065134099616858, "grad_norm": 16.96923463521279, "learning_rate": 1.5286624203821656e-07, "logits/chosen": 1.0444055795669556, "logits/rejected": 0.8398516774177551, "logps/chosen": -0.7191808819770813, "logps/rejected": -0.7984825372695923, "loss": 5.0657, "nll_loss": 0.7191808819770813, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.191809177398682, "rewards/margins": 0.7930164337158203, "rewards/rejected": -7.98482608795166, "step": 160 }, { "epoch": 0.3160919540229885, "grad_norm": 14.547279963040138, "learning_rate": 1.5764331210191082e-07, "logits/chosen": 0.694339394569397, "logits/rejected": 0.7881234884262085, "logps/chosen": -0.6559264063835144, "logps/rejected": -0.6876171827316284, "loss": 5.3857, "nll_loss": 0.6559264063835144, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6.559264183044434, "rewards/margins": 0.3169073164463043, "rewards/rejected": -6.876172065734863, "step": 165 }, { "epoch": 0.32567049808429116, "grad_norm": 15.883172546108769, "learning_rate": 1.6242038216560508e-07, "logits/chosen": 0.9939007759094238, "logits/rejected": 0.9327294230461121, "logps/chosen": -0.514694094657898, "logps/rejected": -0.6553934812545776, "loss": 5.1601, "nll_loss": 0.514694094657898, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -5.146941184997559, "rewards/margins": 1.4069939851760864, "rewards/rejected": -6.5539350509643555, "step": 170 }, { "epoch": 0.33524904214559387, "grad_norm": 14.974676446039048, "learning_rate": 1.6719745222929935e-07, "logits/chosen": 0.8872886896133423, "logits/rejected": 0.8021435737609863, "logps/chosen": -0.7085338830947876, "logps/rejected": -0.7116855382919312, "loss": 5.3339, "nll_loss": 0.7085338830947876, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -7.085339546203613, "rewards/margins": 0.031516265124082565, "rewards/rejected": -7.116854667663574, "step": 175 }, { "epoch": 0.3448275862068966, "grad_norm": 17.49142572813654, "learning_rate": 1.719745222929936e-07, "logits/chosen": 0.7162724733352661, "logits/rejected": 0.6837285757064819, "logps/chosen": -0.8341668844223022, "logps/rejected": -0.7984797358512878, "loss": 5.211, "nll_loss": 0.8341668248176575, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -8.341669082641602, "rewards/margins": -0.35687151551246643, "rewards/rejected": -7.984797477722168, "step": 180 }, { "epoch": 0.3544061302681992, "grad_norm": 15.695305726655395, "learning_rate": 1.7675159235668787e-07, "logits/chosen": 0.9962735176086426, "logits/rejected": 0.8522371053695679, "logps/chosen": -0.824070930480957, "logps/rejected": -0.7453502416610718, "loss": 5.1938, "nll_loss": 0.824070930480957, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -8.24070930480957, "rewards/margins": -0.7872062921524048, "rewards/rejected": -7.453502655029297, "step": 185 }, { "epoch": 0.36398467432950193, "grad_norm": 15.933522154415847, "learning_rate": 1.8152866242038216e-07, "logits/chosen": 0.7882771492004395, "logits/rejected": 0.7340660691261292, "logps/chosen": -0.6636666059494019, "logps/rejected": -0.7198070287704468, "loss": 5.3231, "nll_loss": 0.6636666655540466, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.636666774749756, "rewards/margins": 0.5614040493965149, "rewards/rejected": -7.198070526123047, "step": 190 }, { "epoch": 0.3735632183908046, "grad_norm": 14.263964254379374, "learning_rate": 1.8630573248407643e-07, "logits/chosen": 0.6639770269393921, "logits/rejected": 0.7358517050743103, "logps/chosen": -0.7163713574409485, "logps/rejected": -0.7314590215682983, "loss": 5.2454, "nll_loss": 0.7163712978363037, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.1637139320373535, "rewards/margins": 0.1508767157793045, "rewards/rejected": -7.3145904541015625, "step": 195 }, { "epoch": 0.3831417624521073, "grad_norm": 15.417127684289229, "learning_rate": 1.910828025477707e-07, "logits/chosen": 0.6845916509628296, "logits/rejected": 0.8717025518417358, "logps/chosen": -0.7460684776306152, "logps/rejected": -0.8934140205383301, "loss": 5.3951, "nll_loss": 0.7460684776306152, "rewards/accuracies": 1.0, "rewards/chosen": -7.460684776306152, "rewards/margins": 1.4734549522399902, "rewards/rejected": -8.9341402053833, "step": 200 }, { "epoch": 0.39272030651340994, "grad_norm": 15.29361256055114, "learning_rate": 1.9585987261146495e-07, "logits/chosen": 0.8160010576248169, "logits/rejected": 0.7316077351570129, "logps/chosen": -0.8079813122749329, "logps/rejected": -0.7992033958435059, "loss": 5.1982, "nll_loss": 0.8079813122749329, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -8.079813003540039, "rewards/margins": -0.08777885138988495, "rewards/rejected": -7.992033958435059, "step": 205 }, { "epoch": 0.40229885057471265, "grad_norm": 18.735791443990614, "learning_rate": 2.0063694267515922e-07, "logits/chosen": 0.8831043243408203, "logits/rejected": 0.9638816118240356, "logps/chosen": -0.8348730802536011, "logps/rejected": -0.7787247896194458, "loss": 5.2804, "nll_loss": 0.8348730802536011, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -8.348731994628906, "rewards/margins": -0.5614833235740662, "rewards/rejected": -7.787248134613037, "step": 210 }, { "epoch": 0.4118773946360153, "grad_norm": 15.020231482587125, "learning_rate": 2.0541401273885348e-07, "logits/chosen": 0.8089305758476257, "logits/rejected": 0.7801268696784973, "logps/chosen": -0.9548230171203613, "logps/rejected": -0.8521941304206848, "loss": 5.4231, "nll_loss": 0.9548231363296509, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -9.548230171203613, "rewards/margins": -1.0262889862060547, "rewards/rejected": -8.521940231323242, "step": 215 }, { "epoch": 0.421455938697318, "grad_norm": 16.66122418369343, "learning_rate": 2.1019108280254774e-07, "logits/chosen": 0.8265215754508972, "logits/rejected": 0.7605193257331848, "logps/chosen": -0.6805425882339478, "logps/rejected": -0.7011183500289917, "loss": 5.2324, "nll_loss": 0.6805425882339478, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -6.80542516708374, "rewards/margins": 0.20575818419456482, "rewards/rejected": -7.011183261871338, "step": 220 }, { "epoch": 0.43103448275862066, "grad_norm": 15.84641449526937, "learning_rate": 2.14968152866242e-07, "logits/chosen": 0.9168558120727539, "logits/rejected": 0.8388528823852539, "logps/chosen": -0.8054409027099609, "logps/rejected": -0.7549694776535034, "loss": 5.2786, "nll_loss": 0.8054410219192505, "rewards/accuracies": 0.0, "rewards/chosen": -8.054409980773926, "rewards/margins": -0.5047143697738647, "rewards/rejected": -7.549695014953613, "step": 225 }, { "epoch": 0.44061302681992337, "grad_norm": 15.381291199652642, "learning_rate": 2.1974522292993627e-07, "logits/chosen": 0.8924915194511414, "logits/rejected": 0.9192771911621094, "logps/chosen": -0.7253198623657227, "logps/rejected": -0.6863486766815186, "loss": 5.3113, "nll_loss": 0.7253197431564331, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.253198146820068, "rewards/margins": -0.38971146941185, "rewards/rejected": -6.863486289978027, "step": 230 }, { "epoch": 0.4501915708812261, "grad_norm": 22.207135774874516, "learning_rate": 2.2452229299363056e-07, "logits/chosen": 0.6804489493370056, "logits/rejected": 0.8135985136032104, "logps/chosen": -0.7592498064041138, "logps/rejected": -0.8050843477249146, "loss": 5.306, "nll_loss": 0.7592498064041138, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.592497825622559, "rewards/margins": 0.4583454132080078, "rewards/rejected": -8.050843238830566, "step": 235 }, { "epoch": 0.45977011494252873, "grad_norm": 15.418776278146453, "learning_rate": 2.2929936305732485e-07, "logits/chosen": 0.740727961063385, "logits/rejected": 0.7790459394454956, "logps/chosen": -0.6311203837394714, "logps/rejected": -0.7699800729751587, "loss": 5.3202, "nll_loss": 0.6311203837394714, "rewards/accuracies": 1.0, "rewards/chosen": -6.311203956604004, "rewards/margins": 1.388596534729004, "rewards/rejected": -7.69980001449585, "step": 240 }, { "epoch": 0.46934865900383144, "grad_norm": 18.062610898602617, "learning_rate": 2.340764331210191e-07, "logits/chosen": 0.7905587553977966, "logits/rejected": 0.70208740234375, "logps/chosen": -0.8448599576950073, "logps/rejected": -0.726888120174408, "loss": 5.2108, "nll_loss": 0.8448599576950073, "rewards/accuracies": 0.0, "rewards/chosen": -8.448599815368652, "rewards/margins": -1.179718255996704, "rewards/rejected": -7.268881320953369, "step": 245 }, { "epoch": 0.4789272030651341, "grad_norm": 20.317601991362707, "learning_rate": 2.388535031847134e-07, "logits/chosen": 0.8352301716804504, "logits/rejected": 0.7928398847579956, "logps/chosen": -0.7886701822280884, "logps/rejected": -0.7603856325149536, "loss": 5.3794, "nll_loss": 0.7886701822280884, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -7.886702060699463, "rewards/margins": -0.28284597396850586, "rewards/rejected": -7.603856086730957, "step": 250 }, { "epoch": 0.4885057471264368, "grad_norm": 17.92061038766115, "learning_rate": 2.4363057324840764e-07, "logits/chosen": 0.7246678471565247, "logits/rejected": 0.8966231346130371, "logps/chosen": -0.8691000938415527, "logps/rejected": -0.7771162986755371, "loss": 5.2854, "nll_loss": 0.8691000938415527, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -8.691000938415527, "rewards/margins": -0.9198387861251831, "rewards/rejected": -7.771162509918213, "step": 255 }, { "epoch": 0.49808429118773945, "grad_norm": 17.57639103239166, "learning_rate": 2.484076433121019e-07, "logits/chosen": 0.9574035406112671, "logits/rejected": 0.6729179620742798, "logps/chosen": -0.6743525266647339, "logps/rejected": -0.7344107031822205, "loss": 5.2276, "nll_loss": 0.6743525266647339, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.743525505065918, "rewards/margins": 0.6005817651748657, "rewards/rejected": -7.344107151031494, "step": 260 }, { "epoch": 0.5076628352490421, "grad_norm": 21.283801923925182, "learning_rate": 2.5318471337579616e-07, "logits/chosen": 0.8346107602119446, "logits/rejected": 0.9647878408432007, "logps/chosen": -0.6870938539505005, "logps/rejected": -0.6139359474182129, "loss": 5.3469, "nll_loss": 0.6870938539505005, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -6.870938301086426, "rewards/margins": -0.7315785884857178, "rewards/rejected": -6.139359951019287, "step": 265 }, { "epoch": 0.5172413793103449, "grad_norm": 14.399379764504936, "learning_rate": 2.5796178343949043e-07, "logits/chosen": 0.864532470703125, "logits/rejected": 0.8542212247848511, "logps/chosen": -0.7113697528839111, "logps/rejected": -0.732496976852417, "loss": 5.177, "nll_loss": 0.7113697528839111, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.1136980056762695, "rewards/margins": 0.2112717628479004, "rewards/rejected": -7.324969291687012, "step": 270 }, { "epoch": 0.5268199233716475, "grad_norm": 14.537680856211797, "learning_rate": 2.627388535031847e-07, "logits/chosen": 0.9982441067695618, "logits/rejected": 0.8428419232368469, "logps/chosen": -0.687545120716095, "logps/rejected": -0.7962311506271362, "loss": 5.1719, "nll_loss": 0.687545120716095, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -6.87545108795166, "rewards/margins": 1.0868605375289917, "rewards/rejected": -7.9623122215271, "step": 275 }, { "epoch": 0.5363984674329502, "grad_norm": 19.6142347909319, "learning_rate": 2.6751592356687895e-07, "logits/chosen": 0.7479467988014221, "logits/rejected": 0.7518871426582336, "logps/chosen": -0.7905808687210083, "logps/rejected": -0.7879656553268433, "loss": 5.2129, "nll_loss": 0.7905808687210083, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.905808448791504, "rewards/margins": -0.02615184709429741, "rewards/rejected": -7.8796563148498535, "step": 280 }, { "epoch": 0.5459770114942529, "grad_norm": 15.380742906594616, "learning_rate": 2.722929936305732e-07, "logits/chosen": 0.6082831025123596, "logits/rejected": 0.6754466891288757, "logps/chosen": -0.866966724395752, "logps/rejected": -0.822973370552063, "loss": 5.2728, "nll_loss": 0.8669666051864624, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.66966724395752, "rewards/margins": -0.4399335980415344, "rewards/rejected": -8.229734420776367, "step": 285 }, { "epoch": 0.5555555555555556, "grad_norm": 16.49675270885239, "learning_rate": 2.7707006369426753e-07, "logits/chosen": 0.8076263666152954, "logits/rejected": 0.7774611711502075, "logps/chosen": -0.7558648586273193, "logps/rejected": -0.8593353033065796, "loss": 5.2718, "nll_loss": 0.7558648586273193, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.558648586273193, "rewards/margins": 1.0347039699554443, "rewards/rejected": -8.593353271484375, "step": 290 }, { "epoch": 0.5651340996168582, "grad_norm": 15.91951747554686, "learning_rate": 2.818471337579618e-07, "logits/chosen": 0.7341046333312988, "logits/rejected": 0.6761490106582642, "logps/chosen": -0.7117196321487427, "logps/rejected": -0.8302199244499207, "loss": 5.3494, "nll_loss": 0.7117196321487427, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.117195129394531, "rewards/margins": 1.1850030422210693, "rewards/rejected": -8.302199363708496, "step": 295 }, { "epoch": 0.5747126436781609, "grad_norm": 17.296732181243744, "learning_rate": 2.8662420382165606e-07, "logits/chosen": 0.8246825933456421, "logits/rejected": 0.649739146232605, "logps/chosen": -0.823817253112793, "logps/rejected": -0.8983147740364075, "loss": 5.2106, "nll_loss": 0.8238171339035034, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.23817253112793, "rewards/margins": 0.7449753880500793, "rewards/rejected": -8.983147621154785, "step": 300 }, { "epoch": 0.5842911877394636, "grad_norm": 14.832912322402217, "learning_rate": 2.914012738853503e-07, "logits/chosen": 0.6315704584121704, "logits/rejected": 0.614464521408081, "logps/chosen": -0.8551123738288879, "logps/rejected": -0.7884758114814758, "loss": 5.2191, "nll_loss": 0.8551123738288879, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -8.55112361907959, "rewards/margins": -0.6663654446601868, "rewards/rejected": -7.884758949279785, "step": 305 }, { "epoch": 0.5938697318007663, "grad_norm": 22.263623540675233, "learning_rate": 2.961783439490446e-07, "logits/chosen": 0.8310597538948059, "logits/rejected": 0.7741198539733887, "logps/chosen": -0.769278883934021, "logps/rejected": -0.7818907499313354, "loss": 5.3222, "nll_loss": 0.7692790031433105, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -7.692789554595947, "rewards/margins": 0.1261179894208908, "rewards/rejected": -7.818907737731934, "step": 310 }, { "epoch": 0.603448275862069, "grad_norm": 15.209695881976554, "learning_rate": 2.999999067864633e-07, "logits/chosen": 0.6580372452735901, "logits/rejected": 0.5932300090789795, "logps/chosen": -0.8376361727714539, "logps/rejected": -0.9159227609634399, "loss": 5.1548, "nll_loss": 0.8376361131668091, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.376360893249512, "rewards/margins": 0.782866358757019, "rewards/rejected": -9.15922737121582, "step": 315 }, { "epoch": 0.6130268199233716, "grad_norm": 16.786502978942124, "learning_rate": 2.9999664432484305e-07, "logits/chosen": 0.779415488243103, "logits/rejected": 0.8173543810844421, "logps/chosen": -0.8148608207702637, "logps/rejected": -0.8662956357002258, "loss": 5.0858, "nll_loss": 0.8148608207702637, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -8.148608207702637, "rewards/margins": 0.5143473744392395, "rewards/rejected": -8.662956237792969, "step": 320 }, { "epoch": 0.6226053639846744, "grad_norm": 14.809215737259183, "learning_rate": 2.999887213022373e-07, "logits/chosen": 0.9049234390258789, "logits/rejected": 0.6404236555099487, "logps/chosen": -0.9479631185531616, "logps/rejected": -0.8360861539840698, "loss": 5.282, "nll_loss": 0.9479631185531616, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -9.479631423950195, "rewards/margins": -1.1187700033187866, "rewards/rejected": -8.360861778259277, "step": 325 }, { "epoch": 0.632183908045977, "grad_norm": 16.116801233519272, "learning_rate": 2.999761379648231e-07, "logits/chosen": 0.5830576419830322, "logits/rejected": 0.4962643086910248, "logps/chosen": -0.8123816251754761, "logps/rejected": -0.915209174156189, "loss": 5.0586, "nll_loss": 0.8123816251754761, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -8.12381649017334, "rewards/margins": 1.0282756090164185, "rewards/rejected": -9.152092933654785, "step": 330 }, { "epoch": 0.6417624521072797, "grad_norm": 17.606725914130628, "learning_rate": 2.999588947035786e-07, "logits/chosen": 0.666958212852478, "logits/rejected": 0.6473885774612427, "logps/chosen": -0.8091378211975098, "logps/rejected": -0.8131651878356934, "loss": 5.1567, "nll_loss": 0.8091378211975098, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -8.091379165649414, "rewards/margins": 0.04027414321899414, "rewards/rejected": -8.13165283203125, "step": 335 }, { "epoch": 0.6513409961685823, "grad_norm": 15.2118101330663, "learning_rate": 2.999369920542709e-07, "logits/chosen": 0.7825851440429688, "logits/rejected": 0.6557101011276245, "logps/chosen": -0.8586159944534302, "logps/rejected": -0.795002818107605, "loss": 5.2093, "nll_loss": 0.8586158752441406, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -8.586160659790039, "rewards/margins": -0.6361311078071594, "rewards/rejected": -7.950028419494629, "step": 340 }, { "epoch": 0.6609195402298851, "grad_norm": 21.51236359546376, "learning_rate": 2.9991043069743953e-07, "logits/chosen": 0.8391911387443542, "logits/rejected": 0.9106936454772949, "logps/chosen": -0.9227014780044556, "logps/rejected": -0.8664349317550659, "loss": 5.1782, "nll_loss": 0.9227014780044556, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -9.22701358795166, "rewards/margins": -0.5626646280288696, "rewards/rejected": -8.664348602294922, "step": 345 }, { "epoch": 0.6704980842911877, "grad_norm": 18.995163332531835, "learning_rate": 2.9987921145837506e-07, "logits/chosen": 0.5608048439025879, "logits/rejected": 0.504362940788269, "logps/chosen": -0.9775077700614929, "logps/rejected": -0.8174030184745789, "loss": 5.2409, "nll_loss": 0.9775077700614929, "rewards/accuracies": 0.0, "rewards/chosen": -9.775077819824219, "rewards/margins": -1.6010481119155884, "rewards/rejected": -8.174030303955078, "step": 350 }, { "epoch": 0.6800766283524904, "grad_norm": 18.321608160126484, "learning_rate": 2.998433353070936e-07, "logits/chosen": 0.5228645205497742, "logits/rejected": 0.5875726342201233, "logps/chosen": -0.8083820343017578, "logps/rejected": -0.9094891548156738, "loss": 5.3646, "nll_loss": 0.8083820343017578, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -8.083820343017578, "rewards/margins": 1.011070966720581, "rewards/rejected": -9.094891548156738, "step": 355 }, { "epoch": 0.6896551724137931, "grad_norm": 39.97684130432745, "learning_rate": 2.998028033583067e-07, "logits/chosen": 0.635855495929718, "logits/rejected": 0.6091259717941284, "logps/chosen": -0.8663408160209656, "logps/rejected": -0.8845375776290894, "loss": 5.3618, "nll_loss": 0.8663408160209656, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.663408279418945, "rewards/margins": 0.18196812272071838, "rewards/rejected": -8.845376014709473, "step": 360 }, { "epoch": 0.6992337164750958, "grad_norm": 20.393894377350467, "learning_rate": 2.9975761687138675e-07, "logits/chosen": 0.8516343832015991, "logits/rejected": 0.9076502919197083, "logps/chosen": -0.7819638848304749, "logps/rejected": -0.8371108174324036, "loss": 5.1922, "nll_loss": 0.7819639444351196, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.819638252258301, "rewards/margins": 0.5514693260192871, "rewards/rejected": -8.371108055114746, "step": 365 }, { "epoch": 0.7088122605363985, "grad_norm": 15.92022688698728, "learning_rate": 2.997077772503276e-07, "logits/chosen": 0.6935936212539673, "logits/rejected": 0.7612776160240173, "logps/chosen": -0.8113873600959778, "logps/rejected": -0.8486092686653137, "loss": 5.2959, "nll_loss": 0.8113872408866882, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.113873481750488, "rewards/margins": 0.37221986055374146, "rewards/rejected": -8.486093521118164, "step": 370 }, { "epoch": 0.7183908045977011, "grad_norm": 19.844305829223966, "learning_rate": 2.9965328604370115e-07, "logits/chosen": 0.7288967370986938, "logits/rejected": 0.8481170535087585, "logps/chosen": -0.7829886674880981, "logps/rejected": -0.7796565294265747, "loss": 5.2523, "nll_loss": 0.7829886674880981, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.829885959625244, "rewards/margins": -0.033321283757686615, "rewards/rejected": -7.796565055847168, "step": 375 }, { "epoch": 0.7279693486590039, "grad_norm": 22.21588650736091, "learning_rate": 2.9959414494460934e-07, "logits/chosen": 0.7555449604988098, "logits/rejected": 0.8247362971305847, "logps/chosen": -0.8521485328674316, "logps/rejected": -0.9153167605400085, "loss": 5.1015, "nll_loss": 0.8521484136581421, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.521485328674316, "rewards/margins": 0.6316838264465332, "rewards/rejected": -9.153168678283691, "step": 380 }, { "epoch": 0.7375478927203065, "grad_norm": 16.32420836624451, "learning_rate": 2.995303557906312e-07, "logits/chosen": 0.8759990930557251, "logits/rejected": 0.684215784072876, "logps/chosen": -0.9286302328109741, "logps/rejected": -0.9789536595344543, "loss": 5.3124, "nll_loss": 0.9286301732063293, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -9.28630256652832, "rewards/margins": 0.5032343864440918, "rewards/rejected": -9.789536476135254, "step": 385 }, { "epoch": 0.7471264367816092, "grad_norm": 15.80345388242498, "learning_rate": 2.99461920563766e-07, "logits/chosen": 0.5319703817367554, "logits/rejected": 0.6946344971656799, "logps/chosen": -0.9111973643302917, "logps/rejected": -0.8061173558235168, "loss": 5.227, "nll_loss": 0.911197304725647, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -9.111973762512207, "rewards/margins": -1.0507996082305908, "rewards/rejected": -8.061173439025879, "step": 390 }, { "epoch": 0.7567049808429118, "grad_norm": 20.47124302367669, "learning_rate": 2.993888413903716e-07, "logits/chosen": 0.875220000743866, "logits/rejected": 0.9313627481460571, "logps/chosen": -0.6892002820968628, "logps/rejected": -0.7782602310180664, "loss": 5.1504, "nll_loss": 0.689200222492218, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -6.892002105712891, "rewards/margins": 0.8905998468399048, "rewards/rejected": -7.782601833343506, "step": 395 }, { "epoch": 0.7662835249042146, "grad_norm": 17.699773611139502, "learning_rate": 2.9931112054109855e-07, "logits/chosen": 0.5683273673057556, "logits/rejected": 0.6470497846603394, "logps/chosen": -0.8484388589859009, "logps/rejected": -0.8746269345283508, "loss": 5.1348, "nll_loss": 0.8484388589859009, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.48438835144043, "rewards/margins": 0.26188117265701294, "rewards/rejected": -8.746268272399902, "step": 400 }, { "epoch": 0.7758620689655172, "grad_norm": 19.019674807304686, "learning_rate": 2.992287604308192e-07, "logits/chosen": 0.6370224356651306, "logits/rejected": 0.500920295715332, "logps/chosen": -1.061689853668213, "logps/rejected": -1.0553849935531616, "loss": 5.2272, "nll_loss": 1.0616897344589233, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -10.616897583007812, "rewards/margins": -0.06304798275232315, "rewards/rejected": -10.553851127624512, "step": 405 }, { "epoch": 0.7854406130268199, "grad_norm": 22.32813440003362, "learning_rate": 2.9914176361855286e-07, "logits/chosen": 0.773957371711731, "logits/rejected": 0.5596843957901001, "logps/chosen": -0.8565713167190552, "logps/rejected": -0.8435229063034058, "loss": 5.1566, "nll_loss": 0.8565713167190552, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -8.565712928771973, "rewards/margins": -0.13048286736011505, "rewards/rejected": -8.435229301452637, "step": 410 }, { "epoch": 0.7950191570881227, "grad_norm": 18.60336417027728, "learning_rate": 2.9905013280738643e-07, "logits/chosen": 0.5330244898796082, "logits/rejected": 0.52126145362854, "logps/chosen": -0.9946764707565308, "logps/rejected": -1.0159534215927124, "loss": 5.3385, "nll_loss": 0.9946764707565308, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.94676399230957, "rewards/margins": 0.21276941895484924, "rewards/rejected": -10.159533500671387, "step": 415 }, { "epoch": 0.8045977011494253, "grad_norm": 17.890613875131685, "learning_rate": 2.9895387084439007e-07, "logits/chosen": 0.9610258340835571, "logits/rejected": 0.7646621465682983, "logps/chosen": -0.7954119443893433, "logps/rejected": -0.8386955261230469, "loss": 5.1982, "nll_loss": 0.7954119443893433, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.954119682312012, "rewards/margins": 0.43283557891845703, "rewards/rejected": -8.386955261230469, "step": 420 }, { "epoch": 0.814176245210728, "grad_norm": 16.55166479015083, "learning_rate": 2.9885298072052896e-07, "logits/chosen": 0.6969675421714783, "logits/rejected": 0.8127206563949585, "logps/chosen": -0.7167531251907349, "logps/rejected": -0.8444196581840515, "loss": 5.1488, "nll_loss": 0.7167531847953796, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.167531490325928, "rewards/margins": 1.2766650915145874, "rewards/rejected": -8.444195747375488, "step": 425 }, { "epoch": 0.8237547892720306, "grad_norm": 17.66990705043301, "learning_rate": 2.987474655705706e-07, "logits/chosen": 0.6976544260978699, "logits/rejected": 0.45688027143478394, "logps/chosen": -0.984510600566864, "logps/rejected": -0.9671609997749329, "loss": 5.1164, "nll_loss": 0.9845105409622192, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.84510612487793, "rewards/margins": -0.17349663376808167, "rewards/rejected": -9.671609878540039, "step": 430 }, { "epoch": 0.8333333333333334, "grad_norm": 17.864698049191677, "learning_rate": 2.9863732867298676e-07, "logits/chosen": 0.5837534070014954, "logits/rejected": 0.4828642010688782, "logps/chosen": -0.7947196960449219, "logps/rejected": -0.8388618230819702, "loss": 5.1483, "nll_loss": 0.7947196960449219, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -7.947196960449219, "rewards/margins": 0.44142085313796997, "rewards/rejected": -8.388618469238281, "step": 435 }, { "epoch": 0.842911877394636, "grad_norm": 19.80331125773151, "learning_rate": 2.985225734498523e-07, "logits/chosen": 0.6473149061203003, "logits/rejected": 0.6882539987564087, "logps/chosen": -0.8926659822463989, "logps/rejected": -1.0417662858963013, "loss": 5.0516, "nll_loss": 0.8926659822463989, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -8.926658630371094, "rewards/margins": 1.4910037517547607, "rewards/rejected": -10.417662620544434, "step": 440 }, { "epoch": 0.8524904214559387, "grad_norm": 23.020768804171617, "learning_rate": 2.984032034667383e-07, "logits/chosen": 0.723087728023529, "logits/rejected": 0.7870732545852661, "logps/chosen": -0.9583943486213684, "logps/rejected": -0.9407602548599243, "loss": 5.2786, "nll_loss": 0.9583943486213684, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -9.583943367004395, "rewards/margins": -0.17634105682373047, "rewards/rejected": -9.40760326385498, "step": 445 }, { "epoch": 0.8620689655172413, "grad_norm": 19.047172710756705, "learning_rate": 2.982792224326018e-07, "logits/chosen": 0.6330351829528809, "logits/rejected": 0.6864033341407776, "logps/chosen": -0.7574523687362671, "logps/rejected": -0.8687712550163269, "loss": 5.1046, "nll_loss": 0.7574523091316223, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.574522972106934, "rewards/margins": 1.113189458847046, "rewards/rejected": -8.687711715698242, "step": 450 }, { "epoch": 0.8716475095785441, "grad_norm": 16.669062938958124, "learning_rate": 2.9815063419966994e-07, "logits/chosen": 0.5700467824935913, "logits/rejected": 0.4501461088657379, "logps/chosen": -0.8433354496955872, "logps/rejected": -1.0070374011993408, "loss": 5.224, "nll_loss": 0.8433355093002319, "rewards/accuracies": 1.0, "rewards/chosen": -8.433355331420898, "rewards/margins": 1.637019157409668, "rewards/rejected": -10.070374488830566, "step": 455 }, { "epoch": 0.8812260536398467, "grad_norm": 18.696544325644073, "learning_rate": 2.9801744276332095e-07, "logits/chosen": 0.7160075306892395, "logits/rejected": 0.7029515504837036, "logps/chosen": -0.929043173789978, "logps/rejected": -0.8915184140205383, "loss": 5.2651, "nll_loss": 0.9290431141853333, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -9.29043197631836, "rewards/margins": -0.37524765729904175, "rewards/rejected": -8.915184020996094, "step": 460 }, { "epoch": 0.8908045977011494, "grad_norm": 17.828488483344373, "learning_rate": 2.978796522619593e-07, "logits/chosen": 0.48793935775756836, "logits/rejected": 0.5863816142082214, "logps/chosen": -0.8918827772140503, "logps/rejected": -0.9165046811103821, "loss": 5.1141, "nll_loss": 0.8918827772140503, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -8.918828010559082, "rewards/margins": 0.24621906876564026, "rewards/rejected": -9.165046691894531, "step": 465 }, { "epoch": 0.9003831417624522, "grad_norm": 17.225972538621157, "learning_rate": 2.9773726697688786e-07, "logits/chosen": 0.8548671007156372, "logits/rejected": 0.6914275884628296, "logps/chosen": -0.7240065932273865, "logps/rejected": -0.7344146370887756, "loss": 5.2492, "nll_loss": 0.7240065336227417, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -7.2400665283203125, "rewards/margins": 0.10407984256744385, "rewards/rejected": -7.34414529800415, "step": 470 }, { "epoch": 0.9099616858237548, "grad_norm": 19.7970366847853, "learning_rate": 2.975902913321742e-07, "logits/chosen": 0.6560118794441223, "logits/rejected": 0.9177835583686829, "logps/chosen": -0.7790594100952148, "logps/rejected": -0.7989363670349121, "loss": 5.3694, "nll_loss": 0.7790594100952148, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -7.79059362411499, "rewards/margins": 0.19877009093761444, "rewards/rejected": -7.989363670349121, "step": 475 }, { "epoch": 0.9195402298850575, "grad_norm": 25.366911327657693, "learning_rate": 2.974387298945135e-07, "logits/chosen": 0.4641974866390228, "logits/rejected": 0.5562915802001953, "logps/chosen": -1.0258574485778809, "logps/rejected": -0.9660851359367371, "loss": 5.2603, "nll_loss": 1.0258575677871704, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -10.258573532104492, "rewards/margins": -0.5977233052253723, "rewards/rejected": -9.66085147857666, "step": 480 }, { "epoch": 0.9291187739463601, "grad_norm": 28.44742966645083, "learning_rate": 2.9728258737308666e-07, "logits/chosen": 0.7019563913345337, "logits/rejected": 0.9221956133842468, "logps/chosen": -0.7030410766601562, "logps/rejected": -0.7920758724212646, "loss": 5.1616, "nll_loss": 0.7030410766601562, "rewards/accuracies": 1.0, "rewards/chosen": -7.0304107666015625, "rewards/margins": 0.8903471231460571, "rewards/rejected": -7.920758247375488, "step": 485 }, { "epoch": 0.9386973180076629, "grad_norm": 20.06782347986855, "learning_rate": 2.9712186861941376e-07, "logits/chosen": 0.5050719976425171, "logits/rejected": 0.5799335241317749, "logps/chosen": -0.9137505292892456, "logps/rejected": -0.8959131240844727, "loss": 5.0958, "nll_loss": 0.9137505292892456, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -9.137506484985352, "rewards/margins": -0.17837515473365784, "rewards/rejected": -8.959131240844727, "step": 490 }, { "epoch": 0.9482758620689655, "grad_norm": 25.038776687357906, "learning_rate": 2.9695657862720366e-07, "logits/chosen": 0.6863113641738892, "logits/rejected": 0.617378294467926, "logps/chosen": -0.8844548463821411, "logps/rejected": -0.8485180139541626, "loss": 5.2545, "nll_loss": 0.8844548463821411, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.844549179077148, "rewards/margins": -0.3593680262565613, "rewards/rejected": -8.485180854797363, "step": 495 }, { "epoch": 0.9578544061302682, "grad_norm": 17.692781929545614, "learning_rate": 2.967867225321984e-07, "logits/chosen": 0.37053728103637695, "logits/rejected": 0.6822131276130676, "logps/chosen": -0.8528249859809875, "logps/rejected": -0.8191383481025696, "loss": 5.1402, "nll_loss": 0.8528249859809875, "rewards/accuracies": 0.4000000059604645, "rewards/chosen": -8.528249740600586, "rewards/margins": -0.33686715364456177, "rewards/rejected": -8.191383361816406, "step": 500 }, { "epoch": 0.9674329501915708, "grad_norm": 21.29237339971544, "learning_rate": 2.96612305612014e-07, "logits/chosen": 0.45431119203567505, "logits/rejected": 0.6691686511039734, "logps/chosen": -0.8379117250442505, "logps/rejected": -0.7931283116340637, "loss": 5.1782, "nll_loss": 0.8379117846488953, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -8.379117965698242, "rewards/margins": -0.44783440232276917, "rewards/rejected": -7.931282997131348, "step": 505 }, { "epoch": 0.9770114942528736, "grad_norm": 18.325302400422004, "learning_rate": 2.9643333328597636e-07, "logits/chosen": 0.4990893006324768, "logits/rejected": 0.44439974427223206, "logps/chosen": -0.8343612551689148, "logps/rejected": -0.861991286277771, "loss": 5.3056, "nll_loss": 0.8343612551689148, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -8.343611717224121, "rewards/margins": 0.27630099654197693, "rewards/rejected": -8.619913101196289, "step": 510 }, { "epoch": 0.9865900383141762, "grad_norm": 19.49284906216681, "learning_rate": 2.9624981111495277e-07, "logits/chosen": 0.6197850704193115, "logits/rejected": 0.7587565183639526, "logps/chosen": -0.7680959105491638, "logps/rejected": -0.9193935394287109, "loss": 5.0512, "nll_loss": 0.768095850944519, "rewards/accuracies": 1.0, "rewards/chosen": -7.6809587478637695, "rewards/margins": 1.5129766464233398, "rewards/rejected": -9.19393539428711, "step": 515 }, { "epoch": 0.9961685823754789, "grad_norm": 17.910862409831864, "learning_rate": 2.960617448011793e-07, "logits/chosen": 0.4001706540584564, "logits/rejected": 0.5382306575775146, "logps/chosen": -0.9400454759597778, "logps/rejected": -0.9204422235488892, "loss": 5.1904, "nll_loss": 0.9400454759597778, "rewards/accuracies": 0.20000000298023224, "rewards/chosen": -9.400455474853516, "rewards/margins": -0.19603300094604492, "rewards/rejected": -9.204421997070312, "step": 520 } ], "logging_steps": 5, "max_steps": 3132, "num_input_tokens_seen": 0, "num_train_epochs": 6, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }