{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 8.432772549922241, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -2.5992650985717773, "logits/rejected": -2.567516326904297, "logps/chosen": -272.1844482421875, "logps/rejected": -362.26898193359375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "grad_norm": 9.993362324491976, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.4130637645721436, "logits/rejected": -2.3763909339904785, "logps/chosen": -268.1092834472656, "logps/rejected": -252.8538360595703, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": 0.00011327523679938167, "rewards/margins": 0.00018297109636478126, "rewards/rejected": -6.969591049710289e-05, "step": 10 }, { "epoch": 0.04, "grad_norm": 8.212783868686264, "learning_rate": 2.0833333333333333e-07, "logits/chosen": -2.472649097442627, "logits/rejected": -2.4103596210479736, "logps/chosen": -283.3070068359375, "logps/rejected": -297.09979248046875, "loss": 0.6928, "rewards/accuracies": 0.53125, "rewards/chosen": 0.001224780222401023, "rewards/margins": 0.0005858406075276434, "rewards/rejected": 0.0006389396148733795, "step": 20 }, { "epoch": 0.06, "grad_norm": 7.684799704050697, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.508185386657715, "logits/rejected": -2.415645122528076, "logps/chosen": -301.5997619628906, "logps/rejected": -265.80426025390625, "loss": 0.6906, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0062199728563427925, "rewards/margins": 0.0035330094397068024, "rewards/rejected": 0.002686963649466634, "step": 30 }, { "epoch": 0.08, "grad_norm": 7.49113948544429, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.404700756072998, "logits/rejected": -2.350811243057251, "logps/chosen": -268.32647705078125, "logps/rejected": -247.6831817626953, "loss": 0.6849, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0182647742331028, "rewards/margins": 0.020485591143369675, "rewards/rejected": -0.0022208169102668762, "step": 40 }, { "epoch": 0.1, "grad_norm": 7.812080735900241, "learning_rate": 4.999733114418725e-07, "logits/chosen": -2.3245081901550293, "logits/rejected": -2.2887587547302246, "logps/chosen": -280.1948547363281, "logps/rejected": -293.09405517578125, "loss": 0.6753, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.022629689425230026, "rewards/margins": 0.031411103904247284, "rewards/rejected": -0.00878141075372696, "step": 50 }, { "epoch": 0.13, "grad_norm": 8.653367610484782, "learning_rate": 4.990398100856366e-07, "logits/chosen": -2.3789138793945312, "logits/rejected": -2.3128437995910645, "logps/chosen": -271.4405822753906, "logps/rejected": -303.5579528808594, "loss": 0.6602, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.029351558536291122, "rewards/margins": 0.06681646406650543, "rewards/rejected": -0.03746490180492401, "step": 60 }, { "epoch": 0.15, "grad_norm": 9.793859330498844, "learning_rate": 4.967775735898179e-07, "logits/chosen": -2.1905629634857178, "logits/rejected": -2.1994009017944336, "logps/chosen": -267.55340576171875, "logps/rejected": -273.4862976074219, "loss": 0.6361, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.0027915718965232372, "rewards/margins": 0.1346043348312378, "rewards/rejected": -0.13739590346813202, "step": 70 }, { "epoch": 0.17, "grad_norm": 13.837088401780129, "learning_rate": 4.931986719649298e-07, "logits/chosen": -2.3278985023498535, "logits/rejected": -2.243424892425537, "logps/chosen": -337.2379455566406, "logps/rejected": -292.844970703125, "loss": 0.6223, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.09643656760454178, "rewards/margins": 0.17360267043113708, "rewards/rejected": -0.27003923058509827, "step": 80 }, { "epoch": 0.19, "grad_norm": 17.06156274259609, "learning_rate": 4.883222001996351e-07, "logits/chosen": -2.1236023902893066, "logits/rejected": -2.0597236156463623, "logps/chosen": -276.4493103027344, "logps/rejected": -299.7818908691406, "loss": 0.5987, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.15940961241722107, "rewards/margins": 0.3067048490047455, "rewards/rejected": -0.46611452102661133, "step": 90 }, { "epoch": 0.21, "grad_norm": 15.017538794455808, "learning_rate": 4.821741763807186e-07, "logits/chosen": -2.0527923107147217, "logits/rejected": -1.9835008382797241, "logps/chosen": -294.0035400390625, "logps/rejected": -321.8211669921875, "loss": 0.5965, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -0.27848348021507263, "rewards/margins": 0.3403889834880829, "rewards/rejected": -0.6188725233078003, "step": 100 }, { "epoch": 0.21, "eval_logits/chosen": -2.1194002628326416, "eval_logits/rejected": -2.0640361309051514, "eval_logps/chosen": -308.44342041015625, "eval_logps/rejected": -344.93780517578125, "eval_loss": 0.6008175015449524, "eval_rewards/accuracies": 0.71484375, "eval_rewards/chosen": -0.43487486243247986, "eval_rewards/margins": 0.3607807159423828, "eval_rewards/rejected": -0.7956556081771851, "eval_runtime": 39.9329, "eval_samples_per_second": 50.084, "eval_steps_per_second": 0.801, "step": 100 }, { "epoch": 0.23, "grad_norm": 13.512613063149377, "learning_rate": 4.747874028753375e-07, "logits/chosen": -2.1186444759368896, "logits/rejected": -1.960219383239746, "logps/chosen": -346.79327392578125, "logps/rejected": -330.68634033203125, "loss": 0.6016, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -0.373442143201828, "rewards/margins": 0.33256274461746216, "rewards/rejected": -0.7060048580169678, "step": 110 }, { "epoch": 0.25, "grad_norm": 16.771221195438027, "learning_rate": 4.662012913161997e-07, "logits/chosen": -1.878488302230835, "logits/rejected": -1.82696533203125, "logps/chosen": -322.34173583984375, "logps/rejected": -339.63104248046875, "loss": 0.5803, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.3693729043006897, "rewards/margins": 0.3852415680885315, "rewards/rejected": -0.754614531993866, "step": 120 }, { "epoch": 0.27, "grad_norm": 14.00034800920836, "learning_rate": 4.5646165232345103e-07, "logits/chosen": -1.8512026071548462, "logits/rejected": -1.7661195993423462, "logps/chosen": -322.70599365234375, "logps/rejected": -353.8482360839844, "loss": 0.5644, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.3978124260902405, "rewards/margins": 0.42583903670310974, "rewards/rejected": -0.8236514925956726, "step": 130 }, { "epoch": 0.29, "grad_norm": 15.3625036150752, "learning_rate": 4.456204510851956e-07, "logits/chosen": -1.7981727123260498, "logits/rejected": -1.7398284673690796, "logps/chosen": -359.68994140625, "logps/rejected": -386.440185546875, "loss": 0.5654, "rewards/accuracies": 0.75, "rewards/chosen": -0.40198105573654175, "rewards/margins": 0.43909168243408203, "rewards/rejected": -0.841072678565979, "step": 140 }, { "epoch": 0.31, "grad_norm": 20.80601584306436, "learning_rate": 4.337355301007335e-07, "logits/chosen": -1.7028295993804932, "logits/rejected": -1.5830708742141724, "logps/chosen": -345.12286376953125, "logps/rejected": -376.55859375, "loss": 0.5775, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.5939025282859802, "rewards/margins": 0.37951546907424927, "rewards/rejected": -0.9734179377555847, "step": 150 }, { "epoch": 0.33, "grad_norm": 18.137407218927724, "learning_rate": 4.2087030056579986e-07, "logits/chosen": -1.6537895202636719, "logits/rejected": -1.4418971538543701, "logps/chosen": -318.0480651855469, "logps/rejected": -349.61431884765625, "loss": 0.5693, "rewards/accuracies": 0.75, "rewards/chosen": -0.43722066283226013, "rewards/margins": 0.5587003827095032, "rewards/rejected": -0.9959210157394409, "step": 160 }, { "epoch": 0.36, "grad_norm": 16.981581647441832, "learning_rate": 4.070934040463998e-07, "logits/chosen": -1.5120352506637573, "logits/rejected": -1.4046074151992798, "logps/chosen": -306.04840087890625, "logps/rejected": -330.2176208496094, "loss": 0.5659, "rewards/accuracies": 0.71875, "rewards/chosen": -0.538715124130249, "rewards/margins": 0.42584919929504395, "rewards/rejected": -0.9645644426345825, "step": 170 }, { "epoch": 0.38, "grad_norm": 14.617848943306955, "learning_rate": 3.9247834624635404e-07, "logits/chosen": -1.3031604290008545, "logits/rejected": -1.1622366905212402, "logps/chosen": -317.8174743652344, "logps/rejected": -331.2264404296875, "loss": 0.5424, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6144936084747314, "rewards/margins": 0.4146398603916168, "rewards/rejected": -1.0291334390640259, "step": 180 }, { "epoch": 0.4, "grad_norm": 18.83954708764831, "learning_rate": 3.7710310482256523e-07, "logits/chosen": -1.3724639415740967, "logits/rejected": -1.2839093208312988, "logps/chosen": -320.2606506347656, "logps/rejected": -353.50677490234375, "loss": 0.5567, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -0.47087445855140686, "rewards/margins": 0.4510224461555481, "rewards/rejected": -0.9218968152999878, "step": 190 }, { "epoch": 0.42, "grad_norm": 27.897186532435434, "learning_rate": 3.610497133404795e-07, "logits/chosen": -1.14837646484375, "logits/rejected": -1.072177767753601, "logps/chosen": -318.0636291503906, "logps/rejected": -362.46044921875, "loss": 0.5688, "rewards/accuracies": 0.75, "rewards/chosen": -0.5620325207710266, "rewards/margins": 0.5343230962753296, "rewards/rejected": -1.096355676651001, "step": 200 }, { "epoch": 0.42, "eval_logits/chosen": -1.2653636932373047, "eval_logits/rejected": -1.1455148458480835, "eval_logps/chosen": -328.60369873046875, "eval_logps/rejected": -382.0739440917969, "eval_loss": 0.558937132358551, "eval_rewards/accuracies": 0.73828125, "eval_rewards/chosen": -0.6364771723747253, "eval_rewards/margins": 0.5305400490760803, "eval_rewards/rejected": -1.1670172214508057, "eval_runtime": 39.8802, "eval_samples_per_second": 50.15, "eval_steps_per_second": 0.802, "step": 200 }, { "epoch": 0.44, "grad_norm": 15.625329725139888, "learning_rate": 3.4440382358952115e-07, "logits/chosen": -1.0910618305206299, "logits/rejected": -0.9091793298721313, "logps/chosen": -360.6944885253906, "logps/rejected": -366.00146484375, "loss": 0.5724, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6661044359207153, "rewards/margins": 0.46460071206092834, "rewards/rejected": -1.1307051181793213, "step": 210 }, { "epoch": 0.46, "grad_norm": 19.24668760178638, "learning_rate": 3.272542485937368e-07, "logits/chosen": -0.9850679636001587, "logits/rejected": -0.7914190292358398, "logps/chosen": -309.50775146484375, "logps/rejected": -338.098876953125, "loss": 0.5582, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.44118037819862366, "rewards/margins": 0.524976372718811, "rewards/rejected": -0.9661566019058228, "step": 220 }, { "epoch": 0.48, "grad_norm": 19.97223623454459, "learning_rate": 3.096924887558854e-07, "logits/chosen": -0.4917120039463043, "logits/rejected": -0.298466295003891, "logps/chosen": -313.9906005859375, "logps/rejected": -375.44989013671875, "loss": 0.5541, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.6258713603019714, "rewards/margins": 0.6441494226455688, "rewards/rejected": -1.2700207233428955, "step": 230 }, { "epoch": 0.5, "grad_norm": 21.91087703960587, "learning_rate": 2.9181224366319943e-07, "logits/chosen": 0.09583790600299835, "logits/rejected": 0.32567495107650757, "logps/chosen": -339.2015380859375, "logps/rejected": -384.8148498535156, "loss": 0.521, "rewards/accuracies": 0.78125, "rewards/chosen": -0.7344726324081421, "rewards/margins": 0.6358748078346252, "rewards/rejected": -1.370347499847412, "step": 240 }, { "epoch": 0.52, "grad_norm": 19.954418058737403, "learning_rate": 2.7370891215954565e-07, "logits/chosen": 0.5116527676582336, "logits/rejected": 0.8739731907844543, "logps/chosen": -363.95684814453125, "logps/rejected": -394.17877197265625, "loss": 0.5327, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -0.6146451830863953, "rewards/margins": 0.7260497808456421, "rewards/rejected": -1.3406950235366821, "step": 250 }, { "epoch": 0.54, "grad_norm": 29.50918535565258, "learning_rate": 2.55479083351317e-07, "logits/chosen": 0.7538167834281921, "logits/rejected": 1.1193482875823975, "logps/chosen": -365.6874694824219, "logps/rejected": -389.31396484375, "loss": 0.5412, "rewards/accuracies": 0.71875, "rewards/chosen": -0.6788283586502075, "rewards/margins": 0.6221181154251099, "rewards/rejected": -1.3009464740753174, "step": 260 }, { "epoch": 0.56, "grad_norm": 24.53746609446516, "learning_rate": 2.3722002126275822e-07, "logits/chosen": 1.132846474647522, "logits/rejected": 1.5623472929000854, "logps/chosen": -345.86700439453125, "logps/rejected": -378.31719970703125, "loss": 0.5414, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.6558988690376282, "rewards/margins": 0.5384365320205688, "rewards/rejected": -1.1943353414535522, "step": 270 }, { "epoch": 0.59, "grad_norm": 27.660029561250692, "learning_rate": 2.19029145890313e-07, "logits/chosen": 1.715608835220337, "logits/rejected": 2.1731343269348145, "logps/chosen": -352.11346435546875, "logps/rejected": -406.60772705078125, "loss": 0.5375, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.8623917698860168, "rewards/margins": 0.7531214952468872, "rewards/rejected": -1.6155132055282593, "step": 280 }, { "epoch": 0.61, "grad_norm": 25.659061335686694, "learning_rate": 2.0100351342479216e-07, "logits/chosen": 1.6510066986083984, "logits/rejected": 1.7990186214447021, "logps/chosen": -337.61016845703125, "logps/rejected": -396.5470886230469, "loss": 0.5336, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8643373250961304, "rewards/margins": 0.6685428023338318, "rewards/rejected": -1.5328800678253174, "step": 290 }, { "epoch": 0.63, "grad_norm": 22.694419610449454, "learning_rate": 1.8323929841460178e-07, "logits/chosen": 1.5950249433517456, "logits/rejected": 2.302058696746826, "logps/chosen": -376.0797424316406, "logps/rejected": -401.6100769042969, "loss": 0.5121, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7354680299758911, "rewards/margins": 0.6922025680541992, "rewards/rejected": -1.4276707172393799, "step": 300 }, { "epoch": 0.63, "eval_logits/chosen": 1.7225008010864258, "eval_logits/rejected": 2.1388766765594482, "eval_logps/chosen": -334.2620849609375, "eval_logps/rejected": -418.3771667480469, "eval_loss": 0.5288156270980835, "eval_rewards/accuracies": 0.76171875, "eval_rewards/chosen": -0.6930612325668335, "eval_rewards/margins": 0.8369885683059692, "eval_rewards/rejected": -1.5300499200820923, "eval_runtime": 39.9288, "eval_samples_per_second": 50.089, "eval_steps_per_second": 0.801, "step": 300 }, { "epoch": 0.65, "grad_norm": 24.37280438119094, "learning_rate": 1.6583128063291573e-07, "logits/chosen": 2.1118528842926025, "logits/rejected": 2.5268707275390625, "logps/chosen": -376.37969970703125, "logps/rejected": -417.34869384765625, "loss": 0.507, "rewards/accuracies": 0.75, "rewards/chosen": -0.778560996055603, "rewards/margins": 0.7997097969055176, "rewards/rejected": -1.5782709121704102, "step": 310 }, { "epoch": 0.67, "grad_norm": 26.080136074985454, "learning_rate": 1.488723393865766e-07, "logits/chosen": 2.5625953674316406, "logits/rejected": 3.1481173038482666, "logps/chosen": -383.0509338378906, "logps/rejected": -411.533935546875, "loss": 0.5013, "rewards/accuracies": 0.75, "rewards/chosen": -0.8686873316764832, "rewards/margins": 0.7724698781967163, "rewards/rejected": -1.6411571502685547, "step": 320 }, { "epoch": 0.69, "grad_norm": 27.228237079063305, "learning_rate": 1.3245295796480788e-07, "logits/chosen": 2.7803778648376465, "logits/rejected": 3.247398853302002, "logps/chosen": -351.1916809082031, "logps/rejected": -420.370849609375, "loss": 0.5142, "rewards/accuracies": 0.71875, "rewards/chosen": -0.8757610321044922, "rewards/margins": 0.7058261632919312, "rewards/rejected": -1.5815874338150024, "step": 330 }, { "epoch": 0.71, "grad_norm": 28.892127434127993, "learning_rate": 1.1666074087171627e-07, "logits/chosen": 2.754971504211426, "logits/rejected": 3.230527400970459, "logps/chosen": -377.8298645019531, "logps/rejected": -465.24761962890625, "loss": 0.5138, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.8463465571403503, "rewards/margins": 0.987470269203186, "rewards/rejected": -1.8338168859481812, "step": 340 }, { "epoch": 0.73, "grad_norm": 26.281931375691812, "learning_rate": 1.0157994641835734e-07, "logits/chosen": 2.723754405975342, "logits/rejected": 3.361722230911255, "logps/chosen": -351.22900390625, "logps/rejected": -415.8351135253906, "loss": 0.4828, "rewards/accuracies": 0.75, "rewards/chosen": -0.8969828486442566, "rewards/margins": 0.857469916343689, "rewards/rejected": -1.7544529438018799, "step": 350 }, { "epoch": 0.75, "grad_norm": 24.286834238524502, "learning_rate": 8.729103716819111e-08, "logits/chosen": 2.8787496089935303, "logits/rejected": 3.6532554626464844, "logps/chosen": -402.9510192871094, "logps/rejected": -443.6593322753906, "loss": 0.5325, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -0.9707611203193665, "rewards/margins": 0.8318966627120972, "rewards/rejected": -1.8026577234268188, "step": 360 }, { "epoch": 0.77, "grad_norm": 23.72120672745611, "learning_rate": 7.387025063449081e-08, "logits/chosen": 3.308849811553955, "logits/rejected": 3.967015504837036, "logps/chosen": -388.5707702636719, "logps/rejected": -417.2923889160156, "loss": 0.5145, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -1.0928517580032349, "rewards/margins": 0.6804816722869873, "rewards/rejected": -1.7733335494995117, "step": 370 }, { "epoch": 0.79, "grad_norm": 24.194836344161388, "learning_rate": 6.138919252022435e-08, "logits/chosen": 3.4659945964813232, "logits/rejected": 3.6677188873291016, "logps/chosen": -360.42303466796875, "logps/rejected": -468.11322021484375, "loss": 0.5064, "rewards/accuracies": 0.75, "rewards/chosen": -1.1740145683288574, "rewards/margins": 0.8953350186347961, "rewards/rejected": -2.069349765777588, "step": 380 }, { "epoch": 0.82, "grad_norm": 33.960328274537595, "learning_rate": 4.991445467064689e-08, "logits/chosen": 3.0402557849884033, "logits/rejected": 3.3952622413635254, "logps/chosen": -395.9051208496094, "logps/rejected": -456.98162841796875, "loss": 0.5003, "rewards/accuracies": 0.71875, "rewards/chosen": -0.9203785061836243, "rewards/margins": 0.77605140209198, "rewards/rejected": -1.6964296102523804, "step": 390 }, { "epoch": 0.84, "grad_norm": 24.96336693295718, "learning_rate": 3.9507259776993954e-08, "logits/chosen": 3.402864456176758, "logits/rejected": 3.9089291095733643, "logps/chosen": -373.3275146484375, "logps/rejected": -453.81109619140625, "loss": 0.5208, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0015218257904053, "rewards/margins": 0.86052405834198, "rewards/rejected": -1.8620456457138062, "step": 400 }, { "epoch": 0.84, "eval_logits/chosen": 2.9372177124023438, "eval_logits/rejected": 3.4323720932006836, "eval_logps/chosen": -352.0043029785156, "eval_logps/rejected": -445.87408447265625, "eval_loss": 0.5152841210365295, "eval_rewards/accuracies": 0.7578125, "eval_rewards/chosen": -0.8704833984375, "eval_rewards/margins": 0.9345353841781616, "eval_rewards/rejected": -1.8050185441970825, "eval_runtime": 39.8891, "eval_samples_per_second": 50.139, "eval_steps_per_second": 0.802, "step": 400 }, { "epoch": 0.86, "grad_norm": 26.45544298088616, "learning_rate": 3.022313472693447e-08, "logits/chosen": 3.266558885574341, "logits/rejected": 4.045865535736084, "logps/chosen": -393.68505859375, "logps/rejected": -454.821044921875, "loss": 0.5226, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -0.9244238138198853, "rewards/margins": 0.9223299026489258, "rewards/rejected": -1.8467538356781006, "step": 410 }, { "epoch": 0.88, "grad_norm": 24.753221828065943, "learning_rate": 2.2111614344599684e-08, "logits/chosen": 3.017789363861084, "logits/rejected": 3.619795322418213, "logps/chosen": -396.9893798828125, "logps/rejected": -455.4769592285156, "loss": 0.5062, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.9954174160957336, "rewards/margins": 0.8088730573654175, "rewards/rejected": -1.804290533065796, "step": 420 }, { "epoch": 0.9, "grad_norm": 28.64072501651785, "learning_rate": 1.521597710086439e-08, "logits/chosen": 3.4113173484802246, "logits/rejected": 3.839292526245117, "logps/chosen": -386.08624267578125, "logps/rejected": -451.10894775390625, "loss": 0.4865, "rewards/accuracies": 0.78125, "rewards/chosen": -1.0454809665679932, "rewards/margins": 0.8698482513427734, "rewards/rejected": -1.9153292179107666, "step": 430 }, { "epoch": 0.92, "grad_norm": 28.408098716357102, "learning_rate": 9.57301420397924e-09, "logits/chosen": 2.9448680877685547, "logits/rejected": 3.614654541015625, "logps/chosen": -380.95782470703125, "logps/rejected": -454.02191162109375, "loss": 0.5045, "rewards/accuracies": 0.78125, "rewards/chosen": -0.9367244839668274, "rewards/margins": 0.8637276887893677, "rewards/rejected": -1.8004519939422607, "step": 440 }, { "epoch": 0.94, "grad_norm": 27.41806430030018, "learning_rate": 5.212833302556258e-09, "logits/chosen": 3.149013042449951, "logits/rejected": 3.4816536903381348, "logps/chosen": -401.301513671875, "logps/rejected": -495.34002685546875, "loss": 0.5059, "rewards/accuracies": 0.6875, "rewards/chosen": -1.096161127090454, "rewards/margins": 0.7614067792892456, "rewards/rejected": -1.8575680255889893, "step": 450 }, { "epoch": 0.96, "grad_norm": 30.97252598966743, "learning_rate": 2.158697848236607e-09, "logits/chosen": 3.359788417816162, "logits/rejected": 3.880640745162964, "logps/chosen": -376.32879638671875, "logps/rejected": -425.67010498046875, "loss": 0.5099, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -1.0073617696762085, "rewards/margins": 0.8302923440933228, "rewards/rejected": -1.8376541137695312, "step": 460 }, { "epoch": 0.98, "grad_norm": 23.032101494576157, "learning_rate": 4.269029751107489e-10, "logits/chosen": 3.195591688156128, "logits/rejected": 3.7321903705596924, "logps/chosen": -378.28631591796875, "logps/rejected": -458.43096923828125, "loss": 0.4995, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -1.0239256620407104, "rewards/margins": 0.7799959778785706, "rewards/rejected": -1.8039219379425049, "step": 470 }, { "epoch": 1.0, "step": 478, "total_flos": 0.0, "train_loss": 0.5581105443723032, "train_runtime": 5172.5891, "train_samples_per_second": 11.819, "train_steps_per_second": 0.092 } ], "logging_steps": 10, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }