{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9974099364257123, "eval_steps": 50, "global_step": 353, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "chosen_logps": -92.0396499633789, "chosen_rewards": 0.0, "epoch": 0.0028255238992229807, "grad_norm": 12.203563280219937, "learning_rate": 2.7777777777777774e-08, "log_diff_policy": 2.422942638397217, "logits": -1.2872235774993896, "logp_accuracy": 0.625, "loss": 0.6931, "objective": 0.6931471824645996, "ranking_simple": 0.625, "rejected_logps": -94.46258544921875, "rejected_rewards": 0.0, "reward_accuracy": 0.0, "step": 1 }, { "chosen_logps": -94.69133758544922, "chosen_rewards": -0.00179989542812109, "epoch": 0.014127619496114905, "grad_norm": 13.769130010396626, "learning_rate": 1.3888888888888888e-07, "log_diff_policy": -0.9685585498809814, "logits": -1.223134994506836, "logp_accuracy": 0.4739583432674408, "loss": 0.6927, "objective": 0.6933824419975281, "ranking_simple": 0.4739583432674408, "rejected_logps": -93.7227783203125, "rejected_rewards": -0.0013557692291215062, "reward_accuracy": 0.3697916567325592, "step": 5 }, { "chosen_logps": -95.4830551147461, "chosen_rewards": -0.02674178034067154, "epoch": 0.02825523899222981, "grad_norm": 13.285605890484408, "learning_rate": 2.7777777777777776e-07, "log_diff_policy": -1.0804779529571533, "logits": -1.239374041557312, "logp_accuracy": 0.4833333194255829, "loss": 0.691, "objective": 0.6885988712310791, "ranking_simple": 0.4833333194255829, "rejected_logps": -94.40258026123047, "rejected_rewards": -0.03924858197569847, "reward_accuracy": 0.5416666865348816, "step": 10 }, { "chosen_logps": -95.61571502685547, "chosen_rewards": -0.051721397787332535, "epoch": 0.042382858488344714, "grad_norm": 12.319792066922028, "learning_rate": 4.1666666666666667e-07, "log_diff_policy": -0.5638642907142639, "logits": -1.2099182605743408, "logp_accuracy": 0.5, "loss": 0.6872, "objective": 0.6906775236129761, "ranking_simple": 0.5, "rejected_logps": -95.0518569946289, "rejected_rewards": -0.06499442458152771, "reward_accuracy": 0.5375000238418579, "step": 15 }, { "chosen_logps": -96.26466369628906, "chosen_rewards": 0.06014590337872505, "epoch": 0.05651047798445962, "grad_norm": 12.196449773003847, "learning_rate": 5.555555555555555e-07, "log_diff_policy": 0.15712101757526398, "logits": -1.3111374378204346, "logp_accuracy": 0.5166666507720947, "loss": 0.6904, "objective": 0.6839234828948975, "ranking_simple": 0.5166666507720947, "rejected_logps": -96.42178344726562, "rejected_rewards": 0.029283961281180382, "reward_accuracy": 0.5791666507720947, "step": 20 }, { "chosen_logps": -89.93444061279297, "chosen_rewards": 0.19082187116146088, "epoch": 0.07063809748057452, "grad_norm": 11.658480489992849, "learning_rate": 6.944444444444444e-07, "log_diff_policy": 1.8696421384811401, "logits": -1.275890588760376, "logp_accuracy": 0.5333333611488342, "loss": 0.6882, "objective": 0.6797720789909363, "ranking_simple": 0.5333333611488342, "rejected_logps": -91.80408477783203, "rejected_rewards": 0.14768332242965698, "reward_accuracy": 0.5916666388511658, "step": 25 }, { "chosen_logps": -91.81427764892578, "chosen_rewards": 0.14018863439559937, "epoch": 0.08476571697668943, "grad_norm": 15.037591193931753, "learning_rate": 8.333333333333333e-07, "log_diff_policy": 2.39349102973938, "logits": -1.212536334991455, "logp_accuracy": 0.5458333492279053, "loss": 0.6845, "objective": 0.6916062831878662, "ranking_simple": 0.5458333492279053, "rejected_logps": -94.20777130126953, "rejected_rewards": 0.11354698240756989, "reward_accuracy": 0.5458333492279053, "step": 30 }, { "chosen_logps": -90.74636840820312, "chosen_rewards": 0.15737393498420715, "epoch": 0.09889333647280434, "grad_norm": 10.604933845962982, "learning_rate": 9.722222222222222e-07, "log_diff_policy": 1.6841083765029907, "logits": -1.174207329750061, "logp_accuracy": 0.512499988079071, "loss": 0.671, "objective": 0.6680408716201782, "ranking_simple": 0.512499988079071, "rejected_logps": -92.43048858642578, "rejected_rewards": 0.08380100876092911, "reward_accuracy": 0.612500011920929, "step": 35 }, { "chosen_logps": -95.68059539794922, "chosen_rewards": 0.0697811022400856, "epoch": 0.11302095596891924, "grad_norm": 11.456756804174194, "learning_rate": 9.996071883688332e-07, "log_diff_policy": -0.9100133180618286, "logits": -1.1049649715423584, "logp_accuracy": 0.48750001192092896, "loss": 0.6785, "objective": 0.6945610642433167, "ranking_simple": 0.48750001192092896, "rejected_logps": -94.77057647705078, "rejected_rewards": 0.040437690913677216, "reward_accuracy": 0.5166666507720947, "step": 40 }, { "chosen_logps": -93.884765625, "chosen_rewards": -0.09519442170858383, "epoch": 0.12714857546503414, "grad_norm": 12.792596919876136, "learning_rate": 9.980124488638773e-07, "log_diff_policy": 1.2741607427597046, "logits": -1.2097392082214355, "logp_accuracy": 0.512499988079071, "loss": 0.6877, "objective": 0.6832014918327332, "ranking_simple": 0.512499988079071, "rejected_logps": -95.158935546875, "rejected_rewards": -0.15457913279533386, "reward_accuracy": 0.5083333253860474, "step": 45 }, { "chosen_logps": -96.72931671142578, "chosen_rewards": -0.19579552114009857, "epoch": 0.14127619496114904, "grad_norm": 11.519646507473169, "learning_rate": 9.95195142656885e-07, "log_diff_policy": 0.6534870862960815, "logits": -1.207014799118042, "logp_accuracy": 0.5208333134651184, "loss": 0.6657, "objective": 0.6546425223350525, "ranking_simple": 0.5208333134651184, "rejected_logps": -97.38280487060547, "rejected_rewards": -0.30787384510040283, "reward_accuracy": 0.5791666507720947, "step": 50 }, { "epoch": 0.14127619496114904, "eval_chosen_logps": -96.49317932128906, "eval_chosen_rewards": -0.23376186192035675, "eval_log_diff_policy": 1.5223942995071411, "eval_logits": -1.105320930480957, "eval_logp_accuracy": 0.5193236470222473, "eval_loss": 0.6741412878036499, "eval_objective": 0.6712923049926758, "eval_ranking_simple": 0.5193236470222473, "eval_rejected_logps": -98.01555633544922, "eval_rejected_rewards": -0.3036104738712311, "eval_reward_accuracy": 0.5851449370384216, "eval_runtime": 591.2486, "eval_samples_per_second": 16.8, "eval_steps_per_second": 0.7, "step": 50 }, { "chosen_logps": -95.2642593383789, "chosen_rewards": -0.2244856208562851, "epoch": 0.15540381445726395, "grad_norm": 11.816802501387212, "learning_rate": 9.91162185929904e-07, "log_diff_policy": 2.4890987873077393, "logits": -1.2382166385650635, "logp_accuracy": 0.5666666626930237, "loss": 0.6625, "objective": 0.6524732708930969, "ranking_simple": 0.5666666626930237, "rejected_logps": -97.75334930419922, "rejected_rewards": -0.3347455561161041, "reward_accuracy": 0.6541666388511658, "step": 55 }, { "chosen_logps": -96.38868713378906, "chosen_rewards": -0.31856828927993774, "epoch": 0.16953143395337886, "grad_norm": 15.028016832871401, "learning_rate": 9.859234791555355e-07, "log_diff_policy": 2.37857985496521, "logits": -1.270559549331665, "logp_accuracy": 0.5041666626930237, "loss": 0.658, "objective": 0.6463251709938049, "ranking_simple": 0.5041666626930237, "rejected_logps": -98.76726531982422, "rejected_rewards": -0.4456353485584259, "reward_accuracy": 0.625, "step": 60 }, { "chosen_logps": -96.91040802001953, "chosen_rewards": -0.38393592834472656, "epoch": 0.18365905344949376, "grad_norm": 12.842079906907882, "learning_rate": 9.794918827923456e-07, "log_diff_policy": 2.754225730895996, "logits": -1.2090051174163818, "logp_accuracy": 0.5333333611488342, "loss": 0.6552, "objective": 0.659140944480896, "ranking_simple": 0.5333333611488342, "rejected_logps": -99.66463470458984, "rejected_rewards": -0.5064182877540588, "reward_accuracy": 0.5874999761581421, "step": 65 }, { "chosen_logps": -99.7697525024414, "chosen_rewards": -0.36345040798187256, "epoch": 0.19778667294560867, "grad_norm": 11.642459528097591, "learning_rate": 9.718831857138307e-07, "log_diff_policy": -0.3823479115962982, "logits": -1.2754501104354858, "logp_accuracy": 0.5166666507720947, "loss": 0.652, "objective": 0.6486606597900391, "ranking_simple": 0.5166666507720947, "rejected_logps": -99.38741302490234, "rejected_rewards": -0.5056981444358826, "reward_accuracy": 0.6541666388511658, "step": 70 }, { "chosen_logps": -98.25231170654297, "chosen_rewards": -0.44365358352661133, "epoch": 0.21191429244172358, "grad_norm": 13.778053208316393, "learning_rate": 9.631160664484398e-07, "log_diff_policy": 2.420900821685791, "logits": -1.3257941007614136, "logp_accuracy": 0.5416666865348816, "loss": 0.645, "objective": 0.6380378603935242, "ranking_simple": 0.5416666865348816, "rejected_logps": -100.67321014404297, "rejected_rewards": -0.6107330918312073, "reward_accuracy": 0.6416666507720947, "step": 75 }, { "chosen_logps": -102.11405181884766, "chosen_rewards": -0.5838820338249207, "epoch": 0.22604191193783849, "grad_norm": 11.54119422499371, "learning_rate": 9.532120473258074e-07, "log_diff_policy": 0.3980850577354431, "logits": -1.410662055015564, "logp_accuracy": 0.5291666388511658, "loss": 0.6505, "objective": 0.6602904796600342, "ranking_simple": 0.5291666388511658, "rejected_logps": -102.51213836669922, "rejected_rewards": -0.716762125492096, "reward_accuracy": 0.612500011920929, "step": 80 }, { "chosen_logps": -99.7666015625, "chosen_rewards": -0.6009302735328674, "epoch": 0.2401695314339534, "grad_norm": 12.196843414681535, "learning_rate": 9.421954416417624e-07, "log_diff_policy": 3.4806392192840576, "logits": -1.403199315071106, "logp_accuracy": 0.550000011920929, "loss": 0.6356, "objective": 0.6589958071708679, "ranking_simple": 0.550000011920929, "rejected_logps": -103.24723815917969, "rejected_rewards": -0.7273377776145935, "reward_accuracy": 0.6041666865348816, "step": 85 }, { "chosen_logps": -99.78125, "chosen_rewards": -0.6102154850959778, "epoch": 0.25429715093006827, "grad_norm": 11.884683628001579, "learning_rate": 9.300932939718157e-07, "log_diff_policy": 4.280076026916504, "logits": -1.3959789276123047, "logp_accuracy": 0.5708333253860474, "loss": 0.6393, "objective": 0.6565433144569397, "ranking_simple": 0.5708333253860474, "rejected_logps": -104.06131744384766, "rejected_rewards": -0.732826292514801, "reward_accuracy": 0.637499988079071, "step": 90 }, { "chosen_logps": -100.35063171386719, "chosen_rewards": -0.5101503133773804, "epoch": 0.2684247704261832, "grad_norm": 11.580908746020523, "learning_rate": 9.169353137796533e-07, "log_diff_policy": 2.76478910446167, "logits": -1.453112006187439, "logp_accuracy": 0.5541666746139526, "loss": 0.6437, "objective": 0.6427834033966064, "ranking_simple": 0.5541666746139526, "rejected_logps": -103.11542510986328, "rejected_rewards": -0.6783679127693176, "reward_accuracy": 0.6499999761581421, "step": 95 }, { "chosen_logps": -98.85354614257812, "chosen_rewards": -0.5268033146858215, "epoch": 0.2825523899222981, "grad_norm": 13.403248828603669, "learning_rate": 9.027538024836141e-07, "log_diff_policy": 3.670335531234741, "logits": -1.3224259614944458, "logp_accuracy": 0.5791666507720947, "loss": 0.6364, "objective": 0.6203222274780273, "ranking_simple": 0.5791666507720947, "rejected_logps": -102.5239028930664, "rejected_rewards": -0.7540122270584106, "reward_accuracy": 0.6333333253860474, "step": 100 }, { "epoch": 0.2825523899222981, "eval_chosen_logps": -100.81198120117188, "eval_chosen_rewards": -0.6656423211097717, "eval_log_diff_policy": 2.193570375442505, "eval_logits": -1.2277085781097412, "eval_logp_accuracy": 0.5404589176177979, "eval_loss": 0.6705255508422852, "eval_objective": 0.6646167635917664, "eval_ranking_simple": 0.5404589176177979, "eval_rejected_logps": -103.00554656982422, "eval_rejected_rewards": -0.8026086091995239, "eval_reward_accuracy": 0.5972222089767456, "eval_runtime": 581.9012, "eval_samples_per_second": 17.07, "eval_steps_per_second": 0.711, "step": 100 }, { "chosen_logps": -102.4442367553711, "chosen_rewards": -0.6342382431030273, "epoch": 0.296680009418413, "grad_norm": 13.882342204535156, "learning_rate": 8.875835741602029e-07, "log_diff_policy": 1.7450906038284302, "logits": -1.2776763439178467, "logp_accuracy": 0.5375000238418579, "loss": 0.6375, "objective": 0.638464093208313, "ranking_simple": 0.5375000238418579, "rejected_logps": -104.18933868408203, "rejected_rewards": -0.8383311629295349, "reward_accuracy": 0.6291666626930237, "step": 105 }, { "chosen_logps": -98.33605194091797, "chosen_rewards": -0.4787895083427429, "epoch": 0.3108076289145279, "grad_norm": 11.129672527849843, "learning_rate": 8.714618700792975e-07, "log_diff_policy": 2.9387338161468506, "logits": -1.260119915008545, "logp_accuracy": 0.5333333611488342, "loss": 0.639, "objective": 0.6362030506134033, "ranking_simple": 0.5333333611488342, "rejected_logps": -101.2747802734375, "rejected_rewards": -0.6714141964912415, "reward_accuracy": 0.6458333134651184, "step": 110 }, { "chosen_logps": -98.392822265625, "chosen_rewards": -0.3023277521133423, "epoch": 0.3249352484106428, "grad_norm": 11.136874519373743, "learning_rate": 8.544282672808578e-07, "log_diff_policy": 1.8658004999160767, "logits": -1.2985230684280396, "logp_accuracy": 0.5416666865348816, "loss": 0.6397, "objective": 0.6552284359931946, "ranking_simple": 0.5416666865348816, "rejected_logps": -100.25862884521484, "rejected_rewards": -0.44404318928718567, "reward_accuracy": 0.637499988079071, "step": 115 }, { "chosen_logps": -97.15535736083984, "chosen_rewards": -0.30868878960609436, "epoch": 0.3390628679067577, "grad_norm": 10.158587216555272, "learning_rate": 8.365245814175743e-07, "log_diff_policy": 1.9471272230148315, "logits": -1.3143467903137207, "logp_accuracy": 0.5166666507720947, "loss": 0.6249, "objective": 0.6263554096221924, "ranking_simple": 0.5166666507720947, "rejected_logps": -99.10248565673828, "rejected_rewards": -0.5047985911369324, "reward_accuracy": 0.675000011920929, "step": 120 }, { "chosen_logps": -97.54443359375, "chosen_rewards": -0.41942229866981506, "epoch": 0.3531904874028726, "grad_norm": 11.091561717382723, "learning_rate": 8.17794764101962e-07, "log_diff_policy": 2.919205665588379, "logits": -1.3511691093444824, "logp_accuracy": 0.5625, "loss": 0.6224, "objective": 0.6153813004493713, "ranking_simple": 0.5625, "rejected_logps": -100.46363830566406, "rejected_rewards": -0.6644426584243774, "reward_accuracy": 0.6291666626930237, "step": 125 }, { "chosen_logps": -98.49415588378906, "chosen_rewards": -0.6381217241287231, "epoch": 0.36731810689898753, "grad_norm": 13.19781497022718, "learning_rate": 7.982847950099055e-07, "log_diff_policy": 2.8969905376434326, "logits": -1.2903234958648682, "logp_accuracy": 0.5625, "loss": 0.6365, "objective": 0.6469713449478149, "ranking_simple": 0.5625, "rejected_logps": -101.39115142822266, "rejected_rewards": -0.8170153498649597, "reward_accuracy": 0.6291666626930237, "step": 130 }, { "chosen_logps": -100.84342193603516, "chosen_rewards": -0.6817469000816345, "epoch": 0.38144572639510244, "grad_norm": 12.642228781148605, "learning_rate": 7.780425690055274e-07, "log_diff_policy": 2.6813039779663086, "logits": -1.3165868520736694, "logp_accuracy": 0.5166666507720947, "loss": 0.6271, "objective": 0.6345203518867493, "ranking_simple": 0.5166666507720947, "rejected_logps": -103.52472686767578, "rejected_rewards": -0.9179552793502808, "reward_accuracy": 0.5958333611488342, "step": 135 }, { "chosen_logps": -100.93543243408203, "chosen_rewards": -0.5448787808418274, "epoch": 0.39557334589121734, "grad_norm": 11.726885643648867, "learning_rate": 7.571177785644766e-07, "log_diff_policy": -0.2563031017780304, "logits": -1.3719837665557861, "logp_accuracy": 0.5208333134651184, "loss": 0.6264, "objective": 0.6360562443733215, "ranking_simple": 0.5208333134651184, "rejected_logps": -100.67912292480469, "rejected_rewards": -0.7725273370742798, "reward_accuracy": 0.6666666865348816, "step": 140 }, { "chosen_logps": -99.08710479736328, "chosen_rewards": -0.442624032497406, "epoch": 0.40970096538733225, "grad_norm": 12.322663872224792, "learning_rate": 7.35561791784275e-07, "log_diff_policy": 1.6613519191741943, "logits": -1.4259085655212402, "logp_accuracy": 0.5041666626930237, "loss": 0.6134, "objective": 0.6079509854316711, "ranking_simple": 0.5041666626930237, "rejected_logps": -100.74845123291016, "rejected_rewards": -0.7379883527755737, "reward_accuracy": 0.6708333492279053, "step": 145 }, { "chosen_logps": -97.87836456298828, "chosen_rewards": -0.4350048005580902, "epoch": 0.42382858488344716, "grad_norm": 11.246546091497954, "learning_rate": 7.134275262811934e-07, "log_diff_policy": 0.15529422461986542, "logits": -1.3733505010604858, "logp_accuracy": 0.5083333253860474, "loss": 0.6244, "objective": 0.6462458968162537, "ranking_simple": 0.5083333253860474, "rejected_logps": -98.03366088867188, "rejected_rewards": -0.6196256279945374, "reward_accuracy": 0.6333333253860474, "step": 150 }, { "epoch": 0.42382858488344716, "eval_chosen_logps": -98.61821746826172, "eval_chosen_rewards": -0.4462670087814331, "eval_log_diff_policy": 2.290201187133789, "eval_logits": -1.3178818225860596, "eval_logp_accuracy": 0.5368357300758362, "eval_loss": 0.6577388048171997, "eval_objective": 0.6551039814949036, "eval_ranking_simple": 0.5368357300758362, "eval_rejected_logps": -100.90841674804688, "eval_rejected_rewards": -0.5928963422775269, "eval_reward_accuracy": 0.6086956262588501, "eval_runtime": 578.9907, "eval_samples_per_second": 17.156, "eval_steps_per_second": 0.715, "step": 150 }, { "chosen_logps": -96.16273498535156, "chosen_rewards": -0.3375628888607025, "epoch": 0.43795620437956206, "grad_norm": 10.921897162913947, "learning_rate": 6.907693192832262e-07, "log_diff_policy": 3.625213146209717, "logits": -1.4034603834152222, "logp_accuracy": 0.5458333492279053, "loss": 0.6199, "objective": 0.6075600981712341, "ranking_simple": 0.5458333492279053, "rejected_logps": -99.7879409790039, "rejected_rewards": -0.5962837934494019, "reward_accuracy": 0.6916666626930237, "step": 155 }, { "chosen_logps": -95.9808349609375, "chosen_rewards": -0.23865890502929688, "epoch": 0.45208382387567697, "grad_norm": 10.607229520458072, "learning_rate": 6.676427942380741e-07, "log_diff_policy": 3.0014472007751465, "logits": -1.33468759059906, "logp_accuracy": 0.512499988079071, "loss": 0.6085, "objective": 0.6010857820510864, "ranking_simple": 0.512499988079071, "rejected_logps": -98.9822769165039, "rejected_rewards": -0.5223438739776611, "reward_accuracy": 0.6458333134651184, "step": 160 }, { "chosen_logps": -99.7357406616211, "chosen_rewards": -0.28111231327056885, "epoch": 0.4662114433717919, "grad_norm": 13.093121089444578, "learning_rate": 6.441047242635946e-07, "log_diff_policy": 2.3945486545562744, "logits": -1.4235426187515259, "logp_accuracy": 0.5208333134651184, "loss": 0.608, "objective": 0.6093403100967407, "ranking_simple": 0.5208333134651184, "rejected_logps": -102.13028717041016, "rejected_rewards": -0.570986807346344, "reward_accuracy": 0.6666666865348816, "step": 165 }, { "chosen_logps": -97.75370788574219, "chosen_rewards": -0.318727046251297, "epoch": 0.4803390628679068, "grad_norm": 11.412190377149692, "learning_rate": 6.20212892775939e-07, "log_diff_policy": 1.4041647911071777, "logits": -1.3291094303131104, "logp_accuracy": 0.5166666507720947, "loss": 0.6015, "objective": 0.6086380481719971, "ranking_simple": 0.5166666507720947, "rejected_logps": -99.15787506103516, "rejected_rewards": -0.5835815072059631, "reward_accuracy": 0.6791666746139526, "step": 170 }, { "chosen_logps": -99.83616638183594, "chosen_rewards": -0.4011620283126831, "epoch": 0.49446668236402164, "grad_norm": 12.49488940662025, "learning_rate": 5.960259516375133e-07, "log_diff_policy": 4.319859027862549, "logits": -1.42414128780365, "logp_accuracy": 0.5791666507720947, "loss": 0.6082, "objective": 0.5974529385566711, "ranking_simple": 0.5791666507720947, "rejected_logps": -104.1560287475586, "rejected_rewards": -0.6947523951530457, "reward_accuracy": 0.6916666626930237, "step": 175 }, { "chosen_logps": -99.31816864013672, "chosen_rewards": -0.44157731533050537, "epoch": 0.5085943018601365, "grad_norm": 14.016856666647364, "learning_rate": 5.716032771730007e-07, "log_diff_policy": 2.4290826320648193, "logits": -1.3892205953598022, "logp_accuracy": 0.5833333134651184, "loss": 0.6092, "objective": 0.59227454662323, "ranking_simple": 0.5833333134651184, "rejected_logps": -101.74726104736328, "rejected_rewards": -0.7656379342079163, "reward_accuracy": 0.7124999761581421, "step": 180 }, { "chosen_logps": -99.5169906616211, "chosen_rewards": -0.39382320642471313, "epoch": 0.5227219213562515, "grad_norm": 12.18994561625345, "learning_rate": 5.470048244069055e-07, "log_diff_policy": 3.986778974533081, "logits": -1.364132285118103, "logp_accuracy": 0.5625, "loss": 0.6046, "objective": 0.5779610276222229, "ranking_simple": 0.5625, "rejected_logps": -103.50377655029297, "rejected_rewards": -0.7597634792327881, "reward_accuracy": 0.7250000238418579, "step": 185 }, { "chosen_logps": -100.92544555664062, "chosen_rewards": -0.47165414690971375, "epoch": 0.5368495408523664, "grad_norm": 13.57415809299262, "learning_rate": 5.222909798804514e-07, "log_diff_policy": 1.5828216075897217, "logits": -1.4004924297332764, "logp_accuracy": 0.5583333373069763, "loss": 0.6109, "objective": 0.6085138916969299, "ranking_simple": 0.5583333373069763, "rejected_logps": -102.50827026367188, "rejected_rewards": -0.7680691480636597, "reward_accuracy": 0.6333333253860474, "step": 190 }, { "chosen_logps": -98.55032348632812, "chosen_rewards": -0.42258918285369873, "epoch": 0.5509771603484813, "grad_norm": 12.118293622606005, "learning_rate": 4.97522413409155e-07, "log_diff_policy": 2.5970427989959717, "logits": -1.3773627281188965, "logp_accuracy": 0.5458333492279053, "loss": 0.5975, "objective": 0.578415036201477, "ranking_simple": 0.5458333492279053, "rejected_logps": -101.1473617553711, "rejected_rewards": -0.7796388864517212, "reward_accuracy": 0.7291666865348816, "step": 195 }, { "chosen_logps": -99.49516296386719, "chosen_rewards": -0.38766908645629883, "epoch": 0.5651047798445962, "grad_norm": 11.832876951855871, "learning_rate": 4.7275992914498865e-07, "log_diff_policy": 0.977001428604126, "logits": -1.3331760168075562, "logp_accuracy": 0.5083333253860474, "loss": 0.5938, "objective": 0.6072806119918823, "ranking_simple": 0.5083333253860474, "rejected_logps": -100.47218322753906, "rejected_rewards": -0.6541637182235718, "reward_accuracy": 0.637499988079071, "step": 200 }, { "epoch": 0.5651047798445962, "eval_chosen_logps": -99.37232208251953, "eval_chosen_rewards": -0.5216771960258484, "eval_log_diff_policy": 2.475198745727539, "eval_logits": -1.2857621908187866, "eval_logp_accuracy": 0.5362318754196167, "eval_loss": 0.6590211391448975, "eval_objective": 0.6558353304862976, "eval_ranking_simple": 0.5362318754196167, "eval_rejected_logps": -101.84752655029297, "eval_rejected_rewards": -0.6868062615394592, "eval_reward_accuracy": 0.6159420013427734, "eval_runtime": 585.2119, "eval_samples_per_second": 16.973, "eval_steps_per_second": 0.707, "step": 200 }, { "chosen_logps": -96.24632263183594, "chosen_rewards": -0.3795066773891449, "epoch": 0.5792323993407111, "grad_norm": 11.462490179104472, "learning_rate": 4.4806431630876436e-07, "log_diff_policy": 4.63961124420166, "logits": -1.266701102256775, "logp_accuracy": 0.550000011920929, "loss": 0.6066, "objective": 0.6177704334259033, "ranking_simple": 0.550000011920929, "rejected_logps": -100.88591766357422, "rejected_rewards": -0.6309120655059814, "reward_accuracy": 0.637499988079071, "step": 205 }, { "chosen_logps": -100.29016876220703, "chosen_rewards": -0.4374944269657135, "epoch": 0.593360018836826, "grad_norm": 11.468388905148808, "learning_rate": 4.234961999591705e-07, "log_diff_policy": 5.617704391479492, "logits": -1.4379286766052246, "logp_accuracy": 0.6083333492279053, "loss": 0.6018, "objective": 0.60161954164505, "ranking_simple": 0.6083333492279053, "rejected_logps": -105.90787506103516, "rejected_rewards": -0.7697920203208923, "reward_accuracy": 0.6583333611488342, "step": 210 }, { "chosen_logps": -98.36739349365234, "chosen_rewards": -0.47102805972099304, "epoch": 0.607487638332941, "grad_norm": 11.544777522117554, "learning_rate": 3.9911589216480955e-07, "log_diff_policy": 1.489052176475525, "logits": -1.4814375638961792, "logp_accuracy": 0.5041666626930237, "loss": 0.6048, "objective": 0.6275675296783447, "ranking_simple": 0.5041666626930237, "rejected_logps": -99.8564453125, "rejected_rewards": -0.7114861011505127, "reward_accuracy": 0.675000011920929, "step": 215 }, { "chosen_logps": -98.07539367675781, "chosen_rewards": -0.40747135877609253, "epoch": 0.6216152578290558, "grad_norm": 11.7772812850058, "learning_rate": 3.7498324394459245e-07, "log_diff_policy": 3.3520638942718506, "logits": -1.4648287296295166, "logp_accuracy": 0.550000011920929, "loss": 0.5986, "objective": 0.5826771855354309, "ranking_simple": 0.550000011920929, "rejected_logps": -101.42745971679688, "rejected_rewards": -0.748247504234314, "reward_accuracy": 0.6875, "step": 220 }, { "chosen_logps": -99.81163787841797, "chosen_rewards": -0.42688456177711487, "epoch": 0.6357428773251708, "grad_norm": 11.823543045006824, "learning_rate": 3.511574983399599e-07, "log_diff_policy": 3.882382392883301, "logits": -1.356088399887085, "logp_accuracy": 0.5916666388511658, "loss": 0.5795, "objective": 0.5695621967315674, "ranking_simple": 0.5916666388511658, "rejected_logps": -103.69401550292969, "rejected_rewards": -0.8101120591163635, "reward_accuracy": 0.7041666507720947, "step": 225 }, { "chosen_logps": -98.75402069091797, "chosen_rewards": -0.5289642214775085, "epoch": 0.6498704968212856, "grad_norm": 12.384550673986663, "learning_rate": 3.276971449796223e-07, "log_diff_policy": 3.189291477203369, "logits": -1.4445135593414307, "logp_accuracy": 0.550000011920929, "loss": 0.6056, "objective": 0.6145560145378113, "ranking_simple": 0.550000011920929, "rejected_logps": -101.94329071044922, "rejected_rewards": -0.8118120431900024, "reward_accuracy": 0.6625000238418579, "step": 230 }, { "chosen_logps": -100.55665588378906, "chosen_rewards": -0.4955964684486389, "epoch": 0.6639981163174005, "grad_norm": 11.935195296010395, "learning_rate": 3.046597764938481e-07, "log_diff_policy": 5.328332424163818, "logits": -1.4324222803115845, "logp_accuracy": 0.6083333492279053, "loss": 0.5993, "objective": 0.5901253819465637, "ranking_simple": 0.6083333492279053, "rejected_logps": -105.88497924804688, "rejected_rewards": -0.821084201335907, "reward_accuracy": 0.6958333253860474, "step": 235 }, { "chosen_logps": -98.99813079833984, "chosen_rewards": -0.4493686258792877, "epoch": 0.6781257358135154, "grad_norm": 14.08286875676345, "learning_rate": 2.8210194713078404e-07, "log_diff_policy": 2.7355611324310303, "logits": -1.379309892654419, "logp_accuracy": 0.5708333253860474, "loss": 0.6072, "objective": 0.6106851100921631, "ranking_simple": 0.5708333253860474, "rejected_logps": -101.73368072509766, "rejected_rewards": -0.7426342368125916, "reward_accuracy": 0.6708333492279053, "step": 240 }, { "chosen_logps": -98.74683380126953, "chosen_rewards": -0.36279481649398804, "epoch": 0.6922533553096303, "grad_norm": 11.505971149373092, "learning_rate": 2.600790339218926e-07, "log_diff_policy": 4.0673909187316895, "logits": -1.4577367305755615, "logp_accuracy": 0.5625, "loss": 0.5975, "objective": 0.5751992464065552, "ranking_simple": 0.5625, "rejected_logps": -102.8142318725586, "rejected_rewards": -0.728181779384613, "reward_accuracy": 0.7250000238418579, "step": 245 }, { "chosen_logps": -97.71234893798828, "chosen_rewards": -0.34638115763664246, "epoch": 0.7063809748057452, "grad_norm": 11.594437077167564, "learning_rate": 2.3864510073732915e-07, "log_diff_policy": 2.69929575920105, "logits": -1.3644834756851196, "logp_accuracy": 0.5458333492279053, "loss": 0.5876, "objective": 0.5611613988876343, "ranking_simple": 0.5458333492279053, "rejected_logps": -100.41165161132812, "rejected_rewards": -0.7614852786064148, "reward_accuracy": 0.7083333134651184, "step": 250 }, { "epoch": 0.7063809748057452, "eval_chosen_logps": -99.82035827636719, "eval_chosen_rewards": -0.566480278968811, "eval_log_diff_policy": 2.69968581199646, "eval_logits": -1.3214609622955322, "eval_logp_accuracy": 0.5446860194206238, "eval_loss": 0.6542993187904358, "eval_objective": 0.6504490971565247, "eval_ranking_simple": 0.5446860194206238, "eval_rejected_logps": -102.5200424194336, "eval_rejected_rewards": -0.7540581226348877, "eval_reward_accuracy": 0.6171497702598572, "eval_runtime": 580.2016, "eval_samples_per_second": 17.12, "eval_steps_per_second": 0.714, "step": 250 }, { "chosen_logps": -98.32154846191406, "chosen_rewards": -0.4018653333187103, "epoch": 0.7205085943018601, "grad_norm": 12.212780646567717, "learning_rate": 2.1785276556498678e-07, "log_diff_policy": 4.165632724761963, "logits": -1.4318310022354126, "logp_accuracy": 0.574999988079071, "loss": 0.5952, "objective": 0.6162487268447876, "ranking_simple": 0.574999988079071, "rejected_logps": -102.4871826171875, "rejected_rewards": -0.6852520108222961, "reward_accuracy": 0.6499999761581421, "step": 255 }, { "chosen_logps": -98.31494140625, "chosen_rewards": -0.39665502309799194, "epoch": 0.7346362137979751, "grad_norm": 12.485927390030215, "learning_rate": 1.9775307133902806e-07, "log_diff_policy": 2.1429994106292725, "logits": -1.3612717390060425, "logp_accuracy": 0.5708333253860474, "loss": 0.605, "objective": 0.6075990796089172, "ranking_simple": 0.5708333253860474, "rejected_logps": -100.45793151855469, "rejected_rewards": -0.6959177851676941, "reward_accuracy": 0.6416666507720947, "step": 260 }, { "chosen_logps": -96.93352508544922, "chosen_rewards": -0.4520092010498047, "epoch": 0.7487638332940899, "grad_norm": 12.390709218829844, "learning_rate": 1.783953606350005e-07, "log_diff_policy": 2.5331625938415527, "logits": -1.3640118837356567, "logp_accuracy": 0.5416666865348816, "loss": 0.5989, "objective": 0.6267301440238953, "ranking_simple": 0.5416666865348816, "rejected_logps": -99.46669006347656, "rejected_rewards": -0.7122800350189209, "reward_accuracy": 0.6416666507720947, "step": 265 }, { "chosen_logps": -99.09416961669922, "chosen_rewards": -0.469031423330307, "epoch": 0.7628914527902049, "grad_norm": 13.216199579795669, "learning_rate": 1.5982715453915079e-07, "log_diff_policy": 3.8577356338500977, "logits": -1.4884754419326782, "logp_accuracy": 0.5874999761581421, "loss": 0.5841, "objective": 0.6081915497779846, "ranking_simple": 0.5874999761581421, "rejected_logps": -102.95191192626953, "rejected_rewards": -0.7843472957611084, "reward_accuracy": 0.6791666746139526, "step": 270 }, { "chosen_logps": -100.04994201660156, "chosen_rewards": -0.4179188311100006, "epoch": 0.7770190722863197, "grad_norm": 11.947579077894389, "learning_rate": 1.4209403598929708e-07, "log_diff_policy": 3.1525373458862305, "logits": -1.424578309059143, "logp_accuracy": 0.5291666388511658, "loss": 0.5992, "objective": 0.6024218797683716, "ranking_simple": 0.5291666388511658, "rejected_logps": -103.2024917602539, "rejected_rewards": -0.7272025942802429, "reward_accuracy": 0.6708333492279053, "step": 275 }, { "chosen_logps": -100.48685455322266, "chosen_rewards": -0.4961775243282318, "epoch": 0.7911466917824347, "grad_norm": 12.469165432741715, "learning_rate": 1.2523953787364722e-07, "log_diff_policy": 1.6572238206863403, "logits": -1.3310333490371704, "logp_accuracy": 0.5458333492279053, "loss": 0.588, "objective": 0.6101647615432739, "ranking_simple": 0.5458333492279053, "rejected_logps": -102.14408111572266, "rejected_rewards": -0.7782385349273682, "reward_accuracy": 0.6875, "step": 280 }, { "chosen_logps": -98.08577728271484, "chosen_rewards": -0.48601558804512024, "epoch": 0.8052743112785495, "grad_norm": 12.31765364922954, "learning_rate": 1.0930503616226495e-07, "log_diff_policy": 3.3555397987365723, "logits": -1.4751113653182983, "logp_accuracy": 0.5916666388511658, "loss": 0.6025, "objective": 0.6169189214706421, "ranking_simple": 0.5916666388511658, "rejected_logps": -101.44131469726562, "rejected_rewards": -0.7556989789009094, "reward_accuracy": 0.6708333492279053, "step": 285 }, { "chosen_logps": -100.12785339355469, "chosen_rewards": -0.5353098511695862, "epoch": 0.8194019307746645, "grad_norm": 12.570256325871375, "learning_rate": 9.432964833353946e-08, "log_diff_policy": 3.7209763526916504, "logits": -1.4481749534606934, "logp_accuracy": 0.574999988079071, "loss": 0.5859, "objective": 0.5787585973739624, "ranking_simple": 0.574999988079071, "rejected_logps": -103.84882354736328, "rejected_rewards": -0.8981534242630005, "reward_accuracy": 0.6916666626930237, "step": 290 }, { "chosen_logps": -97.55231475830078, "chosen_rewards": -0.4495824873447418, "epoch": 0.8335295502707794, "grad_norm": 12.980085764300096, "learning_rate": 8.035013734500557e-08, "log_diff_policy": 6.113182544708252, "logits": -1.4768887758255005, "logp_accuracy": 0.6000000238418579, "loss": 0.5855, "objective": 0.5812606811523438, "ranking_simple": 0.6000000238418579, "rejected_logps": -103.66551208496094, "rejected_rewards": -0.8177840113639832, "reward_accuracy": 0.7041666507720947, "step": 295 }, { "chosen_logps": -100.5435562133789, "chosen_rewards": -0.47136637568473816, "epoch": 0.8476571697668943, "grad_norm": 11.431502808997152, "learning_rate": 6.740082138425962e-08, "log_diff_policy": 4.680129051208496, "logits": -1.441601037979126, "logp_accuracy": 0.5791666507720947, "loss": 0.5705, "objective": 0.542003333568573, "ranking_simple": 0.5791666507720947, "rejected_logps": -105.22367095947266, "rejected_rewards": -0.9355214238166809, "reward_accuracy": 0.7416666746139526, "step": 300 }, { "epoch": 0.8476571697668943, "eval_chosen_logps": -100.62620544433594, "eval_chosen_rewards": -0.6470655202865601, "eval_log_diff_policy": 2.7566163539886475, "eval_logits": -1.3272062540054321, "eval_logp_accuracy": 0.5446860194206238, "eval_loss": 0.6553571224212646, "eval_objective": 0.6504380106925964, "eval_ranking_simple": 0.5446860194206238, "eval_rejected_logps": -103.38282012939453, "eval_rejected_rewards": -0.8403363823890686, "eval_reward_accuracy": 0.6189613342285156, "eval_runtime": 577.7358, "eval_samples_per_second": 17.193, "eval_steps_per_second": 0.717, "step": 300 }, { "chosen_logps": -99.48948669433594, "chosen_rewards": -0.5248011350631714, "epoch": 0.8617847892630092, "grad_norm": 13.252372937407342, "learning_rate": 5.551348962151964e-08, "log_diff_policy": 2.7193262577056885, "logits": -1.4033145904541016, "logp_accuracy": 0.5625, "loss": 0.6042, "objective": 0.617713987827301, "ranking_simple": 0.5625, "rejected_logps": -102.20880889892578, "rejected_rewards": -0.8076351284980774, "reward_accuracy": 0.6541666388511658, "step": 305 }, { "chosen_logps": -98.88239288330078, "chosen_rewards": -0.5424375534057617, "epoch": 0.8759124087591241, "grad_norm": 11.513198778387665, "learning_rate": 4.471732417065144e-08, "log_diff_policy": 5.485601902008057, "logits": -1.4526277780532837, "logp_accuracy": 0.612500011920929, "loss": 0.5898, "objective": 0.5874204039573669, "ranking_simple": 0.612500011920929, "rejected_logps": -104.36799621582031, "rejected_rewards": -0.8886787295341492, "reward_accuracy": 0.6916666626930237, "step": 310 }, { "chosen_logps": -95.83468627929688, "chosen_rewards": -0.3822983205318451, "epoch": 0.890040028255239, "grad_norm": 12.210262508024565, "learning_rate": 3.503882845023387e-08, "log_diff_policy": 4.727508068084717, "logits": -1.4003602266311646, "logp_accuracy": 0.6000000238418579, "loss": 0.584, "objective": 0.5880329608917236, "ranking_simple": 0.6000000238418579, "rejected_logps": -100.56217956542969, "rejected_rewards": -0.7324024438858032, "reward_accuracy": 0.6916666626930237, "step": 315 }, { "chosen_logps": -100.13945770263672, "chosen_rewards": -0.40757814049720764, "epoch": 0.9041676477513539, "grad_norm": 11.629225422410343, "learning_rate": 2.65017621205339e-08, "log_diff_policy": 1.5231914520263672, "logits": -1.4287190437316895, "logp_accuracy": 0.5375000238418579, "loss": 0.5795, "objective": 0.5747238993644714, "ranking_simple": 0.5375000238418579, "rejected_logps": -101.66265106201172, "rejected_rewards": -0.7921539545059204, "reward_accuracy": 0.7124999761581421, "step": 320 }, { "chosen_logps": -98.47615814208984, "chosen_rewards": -0.4354442059993744, "epoch": 0.9182952672474688, "grad_norm": 11.50196158969457, "learning_rate": 1.9127082756109138e-08, "log_diff_policy": 4.035438060760498, "logits": -1.4835073947906494, "logp_accuracy": 0.5916666388511658, "loss": 0.5836, "objective": 0.5947951078414917, "ranking_simple": 0.5916666388511658, "rejected_logps": -102.51158905029297, "rejected_rewards": -0.7917211055755615, "reward_accuracy": 0.6791666746139526, "step": 325 }, { "chosen_logps": -98.91030883789062, "chosen_rewards": -0.5017069578170776, "epoch": 0.9324228867435838, "grad_norm": 11.552117625756214, "learning_rate": 1.293289439722961e-08, "log_diff_policy": 6.066530704498291, "logits": -1.376121163368225, "logp_accuracy": 0.6041666865348816, "loss": 0.5902, "objective": 0.6051034331321716, "ranking_simple": 0.6041666865348816, "rejected_logps": -104.97685241699219, "rejected_rewards": -0.7912607789039612, "reward_accuracy": 0.7041666507720947, "step": 330 }, { "chosen_logps": -100.66015625, "chosen_rewards": -0.5485278367996216, "epoch": 0.9465505062396986, "grad_norm": 11.501179850402595, "learning_rate": 7.934403106416243e-09, "log_diff_policy": 3.832066774368286, "logits": -1.4517931938171387, "logp_accuracy": 0.5583333373069763, "loss": 0.5689, "objective": 0.5892359614372253, "ranking_simple": 0.5583333373069763, "rejected_logps": -104.49221801757812, "rejected_rewards": -0.8999612331390381, "reward_accuracy": 0.7041666507720947, "step": 335 }, { "chosen_logps": -99.37032318115234, "chosen_rewards": -0.5224529504776001, "epoch": 0.9606781257358136, "grad_norm": 12.734967304317747, "learning_rate": 4.143879639202541e-09, "log_diff_policy": 1.4650262594223022, "logits": -1.4669277667999268, "logp_accuracy": 0.5375000238418579, "loss": 0.5857, "objective": 0.6081200838088989, "ranking_simple": 0.5375000238418579, "rejected_logps": -100.83534240722656, "rejected_rewards": -0.8256459832191467, "reward_accuracy": 0.637499988079071, "step": 340 }, { "chosen_logps": -98.70734405517578, "chosen_rewards": -0.5224943161010742, "epoch": 0.9748057452319284, "grad_norm": 12.0257535848254, "learning_rate": 1.5706293207561893e-09, "log_diff_policy": 4.583549976348877, "logits": -1.3987572193145752, "logp_accuracy": 0.5708333253860474, "loss": 0.585, "objective": 0.5911051034927368, "ranking_simple": 0.5708333253860474, "rejected_logps": -103.2908935546875, "rejected_rewards": -0.8825021982192993, "reward_accuracy": 0.6875, "step": 345 }, { "chosen_logps": -99.29621124267578, "chosen_rewards": -0.4423524737358093, "epoch": 0.9889333647280433, "grad_norm": 12.354645024534335, "learning_rate": 2.209692023126819e-10, "log_diff_policy": 2.835632801055908, "logits": -1.471450686454773, "logp_accuracy": 0.5, "loss": 0.5864, "objective": 0.5840578079223633, "ranking_simple": 0.5, "rejected_logps": -102.13184356689453, "rejected_rewards": -0.7969415187835693, "reward_accuracy": 0.75, "step": 350 }, { "epoch": 0.9889333647280433, "eval_chosen_logps": -100.49518585205078, "eval_chosen_rewards": -0.6339634656906128, "eval_log_diff_policy": 2.7547223567962646, "eval_logits": -1.327553391456604, "eval_logp_accuracy": 0.54347825050354, "eval_loss": 0.6549956798553467, "eval_objective": 0.6503260731697083, "eval_ranking_simple": 0.54347825050354, "eval_rejected_logps": -103.24991607666016, "eval_rejected_rewards": -0.8270449638366699, "eval_reward_accuracy": 0.6183574795722961, "eval_runtime": 582.9627, "eval_samples_per_second": 17.039, "eval_steps_per_second": 0.71, "step": 350 }, { "epoch": 0.9974099364257123, "step": 353, "total_flos": 0.0, "train_loss": 0.6203088365263034, "train_runtime": 17039.3075, "train_samples_per_second": 5.981, "train_steps_per_second": 0.021 } ], "logging_steps": 5, "max_steps": 353, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }