{ "best_metric": 0.5423553586006165, "best_model_checkpoint": "./qwen2.5-0.5b/qwen2.5-0.5b-expo-DPO-ES-TRY/checkpoint-371", "epoch": 2.9976381672177608, "eval_steps": 53, "global_step": 528, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "dpo_loss": 0.6931471824645996, "dpo_wo_beta": -0.6931471824645996, "epoch": 0.005668398677373642, "grad_norm": 13.433600669124935, "learning_rate": 9.433962264150944e-08, "logits": -1.3874311447143555, "logps": -88.43561553955078, "loss": 0.6931, "objective": 0.6931471824645996, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5208333134651184, "ranking_simple": 0.5208333134651184, "regularize": 0.6931471824645996, "step": 1 }, { "dpo_loss": 0.693236768245697, "dpo_wo_beta": -0.6993356347084045, "epoch": 0.02834199338686821, "grad_norm": 13.640653628388394, "learning_rate": 4.716981132075472e-07, "logits": -1.4090652465820312, "logps": -84.34337615966797, "loss": 0.693, "objective": 0.693236768245697, "ranking_idealized": 0.6197916865348816, "ranking_idealized_expo": 0.546875, "ranking_simple": 0.546875, "regularize": 0.693236768245697, "step": 5 }, { "dpo_loss": 0.6845630407333374, "dpo_wo_beta": -0.7111619710922241, "epoch": 0.05668398677373642, "grad_norm": 12.626074407134174, "learning_rate": 9.433962264150944e-07, "logits": -1.4784893989562988, "logps": -81.94055938720703, "loss": 0.6892, "objective": 0.6845630407333374, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.512499988079071, "regularize": 0.6845630407333374, "step": 10 }, { "dpo_loss": 0.6825469136238098, "dpo_wo_beta": -0.8259204626083374, "epoch": 0.08502598016060463, "grad_norm": 12.374180595083178, "learning_rate": 1.4150943396226415e-06, "logits": -1.4932299852371216, "logps": -81.52880096435547, "loss": 0.6814, "objective": 0.6825469136238098, "ranking_idealized": 0.5916666388511658, "ranking_idealized_expo": 0.5041666626930237, "ranking_simple": 0.512499988079071, "regularize": 0.6825469136238098, "step": 15 }, { "dpo_loss": 0.6950914263725281, "dpo_wo_beta": -1.2390469312667847, "epoch": 0.11336797354747284, "grad_norm": 14.839934392200913, "learning_rate": 1.8867924528301889e-06, "logits": -1.5371100902557373, "logps": -82.72624969482422, "loss": 0.6711, "objective": 0.6950914263725281, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5375000238418579, "ranking_simple": 0.5249999761581421, "regularize": 0.6950914263725281, "step": 20 }, { "dpo_loss": 0.6556071043014526, "dpo_wo_beta": -1.110619068145752, "epoch": 0.14170996693434104, "grad_norm": 12.89805052529156, "learning_rate": 2.358490566037736e-06, "logits": -1.6399922370910645, "logps": -81.59695434570312, "loss": 0.6589, "objective": 0.6556071043014526, "ranking_idealized": 0.5958333611488342, "ranking_idealized_expo": 0.4958333373069763, "ranking_simple": 0.5249999761581421, "regularize": 0.6556071043014526, "step": 25 }, { "dpo_loss": 0.6518108248710632, "dpo_wo_beta": -1.2506839036941528, "epoch": 0.17005196032120926, "grad_norm": 12.64998937636519, "learning_rate": 2.830188679245283e-06, "logits": -1.6404598951339722, "logps": -83.20111846923828, "loss": 0.6451, "objective": 0.6518108248710632, "ranking_idealized": 0.6041666865348816, "ranking_idealized_expo": 0.5291666388511658, "ranking_simple": 0.5666666626930237, "regularize": 0.6518108248710632, "step": 30 }, { "dpo_loss": 0.6226770877838135, "dpo_wo_beta": -1.394917368888855, "epoch": 0.19839395370807747, "grad_norm": 13.760162421635227, "learning_rate": 3.30188679245283e-06, "logits": -1.6237396001815796, "logps": -87.80964660644531, "loss": 0.6189, "objective": 0.6226770877838135, "ranking_idealized": 0.612500011920929, "ranking_idealized_expo": 0.5041666626930237, "ranking_simple": 0.5583333373069763, "regularize": 0.6226770877838135, "step": 35 }, { "dpo_loss": 0.5924390554428101, "dpo_wo_beta": -1.422450304031372, "epoch": 0.22673594709494568, "grad_norm": 16.810886476613117, "learning_rate": 3.7735849056603777e-06, "logits": -1.620682954788208, "logps": -91.93690490722656, "loss": 0.6076, "objective": 0.5924390554428101, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5208333134651184, "ranking_simple": 0.5708333253860474, "regularize": 0.5924390554428101, "step": 40 }, { "dpo_loss": 0.573756217956543, "dpo_wo_beta": -1.3691534996032715, "epoch": 0.25507794048181387, "grad_norm": 13.798774501924722, "learning_rate": 4.245283018867925e-06, "logits": -1.7814558744430542, "logps": -92.24474334716797, "loss": 0.5989, "objective": 0.573756217956543, "ranking_idealized": 0.6208333373069763, "ranking_idealized_expo": 0.5375000238418579, "ranking_simple": 0.5666666626930237, "regularize": 0.573756217956543, "step": 45 }, { "dpo_loss": 0.5726417899131775, "dpo_wo_beta": -1.3605374097824097, "epoch": 0.2834199338686821, "grad_norm": 12.568473894025988, "learning_rate": 4.716981132075472e-06, "logits": -1.808895468711853, "logps": -90.65751647949219, "loss": 0.5954, "objective": 0.5726417899131775, "ranking_idealized": 0.5416666865348816, "ranking_idealized_expo": 0.42500001192092896, "ranking_simple": 0.574999988079071, "regularize": 0.5726417899131775, "step": 50 }, { "epoch": 0.300425129900803, "eval_dpo_loss": 0.7112604975700378, "eval_dpo_wo_beta": -2.2659413814544678, "eval_logits": -1.892814040184021, "eval_logps": -101.36742401123047, "eval_loss": 0.6816489100456238, "eval_objective": 0.7112604975700378, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.5237603187561035, "eval_regularize": 0.7112604975700378, "eval_runtime": 211.6587, "eval_samples_per_second": 27.355, "eval_steps_per_second": 1.143, "step": 53 }, { "dpo_loss": 0.5827316045761108, "dpo_wo_beta": -1.6213361024856567, "epoch": 0.3117619272555503, "grad_norm": 14.442715913160086, "learning_rate": 4.999781286194085e-06, "logits": -1.8762638568878174, "logps": -93.41423797607422, "loss": 0.5721, "objective": 0.5827316045761108, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.6041666865348816, "regularize": 0.5827316045761108, "step": 55 }, { "dpo_loss": 0.5553872585296631, "dpo_wo_beta": -1.6468366384506226, "epoch": 0.3401039206424185, "grad_norm": 13.845514282811145, "learning_rate": 4.997321195347154e-06, "logits": -1.8914529085159302, "logps": -90.59642028808594, "loss": 0.5756, "objective": 0.5553872585296631, "ranking_idealized": 0.6333333253860474, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.5791666507720947, "regularize": 0.5553872585296631, "step": 60 }, { "dpo_loss": 0.5302771329879761, "dpo_wo_beta": -1.3166770935058594, "epoch": 0.3684459140292867, "grad_norm": 10.846857687148022, "learning_rate": 4.992130320438411e-06, "logits": -1.8399535417556763, "logps": -86.60197448730469, "loss": 0.5586, "objective": 0.5302771329879761, "ranking_idealized": 0.5708333253860474, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.6208333373069763, "regularize": 0.5302771329879761, "step": 65 }, { "dpo_loss": 0.5711485743522644, "dpo_wo_beta": -1.7437169551849365, "epoch": 0.39678790741615494, "grad_norm": 13.787840238803502, "learning_rate": 4.984214337613357e-06, "logits": -1.8178967237472534, "logps": -91.10688781738281, "loss": 0.5701, "objective": 0.5711485743522644, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.5874999761581421, "regularize": 0.5711485743522644, "step": 70 }, { "dpo_loss": 0.523643434047699, "dpo_wo_beta": -1.669514536857605, "epoch": 0.42512990080302315, "grad_norm": 13.192298437287352, "learning_rate": 4.97358190288299e-06, "logits": -1.8182169198989868, "logps": -94.8000717163086, "loss": 0.5205, "objective": 0.523643434047699, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5458333492279053, "ranking_simple": 0.6583333611488342, "regularize": 0.523643434047699, "step": 75 }, { "dpo_loss": 0.51079261302948, "dpo_wo_beta": -1.7271808385849, "epoch": 0.45347189418989137, "grad_norm": 15.151373786996814, "learning_rate": 4.9602446426585845e-06, "logits": -1.8920824527740479, "logps": -93.58238220214844, "loss": 0.5285, "objective": 0.51079261302948, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.6458333134651184, "regularize": 0.51079261302948, "step": 80 }, { "dpo_loss": 0.5066012144088745, "dpo_wo_beta": -1.5956443548202515, "epoch": 0.4818138875767596, "grad_norm": 12.328960275584794, "learning_rate": 4.944217141038379e-06, "logits": -1.8741406202316284, "logps": -87.06742858886719, "loss": 0.5202, "objective": 0.5066012144088745, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.6208333373069763, "regularize": 0.5066012144088745, "step": 85 }, { "dpo_loss": 0.5358369946479797, "dpo_wo_beta": -1.9357556104660034, "epoch": 0.5101558809636277, "grad_norm": 12.694483590051824, "learning_rate": 4.925516923860083e-06, "logits": -1.7968534231185913, "logps": -86.77802276611328, "loss": 0.4858, "objective": 0.5358369946479797, "ranking_idealized": 0.5458333492279053, "ranking_idealized_expo": 0.4749999940395355, "ranking_simple": 0.5874999761581421, "regularize": 0.5358369946479797, "step": 90 }, { "dpo_loss": 0.4783257842063904, "dpo_wo_beta": -1.9098786115646362, "epoch": 0.538497874350496, "grad_norm": 14.474706973531484, "learning_rate": 4.904164439536626e-06, "logits": -1.8568389415740967, "logps": -88.12813568115234, "loss": 0.4865, "objective": 0.4783257842063904, "ranking_idealized": 0.6291666626930237, "ranking_idealized_expo": 0.5458333492279053, "ranking_simple": 0.6916666626930237, "regularize": 0.4783257842063904, "step": 95 }, { "dpo_loss": 0.4654810130596161, "dpo_wo_beta": -1.9254087209701538, "epoch": 0.5668398677373642, "grad_norm": 13.577084707122001, "learning_rate": 4.880183036696123e-06, "logits": -1.938937783241272, "logps": -92.29436492919922, "loss": 0.5016, "objective": 0.4654810130596161, "ranking_idealized": 0.612500011920929, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.6875, "regularize": 0.4654810130596161, "step": 100 }, { "dpo_loss": 0.4374677240848541, "dpo_wo_beta": -1.4267934560775757, "epoch": 0.5951818611242324, "grad_norm": 11.14545328639218, "learning_rate": 4.853598938650487e-06, "logits": -1.8158982992172241, "logps": -90.21449279785156, "loss": 0.4618, "objective": 0.4374677240848541, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.6666666865348816, "regularize": 0.4374677240848541, "step": 105 }, { "epoch": 0.600850259801606, "eval_dpo_loss": 0.6936022639274597, "eval_dpo_wo_beta": -2.462427854537964, "eval_logits": -1.9007418155670166, "eval_logps": -94.35714721679688, "eval_loss": 0.6912521123886108, "eval_objective": 0.6936022639274597, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.5351239442825317, "eval_regularize": 0.6936022639274597, "eval_runtime": 210.2297, "eval_samples_per_second": 27.541, "eval_steps_per_second": 1.151, "step": 106 }, { "dpo_loss": 0.47933149337768555, "dpo_wo_beta": -1.9683055877685547, "epoch": 0.6235238545111006, "grad_norm": 12.39392340166307, "learning_rate": 4.824441214720629e-06, "logits": -1.9334439039230347, "logps": -87.35523223876953, "loss": 0.4633, "objective": 0.47933149337768555, "ranking_idealized": 0.5708333253860474, "ranking_idealized_expo": 0.4541666805744171, "ranking_simple": 0.6625000238418579, "regularize": 0.47933149337768555, "step": 110 }, { "dpo_loss": 0.4749464690685272, "dpo_wo_beta": -1.7375919818878174, "epoch": 0.6518658478979689, "grad_norm": 12.612865651893962, "learning_rate": 4.7927417484495756e-06, "logits": -1.9057692289352417, "logps": -87.68991088867188, "loss": 0.4712, "objective": 0.4749464690685272, "ranking_idealized": 0.574999988079071, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.6333333253860474, "regularize": 0.4749464690685272, "step": 115 }, { "dpo_loss": 0.4848935306072235, "dpo_wo_beta": -1.9273093938827515, "epoch": 0.680207841284837, "grad_norm": 13.836239066838136, "learning_rate": 4.758535202738287e-06, "logits": -1.8775906562805176, "logps": -87.8878173828125, "loss": 0.4641, "objective": 0.4848935306072235, "ranking_idealized": 0.5874999761581421, "ranking_idealized_expo": 0.5291666388511658, "ranking_simple": 0.6625000238418579, "regularize": 0.4848935306072235, "step": 120 }, { "dpo_loss": 0.4785127639770508, "dpo_wo_beta": -1.814666748046875, "epoch": 0.7085498346717053, "grad_norm": 12.105170057238437, "learning_rate": 4.721858981942284e-06, "logits": -1.8346068859100342, "logps": -86.40522766113281, "loss": 0.4801, "objective": 0.4785127639770508, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.550000011920929, "ranking_simple": 0.6875, "regularize": 0.4785127639770508, "step": 125 }, { "dpo_loss": 0.4548089802265167, "dpo_wo_beta": -1.4164987802505493, "epoch": 0.7368918280585735, "grad_norm": 11.895980627109102, "learning_rate": 4.682753190970533e-06, "logits": -1.9488608837127686, "logps": -79.42195129394531, "loss": 0.4538, "objective": 0.4548089802265167, "ranking_idealized": 0.5416666865348816, "ranking_idealized_expo": 0.44999998807907104, "ranking_simple": 0.6291666626930237, "regularize": 0.4548089802265167, "step": 130 }, { "dpo_loss": 0.49760884046554565, "dpo_wo_beta": -1.994195818901062, "epoch": 0.7652338214454416, "grad_norm": 12.298776298341995, "learning_rate": 4.641260591431315e-06, "logits": -1.9813282489776611, "logps": -82.40634155273438, "loss": 0.4433, "objective": 0.49760884046554565, "ranking_idealized": 0.5874999761581421, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.6583333611488342, "regularize": 0.49760884046554565, "step": 135 }, { "dpo_loss": 0.41459351778030396, "dpo_wo_beta": -1.187635064125061, "epoch": 0.7935758148323099, "grad_norm": 12.618720178096575, "learning_rate": 4.597426554873037e-06, "logits": -1.97609281539917, "logps": -83.44467163085938, "loss": 0.4236, "objective": 0.41459351778030396, "ranking_idealized": 0.5791666507720947, "ranking_idealized_expo": 0.5041666626930237, "ranking_simple": 0.6791666746139526, "regularize": 0.41459351778030396, "step": 140 }, { "dpo_loss": 0.4073801636695862, "dpo_wo_beta": -1.311059832572937, "epoch": 0.821917808219178, "grad_norm": 14.417917904409194, "learning_rate": 4.551299013171111e-06, "logits": -2.0718839168548584, "logps": -84.2674560546875, "loss": 0.4215, "objective": 0.4073801636695862, "ranking_idealized": 0.5791666507720947, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.6916666626930237, "regularize": 0.4073801636695862, "step": 145 }, { "dpo_loss": 0.4207518398761749, "dpo_wo_beta": -1.50857675075531, "epoch": 0.8502598016060463, "grad_norm": 11.543599868064442, "learning_rate": 4.502928406115152e-06, "logits": -2.0730583667755127, "logps": -82.68958282470703, "loss": 0.4276, "objective": 0.4207518398761749, "ranking_idealized": 0.6166666746139526, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.7208333611488342, "regularize": 0.4207518398761749, "step": 150 }, { "dpo_loss": 0.3847941756248474, "dpo_wo_beta": -1.4449684619903564, "epoch": 0.8786017949929145, "grad_norm": 12.08771803065001, "learning_rate": 4.452367626253805e-06, "logits": -2.0991933345794678, "logps": -85.211181640625, "loss": 0.3986, "objective": 0.3847941756248474, "ranking_idealized": 0.612500011920929, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.7250000238418579, "regularize": 0.3847941756248474, "step": 155 }, { "epoch": 0.9012753897024091, "eval_dpo_loss": 0.7214789390563965, "eval_dpo_wo_beta": -3.1229145526885986, "eval_logits": -2.1450352668762207, "eval_logps": -95.60012817382812, "eval_loss": 0.7013870477676392, "eval_objective": 0.7214789390563965, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.5351239442825317, "eval_regularize": 0.7214789390563965, "eval_runtime": 210.3593, "eval_samples_per_second": 27.524, "eval_steps_per_second": 1.15, "step": 159 }, { "dpo_loss": 0.4162478744983673, "dpo_wo_beta": -1.6461573839187622, "epoch": 0.9069437883797827, "grad_norm": 12.82345397067452, "learning_rate": 4.399671961057523e-06, "logits": -2.0759384632110596, "logps": -89.25846862792969, "loss": 0.4236, "objective": 0.4162478744983673, "ranking_idealized": 0.5708333253860474, "ranking_idealized_expo": 0.49166667461395264, "ranking_simple": 0.699999988079071, "regularize": 0.4162478744983673, "step": 160 }, { "dpo_loss": 0.41358453035354614, "dpo_wo_beta": -1.648630976676941, "epoch": 0.9352857817666509, "grad_norm": 12.860537676624453, "learning_rate": 4.3448990324625244e-06, "logits": -2.024477481842041, "logps": -88.03329467773438, "loss": 0.4026, "objective": 0.41358453035354614, "ranking_idealized": 0.6041666865348816, "ranking_idealized_expo": 0.5208333134651184, "ranking_simple": 0.7333333492279053, "regularize": 0.41358453035354614, "step": 165 }, { "dpo_loss": 0.378000408411026, "dpo_wo_beta": -1.2966532707214355, "epoch": 0.9636277751535192, "grad_norm": 11.533711130228069, "learning_rate": 4.288108733862064e-06, "logits": -2.042527437210083, "logps": -90.26854705810547, "loss": 0.3925, "objective": 0.378000408411026, "ranking_idealized": 0.5791666507720947, "ranking_idealized_expo": 0.512499988079071, "ranking_simple": 0.7166666388511658, "regularize": 0.378000408411026, "step": 170 }, { "dpo_loss": 0.3764660954475403, "dpo_wo_beta": -1.3978971242904663, "epoch": 0.9919697685403873, "grad_norm": 12.165192869157089, "learning_rate": 4.229363164613874e-06, "logits": -2.0610477924346924, "logps": -89.8354721069336, "loss": 0.3793, "objective": 0.3764660954475403, "ranking_idealized": 0.6541666388511658, "ranking_idealized_expo": 0.574999988079071, "ranking_simple": 0.7916666865348816, "regularize": 0.3764660954475403, "step": 175 }, { "dpo_loss": 0.27626773715019226, "dpo_wo_beta": -0.8504549860954285, "epoch": 1.0203117619272555, "grad_norm": 10.141692447282386, "learning_rate": 4.168726562135432e-06, "logits": -2.2514243125915527, "logps": -90.8476333618164, "loss": 0.2852, "objective": 0.27626773715019226, "ranking_idealized": 0.6166666746139526, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.8083333373069763, "regularize": 0.27626773715019226, "step": 180 }, { "dpo_loss": 0.23696589469909668, "dpo_wo_beta": -0.6947117447853088, "epoch": 1.0486537553141237, "grad_norm": 13.78702272812957, "learning_rate": 4.106265231661292e-06, "logits": -2.158977746963501, "logps": -95.00120544433594, "loss": 0.2429, "objective": 0.23696589469909668, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.49166667461395264, "ranking_simple": 0.8083333373069763, "regularize": 0.23696589469909668, "step": 185 }, { "dpo_loss": 0.26388806104660034, "dpo_wo_beta": -0.9112051725387573, "epoch": 1.076995748700992, "grad_norm": 14.740228375586371, "learning_rate": 4.042047473739278e-06, "logits": -2.1533920764923096, "logps": -101.71949768066406, "loss": 0.2517, "objective": 0.26388806104660034, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5541666746139526, "ranking_simple": 0.8416666388511658, "regularize": 0.26388806104660034, "step": 190 }, { "dpo_loss": 0.2244579941034317, "dpo_wo_beta": -0.6430780291557312, "epoch": 1.10533774208786, "grad_norm": 10.169064121599527, "learning_rate": 3.976143509544843e-06, "logits": -2.1589295864105225, "logps": -96.5248031616211, "loss": 0.2467, "objective": 0.2244579941034317, "ranking_idealized": 0.6333333253860474, "ranking_idealized_expo": 0.5666666626930237, "ranking_simple": 0.8083333373069763, "regularize": 0.2244579941034317, "step": 195 }, { "dpo_loss": 0.24179764091968536, "dpo_wo_beta": -0.6332272291183472, "epoch": 1.1336797354747283, "grad_norm": 9.444774343787891, "learning_rate": 3.908625404095242e-06, "logits": -2.2753493785858154, "logps": -91.93312072753906, "loss": 0.2563, "objective": 0.24179764091968536, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.8458333611488342, "regularize": 0.24179764091968536, "step": 200 }, { "dpo_loss": 0.25683078169822693, "dpo_wo_beta": -0.8531176447868347, "epoch": 1.1620217288615966, "grad_norm": 9.240319326762517, "learning_rate": 3.839566987447492e-06, "logits": -2.2432618141174316, "logps": -91.3159408569336, "loss": 0.2584, "objective": 0.25683078169822693, "ranking_idealized": 0.5708333253860474, "ranking_idealized_expo": 0.49166667461395264, "ranking_simple": 0.8166666626930237, "regularize": 0.25683078169822693, "step": 205 }, { "dpo_loss": 0.24292893707752228, "dpo_wo_beta": -0.8205318450927734, "epoch": 1.1903637222484649, "grad_norm": 9.283856100785183, "learning_rate": 3.7690437739662928e-06, "logits": -2.2361652851104736, "logps": -90.6613998413086, "loss": 0.2551, "objective": 0.24292893707752228, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.5, "ranking_simple": 0.800000011920929, "regularize": 0.24292893707752228, "step": 210 }, { "epoch": 1.201700519603212, "eval_dpo_loss": 0.7525234222412109, "eval_dpo_wo_beta": -3.7749528884887695, "eval_logits": -2.267778158187866, "eval_logps": -98.14269256591797, "eval_loss": 0.7350714206695557, "eval_objective": 0.7525234222412109, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.5371900796890259, "eval_regularize": 0.7525234222412109, "eval_runtime": 210.8898, "eval_samples_per_second": 27.455, "eval_steps_per_second": 1.148, "step": 212 }, { "dpo_loss": 0.289533793926239, "dpo_wo_beta": -0.8810125589370728, "epoch": 1.2187057156353331, "grad_norm": 10.72372972136692, "learning_rate": 3.697132879750174e-06, "logits": -2.1757090091705322, "logps": -93.64250183105469, "loss": 0.2578, "objective": 0.289533793926239, "ranking_idealized": 0.5583333373069763, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.7875000238418579, "regularize": 0.289533793926239, "step": 215 }, { "dpo_loss": 0.25134381651878357, "dpo_wo_beta": -0.8703542947769165, "epoch": 1.2470477090222012, "grad_norm": 12.940604838816247, "learning_rate": 3.6239129383061764e-06, "logits": -2.121750593185425, "logps": -94.44015502929688, "loss": 0.2676, "objective": 0.25134381651878357, "ranking_idealized": 0.6541666388511658, "ranking_idealized_expo": 0.6041666865348816, "ranking_simple": 0.8208333253860474, "regularize": 0.25134381651878357, "step": 220 }, { "dpo_loss": 0.23937886953353882, "dpo_wo_beta": -0.7396827936172485, "epoch": 1.2753897024090695, "grad_norm": 9.645711793319885, "learning_rate": 3.5494640145652647e-06, "logits": -2.0901684761047363, "logps": -94.10260772705078, "loss": 0.2637, "objective": 0.23937886953353882, "ranking_idealized": 0.6166666746139526, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.8541666865348816, "regularize": 0.23937886953353882, "step": 225 }, { "dpo_loss": 0.2818019688129425, "dpo_wo_beta": -1.1170729398727417, "epoch": 1.3037316957959377, "grad_norm": 8.80210598601974, "learning_rate": 3.4738675173325008e-06, "logits": -1.9860222339630127, "logps": -92.9978256225586, "loss": 0.2776, "objective": 0.2818019688129425, "ranking_idealized": 0.5791666507720947, "ranking_idealized_expo": 0.5041666626930237, "ranking_simple": 0.7749999761581421, "regularize": 0.2818019688129425, "step": 230 }, { "dpo_loss": 0.22621506452560425, "dpo_wo_beta": -0.35843732953071594, "epoch": 1.3320736891828058, "grad_norm": 9.267612473930496, "learning_rate": 3.397206110267713e-06, "logits": -2.1131467819213867, "logps": -87.49403381347656, "loss": 0.2618, "objective": 0.22621506452560425, "ranking_idealized": 0.6041666865348816, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.8333333134651184, "regularize": 0.22621506452560425, "step": 235 }, { "dpo_loss": 0.23632274568080902, "dpo_wo_beta": -0.6697984933853149, "epoch": 1.360415682569674, "grad_norm": 10.68594080832048, "learning_rate": 3.3195636214939943e-06, "logits": -2.130047559738159, "logps": -91.7619857788086, "loss": 0.2584, "objective": 0.23632274568080902, "ranking_idealized": 0.5541666746139526, "ranking_idealized_expo": 0.4791666567325592, "ranking_simple": 0.8125, "regularize": 0.23632274568080902, "step": 240 }, { "dpo_loss": 0.2982023358345032, "dpo_wo_beta": -1.1124054193496704, "epoch": 1.3887576759565423, "grad_norm": 10.330360151122868, "learning_rate": 3.2410249519328848e-06, "logits": -2.1718757152557373, "logps": -93.45353698730469, "loss": 0.2692, "objective": 0.2982023358345032, "ranking_idealized": 0.5916666388511658, "ranking_idealized_expo": 0.5208333134651184, "ranking_simple": 0.8041666746139526, "regularize": 0.2982023358345032, "step": 245 }, { "dpo_loss": 0.2403133064508438, "dpo_wo_beta": -0.7000442147254944, "epoch": 1.4170996693434104, "grad_norm": 13.026578288520353, "learning_rate": 3.1616759824664543e-06, "logits": -2.145325183868408, "logps": -94.18195343017578, "loss": 0.269, "objective": 0.2403133064508438, "ranking_idealized": 0.574999988079071, "ranking_idealized_expo": 0.47083333134651184, "ranking_simple": 0.824999988079071, "regularize": 0.2403133064508438, "step": 250 }, { "dpo_loss": 0.23977436125278473, "dpo_wo_beta": -0.5784927010536194, "epoch": 1.4454416627302786, "grad_norm": 10.959901566104394, "learning_rate": 3.081603480027826e-06, "logits": -2.108074426651001, "logps": -94.5383529663086, "loss": 0.2625, "objective": 0.23977436125278473, "ranking_idealized": 0.6166666746139526, "ranking_idealized_expo": 0.5458333492279053, "ranking_simple": 0.8374999761581421, "regularize": 0.23977436125278473, "step": 255 }, { "dpo_loss": 0.25297579169273376, "dpo_wo_beta": -0.7996426820755005, "epoch": 1.473783656117147, "grad_norm": 9.578050078679867, "learning_rate": 3.0008950027228035e-06, "logits": -2.1828908920288086, "logps": -92.77781677246094, "loss": 0.232, "objective": 0.25297579169273376, "ranking_idealized": 0.5666666626930237, "ranking_idealized_expo": 0.48750001192092896, "ranking_simple": 0.8041666746139526, "regularize": 0.25297579169273376, "step": 260 }, { "dpo_loss": 0.2830916941165924, "dpo_wo_beta": -1.124144434928894, "epoch": 1.5021256495040152, "grad_norm": 11.27765707111355, "learning_rate": 2.9196388040863695e-06, "logits": -2.1150081157684326, "logps": -95.04662322998047, "loss": 0.2623, "objective": 0.2830916941165924, "ranking_idealized": 0.5791666507720947, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.8041666746139526, "regularize": 0.2830916941165924, "step": 265 }, { "epoch": 1.5021256495040152, "eval_dpo_loss": 0.7739136815071106, "eval_dpo_wo_beta": -4.163427829742432, "eval_logits": -2.1478331089019775, "eval_logps": -100.8313217163086, "eval_loss": 0.7400166392326355, "eval_objective": 0.7739136815071106, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.53925621509552, "eval_regularize": 0.7739136815071106, "eval_runtime": 210.8657, "eval_samples_per_second": 27.458, "eval_steps_per_second": 1.148, "step": 265 }, { "dpo_loss": 0.2930367887020111, "dpo_wo_beta": -1.3651045560836792, "epoch": 1.5304676428908834, "grad_norm": 11.715215816813723, "learning_rate": 2.8379237365787426e-06, "logits": -2.035703182220459, "logps": -97.7331771850586, "loss": 0.253, "objective": 0.2930367887020111, "ranking_idealized": 0.574999988079071, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.7916666865348816, "regularize": 0.2930367887020111, "step": 270 }, { "dpo_loss": 0.24886849522590637, "dpo_wo_beta": -0.8069366216659546, "epoch": 1.5588096362777515, "grad_norm": 8.958944325794365, "learning_rate": 2.7558391544265127e-06, "logits": -1.9700883626937866, "logps": -97.53855895996094, "loss": 0.2491, "objective": 0.24886849522590637, "ranking_idealized": 0.6208333373069763, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.8125, "regularize": 0.24886849522590637, "step": 275 }, { "dpo_loss": 0.22936613857746124, "dpo_wo_beta": -0.6120084524154663, "epoch": 1.5871516296646198, "grad_norm": 10.814739938498821, "learning_rate": 2.6734748159151104e-06, "logits": -1.9118597507476807, "logps": -98.06639099121094, "loss": 0.2491, "objective": 0.22936613857746124, "ranking_idealized": 0.574999988079071, "ranking_idealized_expo": 0.5041666626930237, "ranking_simple": 0.8374999761581421, "regularize": 0.22936613857746124, "step": 280 }, { "dpo_loss": 0.22401383519172668, "dpo_wo_beta": -0.5180224776268005, "epoch": 1.615493623051488, "grad_norm": 11.270657822712987, "learning_rate": 2.5909207852394363e-06, "logits": -1.9585484266281128, "logps": -100.70836639404297, "loss": 0.2348, "objective": 0.22401383519172668, "ranking_idealized": 0.5874999761581421, "ranking_idealized_expo": 0.512499988079071, "ranking_simple": 0.8291666507720947, "regularize": 0.22401383519172668, "step": 285 }, { "dpo_loss": 0.2646006941795349, "dpo_wo_beta": -0.7763135433197021, "epoch": 1.643835616438356, "grad_norm": 10.585292794409252, "learning_rate": 2.508267334019988e-06, "logits": -1.9566444158554077, "logps": -97.0122299194336, "loss": 0.2532, "objective": 0.2646006941795349, "ranking_idealized": 0.612500011920929, "ranking_idealized_expo": 0.5625, "ranking_simple": 0.8041666746139526, "regularize": 0.2646006941795349, "step": 290 }, { "dpo_loss": 0.23113909363746643, "dpo_wo_beta": -0.6497251987457275, "epoch": 1.6721776098252243, "grad_norm": 11.90240881956814, "learning_rate": 2.4256048425921693e-06, "logits": -1.8574607372283936, "logps": -94.91531372070312, "loss": 0.2476, "objective": 0.23113909363746643, "ranking_idealized": 0.5791666507720947, "ranking_idealized_expo": 0.4749999940395355, "ranking_simple": 0.8333333134651184, "regularize": 0.23113909363746643, "step": 295 }, { "dpo_loss": 0.22116926312446594, "dpo_wo_beta": -0.6268281936645508, "epoch": 1.7005196032120926, "grad_norm": 11.745161783871675, "learning_rate": 2.3430237011767166e-06, "logits": -1.895004153251648, "logps": -97.79885864257812, "loss": 0.2266, "objective": 0.22116926312446594, "ranking_idealized": 0.6583333611488342, "ranking_idealized_expo": 0.574999988079071, "ranking_simple": 0.8333333134651184, "regularize": 0.22116926312446594, "step": 300 }, { "dpo_loss": 0.24756571650505066, "dpo_wo_beta": -0.9131773114204407, "epoch": 1.7288615965989607, "grad_norm": 12.299641904512029, "learning_rate": 2.2606142110393248e-06, "logits": -1.8061485290527344, "logps": -96.69060516357422, "loss": 0.2379, "objective": 0.24756571650505066, "ranking_idealized": 0.5625, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.8291666507720947, "regularize": 0.24756571650505066, "step": 305 }, { "dpo_loss": 0.2321903556585312, "dpo_wo_beta": -0.6867564916610718, "epoch": 1.7572035899858292, "grad_norm": 13.489735935272718, "learning_rate": 2.1784664857475356e-06, "logits": -1.8388514518737793, "logps": -95.04447937011719, "loss": 0.2456, "objective": 0.2321903556585312, "ranking_idealized": 0.6083333492279053, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.824999988079071, "regularize": 0.2321903556585312, "step": 310 }, { "dpo_loss": 0.2901044189929962, "dpo_wo_beta": -1.1286156177520752, "epoch": 1.7855455833726972, "grad_norm": 10.887596324980125, "learning_rate": 2.096670352632873e-06, "logits": -1.75984525680542, "logps": -94.63612365722656, "loss": 0.2571, "objective": 0.2901044189929962, "ranking_idealized": 0.6041666865348816, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.8083333373069763, "regularize": 0.2901044189929962, "step": 315 }, { "epoch": 1.8025507794048181, "eval_dpo_loss": 0.7664583325386047, "eval_dpo_wo_beta": -4.09501838684082, "eval_logits": -1.9888346195220947, "eval_logps": -102.3712158203125, "eval_loss": 0.7400712966918945, "eval_objective": 0.7664583325386047, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.53925621509552, "eval_regularize": 0.7664583325386047, "eval_runtime": 210.274, "eval_samples_per_second": 27.535, "eval_steps_per_second": 1.151, "step": 318 }, { "dpo_loss": 0.2219768464565277, "dpo_wo_beta": -0.47742757201194763, "epoch": 1.8138875767595655, "grad_norm": 11.029480506309918, "learning_rate": 2.01531525456598e-06, "logits": -1.9175788164138794, "logps": -99.74655151367188, "loss": 0.2404, "objective": 0.2219768464565277, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5, "ranking_simple": 0.824999988079071, "regularize": 0.2219768464565277, "step": 320 }, { "dpo_loss": 0.24908211827278137, "dpo_wo_beta": -0.8014059066772461, "epoch": 1.8422295701464337, "grad_norm": 12.92850322071669, "learning_rate": 1.93449015215215e-06, "logits": -2.0084919929504395, "logps": -101.09780883789062, "loss": 0.2586, "objective": 0.24908211827278137, "ranking_idealized": 0.6333333253860474, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.8333333134651184, "regularize": 0.24908211827278137, "step": 325 }, { "dpo_loss": 0.1984507441520691, "dpo_wo_beta": -0.3766098618507385, "epoch": 1.8705715635333018, "grad_norm": 10.415606016359964, "learning_rate": 1.8542834264542091e-06, "logits": -1.851909875869751, "logps": -94.5366439819336, "loss": 0.2496, "objective": 0.1984507441520691, "ranking_idealized": 0.574999988079071, "ranking_idealized_expo": 0.4791666567325592, "ranking_simple": 0.8291666507720947, "regularize": 0.1984507441520691, "step": 330 }, { "dpo_loss": 0.26707762479782104, "dpo_wo_beta": -0.9339324831962585, "epoch": 1.89891355692017, "grad_norm": 10.078352873471246, "learning_rate": 1.7747827823491253e-06, "logits": -1.9827288389205933, "logps": -94.26249694824219, "loss": 0.2463, "objective": 0.26707762479782104, "ranking_idealized": 0.5958333611488342, "ranking_idealized_expo": 0.4958333373069763, "ranking_simple": 0.8125, "regularize": 0.26707762479782104, "step": 335 }, { "dpo_loss": 0.2447831928730011, "dpo_wo_beta": -0.7387041449546814, "epoch": 1.9272555503070383, "grad_norm": 10.88136655004607, "learning_rate": 1.6960751526240122e-06, "logits": -1.9671465158462524, "logps": -98.63937377929688, "loss": 0.2399, "objective": 0.2447831928730011, "ranking_idealized": 0.6416666507720947, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.8583333492279053, "regularize": 0.2447831928730011, "step": 340 }, { "dpo_loss": 0.2123527079820633, "dpo_wo_beta": -0.5544185638427734, "epoch": 1.9555975436939064, "grad_norm": 11.18260747105762, "learning_rate": 1.6182466029163974e-06, "logits": -1.9572845697402954, "logps": -100.18721008300781, "loss": 0.2211, "objective": 0.2123527079820633, "ranking_idealized": 0.574999988079071, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.8208333253860474, "regularize": 0.2123527079820633, "step": 345 }, { "dpo_loss": 0.2570362389087677, "dpo_wo_beta": -0.7474013566970825, "epoch": 1.9839395370807746, "grad_norm": 11.061918116138507, "learning_rate": 1.541382237602721e-06, "logits": -1.8960832357406616, "logps": -101.65901947021484, "loss": 0.2316, "objective": 0.2570362389087677, "ranking_idealized": 0.5791666507720947, "ranking_idealized_expo": 0.48750001192092896, "ranking_simple": 0.7916666865348816, "regularize": 0.2570362389087677, "step": 350 }, { "dpo_loss": 0.19961656630039215, "dpo_wo_beta": -0.5642960667610168, "epoch": 2.012281530467643, "grad_norm": 7.569515164252156, "learning_rate": 1.465566106737942e-06, "logits": -1.8380100727081299, "logps": -102.71571350097656, "loss": 0.2103, "objective": 0.19961656630039215, "ranking_idealized": 0.5958333611488342, "ranking_idealized_expo": 0.5041666626930237, "ranking_simple": 0.8374999761581421, "regularize": 0.19961656630039215, "step": 355 }, { "dpo_loss": 0.11018560826778412, "dpo_wo_beta": -0.12253165245056152, "epoch": 2.040623523854511, "grad_norm": 6.632276986432463, "learning_rate": 1.3908811141480408e-06, "logits": -1.867693543434143, "logps": -103.06665802001953, "loss": 0.118, "objective": 0.11018560826778412, "ranking_idealized": 0.6625000238418579, "ranking_idealized_expo": 0.5375000238418579, "ranking_simple": 0.9041666388511658, "regularize": 0.11018560826778412, "step": 360 }, { "dpo_loss": 0.12077057361602783, "dpo_wo_beta": -0.197490856051445, "epoch": 2.0689655172413794, "grad_norm": 10.213186193965676, "learning_rate": 1.3174089267758983e-06, "logits": -1.8255099058151245, "logps": -110.3724136352539, "loss": 0.118, "objective": 0.12077057361602783, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.8666666746139526, "regularize": 0.12077057361602783, "step": 365 }, { "dpo_loss": 0.1337815225124359, "dpo_wo_beta": -0.27523547410964966, "epoch": 2.0973075106282475, "grad_norm": 9.926730675582434, "learning_rate": 1.245229885379699e-06, "logits": -1.7588540315628052, "logps": -111.99506378173828, "loss": 0.1227, "objective": 0.1337815225124359, "ranking_idealized": 0.5708333253860474, "ranking_idealized_expo": 0.4791666567325592, "ranking_simple": 0.8916666507720947, "regularize": 0.1337815225124359, "step": 370 }, { "epoch": 2.1029759093056213, "eval_dpo_loss": 0.9223728179931641, "eval_dpo_wo_beta": -6.4510064125061035, "eval_logits": -1.8644566535949707, "eval_logps": -122.00161743164062, "eval_loss": 0.8844180107116699, "eval_objective": 0.9223728179931641, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.5423553586006165, "eval_regularize": 0.9223728179931641, "eval_runtime": 210.7356, "eval_samples_per_second": 27.475, "eval_steps_per_second": 1.148, "step": 371 }, { "dpo_loss": 0.10664375871419907, "dpo_wo_beta": -0.2532973289489746, "epoch": 2.1256495040151155, "grad_norm": 9.740007111179482, "learning_rate": 1.1744229166814889e-06, "logits": -1.696647047996521, "logps": -118.39366149902344, "loss": 0.1103, "objective": 0.10664375871419907, "ranking_idealized": 0.675000011920929, "ranking_idealized_expo": 0.5916666388511658, "ranking_simple": 0.925000011920929, "regularize": 0.10664375871419907, "step": 375 }, { "dpo_loss": 0.12854978442192078, "dpo_wo_beta": -0.27664583921432495, "epoch": 2.153991497401984, "grad_norm": 9.699256456859702, "learning_rate": 1.1050654470619602e-06, "logits": -1.700494647026062, "logps": -114.1063232421875, "loss": 0.1208, "objective": 0.12854978442192078, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.8999999761581421, "regularize": 0.12854978442192078, "step": 380 }, { "dpo_loss": 0.10418140888214111, "dpo_wo_beta": -0.09889766573905945, "epoch": 2.182333490788852, "grad_norm": 9.620361843085416, "learning_rate": 1.0372333178958462e-06, "logits": -1.8633235692977905, "logps": -110.55794525146484, "loss": 0.1244, "objective": 0.10418140888214111, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.5041666626930237, "ranking_simple": 0.9125000238418579, "regularize": 0.10418140888214111, "step": 385 }, { "dpo_loss": 0.12462247163057327, "dpo_wo_beta": -0.2658768594264984, "epoch": 2.21067548417572, "grad_norm": 11.000881222201947, "learning_rate": 9.710007026204896e-07, "logits": -1.7877620458602905, "logps": -112.08268737792969, "loss": 0.1204, "objective": 0.12462247163057327, "ranking_idealized": 0.5249999761581421, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.8833333253860474, "regularize": 0.12462247163057327, "step": 390 }, { "dpo_loss": 0.11183874309062958, "dpo_wo_beta": -0.3540593981742859, "epoch": 2.2390174775625886, "grad_norm": 8.717110295390793, "learning_rate": 9.064400256282757e-07, "logits": -1.8010636568069458, "logps": -110.48490142822266, "loss": 0.1248, "objective": 0.11183874309062958, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.9041666388511658, "regularize": 0.11183874309062958, "step": 395 }, { "dpo_loss": 0.12893003225326538, "dpo_wo_beta": -0.3680768311023712, "epoch": 2.2673594709494567, "grad_norm": 9.562073048936949, "learning_rate": 8.436218830716259e-07, "logits": -1.8909595012664795, "logps": -111.70219421386719, "loss": 0.1193, "objective": 0.12893003225326538, "ranking_idealized": 0.6625000238418579, "ranking_idealized_expo": 0.5708333253860474, "ranking_simple": 0.9166666865348816, "regularize": 0.12893003225326538, "step": 400 }, { "dpo_loss": 0.13196416199207306, "dpo_wo_beta": -0.17852090299129486, "epoch": 2.295701464336325, "grad_norm": 9.166021194752298, "learning_rate": 7.826149656671386e-07, "logits": -1.9320632219314575, "logps": -108.1246566772461, "loss": 0.1267, "objective": 0.13196416199207306, "ranking_idealized": 0.6416666507720947, "ranking_idealized_expo": 0.5583333373069763, "ranking_simple": 0.8958333134651184, "regularize": 0.13196416199207306, "step": 405 }, { "dpo_loss": 0.11071384698152542, "dpo_wo_beta": -0.1424117088317871, "epoch": 2.324043457723193, "grad_norm": 8.918983804471582, "learning_rate": 7.234859835833022e-07, "logits": -1.8304682970046997, "logps": -111.2301025390625, "loss": 0.112, "objective": 0.11071384698152542, "ranking_idealized": 0.5708333253860474, "ranking_idealized_expo": 0.4791666567325592, "ranking_simple": 0.9041666388511658, "regularize": 0.11071384698152542, "step": 410 }, { "dpo_loss": 0.1223960742354393, "dpo_wo_beta": -0.1956464648246765, "epoch": 2.3523854511100613, "grad_norm": 9.386393866562546, "learning_rate": 6.662995934939007e-07, "logits": -1.8708041906356812, "logps": -111.06449890136719, "loss": 0.1155, "objective": 0.1223960742354393, "ranking_idealized": 0.5958333611488342, "ranking_idealized_expo": 0.5291666388511658, "ranking_simple": 0.9083333611488342, "regularize": 0.1223960742354393, "step": 415 }, { "dpo_loss": 0.12930770218372345, "dpo_wo_beta": -0.21560731530189514, "epoch": 2.3807274444969297, "grad_norm": 11.0131183307354, "learning_rate": 6.111183278768956e-07, "logits": -1.860797643661499, "logps": -113.08780670166016, "loss": 0.133, "objective": 0.12930770218372345, "ranking_idealized": 0.5666666626930237, "ranking_idealized_expo": 0.4749999940395355, "ranking_simple": 0.9208333492279053, "regularize": 0.12930770218372345, "step": 420 }, { "epoch": 2.403401039206424, "eval_dpo_loss": 0.8785684108734131, "eval_dpo_wo_beta": -5.887755870819092, "eval_logits": -2.0276894569396973, "eval_logps": -117.1216812133789, "eval_loss": 0.8447906374931335, "eval_objective": 0.8785684108734131, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.5413222908973694, "eval_regularize": 0.8785684108734131, "eval_runtime": 209.8564, "eval_samples_per_second": 27.59, "eval_steps_per_second": 1.153, "step": 424 }, { "dpo_loss": 0.117975153028965, "dpo_wo_beta": -0.1884605884552002, "epoch": 2.413793103448276, "grad_norm": 11.036168833651558, "learning_rate": 5.580025266360764e-07, "logits": -1.7822004556655884, "logps": -114.43038177490234, "loss": 0.1465, "objective": 0.117975153028965, "ranking_idealized": 0.6666666865348816, "ranking_idealized_expo": 0.5625, "ranking_simple": 0.9375, "regularize": 0.117975153028965, "step": 425 }, { "dpo_loss": 0.1465020477771759, "dpo_wo_beta": -0.2595965266227722, "epoch": 2.442135096835144, "grad_norm": 10.595070818850646, "learning_rate": 5.070102711202606e-07, "logits": -1.8692681789398193, "logps": -110.2762680053711, "loss": 0.1276, "objective": 0.1465020477771759, "ranking_idealized": 0.5916666388511658, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.8958333134651184, "regularize": 0.1465020477771759, "step": 430 }, { "dpo_loss": 0.09775053709745407, "dpo_wo_beta": -0.12755917012691498, "epoch": 2.4704770902220123, "grad_norm": 9.393206692367766, "learning_rate": 4.581973206121948e-07, "logits": -1.8968538045883179, "logps": -112.28767395019531, "loss": 0.1175, "objective": 0.09775053709745407, "ranking_idealized": 0.5874999761581421, "ranking_idealized_expo": 0.512499988079071, "ranking_simple": 0.9083333611488342, "regularize": 0.09775053709745407, "step": 435 }, { "dpo_loss": 0.14228057861328125, "dpo_wo_beta": -0.3639788329601288, "epoch": 2.4988190836088804, "grad_norm": 8.020134663378592, "learning_rate": 4.116170513565942e-07, "logits": -1.8666160106658936, "logps": -109.18843078613281, "loss": 0.1167, "objective": 0.14228057861328125, "ranking_idealized": 0.574999988079071, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.875, "regularize": 0.14228057861328125, "step": 440 }, { "dpo_loss": 0.13583588600158691, "dpo_wo_beta": -0.2074100226163864, "epoch": 2.527161076995749, "grad_norm": 9.224367796824264, "learning_rate": 3.6732039819400686e-07, "logits": -1.8071045875549316, "logps": -107.2675552368164, "loss": 0.1319, "objective": 0.13583588600158691, "ranking_idealized": 0.5625, "ranking_idealized_expo": 0.49166667461395264, "ranking_simple": 0.8791666626930237, "regularize": 0.13583588600158691, "step": 445 }, { "dpo_loss": 0.17114870250225067, "dpo_wo_beta": -0.43270742893218994, "epoch": 2.555503070382617, "grad_norm": 11.265861710797749, "learning_rate": 3.253557988643072e-07, "logits": -1.9256045818328857, "logps": -111.20384216308594, "loss": 0.1288, "objective": 0.17114870250225067, "ranking_idealized": 0.5874999761581421, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.862500011920929, "regularize": 0.17114870250225067, "step": 450 }, { "dpo_loss": 0.10827689617872238, "dpo_wo_beta": -0.1751028150320053, "epoch": 2.583845063769485, "grad_norm": 9.605136286662574, "learning_rate": 2.8576914104074425e-07, "logits": -1.9289051294326782, "logps": -109.37706756591797, "loss": 0.1168, "objective": 0.10827689617872238, "ranking_idealized": 0.5958333611488342, "ranking_idealized_expo": 0.4958333373069763, "ranking_simple": 0.9125000238418579, "regularize": 0.10827689617872238, "step": 455 }, { "dpo_loss": 0.11124877631664276, "dpo_wo_beta": -0.28054580092430115, "epoch": 2.6121870571563535, "grad_norm": 9.957466667064367, "learning_rate": 2.486037121524448e-07, "logits": -1.93342924118042, "logps": -113.2356948852539, "loss": 0.1169, "objective": 0.11124877631664276, "ranking_idealized": 0.6291666626930237, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.9375, "regularize": 0.11124877631664276, "step": 460 }, { "dpo_loss": 0.12714476883411407, "dpo_wo_beta": -0.22146105766296387, "epoch": 2.6405290505432215, "grad_norm": 10.04326854921629, "learning_rate": 2.13900152050239e-07, "logits": -1.8874350786209106, "logps": -108.94982147216797, "loss": 0.1189, "objective": 0.12714476883411407, "ranking_idealized": 0.5458333492279053, "ranking_idealized_expo": 0.4791666567325592, "ranking_simple": 0.862500011920929, "regularize": 0.12714476883411407, "step": 465 }, { "dpo_loss": 0.12151040881872177, "dpo_wo_beta": -0.26416900753974915, "epoch": 2.66887104393009, "grad_norm": 8.777820527737605, "learning_rate": 1.8169640856758652e-07, "logits": -1.9314534664154053, "logps": -112.75170135498047, "loss": 0.1254, "objective": 0.12151040881872177, "ranking_idealized": 0.6291666626930237, "ranking_idealized_expo": 0.5541666746139526, "ranking_simple": 0.9083333611488342, "regularize": 0.12151040881872177, "step": 470 }, { "dpo_loss": 0.12749101221561432, "dpo_wo_beta": -0.2816121280193329, "epoch": 2.697213037316958, "grad_norm": 9.221778751171357, "learning_rate": 1.5202769602517514e-07, "logits": -1.8307260274887085, "logps": -109.39693450927734, "loss": 0.1211, "objective": 0.12749101221561432, "ranking_idealized": 0.6041666865348816, "ranking_idealized_expo": 0.4958333373069763, "ranking_simple": 0.8999999761581421, "regularize": 0.12749101221561432, "step": 475 }, { "epoch": 2.708549834671705, "eval_dpo_loss": 0.8738968372344971, "eval_dpo_wo_beta": -5.815241813659668, "eval_logits": -2.0271613597869873, "eval_logps": -116.42301177978516, "eval_loss": 0.8371492624282837, "eval_objective": 0.8738968372344971, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.5402892827987671, "eval_regularize": 0.8738968372344971, "eval_runtime": 211.9437, "eval_samples_per_second": 27.319, "eval_steps_per_second": 1.142, "step": 477 }, { "dpo_loss": 0.13781045377254486, "dpo_wo_beta": -0.2485995590686798, "epoch": 2.725555030703826, "grad_norm": 10.971551462649595, "learning_rate": 1.2492645672457838e-07, "logits": -1.9437103271484375, "logps": -108.93817901611328, "loss": 0.1267, "objective": 0.13781045377254486, "ranking_idealized": 0.5916666388511658, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.8999999761581421, "regularize": 0.13781045377254486, "step": 480 }, { "dpo_loss": 0.11082082241773605, "dpo_wo_beta": -0.10876031965017319, "epoch": 2.753897024090694, "grad_norm": 10.884940640535042, "learning_rate": 1.004223254730749e-07, "logits": -1.7556992769241333, "logps": -114.1142807006836, "loss": 0.1222, "objective": 0.11082082241773605, "ranking_idealized": 0.6166666746139526, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.9041666388511658, "regularize": 0.11082082241773605, "step": 485 }, { "dpo_loss": 0.09154360741376877, "dpo_wo_beta": -0.05899694189429283, "epoch": 2.7822390174775626, "grad_norm": 11.015982469457516, "learning_rate": 7.854209717842231e-08, "logits": -1.8848822116851807, "logps": -110.15470886230469, "loss": 0.1058, "objective": 0.09154360741376877, "ranking_idealized": 0.5874999761581421, "ranking_idealized_expo": 0.47083333134651184, "ranking_simple": 0.9333333373069763, "regularize": 0.09154360741376877, "step": 490 }, { "dpo_loss": 0.10964310169219971, "dpo_wo_beta": -0.07648710906505585, "epoch": 2.8105810108644307, "grad_norm": 10.079416267782939, "learning_rate": 5.930969754901844e-08, "logits": -1.8575230836868286, "logps": -108.52234649658203, "loss": 0.1192, "objective": 0.10964310169219971, "ranking_idealized": 0.5583333373069763, "ranking_idealized_expo": 0.4749999940395355, "ranking_simple": 0.8666666746139526, "regularize": 0.10964310169219971, "step": 495 }, { "dpo_loss": 0.09479068219661713, "dpo_wo_beta": -0.03411731496453285, "epoch": 2.838923004251299, "grad_norm": 9.84080114767598, "learning_rate": 4.2746156931490756e-08, "logits": -1.8439643383026123, "logps": -109.77281188964844, "loss": 0.1213, "objective": 0.09479068219661713, "ranking_idealized": 0.6208333373069763, "ranking_idealized_expo": 0.5208333134651184, "ranking_simple": 0.925000011920929, "regularize": 0.09479068219661713, "step": 500 }, { "dpo_loss": 0.12725140154361725, "dpo_wo_beta": -0.18973813951015472, "epoch": 2.8672649976381672, "grad_norm": 9.973754192936779, "learning_rate": 2.8869587314321324e-08, "logits": -1.8574442863464355, "logps": -110.32710266113281, "loss": 0.132, "objective": 0.12725140154361725, "ranking_idealized": 0.6166666746139526, "ranking_idealized_expo": 0.5208333134651184, "ranking_simple": 0.8999999761581421, "regularize": 0.12725140154361725, "step": 505 }, { "dpo_loss": 0.10469380766153336, "dpo_wo_beta": -0.1985001415014267, "epoch": 2.8956069910250353, "grad_norm": 8.936464383287202, "learning_rate": 1.7695162522652352e-08, "logits": -1.8629390001296997, "logps": -113.56767272949219, "loss": 0.1218, "objective": 0.10469380766153336, "ranking_idealized": 0.6041666865348816, "ranking_idealized_expo": 0.5, "ranking_simple": 0.8916666507720947, "regularize": 0.10469380766153336, "step": 510 }, { "dpo_loss": 0.11260154843330383, "dpo_wo_beta": -0.15691885352134705, "epoch": 2.9239489844119038, "grad_norm": 9.442300088571939, "learning_rate": 9.235101625932885e-09, "logits": -1.946829915046692, "logps": -108.54016876220703, "loss": 0.1258, "objective": 0.11260154843330383, "ranking_idealized": 0.5791666507720947, "ranking_idealized_expo": 0.48750001192092896, "ranking_simple": 0.925000011920929, "regularize": 0.11260154843330383, "step": 515 }, { "dpo_loss": 0.12230218201875687, "dpo_wo_beta": -0.10489177703857422, "epoch": 2.952290977798772, "grad_norm": 9.279898048101137, "learning_rate": 3.4986555765434415e-09, "logits": -1.8482831716537476, "logps": -114.20655059814453, "loss": 0.1228, "objective": 0.12230218201875687, "ranking_idealized": 0.6291666626930237, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.8708333373069763, "regularize": 0.12230218201875687, "step": 520 }, { "dpo_loss": 0.13335375487804413, "dpo_wo_beta": -0.35261282324790955, "epoch": 2.9806329711856403, "grad_norm": 11.094809681697281, "learning_rate": 4.920970940180958e-10, "logits": -1.876869797706604, "logps": -111.03084564208984, "loss": 0.1235, "objective": 0.13335375487804413, "ranking_idealized": 0.6291666626930237, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.8958333134651184, "regularize": 0.13335375487804413, "step": 525 }, { "epoch": 2.9976381672177608, "step": 528, "total_flos": 0.0, "train_loss": 0.023984534440167023, "train_runtime": 3310.3799, "train_samples_per_second": 46.039, "train_steps_per_second": 0.159 } ], "logging_steps": 5, "max_steps": 528, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 53, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }