{ "best_metric": 0.5869565010070801, "best_model_checkpoint": "./qwen2.5-0.5b/qwen2.5-0.5b-expo-DPO-ES-TRY/checkpoint-583", "epoch": 2.9976381672177608, "eval_steps": 53, "global_step": 1056, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "dpo_loss": 0.6931471824645996, "dpo_wo_beta": -0.6931471824645996, "epoch": 0.005668398677373642, "grad_norm": 13.433600669124935, "learning_rate": 9.433962264150944e-08, "logits": -1.3874311447143555, "logps": -88.43561553955078, "loss": 0.6931, "objective": 0.6931471824645996, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5208333134651184, "ranking_simple": 0.5208333134651184, "regularize": 0.6931471824645996, "step": 1 }, { "dpo_loss": 0.693236768245697, "dpo_wo_beta": -0.6993356347084045, "epoch": 0.02834199338686821, "grad_norm": 13.640653628388394, "learning_rate": 4.716981132075472e-07, "logits": -1.4090652465820312, "logps": -84.34337615966797, "loss": 0.693, "objective": 0.693236768245697, "ranking_idealized": 0.6197916865348816, "ranking_idealized_expo": 0.546875, "ranking_simple": 0.546875, "regularize": 0.693236768245697, "step": 5 }, { "dpo_loss": 0.6845630407333374, "dpo_wo_beta": -0.7111619710922241, "epoch": 0.05668398677373642, "grad_norm": 12.626074407134174, "learning_rate": 9.433962264150944e-07, "logits": -1.4784893989562988, "logps": -81.94055938720703, "loss": 0.6892, "objective": 0.6845630407333374, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.512499988079071, "regularize": 0.6845630407333374, "step": 10 }, { "dpo_loss": 0.6825469136238098, "dpo_wo_beta": -0.8259204626083374, "epoch": 0.08502598016060463, "grad_norm": 12.374180595083178, "learning_rate": 1.4150943396226415e-06, "logits": -1.4932299852371216, "logps": -81.52880096435547, "loss": 0.6814, "objective": 0.6825469136238098, "ranking_idealized": 0.5916666388511658, "ranking_idealized_expo": 0.5041666626930237, "ranking_simple": 0.512499988079071, "regularize": 0.6825469136238098, "step": 15 }, { "dpo_loss": 0.6950914263725281, "dpo_wo_beta": -1.2390469312667847, "epoch": 0.11336797354747284, "grad_norm": 14.839934392200913, "learning_rate": 1.8867924528301889e-06, "logits": -1.5371100902557373, "logps": -82.72624969482422, "loss": 0.6711, "objective": 0.6950914263725281, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5375000238418579, "ranking_simple": 0.5249999761581421, "regularize": 0.6950914263725281, "step": 20 }, { "dpo_loss": 0.6556071043014526, "dpo_wo_beta": -1.110619068145752, "epoch": 0.14170996693434104, "grad_norm": 12.89805052529156, "learning_rate": 2.358490566037736e-06, "logits": -1.6399922370910645, "logps": -81.59695434570312, "loss": 0.6589, "objective": 0.6556071043014526, "ranking_idealized": 0.5958333611488342, "ranking_idealized_expo": 0.4958333373069763, "ranking_simple": 0.5249999761581421, "regularize": 0.6556071043014526, "step": 25 }, { "dpo_loss": 0.6518108248710632, "dpo_wo_beta": -1.2506839036941528, "epoch": 0.17005196032120926, "grad_norm": 12.64998937636519, "learning_rate": 2.830188679245283e-06, "logits": -1.6404598951339722, "logps": -83.20111846923828, "loss": 0.6451, "objective": 0.6518108248710632, "ranking_idealized": 0.6041666865348816, "ranking_idealized_expo": 0.5291666388511658, "ranking_simple": 0.5666666626930237, "regularize": 0.6518108248710632, "step": 30 }, { "dpo_loss": 0.6226770877838135, "dpo_wo_beta": -1.394917368888855, "epoch": 0.19839395370807747, "grad_norm": 13.760162421635227, "learning_rate": 3.30188679245283e-06, "logits": -1.6237396001815796, "logps": -87.80964660644531, "loss": 0.6189, "objective": 0.6226770877838135, "ranking_idealized": 0.612500011920929, "ranking_idealized_expo": 0.5041666626930237, "ranking_simple": 0.5583333373069763, "regularize": 0.6226770877838135, "step": 35 }, { "dpo_loss": 0.5924390554428101, "dpo_wo_beta": -1.422450304031372, "epoch": 0.22673594709494568, "grad_norm": 16.810886476613117, "learning_rate": 3.7735849056603777e-06, "logits": -1.620682954788208, "logps": -91.93690490722656, "loss": 0.6076, "objective": 0.5924390554428101, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5208333134651184, "ranking_simple": 0.5708333253860474, "regularize": 0.5924390554428101, "step": 40 }, { "dpo_loss": 0.573756217956543, "dpo_wo_beta": -1.3691534996032715, "epoch": 0.25507794048181387, "grad_norm": 13.798774501924722, "learning_rate": 4.245283018867925e-06, "logits": -1.7814558744430542, "logps": -92.24474334716797, "loss": 0.5989, "objective": 0.573756217956543, "ranking_idealized": 0.6208333373069763, "ranking_idealized_expo": 0.5375000238418579, "ranking_simple": 0.5666666626930237, "regularize": 0.573756217956543, "step": 45 }, { "dpo_loss": 0.5726417899131775, "dpo_wo_beta": -1.3605374097824097, "epoch": 0.2834199338686821, "grad_norm": 12.568473894025988, "learning_rate": 4.716981132075472e-06, "logits": -1.808895468711853, "logps": -90.65751647949219, "loss": 0.5954, "objective": 0.5726417899131775, "ranking_idealized": 0.5416666865348816, "ranking_idealized_expo": 0.42500001192092896, "ranking_simple": 0.574999988079071, "regularize": 0.5726417899131775, "step": 50 }, { "epoch": 0.300425129900803, "eval_dpo_loss": 0.7112604975700378, "eval_dpo_wo_beta": -2.2659413814544678, "eval_logits": -1.892814040184021, "eval_logps": -101.36742401123047, "eval_loss": 0.6816489100456238, "eval_objective": 0.7112604975700378, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.5237603187561035, "eval_regularize": 0.7112604975700378, "eval_runtime": 211.6587, "eval_samples_per_second": 27.355, "eval_steps_per_second": 1.143, "step": 53 }, { "dpo_loss": 0.5827316045761108, "dpo_wo_beta": -1.6213361024856567, "epoch": 0.3117619272555503, "grad_norm": 14.442715913160086, "learning_rate": 4.999781286194085e-06, "logits": -1.8762638568878174, "logps": -93.41423797607422, "loss": 0.5721, "objective": 0.5827316045761108, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.6041666865348816, "regularize": 0.5827316045761108, "step": 55 }, { "dpo_loss": 0.5553872585296631, "dpo_wo_beta": -1.6468366384506226, "epoch": 0.3401039206424185, "grad_norm": 13.845514282811145, "learning_rate": 4.997321195347154e-06, "logits": -1.8914529085159302, "logps": -90.59642028808594, "loss": 0.5756, "objective": 0.5553872585296631, "ranking_idealized": 0.6333333253860474, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.5791666507720947, "regularize": 0.5553872585296631, "step": 60 }, { "dpo_loss": 0.5302771329879761, "dpo_wo_beta": -1.3166770935058594, "epoch": 0.3684459140292867, "grad_norm": 10.846857687148022, "learning_rate": 4.992130320438411e-06, "logits": -1.8399535417556763, "logps": -86.60197448730469, "loss": 0.5586, "objective": 0.5302771329879761, "ranking_idealized": 0.5708333253860474, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.6208333373069763, "regularize": 0.5302771329879761, "step": 65 }, { "dpo_loss": 0.5711485743522644, "dpo_wo_beta": -1.7437169551849365, "epoch": 0.39678790741615494, "grad_norm": 13.787840238803502, "learning_rate": 4.984214337613357e-06, "logits": -1.8178967237472534, "logps": -91.10688781738281, "loss": 0.5701, "objective": 0.5711485743522644, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.5874999761581421, "regularize": 0.5711485743522644, "step": 70 }, { "dpo_loss": 0.523643434047699, "dpo_wo_beta": -1.669514536857605, "epoch": 0.42512990080302315, "grad_norm": 13.192298437287352, "learning_rate": 4.97358190288299e-06, "logits": -1.8182169198989868, "logps": -94.8000717163086, "loss": 0.5205, "objective": 0.523643434047699, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5458333492279053, "ranking_simple": 0.6583333611488342, "regularize": 0.523643434047699, "step": 75 }, { "dpo_loss": 0.51079261302948, "dpo_wo_beta": -1.7271808385849, "epoch": 0.45347189418989137, "grad_norm": 15.151373786996814, "learning_rate": 4.9602446426585845e-06, "logits": -1.8920824527740479, "logps": -93.58238220214844, "loss": 0.5285, "objective": 0.51079261302948, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.6458333134651184, "regularize": 0.51079261302948, "step": 80 }, { "dpo_loss": 0.5066012144088745, "dpo_wo_beta": -1.5956443548202515, "epoch": 0.4818138875767596, "grad_norm": 12.328960275584794, "learning_rate": 4.944217141038379e-06, "logits": -1.8741406202316284, "logps": -87.06742858886719, "loss": 0.5202, "objective": 0.5066012144088745, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.6208333373069763, "regularize": 0.5066012144088745, "step": 85 }, { "dpo_loss": 0.5358369946479797, "dpo_wo_beta": -1.9357556104660034, "epoch": 0.5101558809636277, "grad_norm": 12.694483590051824, "learning_rate": 4.925516923860083e-06, "logits": -1.7968534231185913, "logps": -86.77802276611328, "loss": 0.4858, "objective": 0.5358369946479797, "ranking_idealized": 0.5458333492279053, "ranking_idealized_expo": 0.4749999940395355, "ranking_simple": 0.5874999761581421, "regularize": 0.5358369946479797, "step": 90 }, { "dpo_loss": 0.4783257842063904, "dpo_wo_beta": -1.9098786115646362, "epoch": 0.538497874350496, "grad_norm": 14.474706973531484, "learning_rate": 4.904164439536626e-06, "logits": -1.8568389415740967, "logps": -88.12813568115234, "loss": 0.4865, "objective": 0.4783257842063904, "ranking_idealized": 0.6291666626930237, "ranking_idealized_expo": 0.5458333492279053, "ranking_simple": 0.6916666626930237, "regularize": 0.4783257842063904, "step": 95 }, { "dpo_loss": 0.4654810130596161, "dpo_wo_beta": -1.9254087209701538, "epoch": 0.5668398677373642, "grad_norm": 13.577084707122001, "learning_rate": 4.880183036696123e-06, "logits": -1.938937783241272, "logps": -92.29436492919922, "loss": 0.5016, "objective": 0.4654810130596161, "ranking_idealized": 0.612500011920929, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.6875, "regularize": 0.4654810130596161, "step": 100 }, { "dpo_loss": 0.4374677240848541, "dpo_wo_beta": -1.4267934560775757, "epoch": 0.5951818611242324, "grad_norm": 11.14545328639218, "learning_rate": 4.853598938650487e-06, "logits": -1.8158982992172241, "logps": -90.21449279785156, "loss": 0.4618, "objective": 0.4374677240848541, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.6666666865348816, "regularize": 0.4374677240848541, "step": 105 }, { "epoch": 0.600850259801606, "eval_dpo_loss": 0.6936022639274597, "eval_dpo_wo_beta": -2.462427854537964, "eval_logits": -1.9007418155670166, "eval_logps": -94.35714721679688, "eval_loss": 0.6912521123886108, "eval_objective": 0.6936022639274597, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.5351239442825317, "eval_regularize": 0.6936022639274597, "eval_runtime": 210.2297, "eval_samples_per_second": 27.541, "eval_steps_per_second": 1.151, "step": 106 }, { "dpo_loss": 0.47933149337768555, "dpo_wo_beta": -1.9683055877685547, "epoch": 0.6235238545111006, "grad_norm": 12.39392340166307, "learning_rate": 4.824441214720629e-06, "logits": -1.9334439039230347, "logps": -87.35523223876953, "loss": 0.4633, "objective": 0.47933149337768555, "ranking_idealized": 0.5708333253860474, "ranking_idealized_expo": 0.4541666805744171, "ranking_simple": 0.6625000238418579, "regularize": 0.47933149337768555, "step": 110 }, { "dpo_loss": 0.4749464690685272, "dpo_wo_beta": -1.7375919818878174, "epoch": 0.6518658478979689, "grad_norm": 12.612865651893962, "learning_rate": 4.7927417484495756e-06, "logits": -1.9057692289352417, "logps": -87.68991088867188, "loss": 0.4712, "objective": 0.4749464690685272, "ranking_idealized": 0.574999988079071, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.6333333253860474, "regularize": 0.4749464690685272, "step": 115 }, { "dpo_loss": 0.4848935306072235, "dpo_wo_beta": -1.9273093938827515, "epoch": 0.680207841284837, "grad_norm": 13.836239066838136, "learning_rate": 4.758535202738287e-06, "logits": -1.8775906562805176, "logps": -87.8878173828125, "loss": 0.4641, "objective": 0.4848935306072235, "ranking_idealized": 0.5874999761581421, "ranking_idealized_expo": 0.5291666388511658, "ranking_simple": 0.6625000238418579, "regularize": 0.4848935306072235, "step": 120 }, { "dpo_loss": 0.4785127639770508, "dpo_wo_beta": -1.814666748046875, "epoch": 0.7085498346717053, "grad_norm": 12.105170057238437, "learning_rate": 4.721858981942284e-06, "logits": -1.8346068859100342, "logps": -86.40522766113281, "loss": 0.4801, "objective": 0.4785127639770508, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.550000011920929, "ranking_simple": 0.6875, "regularize": 0.4785127639770508, "step": 125 }, { "dpo_loss": 0.4548089802265167, "dpo_wo_beta": -1.4164987802505493, "epoch": 0.7368918280585735, "grad_norm": 11.895980627109102, "learning_rate": 4.682753190970533e-06, "logits": -1.9488608837127686, "logps": -79.42195129394531, "loss": 0.4538, "objective": 0.4548089802265167, "ranking_idealized": 0.5416666865348816, "ranking_idealized_expo": 0.44999998807907104, "ranking_simple": 0.6291666626930237, "regularize": 0.4548089802265167, "step": 130 }, { "dpo_loss": 0.49760884046554565, "dpo_wo_beta": -1.994195818901062, "epoch": 0.7652338214454416, "grad_norm": 12.298776298341995, "learning_rate": 4.641260591431315e-06, "logits": -1.9813282489776611, "logps": -82.40634155273438, "loss": 0.4433, "objective": 0.49760884046554565, "ranking_idealized": 0.5874999761581421, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.6583333611488342, "regularize": 0.49760884046554565, "step": 135 }, { "dpo_loss": 0.41459351778030396, "dpo_wo_beta": -1.187635064125061, "epoch": 0.7935758148323099, "grad_norm": 12.618720178096575, "learning_rate": 4.597426554873037e-06, "logits": -1.97609281539917, "logps": -83.44467163085938, "loss": 0.4236, "objective": 0.41459351778030396, "ranking_idealized": 0.5791666507720947, "ranking_idealized_expo": 0.5041666626930237, "ranking_simple": 0.6791666746139526, "regularize": 0.41459351778030396, "step": 140 }, { "dpo_loss": 0.4073801636695862, "dpo_wo_beta": -1.311059832572937, "epoch": 0.821917808219178, "grad_norm": 14.417917904409194, "learning_rate": 4.551299013171111e-06, "logits": -2.0718839168548584, "logps": -84.2674560546875, "loss": 0.4215, "objective": 0.4073801636695862, "ranking_idealized": 0.5791666507720947, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.6916666626930237, "regularize": 0.4073801636695862, "step": 145 }, { "dpo_loss": 0.4207518398761749, "dpo_wo_beta": -1.50857675075531, "epoch": 0.8502598016060463, "grad_norm": 11.543599868064442, "learning_rate": 4.502928406115152e-06, "logits": -2.0730583667755127, "logps": -82.68958282470703, "loss": 0.4276, "objective": 0.4207518398761749, "ranking_idealized": 0.6166666746139526, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.7208333611488342, "regularize": 0.4207518398761749, "step": 150 }, { "dpo_loss": 0.3847941756248474, "dpo_wo_beta": -1.4449684619903564, "epoch": 0.8786017949929145, "grad_norm": 12.08771803065001, "learning_rate": 4.452367626253805e-06, "logits": -2.0991933345794678, "logps": -85.211181640625, "loss": 0.3986, "objective": 0.3847941756248474, "ranking_idealized": 0.612500011920929, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.7250000238418579, "regularize": 0.3847941756248474, "step": 155 }, { "epoch": 0.9012753897024091, "eval_dpo_loss": 0.7214789390563965, "eval_dpo_wo_beta": -3.1229145526885986, "eval_logits": -2.1450352668762207, "eval_logps": -95.60012817382812, "eval_loss": 0.7013870477676392, "eval_objective": 0.7214789390563965, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.5351239442825317, "eval_regularize": 0.7214789390563965, "eval_runtime": 210.3593, "eval_samples_per_second": 27.524, "eval_steps_per_second": 1.15, "step": 159 }, { "dpo_loss": 0.4162478744983673, "dpo_wo_beta": -1.6461573839187622, "epoch": 0.9069437883797827, "grad_norm": 12.82345397067452, "learning_rate": 4.399671961057523e-06, "logits": -2.0759384632110596, "logps": -89.25846862792969, "loss": 0.4236, "objective": 0.4162478744983673, "ranking_idealized": 0.5708333253860474, "ranking_idealized_expo": 0.49166667461395264, "ranking_simple": 0.699999988079071, "regularize": 0.4162478744983673, "step": 160 }, { "dpo_loss": 0.41358453035354614, "dpo_wo_beta": -1.648630976676941, "epoch": 0.9352857817666509, "grad_norm": 12.860537676624453, "learning_rate": 4.3448990324625244e-06, "logits": -2.024477481842041, "logps": -88.03329467773438, "loss": 0.4026, "objective": 0.41358453035354614, "ranking_idealized": 0.6041666865348816, "ranking_idealized_expo": 0.5208333134651184, "ranking_simple": 0.7333333492279053, "regularize": 0.41358453035354614, "step": 165 }, { "dpo_loss": 0.378000408411026, "dpo_wo_beta": -1.2966532707214355, "epoch": 0.9636277751535192, "grad_norm": 11.533711130228069, "learning_rate": 4.288108733862064e-06, "logits": -2.042527437210083, "logps": -90.26854705810547, "loss": 0.3925, "objective": 0.378000408411026, "ranking_idealized": 0.5791666507720947, "ranking_idealized_expo": 0.512499988079071, "ranking_simple": 0.7166666388511658, "regularize": 0.378000408411026, "step": 170 }, { "dpo_loss": 0.3764660954475403, "dpo_wo_beta": -1.3978971242904663, "epoch": 0.9919697685403873, "grad_norm": 12.165192869157089, "learning_rate": 4.229363164613874e-06, "logits": -2.0610477924346924, "logps": -89.8354721069336, "loss": 0.3793, "objective": 0.3764660954475403, "ranking_idealized": 0.6541666388511658, "ranking_idealized_expo": 0.574999988079071, "ranking_simple": 0.7916666865348816, "regularize": 0.3764660954475403, "step": 175 }, { "dpo_loss": 0.27626773715019226, "dpo_wo_beta": -0.8504549860954285, "epoch": 1.0203117619272555, "grad_norm": 10.141692447282386, "learning_rate": 4.168726562135432e-06, "logits": -2.2514243125915527, "logps": -90.8476333618164, "loss": 0.2852, "objective": 0.27626773715019226, "ranking_idealized": 0.6166666746139526, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.8083333373069763, "regularize": 0.27626773715019226, "step": 180 }, { "dpo_loss": 0.23696589469909668, "dpo_wo_beta": -0.6947117447853088, "epoch": 1.0486537553141237, "grad_norm": 13.78702272812957, "learning_rate": 4.106265231661292e-06, "logits": -2.158977746963501, "logps": -95.00120544433594, "loss": 0.2429, "objective": 0.23696589469909668, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.49166667461395264, "ranking_simple": 0.8083333373069763, "regularize": 0.23696589469909668, "step": 185 }, { "dpo_loss": 0.26388806104660034, "dpo_wo_beta": -0.9112051725387573, "epoch": 1.076995748700992, "grad_norm": 14.740228375586371, "learning_rate": 4.042047473739278e-06, "logits": -2.1533920764923096, "logps": -101.71949768066406, "loss": 0.2517, "objective": 0.26388806104660034, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5541666746139526, "ranking_simple": 0.8416666388511658, "regularize": 0.26388806104660034, "step": 190 }, { "dpo_loss": 0.2244579941034317, "dpo_wo_beta": -0.6430780291557312, "epoch": 1.10533774208786, "grad_norm": 10.169064121599527, "learning_rate": 3.976143509544843e-06, "logits": -2.1589295864105225, "logps": -96.5248031616211, "loss": 0.2467, "objective": 0.2244579941034317, "ranking_idealized": 0.6333333253860474, "ranking_idealized_expo": 0.5666666626930237, "ranking_simple": 0.8083333373069763, "regularize": 0.2244579941034317, "step": 195 }, { "dpo_loss": 0.24179764091968536, "dpo_wo_beta": -0.6332272291183472, "epoch": 1.1336797354747283, "grad_norm": 9.444774343787891, "learning_rate": 3.908625404095242e-06, "logits": -2.2753493785858154, "logps": -91.93312072753906, "loss": 0.2563, "objective": 0.24179764091968536, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.8458333611488342, "regularize": 0.24179764091968536, "step": 200 }, { "dpo_loss": 0.25683078169822693, "dpo_wo_beta": -0.8531176447868347, "epoch": 1.1620217288615966, "grad_norm": 9.240319326762517, "learning_rate": 3.839566987447492e-06, "logits": -2.2432618141174316, "logps": -91.3159408569336, "loss": 0.2584, "objective": 0.25683078169822693, "ranking_idealized": 0.5708333253860474, "ranking_idealized_expo": 0.49166667461395264, "ranking_simple": 0.8166666626930237, "regularize": 0.25683078169822693, "step": 205 }, { "dpo_loss": 0.24292893707752228, "dpo_wo_beta": -0.8205318450927734, "epoch": 1.1903637222484649, "grad_norm": 9.283856100785183, "learning_rate": 3.7690437739662928e-06, "logits": -2.2361652851104736, "logps": -90.6613998413086, "loss": 0.2551, "objective": 0.24292893707752228, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.5, "ranking_simple": 0.800000011920929, "regularize": 0.24292893707752228, "step": 210 }, { "epoch": 1.201700519603212, "eval_dpo_loss": 0.7525234222412109, "eval_dpo_wo_beta": -3.7749528884887695, "eval_logits": -2.267778158187866, "eval_logps": -98.14269256591797, "eval_loss": 0.7350714206695557, "eval_objective": 0.7525234222412109, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.5371900796890259, "eval_regularize": 0.7525234222412109, "eval_runtime": 210.8898, "eval_samples_per_second": 27.455, "eval_steps_per_second": 1.148, "step": 212 }, { "dpo_loss": 0.289533793926239, "dpo_wo_beta": -0.8810125589370728, "epoch": 1.2187057156353331, "grad_norm": 10.72372972136692, "learning_rate": 3.697132879750174e-06, "logits": -2.1757090091705322, "logps": -93.64250183105469, "loss": 0.2578, "objective": 0.289533793926239, "ranking_idealized": 0.5583333373069763, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.7875000238418579, "regularize": 0.289533793926239, "step": 215 }, { "dpo_loss": 0.25134381651878357, "dpo_wo_beta": -0.8703542947769165, "epoch": 1.2470477090222012, "grad_norm": 12.940604838816247, "learning_rate": 3.6239129383061764e-06, "logits": -2.121750593185425, "logps": -94.44015502929688, "loss": 0.2676, "objective": 0.25134381651878357, "ranking_idealized": 0.6541666388511658, "ranking_idealized_expo": 0.6041666865348816, "ranking_simple": 0.8208333253860474, "regularize": 0.25134381651878357, "step": 220 }, { "dpo_loss": 0.23937886953353882, "dpo_wo_beta": -0.7396827936172485, "epoch": 1.2753897024090695, "grad_norm": 9.645711793319885, "learning_rate": 3.5494640145652647e-06, "logits": -2.0901684761047363, "logps": -94.10260772705078, "loss": 0.2637, "objective": 0.23937886953353882, "ranking_idealized": 0.6166666746139526, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.8541666865348816, "regularize": 0.23937886953353882, "step": 225 }, { "dpo_loss": 0.2818019688129425, "dpo_wo_beta": -1.1170729398727417, "epoch": 1.3037316957959377, "grad_norm": 8.80210598601974, "learning_rate": 3.4738675173325008e-06, "logits": -1.9860222339630127, "logps": -92.9978256225586, "loss": 0.2776, "objective": 0.2818019688129425, "ranking_idealized": 0.5791666507720947, "ranking_idealized_expo": 0.5041666626930237, "ranking_simple": 0.7749999761581421, "regularize": 0.2818019688129425, "step": 230 }, { "dpo_loss": 0.22621506452560425, "dpo_wo_beta": -0.35843732953071594, "epoch": 1.3320736891828058, "grad_norm": 9.267612473930496, "learning_rate": 3.397206110267713e-06, "logits": -2.1131467819213867, "logps": -87.49403381347656, "loss": 0.2618, "objective": 0.22621506452560425, "ranking_idealized": 0.6041666865348816, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.8333333134651184, "regularize": 0.22621506452560425, "step": 235 }, { "dpo_loss": 0.23632274568080902, "dpo_wo_beta": -0.6697984933853149, "epoch": 1.360415682569674, "grad_norm": 10.68594080832048, "learning_rate": 3.3195636214939943e-06, "logits": -2.130047559738159, "logps": -91.7619857788086, "loss": 0.2584, "objective": 0.23632274568080902, "ranking_idealized": 0.5541666746139526, "ranking_idealized_expo": 0.4791666567325592, "ranking_simple": 0.8125, "regularize": 0.23632274568080902, "step": 240 }, { "dpo_loss": 0.2982023358345032, "dpo_wo_beta": -1.1124054193496704, "epoch": 1.3887576759565423, "grad_norm": 10.330360151122868, "learning_rate": 3.2410249519328848e-06, "logits": -2.1718757152557373, "logps": -93.45353698730469, "loss": 0.2692, "objective": 0.2982023358345032, "ranking_idealized": 0.5916666388511658, "ranking_idealized_expo": 0.5208333134651184, "ranking_simple": 0.8041666746139526, "regularize": 0.2982023358345032, "step": 245 }, { "dpo_loss": 0.2403133064508438, "dpo_wo_beta": -0.7000442147254944, "epoch": 1.4170996693434104, "grad_norm": 13.026578288520353, "learning_rate": 3.1616759824664543e-06, "logits": -2.145325183868408, "logps": -94.18195343017578, "loss": 0.269, "objective": 0.2403133064508438, "ranking_idealized": 0.574999988079071, "ranking_idealized_expo": 0.47083333134651184, "ranking_simple": 0.824999988079071, "regularize": 0.2403133064508438, "step": 250 }, { "dpo_loss": 0.23977436125278473, "dpo_wo_beta": -0.5784927010536194, "epoch": 1.4454416627302786, "grad_norm": 10.959901566104394, "learning_rate": 3.081603480027826e-06, "logits": -2.108074426651001, "logps": -94.5383529663086, "loss": 0.2625, "objective": 0.23977436125278473, "ranking_idealized": 0.6166666746139526, "ranking_idealized_expo": 0.5458333492279053, "ranking_simple": 0.8374999761581421, "regularize": 0.23977436125278473, "step": 255 }, { "dpo_loss": 0.25297579169273376, "dpo_wo_beta": -0.7996426820755005, "epoch": 1.473783656117147, "grad_norm": 9.578050078679867, "learning_rate": 3.0008950027228035e-06, "logits": -2.1828908920288086, "logps": -92.77781677246094, "loss": 0.232, "objective": 0.25297579169273376, "ranking_idealized": 0.5666666626930237, "ranking_idealized_expo": 0.48750001192092896, "ranking_simple": 0.8041666746139526, "regularize": 0.25297579169273376, "step": 260 }, { "dpo_loss": 0.2830916941165924, "dpo_wo_beta": -1.124144434928894, "epoch": 1.5021256495040152, "grad_norm": 11.27765707111355, "learning_rate": 2.9196388040863695e-06, "logits": -2.1150081157684326, "logps": -95.04662322998047, "loss": 0.2623, "objective": 0.2830916941165924, "ranking_idealized": 0.5791666507720947, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.8041666746139526, "regularize": 0.2830916941165924, "step": 265 }, { "epoch": 1.5021256495040152, "eval_dpo_loss": 0.7739136815071106, "eval_dpo_wo_beta": -4.163427829742432, "eval_logits": -2.1478331089019775, "eval_logps": -100.8313217163086, "eval_loss": 0.7400166392326355, "eval_objective": 0.7739136815071106, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.53925621509552, "eval_regularize": 0.7739136815071106, "eval_runtime": 210.8657, "eval_samples_per_second": 27.458, "eval_steps_per_second": 1.148, "step": 265 }, { "dpo_loss": 0.2930367887020111, "dpo_wo_beta": -1.3651045560836792, "epoch": 1.5304676428908834, "grad_norm": 11.715215816813723, "learning_rate": 2.8379237365787426e-06, "logits": -2.035703182220459, "logps": -97.7331771850586, "loss": 0.253, "objective": 0.2930367887020111, "ranking_idealized": 0.574999988079071, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.7916666865348816, "regularize": 0.2930367887020111, "step": 270 }, { "dpo_loss": 0.24886849522590637, "dpo_wo_beta": -0.8069366216659546, "epoch": 1.5588096362777515, "grad_norm": 8.958944325794365, "learning_rate": 2.7558391544265127e-06, "logits": -1.9700883626937866, "logps": -97.53855895996094, "loss": 0.2491, "objective": 0.24886849522590637, "ranking_idealized": 0.6208333373069763, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.8125, "regularize": 0.24886849522590637, "step": 275 }, { "dpo_loss": 0.22936613857746124, "dpo_wo_beta": -0.6120084524154663, "epoch": 1.5871516296646198, "grad_norm": 10.814739938498821, "learning_rate": 2.6734748159151104e-06, "logits": -1.9118597507476807, "logps": -98.06639099121094, "loss": 0.2491, "objective": 0.22936613857746124, "ranking_idealized": 0.574999988079071, "ranking_idealized_expo": 0.5041666626930237, "ranking_simple": 0.8374999761581421, "regularize": 0.22936613857746124, "step": 280 }, { "dpo_loss": 0.22401383519172668, "dpo_wo_beta": -0.5180224776268005, "epoch": 1.615493623051488, "grad_norm": 11.270657822712987, "learning_rate": 2.5909207852394363e-06, "logits": -1.9585484266281128, "logps": -100.70836639404297, "loss": 0.2348, "objective": 0.22401383519172668, "ranking_idealized": 0.5874999761581421, "ranking_idealized_expo": 0.512499988079071, "ranking_simple": 0.8291666507720947, "regularize": 0.22401383519172668, "step": 285 }, { "dpo_loss": 0.2646006941795349, "dpo_wo_beta": -0.7763135433197021, "epoch": 1.643835616438356, "grad_norm": 10.585292794409252, "learning_rate": 2.508267334019988e-06, "logits": -1.9566444158554077, "logps": -97.0122299194336, "loss": 0.2532, "objective": 0.2646006941795349, "ranking_idealized": 0.612500011920929, "ranking_idealized_expo": 0.5625, "ranking_simple": 0.8041666746139526, "regularize": 0.2646006941795349, "step": 290 }, { "dpo_loss": 0.23113909363746643, "dpo_wo_beta": -0.6497251987457275, "epoch": 1.6721776098252243, "grad_norm": 11.90240881956814, "learning_rate": 2.4256048425921693e-06, "logits": -1.8574607372283936, "logps": -94.91531372070312, "loss": 0.2476, "objective": 0.23113909363746643, "ranking_idealized": 0.5791666507720947, "ranking_idealized_expo": 0.4749999940395355, "ranking_simple": 0.8333333134651184, "regularize": 0.23113909363746643, "step": 295 }, { "dpo_loss": 0.22116926312446594, "dpo_wo_beta": -0.6268281936645508, "epoch": 1.7005196032120926, "grad_norm": 11.745161783871675, "learning_rate": 2.3430237011767166e-06, "logits": -1.895004153251648, "logps": -97.79885864257812, "loss": 0.2266, "objective": 0.22116926312446594, "ranking_idealized": 0.6583333611488342, "ranking_idealized_expo": 0.574999988079071, "ranking_simple": 0.8333333134651184, "regularize": 0.22116926312446594, "step": 300 }, { "dpo_loss": 0.24756571650505066, "dpo_wo_beta": -0.9131773114204407, "epoch": 1.7288615965989607, "grad_norm": 12.299641904512029, "learning_rate": 2.2606142110393248e-06, "logits": -1.8061485290527344, "logps": -96.69060516357422, "loss": 0.2379, "objective": 0.24756571650505066, "ranking_idealized": 0.5625, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.8291666507720947, "regularize": 0.24756571650505066, "step": 305 }, { "dpo_loss": 0.2321903556585312, "dpo_wo_beta": -0.6867564916610718, "epoch": 1.7572035899858292, "grad_norm": 13.489735935272718, "learning_rate": 2.1784664857475356e-06, "logits": -1.8388514518737793, "logps": -95.04447937011719, "loss": 0.2456, "objective": 0.2321903556585312, "ranking_idealized": 0.6083333492279053, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.824999988079071, "regularize": 0.2321903556585312, "step": 310 }, { "dpo_loss": 0.2901044189929962, "dpo_wo_beta": -1.1286156177520752, "epoch": 1.7855455833726972, "grad_norm": 10.887596324980125, "learning_rate": 2.096670352632873e-06, "logits": -1.75984525680542, "logps": -94.63612365722656, "loss": 0.2571, "objective": 0.2901044189929962, "ranking_idealized": 0.6041666865348816, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.8083333373069763, "regularize": 0.2901044189929962, "step": 315 }, { "epoch": 1.8025507794048181, "eval_dpo_loss": 0.7664583325386047, "eval_dpo_wo_beta": -4.09501838684082, "eval_logits": -1.9888346195220947, "eval_logps": -102.3712158203125, "eval_loss": 0.7400712966918945, "eval_objective": 0.7664583325386047, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.53925621509552, "eval_regularize": 0.7664583325386047, "eval_runtime": 210.274, "eval_samples_per_second": 27.535, "eval_steps_per_second": 1.151, "step": 318 }, { "dpo_loss": 0.2219768464565277, "dpo_wo_beta": -0.47742757201194763, "epoch": 1.8138875767595655, "grad_norm": 11.029480506309918, "learning_rate": 2.01531525456598e-06, "logits": -1.9175788164138794, "logps": -99.74655151367188, "loss": 0.2404, "objective": 0.2219768464565277, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5, "ranking_simple": 0.824999988079071, "regularize": 0.2219768464565277, "step": 320 }, { "dpo_loss": 0.24908211827278137, "dpo_wo_beta": -0.8014059066772461, "epoch": 1.8422295701464337, "grad_norm": 12.92850322071669, "learning_rate": 1.93449015215215e-06, "logits": -2.0084919929504395, "logps": -101.09780883789062, "loss": 0.2586, "objective": 0.24908211827278137, "ranking_idealized": 0.6333333253860474, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.8333333134651184, "regularize": 0.24908211827278137, "step": 325 }, { "dpo_loss": 0.1984507441520691, "dpo_wo_beta": -0.3766098618507385, "epoch": 1.8705715635333018, "grad_norm": 10.415606016359964, "learning_rate": 1.8542834264542091e-06, "logits": -1.851909875869751, "logps": -94.5366439819336, "loss": 0.2496, "objective": 0.1984507441520691, "ranking_idealized": 0.574999988079071, "ranking_idealized_expo": 0.4791666567325592, "ranking_simple": 0.8291666507720947, "regularize": 0.1984507441520691, "step": 330 }, { "dpo_loss": 0.26707762479782104, "dpo_wo_beta": -0.9339324831962585, "epoch": 1.89891355692017, "grad_norm": 10.078352873471246, "learning_rate": 1.7747827823491253e-06, "logits": -1.9827288389205933, "logps": -94.26249694824219, "loss": 0.2463, "objective": 0.26707762479782104, "ranking_idealized": 0.5958333611488342, "ranking_idealized_expo": 0.4958333373069763, "ranking_simple": 0.8125, "regularize": 0.26707762479782104, "step": 335 }, { "dpo_loss": 0.2447831928730011, "dpo_wo_beta": -0.7387041449546814, "epoch": 1.9272555503070383, "grad_norm": 10.88136655004607, "learning_rate": 1.6960751526240122e-06, "logits": -1.9671465158462524, "logps": -98.63937377929688, "loss": 0.2399, "objective": 0.2447831928730011, "ranking_idealized": 0.6416666507720947, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.8583333492279053, "regularize": 0.2447831928730011, "step": 340 }, { "dpo_loss": 0.2123527079820633, "dpo_wo_beta": -0.5544185638427734, "epoch": 1.9555975436939064, "grad_norm": 11.18260747105762, "learning_rate": 1.6182466029163974e-06, "logits": -1.9572845697402954, "logps": -100.18721008300781, "loss": 0.2211, "objective": 0.2123527079820633, "ranking_idealized": 0.574999988079071, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.8208333253860474, "regularize": 0.2123527079820633, "step": 345 }, { "dpo_loss": 0.2570362389087677, "dpo_wo_beta": -0.7474013566970825, "epoch": 1.9839395370807746, "grad_norm": 11.061918116138507, "learning_rate": 1.541382237602721e-06, "logits": -1.8960832357406616, "logps": -101.65901947021484, "loss": 0.2316, "objective": 0.2570362389087677, "ranking_idealized": 0.5791666507720947, "ranking_idealized_expo": 0.48750001192092896, "ranking_simple": 0.7916666865348816, "regularize": 0.2570362389087677, "step": 350 }, { "dpo_loss": 0.19961656630039215, "dpo_wo_beta": -0.5642960667610168, "epoch": 2.012281530467643, "grad_norm": 7.569515164252156, "learning_rate": 1.465566106737942e-06, "logits": -1.8380100727081299, "logps": -102.71571350097656, "loss": 0.2103, "objective": 0.19961656630039215, "ranking_idealized": 0.5958333611488342, "ranking_idealized_expo": 0.5041666626930237, "ranking_simple": 0.8374999761581421, "regularize": 0.19961656630039215, "step": 355 }, { "dpo_loss": 0.11018560826778412, "dpo_wo_beta": -0.12253165245056152, "epoch": 2.040623523854511, "grad_norm": 6.632276986432463, "learning_rate": 1.3908811141480408e-06, "logits": -1.867693543434143, "logps": -103.06665802001953, "loss": 0.118, "objective": 0.11018560826778412, "ranking_idealized": 0.6625000238418579, "ranking_idealized_expo": 0.5375000238418579, "ranking_simple": 0.9041666388511658, "regularize": 0.11018560826778412, "step": 360 }, { "dpo_loss": 0.12077057361602783, "dpo_wo_beta": -0.197490856051445, "epoch": 2.0689655172413794, "grad_norm": 10.213186193965676, "learning_rate": 1.3174089267758983e-06, "logits": -1.8255099058151245, "logps": -110.3724136352539, "loss": 0.118, "objective": 0.12077057361602783, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.8666666746139526, "regularize": 0.12077057361602783, "step": 365 }, { "dpo_loss": 0.1337815225124359, "dpo_wo_beta": -0.27523547410964966, "epoch": 2.0973075106282475, "grad_norm": 9.926730675582434, "learning_rate": 1.245229885379699e-06, "logits": -1.7588540315628052, "logps": -111.99506378173828, "loss": 0.1227, "objective": 0.1337815225124359, "ranking_idealized": 0.5708333253860474, "ranking_idealized_expo": 0.4791666567325592, "ranking_simple": 0.8916666507720947, "regularize": 0.1337815225124359, "step": 370 }, { "epoch": 2.1029759093056213, "eval_dpo_loss": 0.9223728179931641, "eval_dpo_wo_beta": -6.4510064125061035, "eval_logits": -1.8644566535949707, "eval_logps": -122.00161743164062, "eval_loss": 0.8844180107116699, "eval_objective": 0.9223728179931641, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.5423553586006165, "eval_regularize": 0.9223728179931641, "eval_runtime": 210.7356, "eval_samples_per_second": 27.475, "eval_steps_per_second": 1.148, "step": 371 }, { "dpo_loss": 0.10664375871419907, "dpo_wo_beta": -0.2532973289489746, "epoch": 2.1256495040151155, "grad_norm": 9.740007111179482, "learning_rate": 1.1744229166814889e-06, "logits": -1.696647047996521, "logps": -118.39366149902344, "loss": 0.1103, "objective": 0.10664375871419907, "ranking_idealized": 0.675000011920929, "ranking_idealized_expo": 0.5916666388511658, "ranking_simple": 0.925000011920929, "regularize": 0.10664375871419907, "step": 375 }, { "dpo_loss": 0.12854978442192078, "dpo_wo_beta": -0.27664583921432495, "epoch": 2.153991497401984, "grad_norm": 9.699256456859702, "learning_rate": 1.1050654470619602e-06, "logits": -1.700494647026062, "logps": -114.1063232421875, "loss": 0.1208, "objective": 0.12854978442192078, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.8999999761581421, "regularize": 0.12854978442192078, "step": 380 }, { "dpo_loss": 0.10418140888214111, "dpo_wo_beta": -0.09889766573905945, "epoch": 2.182333490788852, "grad_norm": 9.620361843085416, "learning_rate": 1.0372333178958462e-06, "logits": -1.8633235692977905, "logps": -110.55794525146484, "loss": 0.1244, "objective": 0.10418140888214111, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.5041666626930237, "ranking_simple": 0.9125000238418579, "regularize": 0.10418140888214111, "step": 385 }, { "dpo_loss": 0.12462247163057327, "dpo_wo_beta": -0.2658768594264984, "epoch": 2.21067548417572, "grad_norm": 11.000881222201947, "learning_rate": 9.710007026204896e-07, "logits": -1.7877620458602905, "logps": -112.08268737792969, "loss": 0.1204, "objective": 0.12462247163057327, "ranking_idealized": 0.5249999761581421, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.8833333253860474, "regularize": 0.12462247163057327, "step": 390 }, { "dpo_loss": 0.11183874309062958, "dpo_wo_beta": -0.3540593981742859, "epoch": 2.2390174775625886, "grad_norm": 8.717110295390793, "learning_rate": 9.064400256282757e-07, "logits": -1.8010636568069458, "logps": -110.48490142822266, "loss": 0.1248, "objective": 0.11183874309062958, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.9041666388511658, "regularize": 0.11183874309062958, "step": 395 }, { "dpo_loss": 0.12893003225326538, "dpo_wo_beta": -0.3680768311023712, "epoch": 2.2673594709494567, "grad_norm": 9.562073048936949, "learning_rate": 8.436218830716259e-07, "logits": -1.8909595012664795, "logps": -111.70219421386719, "loss": 0.1193, "objective": 0.12893003225326538, "ranking_idealized": 0.6625000238418579, "ranking_idealized_expo": 0.5708333253860474, "ranking_simple": 0.9166666865348816, "regularize": 0.12893003225326538, "step": 400 }, { "dpo_loss": 0.13196416199207306, "dpo_wo_beta": -0.17852090299129486, "epoch": 2.295701464336325, "grad_norm": 9.166021194752298, "learning_rate": 7.826149656671386e-07, "logits": -1.9320632219314575, "logps": -108.1246566772461, "loss": 0.1267, "objective": 0.13196416199207306, "ranking_idealized": 0.6416666507720947, "ranking_idealized_expo": 0.5583333373069763, "ranking_simple": 0.8958333134651184, "regularize": 0.13196416199207306, "step": 405 }, { "dpo_loss": 0.11071384698152542, "dpo_wo_beta": -0.1424117088317871, "epoch": 2.324043457723193, "grad_norm": 8.918983804471582, "learning_rate": 7.234859835833022e-07, "logits": -1.8304682970046997, "logps": -111.2301025390625, "loss": 0.112, "objective": 0.11071384698152542, "ranking_idealized": 0.5708333253860474, "ranking_idealized_expo": 0.4791666567325592, "ranking_simple": 0.9041666388511658, "regularize": 0.11071384698152542, "step": 410 }, { "dpo_loss": 0.1223960742354393, "dpo_wo_beta": -0.1956464648246765, "epoch": 2.3523854511100613, "grad_norm": 9.386393866562546, "learning_rate": 6.662995934939007e-07, "logits": -1.8708041906356812, "logps": -111.06449890136719, "loss": 0.1155, "objective": 0.1223960742354393, "ranking_idealized": 0.5958333611488342, "ranking_idealized_expo": 0.5291666388511658, "ranking_simple": 0.9083333611488342, "regularize": 0.1223960742354393, "step": 415 }, { "dpo_loss": 0.12930770218372345, "dpo_wo_beta": -0.21560731530189514, "epoch": 2.3807274444969297, "grad_norm": 11.0131183307354, "learning_rate": 6.111183278768956e-07, "logits": -1.860797643661499, "logps": -113.08780670166016, "loss": 0.133, "objective": 0.12930770218372345, "ranking_idealized": 0.5666666626930237, "ranking_idealized_expo": 0.4749999940395355, "ranking_simple": 0.9208333492279053, "regularize": 0.12930770218372345, "step": 420 }, { "epoch": 2.403401039206424, "eval_dpo_loss": 0.8785684108734131, "eval_dpo_wo_beta": -5.887755870819092, "eval_logits": -2.0276894569396973, "eval_logps": -117.1216812133789, "eval_loss": 0.8447906374931335, "eval_objective": 0.8785684108734131, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.5413222908973694, "eval_regularize": 0.8785684108734131, "eval_runtime": 209.8564, "eval_samples_per_second": 27.59, "eval_steps_per_second": 1.153, "step": 424 }, { "dpo_loss": 0.117975153028965, "dpo_wo_beta": -0.1884605884552002, "epoch": 2.413793103448276, "grad_norm": 11.036168833651558, "learning_rate": 5.580025266360764e-07, "logits": -1.7822004556655884, "logps": -114.43038177490234, "loss": 0.1465, "objective": 0.117975153028965, "ranking_idealized": 0.6666666865348816, "ranking_idealized_expo": 0.5625, "ranking_simple": 0.9375, "regularize": 0.117975153028965, "step": 425 }, { "dpo_loss": 0.1465020477771759, "dpo_wo_beta": -0.2595965266227722, "epoch": 2.442135096835144, "grad_norm": 10.595070818850646, "learning_rate": 5.070102711202606e-07, "logits": -1.8692681789398193, "logps": -110.2762680053711, "loss": 0.1276, "objective": 0.1465020477771759, "ranking_idealized": 0.5916666388511658, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.8958333134651184, "regularize": 0.1465020477771759, "step": 430 }, { "dpo_loss": 0.09775053709745407, "dpo_wo_beta": -0.12755917012691498, "epoch": 2.4704770902220123, "grad_norm": 9.393206692367766, "learning_rate": 4.581973206121948e-07, "logits": -1.8968538045883179, "logps": -112.28767395019531, "loss": 0.1175, "objective": 0.09775053709745407, "ranking_idealized": 0.5874999761581421, "ranking_idealized_expo": 0.512499988079071, "ranking_simple": 0.9083333611488342, "regularize": 0.09775053709745407, "step": 435 }, { "dpo_loss": 0.14228057861328125, "dpo_wo_beta": -0.3639788329601288, "epoch": 2.4988190836088804, "grad_norm": 8.020134663378592, "learning_rate": 4.116170513565942e-07, "logits": -1.8666160106658936, "logps": -109.18843078613281, "loss": 0.1167, "objective": 0.14228057861328125, "ranking_idealized": 0.574999988079071, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.875, "regularize": 0.14228057861328125, "step": 440 }, { "dpo_loss": 0.13583588600158691, "dpo_wo_beta": -0.2074100226163864, "epoch": 2.527161076995749, "grad_norm": 9.224367796824264, "learning_rate": 3.6732039819400686e-07, "logits": -1.8071045875549316, "logps": -107.2675552368164, "loss": 0.1319, "objective": 0.13583588600158691, "ranking_idealized": 0.5625, "ranking_idealized_expo": 0.49166667461395264, "ranking_simple": 0.8791666626930237, "regularize": 0.13583588600158691, "step": 445 }, { "dpo_loss": 0.17114870250225067, "dpo_wo_beta": -0.43270742893218994, "epoch": 2.555503070382617, "grad_norm": 11.265861710797749, "learning_rate": 3.253557988643072e-07, "logits": -1.9256045818328857, "logps": -111.20384216308594, "loss": 0.1288, "objective": 0.17114870250225067, "ranking_idealized": 0.5874999761581421, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.862500011920929, "regularize": 0.17114870250225067, "step": 450 }, { "dpo_loss": 0.10827689617872238, "dpo_wo_beta": -0.1751028150320053, "epoch": 2.583845063769485, "grad_norm": 9.605136286662574, "learning_rate": 2.8576914104074425e-07, "logits": -1.9289051294326782, "logps": -109.37706756591797, "loss": 0.1168, "objective": 0.10827689617872238, "ranking_idealized": 0.5958333611488342, "ranking_idealized_expo": 0.4958333373069763, "ranking_simple": 0.9125000238418579, "regularize": 0.10827689617872238, "step": 455 }, { "dpo_loss": 0.11124877631664276, "dpo_wo_beta": -0.28054580092430115, "epoch": 2.6121870571563535, "grad_norm": 9.957466667064367, "learning_rate": 2.486037121524448e-07, "logits": -1.93342924118042, "logps": -113.2356948852539, "loss": 0.1169, "objective": 0.11124877631664276, "ranking_idealized": 0.6291666626930237, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.9375, "regularize": 0.11124877631664276, "step": 460 }, { "dpo_loss": 0.12714476883411407, "dpo_wo_beta": -0.22146105766296387, "epoch": 2.6405290505432215, "grad_norm": 10.04326854921629, "learning_rate": 2.13900152050239e-07, "logits": -1.8874350786209106, "logps": -108.94982147216797, "loss": 0.1189, "objective": 0.12714476883411407, "ranking_idealized": 0.5458333492279053, "ranking_idealized_expo": 0.4791666567325592, "ranking_simple": 0.862500011920929, "regularize": 0.12714476883411407, "step": 465 }, { "dpo_loss": 0.12151040881872177, "dpo_wo_beta": -0.26416900753974915, "epoch": 2.66887104393009, "grad_norm": 8.777820527737605, "learning_rate": 1.8169640856758652e-07, "logits": -1.9314534664154053, "logps": -112.75170135498047, "loss": 0.1254, "objective": 0.12151040881872177, "ranking_idealized": 0.6291666626930237, "ranking_idealized_expo": 0.5541666746139526, "ranking_simple": 0.9083333611488342, "regularize": 0.12151040881872177, "step": 470 }, { "dpo_loss": 0.12749101221561432, "dpo_wo_beta": -0.2816121280193329, "epoch": 2.697213037316958, "grad_norm": 9.221778751171357, "learning_rate": 1.5202769602517514e-07, "logits": -1.8307260274887085, "logps": -109.39693450927734, "loss": 0.1211, "objective": 0.12749101221561432, "ranking_idealized": 0.6041666865348816, "ranking_idealized_expo": 0.4958333373069763, "ranking_simple": 0.8999999761581421, "regularize": 0.12749101221561432, "step": 475 }, { "epoch": 2.708549834671705, "eval_dpo_loss": 0.8738968372344971, "eval_dpo_wo_beta": -5.815241813659668, "eval_logits": -2.0271613597869873, "eval_logps": -116.42301177978516, "eval_loss": 0.8371492624282837, "eval_objective": 0.8738968372344971, "eval_ranking_idealized": 0.5888429880142212, "eval_ranking_idealized_expo": 0.5092975497245789, "eval_ranking_simple": 0.5402892827987671, "eval_regularize": 0.8738968372344971, "eval_runtime": 211.9437, "eval_samples_per_second": 27.319, "eval_steps_per_second": 1.142, "step": 477 }, { "dpo_loss": 0.13781045377254486, "dpo_wo_beta": -0.2485995590686798, "epoch": 2.725555030703826, "grad_norm": 10.971551462649595, "learning_rate": 1.2492645672457838e-07, "logits": -1.9437103271484375, "logps": -108.93817901611328, "loss": 0.1267, "objective": 0.13781045377254486, "ranking_idealized": 0.5916666388511658, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.8999999761581421, "regularize": 0.13781045377254486, "step": 480 }, { "dpo_loss": 0.11082082241773605, "dpo_wo_beta": -0.10876031965017319, "epoch": 2.753897024090694, "grad_norm": 10.884940640535042, "learning_rate": 1.004223254730749e-07, "logits": -1.7556992769241333, "logps": -114.1142807006836, "loss": 0.1222, "objective": 0.11082082241773605, "ranking_idealized": 0.6166666746139526, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.9041666388511658, "regularize": 0.11082082241773605, "step": 485 }, { "dpo_loss": 0.09154360741376877, "dpo_wo_beta": -0.05899694189429283, "epoch": 2.7822390174775626, "grad_norm": 11.015982469457516, "learning_rate": 7.854209717842231e-08, "logits": -1.8848822116851807, "logps": -110.15470886230469, "loss": 0.1058, "objective": 0.09154360741376877, "ranking_idealized": 0.5874999761581421, "ranking_idealized_expo": 0.47083333134651184, "ranking_simple": 0.9333333373069763, "regularize": 0.09154360741376877, "step": 490 }, { "dpo_loss": 0.10964310169219971, "dpo_wo_beta": -0.07648710906505585, "epoch": 2.8105810108644307, "grad_norm": 10.079416267782939, "learning_rate": 5.930969754901844e-08, "logits": -1.8575230836868286, "logps": -108.52234649658203, "loss": 0.1192, "objective": 0.10964310169219971, "ranking_idealized": 0.5583333373069763, "ranking_idealized_expo": 0.4749999940395355, "ranking_simple": 0.8666666746139526, "regularize": 0.10964310169219971, "step": 495 }, { "dpo_loss": 0.09479068219661713, "dpo_wo_beta": -0.03411731496453285, "epoch": 2.838923004251299, "grad_norm": 9.84080114767598, "learning_rate": 4.2746156931490756e-08, "logits": -1.8439643383026123, "logps": -109.77281188964844, "loss": 0.1213, "objective": 0.09479068219661713, "ranking_idealized": 0.6208333373069763, "ranking_idealized_expo": 0.5208333134651184, "ranking_simple": 0.925000011920929, "regularize": 0.09479068219661713, "step": 500 }, { "dpo_loss": 0.12725140154361725, "dpo_wo_beta": -0.18973813951015472, "epoch": 2.8672649976381672, "grad_norm": 9.973754192936779, "learning_rate": 2.8869587314321324e-08, "logits": -1.8574442863464355, "logps": -110.32710266113281, "loss": 0.132, "objective": 0.12725140154361725, "ranking_idealized": 0.6166666746139526, "ranking_idealized_expo": 0.5208333134651184, "ranking_simple": 0.8999999761581421, "regularize": 0.12725140154361725, "step": 505 }, { "dpo_loss": 0.10469380766153336, "dpo_wo_beta": -0.1985001415014267, "epoch": 2.8956069910250353, "grad_norm": 8.936464383287202, "learning_rate": 1.7695162522652352e-08, "logits": -1.8629390001296997, "logps": -113.56767272949219, "loss": 0.1218, "objective": 0.10469380766153336, "ranking_idealized": 0.6041666865348816, "ranking_idealized_expo": 0.5, "ranking_simple": 0.8916666507720947, "regularize": 0.10469380766153336, "step": 510 }, { "dpo_loss": 0.11260154843330383, "dpo_wo_beta": -0.15691885352134705, "epoch": 2.9239489844119038, "grad_norm": 9.442300088571939, "learning_rate": 9.235101625932885e-09, "logits": -1.946829915046692, "logps": -108.54016876220703, "loss": 0.1258, "objective": 0.11260154843330383, "ranking_idealized": 0.5791666507720947, "ranking_idealized_expo": 0.48750001192092896, "ranking_simple": 0.925000011920929, "regularize": 0.11260154843330383, "step": 515 }, { "dpo_loss": 0.12230218201875687, "dpo_wo_beta": -0.10489177703857422, "epoch": 2.952290977798772, "grad_norm": 9.279898048101137, "learning_rate": 3.4986555765434415e-09, "logits": -1.8482831716537476, "logps": -114.20655059814453, "loss": 0.1228, "objective": 0.12230218201875687, "ranking_idealized": 0.6291666626930237, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.8708333373069763, "regularize": 0.12230218201875687, "step": 520 }, { "dpo_loss": 0.13335375487804413, "dpo_wo_beta": -0.35261282324790955, "epoch": 2.9806329711856403, "grad_norm": 11.094809681697281, "learning_rate": 4.920970940180958e-10, "logits": -1.876869797706604, "logps": -111.03084564208984, "loss": 0.1235, "objective": 0.13335375487804413, "ranking_idealized": 0.6291666626930237, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.8958333134651184, "regularize": 0.13335375487804413, "step": 525 }, { "dpo_loss": 0.08052093535661697, "dpo_wo_beta": -0.004379949066787958, "epoch": 1.5044874822862542, "grad_norm": 9.015683090612498, "learning_rate": 2.9196388040863695e-06, "logits": -1.833287239074707, "logps": -114.10733795166016, "loss": 0.0858, "objective": 0.08052093535661697, "ranking_idealized": 0.6666666865348816, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.9375, "regularize": 0.08052093535661697, "step": 530 }, { "epoch": 1.5044874822862542, "eval_dpo_loss": 0.8753401041030884, "eval_dpo_wo_beta": -5.922874450683594, "eval_logits": -2.4529590606689453, "eval_logps": -118.25288391113281, "eval_loss": 0.8505071997642517, "eval_objective": 0.8753401041030884, "eval_ranking_idealized": 0.6045548915863037, "eval_ranking_idealized_expo": 0.5279502868652344, "eval_ranking_simple": 0.5683229565620422, "eval_regularize": 0.8753401041030884, "eval_runtime": 344.4459, "eval_samples_per_second": 16.81, "eval_steps_per_second": 1.402, "step": 530 }, { "dpo_loss": 0.06306228041648865, "dpo_wo_beta": -0.08629266172647476, "epoch": 1.5186584789796882, "grad_norm": 12.634106279395441, "learning_rate": 2.8788330549198512e-06, "logits": -1.8371531963348389, "logps": -115.510009765625, "loss": 0.0766, "objective": 0.06306228041648865, "ranking_idealized": 0.550000011920929, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.9166666865348816, "regularize": 0.06306228041648865, "step": 535 }, { "dpo_loss": 0.13102607429027557, "dpo_wo_beta": -0.41934680938720703, "epoch": 1.5328294756731222, "grad_norm": 14.583212522491863, "learning_rate": 2.8379237365787426e-06, "logits": -1.7649121284484863, "logps": -127.44564819335938, "loss": 0.0928, "objective": 0.13102607429027557, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.8999999761581421, "regularize": 0.13102607429027557, "step": 540 }, { "dpo_loss": 0.12010473757982254, "dpo_wo_beta": -0.2892196476459503, "epoch": 1.5470004723665565, "grad_norm": 18.74636858022955, "learning_rate": 2.7969220332622004e-06, "logits": -1.7403244972229004, "logps": -126.89453125, "loss": 0.1114, "objective": 0.12010473757982254, "ranking_idealized": 0.6499999761581421, "ranking_idealized_expo": 0.550000011920929, "ranking_simple": 0.925000011920929, "regularize": 0.12010473757982254, "step": 545 }, { "dpo_loss": 0.05550822243094444, "dpo_wo_beta": -0.0006199590279720724, "epoch": 1.5611714690599907, "grad_norm": 16.65748534309405, "learning_rate": 2.7558391544265127e-06, "logits": -1.7434070110321045, "logps": -120.79520416259766, "loss": 0.0872, "objective": 0.05550822243094444, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.9750000238418579, "regularize": 0.05550822243094444, "step": 550 }, { "dpo_loss": 0.12240471690893173, "dpo_wo_beta": -0.22643856704235077, "epoch": 1.5753424657534247, "grad_norm": 12.27653222683965, "learning_rate": 2.714686331720543e-06, "logits": -1.8163702487945557, "logps": -123.55280303955078, "loss": 0.103, "objective": 0.12240471690893173, "ranking_idealized": 0.6083333492279053, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.875, "regularize": 0.12240471690893173, "step": 555 }, { "dpo_loss": 0.1260891556739807, "dpo_wo_beta": -0.384922057390213, "epoch": 1.5895134624468588, "grad_norm": 14.054554594902122, "learning_rate": 2.6734748159151104e-06, "logits": -1.8347235918045044, "logps": -115.50324249267578, "loss": 0.1171, "objective": 0.1260891556739807, "ranking_idealized": 0.5333333611488342, "ranking_idealized_expo": 0.4749999940395355, "ranking_simple": 0.8916666507720947, "regularize": 0.1260891556739807, "step": 560 }, { "dpo_loss": 0.11428937315940857, "dpo_wo_beta": -0.142462819814682, "epoch": 1.6036844591402928, "grad_norm": 8.888993907970832, "learning_rate": 2.632215873827142e-06, "logits": -1.9027162790298462, "logps": -117.14366912841797, "loss": 0.1142, "objective": 0.11428937315940857, "ranking_idealized": 0.5416666865348816, "ranking_idealized_expo": 0.4583333432674408, "ranking_simple": 0.8916666507720947, "regularize": 0.11428937315940857, "step": 565 }, { "dpo_loss": 0.14770367741584778, "dpo_wo_beta": -0.5861695408821106, "epoch": 1.6178554558337268, "grad_norm": 9.396256315509223, "learning_rate": 2.5909207852394363e-06, "logits": -2.088587760925293, "logps": -113.1082992553711, "loss": 0.1098, "objective": 0.14770367741584778, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.9083333611488342, "regularize": 0.14770367741584778, "step": 570 }, { "dpo_loss": 0.1413314789533615, "dpo_wo_beta": -0.3796103298664093, "epoch": 1.632026452527161, "grad_norm": 16.325704474226345, "learning_rate": 2.5496008398168844e-06, "logits": -1.9472541809082031, "logps": -117.12679290771484, "loss": 0.1404, "objective": 0.1413314789533615, "ranking_idealized": 0.6583333611488342, "ranking_idealized_expo": 0.6166666746139526, "ranking_simple": 0.9166666865348816, "regularize": 0.1413314789533615, "step": 575 }, { "dpo_loss": 0.1715042144060135, "dpo_wo_beta": -0.48858124017715454, "epoch": 1.6461974492205953, "grad_norm": 11.97898567349363, "learning_rate": 2.508267334019988e-06, "logits": -1.935112476348877, "logps": -107.23208618164062, "loss": 0.1274, "objective": 0.1715042144060135, "ranking_idealized": 0.5583333373069763, "ranking_idealized_expo": 0.5, "ranking_simple": 0.8999999761581421, "regularize": 0.1715042144060135, "step": 580 }, { "epoch": 1.6547000472366555, "eval_dpo_loss": 0.8264312148094177, "eval_dpo_wo_beta": -5.2847466468811035, "eval_logits": -2.4379913806915283, "eval_logps": -119.59071350097656, "eval_loss": 0.808626651763916, "eval_objective": 0.8264312148094177, "eval_ranking_idealized": 0.6045548915863037, "eval_ranking_idealized_expo": 0.5279502868652344, "eval_ranking_simple": 0.5869565010070801, "eval_regularize": 0.8264312148094177, "eval_runtime": 355.6593, "eval_samples_per_second": 16.28, "eval_steps_per_second": 1.358, "step": 583 }, { "dpo_loss": 0.1341490000486374, "dpo_wo_beta": -0.5180007815361023, "epoch": 1.6603684459140293, "grad_norm": 15.27561747473412, "learning_rate": 2.46693156801652e-06, "logits": -1.8945667743682861, "logps": -110.8204116821289, "loss": 0.1227, "objective": 0.1341490000486374, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.9083333611488342, "regularize": 0.1341490000486374, "step": 585 }, { "dpo_loss": 0.13649246096611023, "dpo_wo_beta": -0.5407892465591431, "epoch": 1.6745394426074633, "grad_norm": 8.066407586221112, "learning_rate": 2.4256048425921693e-06, "logits": -1.8402847051620483, "logps": -117.5975112915039, "loss": 0.1263, "objective": 0.13649246096611023, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.46666666865348816, "ranking_simple": 0.9083333611488342, "regularize": 0.13649246096611023, "step": 590 }, { "dpo_loss": 0.14796380698680878, "dpo_wo_beta": -0.5909832119941711, "epoch": 1.6887104393008974, "grad_norm": 17.4246056389601, "learning_rate": 2.384298456061023e-06, "logits": -1.8368481397628784, "logps": -117.76559448242188, "loss": 0.1544, "objective": 0.14796380698680878, "ranking_idealized": 0.675000011920929, "ranking_idealized_expo": 0.5916666388511658, "ranking_simple": 0.8916666507720947, "regularize": 0.14796380698680878, "step": 595 }, { "dpo_loss": 0.15043622255325317, "dpo_wo_beta": -0.6607655882835388, "epoch": 1.7028814359943316, "grad_norm": 18.447927668502214, "learning_rate": 2.3430237011767166e-06, "logits": -1.839497447013855, "logps": -112.93590545654297, "loss": 0.15, "objective": 0.15043622255325317, "ranking_idealized": 0.6333333253860474, "ranking_idealized_expo": 0.550000011920929, "ranking_simple": 0.875, "regularize": 0.15043622255325317, "step": 600 }, { "dpo_loss": 0.11465544998645782, "dpo_wo_beta": -0.34068503975868225, "epoch": 1.7170524326877659, "grad_norm": 12.561303443006217, "learning_rate": 2.30179186204511e-06, "logits": -1.7405670881271362, "logps": -112.9049072265625, "loss": 0.1278, "objective": 0.11465544998645782, "ranking_idealized": 0.5583333373069763, "ranking_idealized_expo": 0.5, "ranking_simple": 0.8999999761581421, "regularize": 0.11465544998645782, "step": 605 }, { "dpo_loss": 0.1477740854024887, "dpo_wo_beta": -0.4079127609729767, "epoch": 1.7312234293811999, "grad_norm": 17.631548474702864, "learning_rate": 2.2606142110393248e-06, "logits": -1.7865701913833618, "logps": -109.32853698730469, "loss": 0.1331, "objective": 0.1477740854024887, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.875, "regularize": 0.1477740854024887, "step": 610 }, { "dpo_loss": 0.18279190361499786, "dpo_wo_beta": -0.8598226308822632, "epoch": 1.745394426074634, "grad_norm": 10.88386046876842, "learning_rate": 2.2195020057179897e-06, "logits": -1.7821184396743774, "logps": -111.59290313720703, "loss": 0.136, "objective": 0.18279190361499786, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.875, "regularize": 0.18279190361499786, "step": 615 }, { "dpo_loss": 0.1030309647321701, "dpo_wo_beta": -0.05606275424361229, "epoch": 1.759565422768068, "grad_norm": 12.949368950505932, "learning_rate": 2.1784664857475356e-06, "logits": -1.780458688735962, "logps": -108.14535522460938, "loss": 0.1303, "objective": 0.1030309647321701, "ranking_idealized": 0.6416666507720947, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.9666666388511658, "regularize": 0.1030309647321701, "step": 620 }, { "dpo_loss": 0.158640518784523, "dpo_wo_beta": -0.4262932240962982, "epoch": 1.7737364194615022, "grad_norm": 15.36682532028932, "learning_rate": 2.1375188698293855e-06, "logits": -1.657003402709961, "logps": -110.36738586425781, "loss": 0.161, "objective": 0.158640518784523, "ranking_idealized": 0.6166666746139526, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.9083333611488342, "regularize": 0.158640518784523, "step": 625 }, { "dpo_loss": 0.21892648935317993, "dpo_wo_beta": -0.9036411046981812, "epoch": 1.7879074161549362, "grad_norm": 14.959909753098716, "learning_rate": 2.096670352632873e-06, "logits": -1.7339377403259277, "logps": -107.9156723022461, "loss": 0.1673, "objective": 0.21892648935317993, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.49166667461395264, "ranking_simple": 0.7833333611488342, "regularize": 0.21892648935317993, "step": 630 }, { "dpo_loss": 0.14205454289913177, "dpo_wo_beta": -0.47540512681007385, "epoch": 1.8020784128483704, "grad_norm": 22.25702396524442, "learning_rate": 2.0559321017347286e-06, "logits": -2.000821352005005, "logps": -119.52375793457031, "loss": 0.1614, "objective": 0.14205454289913177, "ranking_idealized": 0.675000011920929, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.8999999761581421, "regularize": 0.14205454289913177, "step": 635 }, { "epoch": 1.8049126121870571, "eval_dpo_loss": 0.8243346214294434, "eval_dpo_wo_beta": -5.281310081481934, "eval_logits": -2.4850430488586426, "eval_logps": -117.85846710205078, "eval_loss": 0.8209081292152405, "eval_objective": 0.8243346214294434, "eval_ranking_idealized": 0.6045548915863037, "eval_ranking_idealized_expo": 0.5279502868652344, "eval_ranking_simple": 0.5817805528640747, "eval_regularize": 0.8243346214294434, "eval_runtime": 351.818, "eval_samples_per_second": 16.457, "eval_steps_per_second": 1.373, "step": 636 }, { "dpo_loss": 0.13119691610336304, "dpo_wo_beta": -0.36706313490867615, "epoch": 1.8162494095418045, "grad_norm": 20.631621209893076, "learning_rate": 2.01531525456598e-06, "logits": -1.8479942083358765, "logps": -111.6270523071289, "loss": 0.1417, "objective": 0.13119691610336304, "ranking_idealized": 0.5249999761581421, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.8583333492279053, "regularize": 0.13119691610336304, "step": 640 }, { "dpo_loss": 0.1571781039237976, "dpo_wo_beta": -0.5080724954605103, "epoch": 1.8304204062352385, "grad_norm": 20.508347622723388, "learning_rate": 1.974830915367086e-06, "logits": -1.9629262685775757, "logps": -122.33467102050781, "loss": 0.1732, "objective": 0.1571781039237976, "ranking_idealized": 0.6333333253860474, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.875, "regularize": 0.1571781039237976, "step": 645 }, { "dpo_loss": 0.18711452186107635, "dpo_wo_beta": -0.5334885120391846, "epoch": 1.8445914029286725, "grad_norm": 17.78377983289135, "learning_rate": 1.93449015215215e-06, "logits": -1.960700511932373, "logps": -116.45585632324219, "loss": 0.1873, "objective": 0.18711452186107635, "ranking_idealized": 0.6333333253860474, "ranking_idealized_expo": 0.5, "ranking_simple": 0.875, "regularize": 0.18711452186107635, "step": 650 }, { "dpo_loss": 0.14089132845401764, "dpo_wo_beta": -0.257717490196228, "epoch": 1.8587623996221068, "grad_norm": 17.423684978791957, "learning_rate": 1.8943039936830347e-06, "logits": -1.7539128065109253, "logps": -105.96385192871094, "loss": 0.1703, "objective": 0.14089132845401764, "ranking_idealized": 0.5666666626930237, "ranking_idealized_expo": 0.46666666865348816, "ranking_simple": 0.8416666388511658, "regularize": 0.14089132845401764, "step": 655 }, { "dpo_loss": 0.12644439935684204, "dpo_wo_beta": -0.18994450569152832, "epoch": 1.872933396315541, "grad_norm": 14.114136095930363, "learning_rate": 1.8542834264542091e-06, "logits": -1.8740805387496948, "logps": -112.58109283447266, "loss": 0.1679, "objective": 0.12644439935684204, "ranking_idealized": 0.5666666626930237, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.8833333253860474, "regularize": 0.12644439935684204, "step": 660 }, { "dpo_loss": 0.1596754938364029, "dpo_wo_beta": -0.5118387341499329, "epoch": 1.887104393008975, "grad_norm": 17.02657500840854, "learning_rate": 1.814439391689151e-06, "logits": -2.012057065963745, "logps": -106.6546401977539, "loss": 0.1748, "objective": 0.1596754938364029, "ranking_idealized": 0.6166666746139526, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.8583333492279053, "regularize": 0.1596754938364029, "step": 665 }, { "dpo_loss": 0.1905670315027237, "dpo_wo_beta": -0.7486369609832764, "epoch": 1.901275389702409, "grad_norm": 14.652072743525036, "learning_rate": 1.7747827823491253e-06, "logits": -1.7807596921920776, "logps": -107.37975311279297, "loss": 0.1695, "objective": 0.1905670315027237, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.8666666746139526, "regularize": 0.1905670315027237, "step": 670 }, { "dpo_loss": 0.14669080078601837, "dpo_wo_beta": -0.3554477095603943, "epoch": 1.915446386395843, "grad_norm": 15.076428386388748, "learning_rate": 1.7353244401551566e-06, "logits": -1.8374218940734863, "logps": -107.03987884521484, "loss": 0.1555, "objective": 0.14669080078601837, "ranking_idealized": 0.6416666507720947, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.8999999761581421, "regularize": 0.14669080078601837, "step": 675 }, { "dpo_loss": 0.15277433395385742, "dpo_wo_beta": -0.5579003095626831, "epoch": 1.9296173830892773, "grad_norm": 14.063016020977683, "learning_rate": 1.6960751526240122e-06, "logits": -1.9377697706222534, "logps": -118.40213775634766, "loss": 0.1487, "objective": 0.15277433395385742, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.8833333253860474, "regularize": 0.15277433395385742, "step": 680 }, { "dpo_loss": 0.1468452513217926, "dpo_wo_beta": -0.34248843789100647, "epoch": 1.9437883797827114, "grad_norm": 20.037932580562952, "learning_rate": 1.6570456501189996e-06, "logits": -1.822561502456665, "logps": -116.04502868652344, "loss": 0.1616, "objective": 0.1468452513217926, "ranking_idealized": 0.5083333253860474, "ranking_idealized_expo": 0.46666666865348816, "ranking_simple": 0.8999999761581421, "regularize": 0.1468452513217926, "step": 685 }, { "epoch": 1.9551251771374587, "eval_dpo_loss": 0.857575535774231, "eval_dpo_wo_beta": -5.723405361175537, "eval_logits": -2.465576410293579, "eval_logps": -119.32209777832031, "eval_loss": 0.8382942080497742, "eval_objective": 0.857575535774231, "eval_ranking_idealized": 0.6045548915863037, "eval_ranking_idealized_expo": 0.5279502868652344, "eval_ranking_simple": 0.5797101259231567, "eval_regularize": 0.857575535774231, "eval_runtime": 359.3448, "eval_samples_per_second": 16.113, "eval_steps_per_second": 1.344, "step": 689 }, { "dpo_loss": 0.11434569954872131, "dpo_wo_beta": -0.045675624161958694, "epoch": 1.9579593764761456, "grad_norm": 13.437704226981893, "learning_rate": 1.6182466029163974e-06, "logits": -1.8100759983062744, "logps": -115.22444915771484, "loss": 0.1309, "objective": 0.11434569954872131, "ranking_idealized": 0.6166666746139526, "ranking_idealized_expo": 0.574999988079071, "ranking_simple": 0.8999999761581421, "regularize": 0.11434569954872131, "step": 690 }, { "dpo_loss": 0.19163179397583008, "dpo_wo_beta": -0.46952199935913086, "epoch": 1.9721303731695796, "grad_norm": 20.21550526964092, "learning_rate": 1.5796886182883053e-06, "logits": -1.829925775527954, "logps": -114.886962890625, "loss": 0.1753, "objective": 0.19163179397583008, "ranking_idealized": 0.5666666626930237, "ranking_idealized_expo": 0.4416666626930237, "ranking_simple": 0.8166666626930237, "regularize": 0.19163179397583008, "step": 695 }, { "dpo_loss": 0.18629121780395508, "dpo_wo_beta": -0.7477880120277405, "epoch": 1.9863013698630136, "grad_norm": 14.017066914132515, "learning_rate": 1.541382237602721e-06, "logits": -1.7919304370880127, "logps": -115.88526916503906, "loss": 0.176, "objective": 0.18629121780395508, "ranking_idealized": 0.5916666388511658, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.8833333253860474, "regularize": 0.18629121780395508, "step": 700 }, { "dpo_loss": 0.20123900473117828, "dpo_wo_beta": -0.6124710440635681, "epoch": 2.0004723665564477, "grad_norm": 21.887245138739928, "learning_rate": 1.5033379334416376e-06, "logits": -1.786551594734192, "logps": -114.37857055664062, "loss": 0.2015, "objective": 0.20123900473117828, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.8333333134651184, "regularize": 0.20123900473117828, "step": 705 }, { "dpo_loss": 0.08176574856042862, "dpo_wo_beta": -0.0886942520737648, "epoch": 2.0146433632498817, "grad_norm": 12.07978325252889, "learning_rate": 1.465566106737942e-06, "logits": -1.8880345821380615, "logps": -114.7125244140625, "loss": 0.1005, "objective": 0.08176574856042862, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.925000011920929, "regularize": 0.08176574856042862, "step": 710 }, { "dpo_loss": 0.07800193130970001, "dpo_wo_beta": -0.004813884384930134, "epoch": 2.028814359943316, "grad_norm": 10.892350166600437, "learning_rate": 1.4280770839319073e-06, "logits": -1.8223975896835327, "logps": -109.677001953125, "loss": 0.1064, "objective": 0.07800193130970001, "ranking_idealized": 0.6916666626930237, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.949999988079071, "regularize": 0.07800193130970001, "step": 715 }, { "dpo_loss": 0.07249681651592255, "dpo_wo_beta": -0.03800208121538162, "epoch": 2.04298535663675, "grad_norm": 10.585775059236884, "learning_rate": 1.3908811141480408e-06, "logits": -1.7804607152938843, "logps": -119.00810241699219, "loss": 0.1011, "objective": 0.07249681651592255, "ranking_idealized": 0.6333333253860474, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.949999988079071, "regularize": 0.07249681651592255, "step": 720 }, { "dpo_loss": 0.10758433490991592, "dpo_wo_beta": -0.4225389361381531, "epoch": 2.057156353330184, "grad_norm": 16.634608237356296, "learning_rate": 1.353988366393083e-06, "logits": -1.8837405443191528, "logps": -122.06383514404297, "loss": 0.1068, "objective": 0.10758433490991592, "ranking_idealized": 0.6499999761581421, "ranking_idealized_expo": 0.5583333373069763, "ranking_simple": 0.925000011920929, "regularize": 0.10758433490991592, "step": 725 }, { "dpo_loss": 0.16440601646900177, "dpo_wo_beta": -0.5969924926757812, "epoch": 2.0713273500236182, "grad_norm": 12.618281047842604, "learning_rate": 1.3174089267758983e-06, "logits": -1.7312003374099731, "logps": -118.56900024414062, "loss": 0.107, "objective": 0.16440601646900177, "ranking_idealized": 0.5166666507720947, "ranking_idealized_expo": 0.4333333373069763, "ranking_simple": 0.8583333492279053, "regularize": 0.16440601646900177, "step": 730 }, { "dpo_loss": 0.15623989701271057, "dpo_wo_beta": -0.9556598663330078, "epoch": 2.0854983467170523, "grad_norm": 16.05614648021729, "learning_rate": 1.2811527957500344e-06, "logits": -1.6055046319961548, "logps": -127.75144958496094, "loss": 0.1095, "objective": 0.15623989701271057, "ranking_idealized": 0.5583333373069763, "ranking_idealized_expo": 0.46666666865348816, "ranking_simple": 0.949999988079071, "regularize": 0.15623989701271057, "step": 735 }, { "dpo_loss": 0.11614324897527695, "dpo_wo_beta": -0.22688980400562286, "epoch": 2.0996693434104867, "grad_norm": 14.923960342956232, "learning_rate": 1.245229885379699e-06, "logits": -1.688416838645935, "logps": -123.08829498291016, "loss": 0.1063, "objective": 0.11614324897527695, "ranking_idealized": 0.6333333253860474, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.949999988079071, "regularize": 0.11614324897527695, "step": 740 }, { "epoch": 2.10533774208786, "eval_dpo_loss": 0.9823706746101379, "eval_dpo_wo_beta": -7.331023216247559, "eval_logits": -2.2711641788482666, "eval_logps": -133.36373901367188, "eval_loss": 0.9485942721366882, "eval_objective": 0.9823706746101379, "eval_ranking_idealized": 0.6045548915863037, "eval_ranking_idealized_expo": 0.5279502868652344, "eval_ranking_simple": 0.5517598390579224, "eval_regularize": 0.9823706746101379, "eval_runtime": 364.9494, "eval_samples_per_second": 15.865, "eval_steps_per_second": 1.323, "step": 742 }, { "dpo_loss": 0.11820446699857712, "dpo_wo_beta": -0.5452965497970581, "epoch": 2.1138403401039207, "grad_norm": 14.405167306826879, "learning_rate": 1.2096500166298992e-06, "logits": -1.4971224069595337, "logps": -132.55892944335938, "loss": 0.1031, "objective": 0.11820446699857712, "ranking_idealized": 0.6916666626930237, "ranking_idealized_expo": 0.6166666746139526, "ranking_simple": 0.925000011920929, "regularize": 0.11820446699857712, "step": 745 }, { "dpo_loss": 0.08092837035655975, "dpo_wo_beta": -0.21483030915260315, "epoch": 2.1280113367973548, "grad_norm": 12.035987152428243, "learning_rate": 1.1744229166814889e-06, "logits": -1.698511004447937, "logps": -129.5937042236328, "loss": 0.0957, "objective": 0.08092837035655975, "ranking_idealized": 0.6666666865348816, "ranking_idealized_expo": 0.574999988079071, "ranking_simple": 0.9583333134651184, "regularize": 0.08092837035655975, "step": 750 }, { "dpo_loss": 0.09838299453258514, "dpo_wo_beta": -0.330010324716568, "epoch": 2.142182333490789, "grad_norm": 15.638085348810199, "learning_rate": 1.1395582162718524e-06, "logits": -1.6223360300064087, "logps": -128.86538696289062, "loss": 0.1147, "objective": 0.09838299453258514, "ranking_idealized": 0.5916666388511658, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.925000011920929, "regularize": 0.09838299453258514, "step": 755 }, { "dpo_loss": 0.10984232276678085, "dpo_wo_beta": -0.2521561086177826, "epoch": 2.156353330184223, "grad_norm": 18.280317761955644, "learning_rate": 1.1050654470619602e-06, "logits": -1.6547772884368896, "logps": -118.33650970458984, "loss": 0.1127, "objective": 0.10984232276678085, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.9166666865348816, "regularize": 0.10984232276678085, "step": 760 }, { "dpo_loss": 0.11440528929233551, "dpo_wo_beta": -0.2931906580924988, "epoch": 2.170524326877657, "grad_norm": 12.536707104746414, "learning_rate": 1.0709540390305061e-06, "logits": -1.692717432975769, "logps": -118.69541931152344, "loss": 0.1215, "objective": 0.11440528929233551, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.9333333373069763, "regularize": 0.11440528929233551, "step": 765 }, { "dpo_loss": 0.06226298585534096, "dpo_wo_beta": -0.023008961230516434, "epoch": 2.1846953235710913, "grad_norm": 12.278836680753214, "learning_rate": 1.0372333178958462e-06, "logits": -1.8234201669692993, "logps": -122.00631713867188, "loss": 0.1046, "objective": 0.06226298585534096, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.49166667461395264, "ranking_simple": 0.9583333134651184, "regularize": 0.06226298585534096, "step": 770 }, { "dpo_loss": 0.13250760734081268, "dpo_wo_beta": -0.465701699256897, "epoch": 2.1988663202645253, "grad_norm": 18.99094104963921, "learning_rate": 1.0039125025664392e-06, "logits": -1.7803070545196533, "logps": -124.71762084960938, "loss": 0.1111, "objective": 0.13250760734081268, "ranking_idealized": 0.5416666865348816, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.875, "regularize": 0.13250760734081268, "step": 775 }, { "dpo_loss": 0.13606421649456024, "dpo_wo_beta": -0.6162300705909729, "epoch": 2.2130373169579594, "grad_norm": 16.36095800479351, "learning_rate": 9.710007026204896e-07, "logits": -1.7376734018325806, "logps": -122.50430297851562, "loss": 0.1045, "objective": 0.13606421649456024, "ranking_idealized": 0.5, "ranking_idealized_expo": 0.4333333373069763, "ranking_simple": 0.8999999761581421, "regularize": 0.13606421649456024, "step": 780 }, { "dpo_loss": 0.11954029649496078, "dpo_wo_beta": -0.5017859935760498, "epoch": 2.2272083136513934, "grad_norm": 14.897356026126795, "learning_rate": 9.385069158154805e-07, "logits": -1.7108873128890991, "logps": -119.73731994628906, "loss": 0.1223, "objective": 0.11954029649496078, "ranking_idealized": 0.5583333373069763, "ranking_idealized_expo": 0.4583333432674408, "ranking_simple": 0.8916666507720947, "regularize": 0.11954029649496078, "step": 785 }, { "dpo_loss": 0.0877259224653244, "dpo_wo_beta": -0.3332770764827728, "epoch": 2.2413793103448274, "grad_norm": 14.08354106572375, "learning_rate": 9.064400256282757e-07, "logits": -1.7486475706100464, "logps": -122.89460754394531, "loss": 0.1063, "objective": 0.0877259224653244, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5666666626930237, "ranking_simple": 0.949999988079071, "regularize": 0.0877259224653244, "step": 790 }, { "dpo_loss": 0.09928978979587555, "dpo_wo_beta": -0.10460276901721954, "epoch": 2.255550307038262, "grad_norm": 15.557463813809257, "learning_rate": 8.74808798826467e-07, "logits": -1.8421998023986816, "logps": -120.40747833251953, "loss": 0.1017, "objective": 0.09928978979587555, "ranking_idealized": 0.699999988079071, "ranking_idealized_expo": 0.6000000238418579, "ranking_simple": 0.9416666626930237, "regularize": 0.09928978979587555, "step": 795 }, { "epoch": 2.255550307038262, "eval_dpo_loss": 0.8903655409812927, "eval_dpo_wo_beta": -6.205545902252197, "eval_logits": -2.4489927291870117, "eval_logps": -123.57449340820312, "eval_loss": 0.8711386919021606, "eval_objective": 0.8903655409812927, "eval_ranking_idealized": 0.6045548915863037, "eval_ranking_idealized_expo": 0.5279502868652344, "eval_ranking_simple": 0.5683229565620422, "eval_regularize": 0.8903655409812927, "eval_runtime": 347.0103, "eval_samples_per_second": 16.685, "eval_steps_per_second": 1.392, "step": 795 }, { "dpo_loss": 0.1340600550174713, "dpo_wo_beta": -0.5811701416969299, "epoch": 2.269721303731696, "grad_norm": 13.668309173603514, "learning_rate": 8.436218830716259e-07, "logits": -1.8454309701919556, "logps": -120.35511779785156, "loss": 0.1264, "objective": 0.1340600550174713, "ranking_idealized": 0.6499999761581421, "ranking_idealized_expo": 0.550000011920929, "ranking_simple": 0.925000011920929, "regularize": 0.1340600550174713, "step": 800 }, { "dpo_loss": 0.10816308110952377, "dpo_wo_beta": -0.1633211225271225, "epoch": 2.28389230042513, "grad_norm": 13.957508503091352, "learning_rate": 8.1288780455512e-07, "logits": -1.8811193704605103, "logps": -119.9056625366211, "loss": 0.1133, "objective": 0.10816308110952377, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.550000011920929, "ranking_simple": 0.9166666865348816, "regularize": 0.10816308110952377, "step": 805 }, { "dpo_loss": 0.1715787798166275, "dpo_wo_beta": -0.7547404766082764, "epoch": 2.298063297118564, "grad_norm": 12.765274097098457, "learning_rate": 7.826149656671386e-07, "logits": -1.9726245403289795, "logps": -114.58699798583984, "loss": 0.1146, "objective": 0.1715787798166275, "ranking_idealized": 0.6416666507720947, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.9166666865348816, "regularize": 0.1715787798166275, "step": 810 }, { "dpo_loss": 0.09882104396820068, "dpo_wo_beta": -0.18077202141284943, "epoch": 2.312234293811998, "grad_norm": 11.818365173405272, "learning_rate": 7.528116426995605e-07, "logits": -1.7834192514419556, "logps": -118.5341796875, "loss": 0.0843, "objective": 0.09882104396820068, "ranking_idealized": 0.550000011920929, "ranking_idealized_expo": 0.4333333373069763, "ranking_simple": 0.8916666507720947, "regularize": 0.09882104396820068, "step": 815 }, { "dpo_loss": 0.1070082038640976, "dpo_wo_beta": -0.44529280066490173, "epoch": 2.3264052905054324, "grad_norm": 13.918285230217345, "learning_rate": 7.234859835833022e-07, "logits": -1.8069401979446411, "logps": -123.11463928222656, "loss": 0.1153, "objective": 0.1070082038640976, "ranking_idealized": 0.6083333492279053, "ranking_idealized_expo": 0.5583333373069763, "ranking_simple": 0.949999988079071, "regularize": 0.1070082038640976, "step": 820 }, { "dpo_loss": 0.14794430136680603, "dpo_wo_beta": -0.5907248258590698, "epoch": 2.3405762871988665, "grad_norm": 14.849580151366643, "learning_rate": 6.94646005660749e-07, "logits": -1.808493971824646, "logps": -116.64714050292969, "loss": 0.1107, "objective": 0.14794430136680603, "ranking_idealized": 0.6333333253860474, "ranking_idealized_expo": 0.5583333373069763, "ranking_simple": 0.9083333611488342, "regularize": 0.14794430136680603, "step": 825 }, { "dpo_loss": 0.08306514471769333, "dpo_wo_beta": -0.10000230371952057, "epoch": 2.3547472838923005, "grad_norm": 12.857101782602014, "learning_rate": 6.662995934939007e-07, "logits": -1.7857582569122314, "logps": -123.92733764648438, "loss": 0.1063, "objective": 0.08306514471769333, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.9583333134651184, "regularize": 0.08306514471769333, "step": 830 }, { "dpo_loss": 0.11827471107244492, "dpo_wo_beta": -0.40130358934402466, "epoch": 2.3689182805857345, "grad_norm": 13.825829745811577, "learning_rate": 6.384544967088063e-07, "logits": -1.8356945514678955, "logps": -122.48320770263672, "loss": 0.124, "objective": 0.11827471107244492, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.4583333432674408, "ranking_simple": 0.9333333373069763, "regularize": 0.11827471107244492, "step": 835 }, { "dpo_loss": 0.14169135689735413, "dpo_wo_beta": -0.3359481692314148, "epoch": 2.3830892772791685, "grad_norm": 18.243146391325514, "learning_rate": 6.111183278768956e-07, "logits": -1.8658840656280518, "logps": -123.42705535888672, "loss": 0.1321, "objective": 0.14169135689735413, "ranking_idealized": 0.5, "ranking_idealized_expo": 0.44999998807907104, "ranking_simple": 0.9083333611488342, "regularize": 0.14169135689735413, "step": 840 }, { "dpo_loss": 0.09381429105997086, "dpo_wo_beta": -0.14171645045280457, "epoch": 2.3972602739726026, "grad_norm": 24.500088347031785, "learning_rate": 5.842985604337769e-07, "logits": -1.7731019258499146, "logps": -125.81861877441406, "loss": 0.1225, "objective": 0.09381429105997086, "ranking_idealized": 0.6083333492279053, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.8999999761581421, "regularize": 0.09381429105997086, "step": 845 }, { "epoch": 2.4057628719886632, "eval_dpo_loss": 0.9035148620605469, "eval_dpo_wo_beta": -6.352902889251709, "eval_logits": -2.4742591381073, "eval_logps": -124.53355407714844, "eval_loss": 0.882164478302002, "eval_objective": 0.9035148620605469, "eval_ranking_idealized": 0.6045548915863037, "eval_ranking_idealized_expo": 0.5279502868652344, "eval_ranking_simple": 0.5569358468055725, "eval_regularize": 0.9035148620605469, "eval_runtime": 366.478, "eval_samples_per_second": 15.799, "eval_steps_per_second": 1.318, "step": 848 }, { "dpo_loss": 0.10245585441589355, "dpo_wo_beta": -0.2030431628227234, "epoch": 2.413793103448276, "grad_norm": 22.363955547911008, "learning_rate": 5.580025266360764e-07, "logits": -1.7464776039123535, "logps": -122.80999755859375, "loss": 0.1449, "objective": 0.10245585441589355, "ranking_idealized": 0.6666666865348816, "ranking_idealized_expo": 0.5625, "ranking_simple": 0.9583333134651184, "regularize": 0.10245585441589355, "step": 850 }, { "dpo_loss": 0.14077231287956238, "dpo_wo_beta": -0.15207929909229279, "epoch": 2.42796410014171, "grad_norm": 14.848409925173506, "learning_rate": 5.322374155568688e-07, "logits": -1.8929237127304077, "logps": -115.12696838378906, "loss": 0.1124, "objective": 0.14077231287956238, "ranking_idealized": 0.5666666626930237, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.9166666865348816, "regularize": 0.14077231287956238, "step": 855 }, { "dpo_loss": 0.1414971649646759, "dpo_wo_beta": -0.5974557995796204, "epoch": 2.442135096835144, "grad_norm": 11.710235822935351, "learning_rate": 5.070102711202606e-07, "logits": -1.7974507808685303, "logps": -121.47347259521484, "loss": 0.1083, "objective": 0.1414971649646759, "ranking_idealized": 0.6166666746139526, "ranking_idealized_expo": 0.5583333373069763, "ranking_simple": 0.9333333373069763, "regularize": 0.1414971649646759, "step": 860 }, { "dpo_loss": 0.09845638275146484, "dpo_wo_beta": -0.1998511403799057, "epoch": 2.4563060935285783, "grad_norm": 15.827446562598988, "learning_rate": 4.823279901756498e-07, "logits": -1.816353440284729, "logps": -122.7919692993164, "loss": 0.1063, "objective": 0.09845638275146484, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.9083333611488342, "regularize": 0.09845638275146484, "step": 865 }, { "dpo_loss": 0.11614971607923508, "dpo_wo_beta": -0.34934201836586, "epoch": 2.4704770902220123, "grad_norm": 15.618833316654502, "learning_rate": 4.581973206121948e-07, "logits": -1.9400283098220825, "logps": -119.73554992675781, "loss": 0.1153, "objective": 0.11614971607923508, "ranking_idealized": 0.5916666388511658, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.925000011920929, "regularize": 0.11614971607923508, "step": 870 }, { "dpo_loss": 0.2065262645483017, "dpo_wo_beta": -1.1751881837844849, "epoch": 2.4846480869154464, "grad_norm": 17.223610041291963, "learning_rate": 4.3462485951401126e-07, "logits": -1.7437902688980103, "logps": -120.61251831054688, "loss": 0.1238, "objective": 0.2065262645483017, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.574999988079071, "ranking_simple": 0.8666666746139526, "regularize": 0.2065262645483017, "step": 875 }, { "dpo_loss": 0.13995474576950073, "dpo_wo_beta": -0.27328214049339294, "epoch": 2.4988190836088804, "grad_norm": 11.937037662011573, "learning_rate": 4.116170513565942e-07, "logits": -1.9172199964523315, "logps": -115.04023742675781, "loss": 0.1093, "objective": 0.13995474576950073, "ranking_idealized": 0.5249999761581421, "ranking_idealized_expo": 0.49166667461395264, "ranking_simple": 0.8999999761581421, "regularize": 0.13995474576950073, "step": 880 }, { "dpo_loss": 0.13740381598472595, "dpo_wo_beta": -0.38727322220802307, "epoch": 2.5129900803023144, "grad_norm": 16.35266837824131, "learning_rate": 3.891801862449629e-07, "logits": -1.8533929586410522, "logps": -115.91497039794922, "loss": 0.1385, "objective": 0.13740381598472595, "ranking_idealized": 0.6083333492279053, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.8916666507720947, "regularize": 0.13740381598472595, "step": 885 }, { "dpo_loss": 0.11444827914237976, "dpo_wo_beta": -0.31205496191978455, "epoch": 2.527161076995749, "grad_norm": 12.141683375579714, "learning_rate": 3.6732039819400686e-07, "logits": -1.6747931241989136, "logps": -116.25071716308594, "loss": 0.1173, "objective": 0.11444827914237976, "ranking_idealized": 0.5166666507720947, "ranking_idealized_expo": 0.4416666626930237, "ranking_simple": 0.9083333611488342, "regularize": 0.11444827914237976, "step": 890 }, { "dpo_loss": 0.21558310091495514, "dpo_wo_beta": -0.7720097303390503, "epoch": 2.541332073689183, "grad_norm": 17.5258932616616, "learning_rate": 3.46043663451511e-07, "logits": -1.938331127166748, "logps": -121.76246643066406, "loss": 0.1324, "objective": 0.21558310091495514, "ranking_idealized": 0.550000011920929, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.8333333134651184, "regularize": 0.21558310091495514, "step": 895 }, { "dpo_loss": 0.1236240565776825, "dpo_wo_beta": -0.21698738634586334, "epoch": 2.555503070382617, "grad_norm": 16.69855766001795, "learning_rate": 3.253557988643072e-07, "logits": -1.8755207061767578, "logps": -119.14775848388672, "loss": 0.1157, "objective": 0.1236240565776825, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.550000011920929, "ranking_simple": 0.9083333611488342, "regularize": 0.1236240565776825, "step": 900 }, { "epoch": 2.558337269721304, "eval_dpo_loss": 0.894111156463623, "eval_dpo_wo_beta": -6.213596343994141, "eval_logits": -2.4885809421539307, "eval_logps": -124.45829010009766, "eval_loss": 0.8717960715293884, "eval_objective": 0.894111156463623, "eval_ranking_idealized": 0.6045548915863037, "eval_ranking_idealized_expo": 0.5279502868652344, "eval_ranking_simple": 0.5621117949485779, "eval_regularize": 0.894111156463623, "eval_runtime": 342.6574, "eval_samples_per_second": 16.897, "eval_steps_per_second": 1.41, "step": 901 }, { "dpo_loss": 0.09204068034887314, "dpo_wo_beta": -0.17730669677257538, "epoch": 2.569674067076051, "grad_norm": 14.161016786369684, "learning_rate": 3.052624602880064e-07, "logits": -1.8424724340438843, "logps": -117.90782928466797, "loss": 0.0986, "objective": 0.09204068034887314, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5333333611488342, "ranking_simple": 0.925000011920929, "regularize": 0.09204068034887314, "step": 905 }, { "dpo_loss": 0.11361932754516602, "dpo_wo_beta": -0.38244467973709106, "epoch": 2.583845063769485, "grad_norm": 12.421803606538058, "learning_rate": 2.8576914104074425e-07, "logits": -2.0089211463928223, "logps": -116.39904022216797, "loss": 0.1122, "objective": 0.11361932754516602, "ranking_idealized": 0.5916666388511658, "ranking_idealized_expo": 0.4583333432674408, "ranking_simple": 0.9166666865348816, "regularize": 0.11361932754516602, "step": 910 }, { "dpo_loss": 0.13471105694770813, "dpo_wo_beta": -0.5014829039573669, "epoch": 2.5980160604629194, "grad_norm": 15.12060209950672, "learning_rate": 2.6688117040136463e-07, "logits": -1.981037974357605, "logps": -121.86930084228516, "loss": 0.1153, "objective": 0.13471105694770813, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.949999988079071, "regularize": 0.13471105694770813, "step": 915 }, { "dpo_loss": 0.08700807392597198, "dpo_wo_beta": -0.1876840889453888, "epoch": 2.6121870571563535, "grad_norm": 17.811975214160903, "learning_rate": 2.486037121524448e-07, "logits": -1.898934245109558, "logps": -120.13009643554688, "loss": 0.114, "objective": 0.08700807392597198, "ranking_idealized": 0.6333333253860474, "ranking_idealized_expo": 0.550000011920929, "ranking_simple": 0.9416666626930237, "regularize": 0.08700807392597198, "step": 920 }, { "dpo_loss": 0.0918925479054451, "dpo_wo_beta": -0.14420188963413239, "epoch": 2.6263580538497875, "grad_norm": 16.6284598820075, "learning_rate": 2.3094176316856982e-07, "logits": -1.8268101215362549, "logps": -121.2860107421875, "loss": 0.1064, "objective": 0.0918925479054451, "ranking_idealized": 0.550000011920929, "ranking_idealized_expo": 0.4749999940395355, "ranking_simple": 0.9083333611488342, "regularize": 0.0918925479054451, "step": 925 }, { "dpo_loss": 0.15886452794075012, "dpo_wo_beta": -0.7213179469108582, "epoch": 2.6405290505432215, "grad_norm": 15.32952011074324, "learning_rate": 2.13900152050239e-07, "logits": -1.9606980085372925, "logps": -110.05363464355469, "loss": 0.1179, "objective": 0.15886452794075012, "ranking_idealized": 0.5416666865348816, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.8999999761581421, "regularize": 0.15886452794075012, "step": 930 }, { "dpo_loss": 0.13784056901931763, "dpo_wo_beta": -0.5006576776504517, "epoch": 2.6547000472366555, "grad_norm": 14.658353796887713, "learning_rate": 1.9748353780377234e-07, "logits": -1.9395031929016113, "logps": -119.9269790649414, "loss": 0.1254, "objective": 0.13784056901931763, "ranking_idealized": 0.6583333611488342, "ranking_idealized_expo": 0.6000000238418579, "ranking_simple": 0.8916666507720947, "regularize": 0.13784056901931763, "step": 935 }, { "dpo_loss": 0.12213913351297379, "dpo_wo_beta": -0.43640393018722534, "epoch": 2.66887104393009, "grad_norm": 14.844912954939305, "learning_rate": 1.8169640856758652e-07, "logits": -1.952646017074585, "logps": -121.75474548339844, "loss": 0.1117, "objective": 0.12213913351297379, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.9166666865348816, "regularize": 0.12213913351297379, "step": 940 }, { "dpo_loss": 0.1136295348405838, "dpo_wo_beta": -0.17752434313297272, "epoch": 2.6830420406235236, "grad_norm": 13.220834082246482, "learning_rate": 1.6654308038518057e-07, "logits": -1.7970060110092163, "logps": -117.90103149414062, "loss": 0.1054, "objective": 0.1136295348405838, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.49166667461395264, "ranking_simple": 0.9083333611488342, "regularize": 0.1136295348405838, "step": 945 }, { "dpo_loss": 0.16729401051998138, "dpo_wo_beta": -0.7473469972610474, "epoch": 2.697213037316958, "grad_norm": 14.369145851061829, "learning_rate": 1.5202769602517514e-07, "logits": -1.8816020488739014, "logps": -115.14582061767578, "loss": 0.1387, "objective": 0.16729401051998138, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5, "ranking_simple": 0.8999999761581421, "regularize": 0.16729401051998138, "step": 950 }, { "epoch": 2.708549834671705, "eval_dpo_loss": 0.8891981840133667, "eval_dpo_wo_beta": -6.16098690032959, "eval_logits": -2.508572816848755, "eval_logps": -123.41434478759766, "eval_loss": 0.8687644004821777, "eval_objective": 0.8891981840133667, "eval_ranking_idealized": 0.6045548915863037, "eval_ranking_idealized_expo": 0.5279502868652344, "eval_ranking_simple": 0.5579710006713867, "eval_regularize": 0.8891981840133667, "eval_runtime": 379.9179, "eval_samples_per_second": 15.24, "eval_steps_per_second": 1.271, "step": 954 }, { "dpo_loss": 0.16232462227344513, "dpo_wo_beta": -0.5988053679466248, "epoch": 2.711384034010392, "grad_norm": 18.96223814657741, "learning_rate": 1.381542238487188e-07, "logits": -1.8838648796081543, "logps": -119.06771087646484, "loss": 0.1298, "objective": 0.16232462227344513, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.8999999761581421, "regularize": 0.16232462227344513, "step": 955 }, { "dpo_loss": 0.14441066980361938, "dpo_wo_beta": -0.4333815276622772, "epoch": 2.725555030703826, "grad_norm": 13.716940458471806, "learning_rate": 1.2492645672457838e-07, "logits": -2.032045364379883, "logps": -113.05756378173828, "loss": 0.1189, "objective": 0.14441066980361938, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5083333253860474, "ranking_simple": 0.8999999761581421, "regularize": 0.14441066980361938, "step": 960 }, { "dpo_loss": 0.0981813296675682, "dpo_wo_beta": -0.19802381098270416, "epoch": 2.73972602739726, "grad_norm": 15.207158850260772, "learning_rate": 1.1234801099220787e-07, "logits": -1.7988998889923096, "logps": -122.7618408203125, "loss": 0.1111, "objective": 0.0981813296675682, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5666666626930237, "ranking_simple": 0.9333333373069763, "regularize": 0.0981813296675682, "step": 965 }, { "dpo_loss": 0.1318301260471344, "dpo_wo_beta": -0.2933843731880188, "epoch": 2.753897024090694, "grad_norm": 19.758130540339153, "learning_rate": 1.004223254730749e-07, "logits": -1.7169368267059326, "logps": -120.43925476074219, "loss": 0.1278, "objective": 0.1318301260471344, "ranking_idealized": 0.6083333492279053, "ranking_idealized_expo": 0.4833333194255829, "ranking_simple": 0.8916666507720947, "regularize": 0.1318301260471344, "step": 970 }, { "dpo_loss": 0.058168552815914154, "dpo_wo_beta": -8.394511678488925e-05, "epoch": 2.7680680207841286, "grad_norm": 12.662905511630496, "learning_rate": 8.915266053052374e-08, "logits": -1.902711033821106, "logps": -116.11229705810547, "loss": 0.0999, "objective": 0.058168552815914154, "ranking_idealized": 0.6333333253860474, "ranking_idealized_expo": 0.5, "ranking_simple": 0.925000011920929, "regularize": 0.058168552815914154, "step": 975 }, { "dpo_loss": 0.10343047231435776, "dpo_wo_beta": -0.17137210071086884, "epoch": 2.7822390174775626, "grad_norm": 18.09899373584344, "learning_rate": 7.854209717842231e-08, "logits": -1.8915067911148071, "logps": -118.23885345458984, "loss": 0.1108, "objective": 0.10343047231435776, "ranking_idealized": 0.5416666865348816, "ranking_idealized_expo": 0.4416666626930237, "ranking_simple": 0.949999988079071, "regularize": 0.10343047231435776, "step": 980 }, { "dpo_loss": 0.11380515992641449, "dpo_wo_beta": -0.2935677468776703, "epoch": 2.7964100141709967, "grad_norm": 15.185634936609162, "learning_rate": 6.859353623884569e-08, "logits": -1.851272463798523, "logps": -114.0352783203125, "loss": 0.1005, "objective": 0.11380515992641449, "ranking_idealized": 0.625, "ranking_idealized_expo": 0.5166666507720947, "ranking_simple": 0.8833333253860474, "regularize": 0.11380515992641449, "step": 985 }, { "dpo_loss": 0.10726428776979446, "dpo_wo_beta": -0.24069656431674957, "epoch": 2.8105810108644307, "grad_norm": 16.660819641239655, "learning_rate": 5.930969754901844e-08, "logits": -1.8860033750534058, "logps": -116.69475555419922, "loss": 0.1285, "objective": 0.10726428776979446, "ranking_idealized": 0.49166667461395264, "ranking_idealized_expo": 0.4333333373069763, "ranking_simple": 0.8833333253860474, "regularize": 0.10726428776979446, "step": 990 }, { "dpo_loss": 0.12048947066068649, "dpo_wo_beta": -0.4086553454399109, "epoch": 2.8247520075578647, "grad_norm": 18.459518556479477, "learning_rate": 5.069311921774039e-08, "logits": -1.8980218172073364, "logps": -120.89018249511719, "loss": 0.1282, "objective": 0.12048947066068649, "ranking_idealized": 0.6416666507720947, "ranking_idealized_expo": 0.550000011920929, "ranking_simple": 0.925000011920929, "regularize": 0.12048947066068649, "step": 995 }, { "dpo_loss": 0.08448319137096405, "dpo_wo_beta": -0.17254652082920074, "epoch": 2.838923004251299, "grad_norm": 13.073827319036173, "learning_rate": 4.2746156931490756e-08, "logits": -1.8106515407562256, "logps": -113.39408874511719, "loss": 0.1237, "objective": 0.08448319137096405, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.49166667461395264, "ranking_simple": 0.9333333373069763, "regularize": 0.08448319137096405, "step": 1000 }, { "dpo_loss": 0.12625885009765625, "dpo_wo_beta": -0.28751522302627563, "epoch": 2.853094000944733, "grad_norm": 11.352338165734487, "learning_rate": 3.547098331040916e-08, "logits": -1.8715885877609253, "logps": -115.85346984863281, "loss": 0.1219, "objective": 0.12625885009765625, "ranking_idealized": 0.5833333134651184, "ranking_idealized_expo": 0.4749999940395355, "ranking_simple": 0.8833333253860474, "regularize": 0.12625885009765625, "step": 1005 }, { "epoch": 2.8587623996221065, "eval_dpo_loss": 0.8881875872612, "eval_dpo_wo_beta": -6.153732776641846, "eval_logits": -2.5127134323120117, "eval_logps": -123.14542388916016, "eval_loss": 0.868183434009552, "eval_objective": 0.8881875872612, "eval_ranking_idealized": 0.6045548915863037, "eval_ranking_idealized_expo": 0.5279502868652344, "eval_ranking_simple": 0.5600414276123047, "eval_regularize": 0.8881875872612, "eval_runtime": 375.0227, "eval_samples_per_second": 15.439, "eval_steps_per_second": 1.288, "step": 1007 }, { "dpo_loss": 0.12578138709068298, "dpo_wo_beta": -0.4055772125720978, "epoch": 2.8672649976381672, "grad_norm": 13.654143333766829, "learning_rate": 2.8869587314321324e-08, "logits": -1.871021032333374, "logps": -118.54988861083984, "loss": 0.1301, "objective": 0.12578138709068298, "ranking_idealized": 0.6499999761581421, "ranking_idealized_expo": 0.5666666626930237, "ranking_simple": 0.9166666865348816, "regularize": 0.12578138709068298, "step": 1010 }, { "dpo_loss": 0.11141829192638397, "dpo_wo_beta": -0.452913373708725, "epoch": 2.8814359943316012, "grad_norm": 14.00967768494451, "learning_rate": 2.2943773698977935e-08, "logits": -1.8538991212844849, "logps": -119.40221405029297, "loss": 0.1157, "objective": 0.11141829192638397, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5, "ranking_simple": 0.9416666626930237, "regularize": 0.11141829192638397, "step": 1015 }, { "dpo_loss": 0.10787668824195862, "dpo_wo_beta": -0.35029396414756775, "epoch": 2.8956069910250353, "grad_norm": 12.590503217808422, "learning_rate": 1.7695162522652352e-08, "logits": -1.9000986814498901, "logps": -122.90519714355469, "loss": 0.1076, "objective": 0.10787668824195862, "ranking_idealized": 0.6083333492279053, "ranking_idealized_expo": 0.5, "ranking_simple": 0.8999999761581421, "regularize": 0.10787668824195862, "step": 1020 }, { "dpo_loss": 0.11394120752811432, "dpo_wo_beta": -0.36261746287345886, "epoch": 2.9097779877184697, "grad_norm": 10.606906345600125, "learning_rate": 1.3125188703233815e-08, "logits": -1.8986002206802368, "logps": -116.403564453125, "loss": 0.1089, "objective": 0.11394120752811432, "ranking_idealized": 0.5583333373069763, "ranking_idealized_expo": 0.44999998807907104, "ranking_simple": 0.9333333373069763, "regularize": 0.11394120752811432, "step": 1025 }, { "dpo_loss": 0.0906638652086258, "dpo_wo_beta": -0.1376449316740036, "epoch": 2.9239489844119038, "grad_norm": 12.750602018189479, "learning_rate": 9.235101625932885e-09, "logits": -2.033400058746338, "logps": -113.65220642089844, "loss": 0.1197, "objective": 0.0906638652086258, "ranking_idealized": 0.6000000238418579, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.925000011920929, "regularize": 0.0906638652086258, "step": 1030 }, { "dpo_loss": 0.10265343636274338, "dpo_wo_beta": -0.1061137467622757, "epoch": 2.938119981105338, "grad_norm": 15.296605965797069, "learning_rate": 6.025964801714412e-09, "logits": -1.8468897342681885, "logps": -119.85134887695312, "loss": 0.1049, "objective": 0.10265343636274338, "ranking_idealized": 0.6083333492279053, "ranking_idealized_expo": 0.49166667461395264, "ranking_simple": 0.8916666507720947, "regularize": 0.10265343636274338, "step": 1035 }, { "dpo_loss": 0.09396873414516449, "dpo_wo_beta": -0.1912200003862381, "epoch": 2.952290977798772, "grad_norm": 15.90972962002085, "learning_rate": 3.4986555765434415e-09, "logits": -1.8800926208496094, "logps": -122.51961517333984, "loss": 0.0994, "objective": 0.09396873414516449, "ranking_idealized": 0.6499999761581421, "ranking_idealized_expo": 0.5583333373069763, "ranking_simple": 0.9083333611488342, "regularize": 0.09396873414516449, "step": 1040 }, { "dpo_loss": 0.14692950248718262, "dpo_wo_beta": -0.6586350798606873, "epoch": 2.966461974492206, "grad_norm": 8.0573279067109, "learning_rate": 1.6538648915270794e-09, "logits": -1.8756026029586792, "logps": -119.65303039550781, "loss": 0.1082, "objective": 0.14692950248718262, "ranking_idealized": 0.6416666507720947, "ranking_idealized_expo": 0.5416666865348816, "ranking_simple": 0.9166666865348816, "regularize": 0.14692950248718262, "step": 1045 }, { "dpo_loss": 0.07982174307107925, "dpo_wo_beta": -0.0584401935338974, "epoch": 2.9806329711856403, "grad_norm": 9.88886611903832, "learning_rate": 4.920970940180958e-10, "logits": -1.912126898765564, "logps": -116.61032104492188, "loss": 0.0891, "objective": 0.07982174307107925, "ranking_idealized": 0.6166666746139526, "ranking_idealized_expo": 0.5249999761581421, "ranking_simple": 0.925000011920929, "regularize": 0.07982174307107925, "step": 1050 }, { "dpo_loss": 0.07663024961948395, "dpo_wo_beta": -0.015355088748037815, "epoch": 2.9948039678790743, "grad_norm": 13.025122561532541, "learning_rate": 1.3669799732163314e-11, "logits": -1.775391697883606, "logps": -116.977294921875, "loss": 0.0869, "objective": 0.07663024961948395, "ranking_idealized": 0.6333333253860474, "ranking_idealized_expo": 0.550000011920929, "ranking_simple": 0.8999999761581421, "regularize": 0.07663024961948395, "step": 1055 }, { "epoch": 2.9976381672177608, "step": 1056, "total_flos": 0.0, "train_loss": 0.022545777056648425, "train_runtime": 4386.5835, "train_samples_per_second": 34.744, "train_steps_per_second": 0.241 } ], "logging_steps": 5, "max_steps": 1056, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 53, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 5, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }