{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 4167, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 4.796163069544364e-10, "logps/chosen": -195.92408752441406, "logps/rejected": -219.64862060546875, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 0.5502853989601135, "losses/total": 0.6931471824645996, "ref_logps/chosen": -195.92408752441406, "ref_logps/rejected": -219.64862060546875, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 9.592326139088728e-10, "logps/chosen": -227.07376098632812, "logps/rejected": -208.024169921875, "loss": 0.6931, "losses/dpo": 0.6931471824645996, "losses/sft": 0.5461169481277466, "losses/total": 0.6931471824645996, "ref_logps/chosen": -227.07376098632812, "ref_logps/rejected": -208.024169921875, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0, "learning_rate": 1.4388489208633094e-09, "logps/chosen": -209.070068359375, "logps/rejected": -238.21185302734375, "loss": 0.6861, "losses/dpo": 0.6420527100563049, "losses/sft": 0.5804787874221802, "losses/total": 0.6420527100563049, "ref_logps/chosen": -209.22369384765625, "ref_logps/rejected": -238.2115936279297, "rewards/accuracies": 0.53125, "rewards/chosen": 0.01536126434803009, "rewards/margins": 0.015388300642371178, "rewards/rejected": -2.703676000237465e-05, "step": 3 }, { "epoch": 0.0, "learning_rate": 1.9184652278177457e-09, "logps/chosen": -212.27862548828125, "logps/rejected": -207.04708862304688, "loss": 0.6896, "losses/dpo": 0.6863952279090881, "losses/sft": 0.5078216195106506, "losses/total": 0.6863952279090881, "ref_logps/chosen": -212.33580017089844, "ref_logps/rejected": -207.01995849609375, "rewards/accuracies": 0.5, "rewards/chosen": 0.005714787170290947, "rewards/margins": 0.008426044136285782, "rewards/rejected": -0.002711254870519042, "step": 4 }, { "epoch": 0.0, "learning_rate": 2.398081534772182e-09, "logps/chosen": -217.98707580566406, "logps/rejected": -224.05078125, "loss": 0.7004, "losses/dpo": 0.7808195352554321, "losses/sft": 1.2608217000961304, "losses/total": 0.7808195352554321, "ref_logps/chosen": -217.8271484375, "ref_logps/rejected": -224.02053833007812, "rewards/accuracies": 0.4375, "rewards/chosen": -0.01599106937646866, "rewards/margins": -0.01296453457325697, "rewards/rejected": -0.0030265338718891144, "step": 5 }, { "epoch": 0.0, "learning_rate": 2.8776978417266188e-09, "logps/chosen": -226.57473754882812, "logps/rejected": -223.18605041503906, "loss": 0.6967, "losses/dpo": 0.8085763454437256, "losses/sft": 0.63048255443573, "losses/total": 0.8085763454437256, "ref_logps/chosen": -226.53131103515625, "ref_logps/rejected": -223.19110107421875, "rewards/accuracies": 0.5625, "rewards/chosen": -0.004343712702393532, "rewards/margins": -0.0048466213047504425, "rewards/rejected": 0.0005029086023569107, "step": 6 }, { "epoch": 0.0, "learning_rate": 3.357314148681055e-09, "logps/chosen": -223.2445068359375, "logps/rejected": -229.1051025390625, "loss": 0.6901, "losses/dpo": 0.683900773525238, "losses/sft": 0.42638689279556274, "losses/total": 0.683900773525238, "ref_logps/chosen": -223.25167846679688, "ref_logps/rejected": -229.0334014892578, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0007191309705376625, "rewards/margins": 0.007889331318438053, "rewards/rejected": -0.0071702017448842525, "step": 7 }, { "epoch": 0.0, "learning_rate": 3.836930455635491e-09, "logps/chosen": -183.9506378173828, "logps/rejected": -225.83067321777344, "loss": 0.695, "losses/dpo": 0.69858717918396, "losses/sft": 0.4367382526397705, "losses/total": 0.69858717918396, "ref_logps/chosen": -183.9214324951172, "ref_logps/rejected": -225.82583618164062, "rewards/accuracies": 0.40625, "rewards/chosen": -0.0029202429577708244, "rewards/margins": -0.002434477675706148, "rewards/rejected": -0.0004857662133872509, "step": 8 }, { "epoch": 0.0, "learning_rate": 4.316546762589928e-09, "logps/chosen": -209.6141357421875, "logps/rejected": -261.406982421875, "loss": 0.7054, "losses/dpo": 0.6413358449935913, "losses/sft": 0.70503830909729, "losses/total": 0.6413358449935913, "ref_logps/chosen": -209.62454223632812, "ref_logps/rejected": -261.6472473144531, "rewards/accuracies": 0.4375, "rewards/chosen": 0.0010393736883997917, "rewards/margins": -0.0229833722114563, "rewards/rejected": 0.024022744968533516, "step": 9 }, { "epoch": 0.0, "learning_rate": 4.796163069544364e-09, "logps/chosen": -239.19415283203125, "logps/rejected": -209.92398071289062, "loss": 0.7037, "losses/dpo": 0.6840277314186096, "losses/sft": 0.6743116974830627, "losses/total": 0.6840277314186096, "ref_logps/chosen": -239.11460876464844, "ref_logps/rejected": -210.03622436523438, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0079556405544281, "rewards/margins": -0.0191812701523304, "rewards/rejected": 0.011225629597902298, "step": 10 }, { "epoch": 0.0, "learning_rate": 5.275779376498801e-09, "logps/chosen": -226.1039276123047, "logps/rejected": -203.7198486328125, "loss": 0.6879, "losses/dpo": 0.689361035823822, "losses/sft": 0.6275054812431335, "losses/total": 0.689361035823822, "ref_logps/chosen": -226.0528564453125, "ref_logps/rejected": -203.54913330078125, "rewards/accuracies": 0.6875, "rewards/chosen": -0.005106884986162186, "rewards/margins": 0.011965863406658173, "rewards/rejected": -0.01707274839282036, "step": 11 }, { "epoch": 0.0, "learning_rate": 5.7553956834532375e-09, "logps/chosen": -238.27603149414062, "logps/rejected": -240.8157196044922, "loss": 0.6832, "losses/dpo": 0.6698487401008606, "losses/sft": 0.4733908176422119, "losses/total": 0.6698487401008606, "ref_logps/chosen": -238.43630981445312, "ref_logps/rejected": -240.76112365722656, "rewards/accuracies": 0.625, "rewards/chosen": 0.01602914184331894, "rewards/margins": 0.021489525213837624, "rewards/rejected": -0.0054603805765509605, "step": 12 }, { "epoch": 0.0, "learning_rate": 6.2350119904076734e-09, "logps/chosen": -223.08953857421875, "logps/rejected": -228.435302734375, "loss": 0.6964, "losses/dpo": 0.7008641362190247, "losses/sft": 0.4583137035369873, "losses/total": 0.7008641362190247, "ref_logps/chosen": -223.0375213623047, "ref_logps/rejected": -228.4265899658203, "rewards/accuracies": 0.53125, "rewards/chosen": -0.005202519707381725, "rewards/margins": -0.004330779425799847, "rewards/rejected": -0.0008717416785657406, "step": 13 }, { "epoch": 0.0, "learning_rate": 6.71462829736211e-09, "logps/chosen": -268.0576477050781, "logps/rejected": -231.2375946044922, "loss": 0.7002, "losses/dpo": 0.7136955261230469, "losses/sft": 0.4998738765716553, "losses/total": 0.7136955261230469, "ref_logps/chosen": -268.08642578125, "ref_logps/rejected": -231.38967895507812, "rewards/accuracies": 0.5, "rewards/chosen": 0.0028741597197949886, "rewards/margins": -0.01233379915356636, "rewards/rejected": 0.015207958407700062, "step": 14 }, { "epoch": 0.0, "learning_rate": 7.194244604316546e-09, "logps/chosen": -172.2254638671875, "logps/rejected": -217.13748168945312, "loss": 0.6951, "losses/dpo": 0.7789156436920166, "losses/sft": 0.463234007358551, "losses/total": 0.7789156436920166, "ref_logps/chosen": -172.21835327148438, "ref_logps/rejected": -217.15478515625, "rewards/accuracies": 0.5, "rewards/chosen": -0.0007107567507773638, "rewards/margins": -0.002442675642669201, "rewards/rejected": 0.0017319191247224808, "step": 15 }, { "epoch": 0.0, "learning_rate": 7.673860911270983e-09, "logps/chosen": -222.18521118164062, "logps/rejected": -231.84127807617188, "loss": 0.6839, "losses/dpo": 0.6667888760566711, "losses/sft": 0.5182288885116577, "losses/total": 0.6667888760566711, "ref_logps/chosen": -222.25341796875, "ref_logps/rejected": -231.70138549804688, "rewards/accuracies": 0.625, "rewards/chosen": 0.006821859162300825, "rewards/margins": 0.020810546353459358, "rewards/rejected": -0.013988685794174671, "step": 16 }, { "epoch": 0.0, "learning_rate": 8.15347721822542e-09, "logps/chosen": -235.65390014648438, "logps/rejected": -222.43429565429688, "loss": 0.6865, "losses/dpo": 0.6585410833358765, "losses/sft": 0.5655278563499451, "losses/total": 0.6585410833358765, "ref_logps/chosen": -235.76455688476562, "ref_logps/rejected": -222.39419555664062, "rewards/accuracies": 0.5, "rewards/chosen": 0.011065607890486717, "rewards/margins": 0.015074377879500389, "rewards/rejected": -0.004008769057691097, "step": 17 }, { "epoch": 0.0, "learning_rate": 8.633093525179856e-09, "logps/chosen": -241.33787536621094, "logps/rejected": -234.54632568359375, "loss": 0.6797, "losses/dpo": 0.6333146095275879, "losses/sft": 0.6201242208480835, "losses/total": 0.6333146095275879, "ref_logps/chosen": -241.516357421875, "ref_logps/rejected": -234.44100952148438, "rewards/accuracies": 0.59375, "rewards/chosen": 0.017847366631031036, "rewards/margins": 0.028378620743751526, "rewards/rejected": -0.010531256906688213, "step": 18 }, { "epoch": 0.0, "learning_rate": 9.112709832134293e-09, "logps/chosen": -224.79901123046875, "logps/rejected": -221.52340698242188, "loss": 0.6753, "losses/dpo": 0.6575611233711243, "losses/sft": 0.5702490210533142, "losses/total": 0.6575611233711243, "ref_logps/chosen": -225.04833984375, "ref_logps/rejected": -221.396728515625, "rewards/accuracies": 0.71875, "rewards/chosen": 0.024935007095336914, "rewards/margins": 0.03760318458080292, "rewards/rejected": -0.012668181210756302, "step": 19 }, { "epoch": 0.0, "learning_rate": 9.592326139088728e-09, "logps/chosen": -248.1580810546875, "logps/rejected": -216.6111602783203, "loss": 0.6855, "losses/dpo": 0.6101447939872742, "losses/sft": 0.58149653673172, "losses/total": 0.6101447939872742, "ref_logps/chosen": -248.23043823242188, "ref_logps/rejected": -216.5164031982422, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0072329165413975716, "rewards/margins": 0.016706980764865875, "rewards/rejected": -0.009474064223468304, "step": 20 }, { "epoch": 0.01, "learning_rate": 1.0071942446043165e-08, "logps/chosen": -211.05523681640625, "logps/rejected": -222.06698608398438, "loss": 0.6913, "losses/dpo": 0.7278509140014648, "losses/sft": 0.7115186452865601, "losses/total": 0.7278509140014648, "ref_logps/chosen": -211.1478271484375, "ref_logps/rejected": -222.10845947265625, "rewards/accuracies": 0.46875, "rewards/chosen": 0.0092597845941782, "rewards/margins": 0.005111610051244497, "rewards/rejected": 0.004148174077272415, "step": 21 }, { "epoch": 0.01, "learning_rate": 1.0551558752997602e-08, "logps/chosen": -186.68423461914062, "logps/rejected": -190.4842529296875, "loss": 0.7052, "losses/dpo": 0.8078962564468384, "losses/sft": 0.6679735779762268, "losses/total": 0.8078962564468384, "ref_logps/chosen": -186.53273010253906, "ref_logps/rejected": -190.55789184570312, "rewards/accuracies": 0.40625, "rewards/chosen": -0.01514910813421011, "rewards/margins": -0.022512998431921005, "rewards/rejected": 0.00736389122903347, "step": 22 }, { "epoch": 0.01, "learning_rate": 1.1031175059952037e-08, "logps/chosen": -256.63458251953125, "logps/rejected": -242.77621459960938, "loss": 0.6842, "losses/dpo": 0.6175757646560669, "losses/sft": 1.0776233673095703, "losses/total": 0.6175757646560669, "ref_logps/chosen": -256.7547912597656, "ref_logps/rejected": -242.70587158203125, "rewards/accuracies": 0.625, "rewards/chosen": 0.012022221460938454, "rewards/margins": 0.019056284800171852, "rewards/rejected": -0.007034063804894686, "step": 23 }, { "epoch": 0.01, "learning_rate": 1.1510791366906475e-08, "logps/chosen": -245.29763793945312, "logps/rejected": -224.24526977539062, "loss": 0.6748, "losses/dpo": 0.5607553720474243, "losses/sft": 0.7223778367042542, "losses/total": 0.5607553720474243, "ref_logps/chosen": -245.56512451171875, "ref_logps/rejected": -224.11923217773438, "rewards/accuracies": 0.65625, "rewards/chosen": 0.026748700067400932, "rewards/margins": 0.03935111314058304, "rewards/rejected": -0.012602413073182106, "step": 24 }, { "epoch": 0.01, "learning_rate": 1.1990407673860912e-08, "logps/chosen": -215.9717254638672, "logps/rejected": -203.93508911132812, "loss": 0.6952, "losses/dpo": 0.645915687084198, "losses/sft": 0.5546073317527771, "losses/total": 0.645915687084198, "ref_logps/chosen": -215.82223510742188, "ref_logps/rejected": -203.81454467773438, "rewards/accuracies": 0.53125, "rewards/chosen": -0.014948928728699684, "rewards/margins": -0.0028944136574864388, "rewards/rejected": -0.012054515071213245, "step": 25 }, { "epoch": 0.01, "learning_rate": 1.2470023980815347e-08, "logps/chosen": -224.75469970703125, "logps/rejected": -219.81045532226562, "loss": 0.6974, "losses/dpo": 0.7912802696228027, "losses/sft": 0.6783121228218079, "losses/total": 0.7912802696228027, "ref_logps/chosen": -224.6754150390625, "ref_logps/rejected": -219.7965545654297, "rewards/accuracies": 0.4375, "rewards/chosen": -0.007928472012281418, "rewards/margins": -0.006536121480166912, "rewards/rejected": -0.001392353093251586, "step": 26 }, { "epoch": 0.01, "learning_rate": 1.2949640287769784e-08, "logps/chosen": -222.8507080078125, "logps/rejected": -221.41018676757812, "loss": 0.696, "losses/dpo": 0.677315354347229, "losses/sft": 0.7283089756965637, "losses/total": 0.677315354347229, "ref_logps/chosen": -222.86117553710938, "ref_logps/rejected": -221.4599609375, "rewards/accuracies": 0.5, "rewards/chosen": 0.0010487977415323257, "rewards/margins": -0.003926068544387817, "rewards/rejected": 0.004974866285920143, "step": 27 }, { "epoch": 0.01, "learning_rate": 1.342925659472422e-08, "logps/chosen": -197.4875946044922, "logps/rejected": -228.24612426757812, "loss": 0.6947, "losses/dpo": 0.6872913241386414, "losses/sft": 0.44340550899505615, "losses/total": 0.6872913241386414, "ref_logps/chosen": -197.37225341796875, "ref_logps/rejected": -228.1461181640625, "rewards/accuracies": 0.5, "rewards/chosen": -0.011532868258655071, "rewards/margins": -0.001532461028546095, "rewards/rejected": -0.010000407695770264, "step": 28 }, { "epoch": 0.01, "learning_rate": 1.3908872901678655e-08, "logps/chosen": -243.61630249023438, "logps/rejected": -224.24642944335938, "loss": 0.6978, "losses/dpo": 0.7187615633010864, "losses/sft": 0.6433253288269043, "losses/total": 0.7187615633010864, "ref_logps/chosen": -243.58395385742188, "ref_logps/rejected": -224.29566955566406, "rewards/accuracies": 0.46875, "rewards/chosen": -0.00323600135743618, "rewards/margins": -0.008159088902175426, "rewards/rejected": 0.004923087544739246, "step": 29 }, { "epoch": 0.01, "learning_rate": 1.4388489208633092e-08, "logps/chosen": -219.4247283935547, "logps/rejected": -218.03570556640625, "loss": 0.6954, "losses/dpo": 0.809138298034668, "losses/sft": 1.0508685111999512, "losses/total": 0.809138298034668, "ref_logps/chosen": -219.2953338623047, "ref_logps/rejected": -217.93710327148438, "rewards/accuracies": 0.5, "rewards/chosen": -0.012938402593135834, "rewards/margins": -0.0030781980603933334, "rewards/rejected": -0.009860205464065075, "step": 30 }, { "epoch": 0.01, "learning_rate": 1.4868105515587529e-08, "logps/chosen": -185.1855926513672, "logps/rejected": -217.99859619140625, "loss": 0.7077, "losses/dpo": 0.713720440864563, "losses/sft": 0.5114263892173767, "losses/total": 0.713720440864563, "ref_logps/chosen": -185.0439910888672, "ref_logps/rejected": -218.1269989013672, "rewards/accuracies": 0.375, "rewards/chosen": -0.014160919934511185, "rewards/margins": -0.027000270783901215, "rewards/rejected": 0.012839353643357754, "step": 31 }, { "epoch": 0.01, "learning_rate": 1.5347721822541966e-08, "logps/chosen": -233.45773315429688, "logps/rejected": -217.12294006347656, "loss": 0.6859, "losses/dpo": 0.7353055477142334, "losses/sft": 0.5802497267723083, "losses/total": 0.7353055477142334, "ref_logps/chosen": -233.4857635498047, "ref_logps/rejected": -216.99427795410156, "rewards/accuracies": 0.625, "rewards/chosen": 0.0028023591730743647, "rewards/margins": 0.015670321881771088, "rewards/rejected": -0.012867962941527367, "step": 32 }, { "epoch": 0.01, "learning_rate": 1.58273381294964e-08, "logps/chosen": -201.7538604736328, "logps/rejected": -213.19798278808594, "loss": 0.7034, "losses/dpo": 0.7221289873123169, "losses/sft": 0.69758141040802, "losses/total": 0.7221289873123169, "ref_logps/chosen": -201.49549865722656, "ref_logps/rejected": -213.12582397460938, "rewards/accuracies": 0.375, "rewards/chosen": -0.025837333872914314, "rewards/margins": -0.01862039417028427, "rewards/rejected": -0.007216935977339745, "step": 33 }, { "epoch": 0.01, "learning_rate": 1.630695443645084e-08, "logps/chosen": -214.36306762695312, "logps/rejected": -226.4154052734375, "loss": 0.694, "losses/dpo": 0.7063605189323425, "losses/sft": 0.4379614591598511, "losses/total": 0.7063605189323425, "ref_logps/chosen": -214.29627990722656, "ref_logps/rejected": -226.3500213623047, "rewards/accuracies": 0.5, "rewards/chosen": -0.0066780271008610725, "rewards/margins": -0.00013813376426696777, "rewards/rejected": -0.00653989240527153, "step": 34 }, { "epoch": 0.01, "learning_rate": 1.6786570743405277e-08, "logps/chosen": -197.10842895507812, "logps/rejected": -208.14047241210938, "loss": 0.6826, "losses/dpo": 0.7181844711303711, "losses/sft": 0.5392345786094666, "losses/total": 0.7181844711303711, "ref_logps/chosen": -197.16732788085938, "ref_logps/rejected": -207.9783935546875, "rewards/accuracies": 0.65625, "rewards/chosen": 0.005890691187232733, "rewards/margins": 0.022097958251833916, "rewards/rejected": -0.016207266598939896, "step": 35 }, { "epoch": 0.01, "learning_rate": 1.7266187050359713e-08, "logps/chosen": -205.97183227539062, "logps/rejected": -212.31854248046875, "loss": 0.6977, "losses/dpo": 0.7123667597770691, "losses/sft": 0.5793213844299316, "losses/total": 0.7123667597770691, "ref_logps/chosen": -206.0484161376953, "ref_logps/rejected": -212.47242736816406, "rewards/accuracies": 0.46875, "rewards/chosen": 0.007658963557332754, "rewards/margins": -0.007729287259280682, "rewards/rejected": 0.015388250350952148, "step": 36 }, { "epoch": 0.01, "learning_rate": 1.7745803357314148e-08, "logps/chosen": -207.560546875, "logps/rejected": -216.02842712402344, "loss": 0.6854, "losses/dpo": 0.6512342691421509, "losses/sft": 0.6457773447036743, "losses/total": 0.6512342691421509, "ref_logps/chosen": -207.574462890625, "ref_logps/rejected": -215.8651123046875, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0013919173507019877, "rewards/margins": 0.017723828554153442, "rewards/rejected": -0.016331912949681282, "step": 37 }, { "epoch": 0.01, "learning_rate": 1.8225419664268586e-08, "logps/chosen": -189.58447265625, "logps/rejected": -206.37522888183594, "loss": 0.6901, "losses/dpo": 0.69176185131073, "losses/sft": 0.48817819356918335, "losses/total": 0.69176185131073, "ref_logps/chosen": -189.5853271484375, "ref_logps/rejected": -206.30523681640625, "rewards/accuracies": 0.5, "rewards/chosen": 8.381949737668037e-05, "rewards/margins": 0.007083646021783352, "rewards/rejected": -0.006999826990067959, "step": 38 }, { "epoch": 0.01, "learning_rate": 1.870503597122302e-08, "logps/chosen": -211.13250732421875, "logps/rejected": -218.1571044921875, "loss": 0.6957, "losses/dpo": 0.5950769186019897, "losses/sft": 0.8146270513534546, "losses/total": 0.5950769186019897, "ref_logps/chosen": -211.0177001953125, "ref_logps/rejected": -218.08157348632812, "rewards/accuracies": 0.46875, "rewards/chosen": -0.01147910300642252, "rewards/margins": -0.003924371208995581, "rewards/rejected": -0.007554733660072088, "step": 39 }, { "epoch": 0.01, "learning_rate": 1.9184652278177456e-08, "logps/chosen": -235.50601196289062, "logps/rejected": -222.30746459960938, "loss": 0.6906, "losses/dpo": 0.6827842593193054, "losses/sft": 0.5948323011398315, "losses/total": 0.6827842593193054, "ref_logps/chosen": -235.40170288085938, "ref_logps/rejected": -222.13043212890625, "rewards/accuracies": 0.5625, "rewards/chosen": -0.01042894646525383, "rewards/margins": 0.007272547576576471, "rewards/rejected": -0.017701495438814163, "step": 40 }, { "epoch": 0.01, "learning_rate": 1.9664268585131895e-08, "logps/chosen": -189.27490234375, "logps/rejected": -217.73687744140625, "loss": 0.6795, "losses/dpo": 0.6725283861160278, "losses/sft": 0.4705185294151306, "losses/total": 0.6725283861160278, "ref_logps/chosen": -189.46490478515625, "ref_logps/rejected": -217.64382934570312, "rewards/accuracies": 0.625, "rewards/chosen": 0.019000083208084106, "rewards/margins": 0.02830422669649124, "rewards/rejected": -0.00930414255708456, "step": 41 }, { "epoch": 0.01, "learning_rate": 2.014388489208633e-08, "logps/chosen": -193.93833923339844, "logps/rejected": -213.3574676513672, "loss": 0.6977, "losses/dpo": 0.6685161590576172, "losses/sft": 0.38773113489151, "losses/total": 0.6685161590576172, "ref_logps/chosen": -193.80227661132812, "ref_logps/rejected": -213.29908752441406, "rewards/accuracies": 0.40625, "rewards/chosen": -0.013606108725070953, "rewards/margins": -0.007766854017972946, "rewards/rejected": -0.005839252844452858, "step": 42 }, { "epoch": 0.01, "learning_rate": 2.0623501199040765e-08, "logps/chosen": -252.6197052001953, "logps/rejected": -226.04995727539062, "loss": 0.6955, "losses/dpo": 0.7268983125686646, "losses/sft": 0.5932801365852356, "losses/total": 0.7268983125686646, "ref_logps/chosen": -252.68643188476562, "ref_logps/rejected": -226.15179443359375, "rewards/accuracies": 0.4375, "rewards/chosen": 0.006670677103102207, "rewards/margins": -0.003512322437018156, "rewards/rejected": 0.010183000937104225, "step": 43 }, { "epoch": 0.01, "learning_rate": 2.1103117505995203e-08, "logps/chosen": -235.77630615234375, "logps/rejected": -244.09078979492188, "loss": 0.6979, "losses/dpo": 0.7177029252052307, "losses/sft": 0.5535863041877747, "losses/total": 0.7177029252052307, "ref_logps/chosen": -235.81887817382812, "ref_logps/rejected": -244.2152557373047, "rewards/accuracies": 0.40625, "rewards/chosen": 0.0042563192546367645, "rewards/margins": -0.008189273066818714, "rewards/rejected": 0.012445593252778053, "step": 44 }, { "epoch": 0.01, "learning_rate": 2.1582733812949638e-08, "logps/chosen": -197.5068817138672, "logps/rejected": -216.68121337890625, "loss": 0.6864, "losses/dpo": 0.6721041202545166, "losses/sft": 0.4915233850479126, "losses/total": 0.6721041202545166, "ref_logps/chosen": -197.61639404296875, "ref_logps/rejected": -216.64016723632812, "rewards/accuracies": 0.59375, "rewards/chosen": 0.010950400494039059, "rewards/margins": 0.015054966323077679, "rewards/rejected": -0.004104567226022482, "step": 45 }, { "epoch": 0.01, "learning_rate": 2.2062350119904073e-08, "logps/chosen": -183.44943237304688, "logps/rejected": -213.3627166748047, "loss": 0.696, "losses/dpo": 0.6813101172447205, "losses/sft": 0.6016968488693237, "losses/total": 0.6813101172447205, "ref_logps/chosen": -183.44973754882812, "ref_logps/rejected": -213.40567016601562, "rewards/accuracies": 0.53125, "rewards/chosen": 3.295857459306717e-05, "rewards/margins": -0.004262986592948437, "rewards/rejected": 0.004295945633202791, "step": 46 }, { "epoch": 0.01, "learning_rate": 2.254196642685851e-08, "logps/chosen": -214.40921020507812, "logps/rejected": -202.41232299804688, "loss": 0.6866, "losses/dpo": 0.6039983630180359, "losses/sft": 0.5450809001922607, "losses/total": 0.6039983630180359, "ref_logps/chosen": -214.48484802246094, "ref_logps/rejected": -202.34368896484375, "rewards/accuracies": 0.59375, "rewards/chosen": 0.007562707643955946, "rewards/margins": 0.014427297748625278, "rewards/rejected": -0.0068645901046693325, "step": 47 }, { "epoch": 0.01, "learning_rate": 2.302158273381295e-08, "logps/chosen": -246.068115234375, "logps/rejected": -240.49893188476562, "loss": 0.6827, "losses/dpo": 0.703313410282135, "losses/sft": 0.6548411846160889, "losses/total": 0.703313410282135, "ref_logps/chosen": -246.12924194335938, "ref_logps/rejected": -240.32579040527344, "rewards/accuracies": 0.59375, "rewards/chosen": 0.006113573908805847, "rewards/margins": 0.02342676743865013, "rewards/rejected": -0.017313193529844284, "step": 48 }, { "epoch": 0.01, "learning_rate": 2.3501199040767385e-08, "logps/chosen": -228.2919921875, "logps/rejected": -230.7275390625, "loss": 0.6823, "losses/dpo": 0.755733072757721, "losses/sft": 0.8291958570480347, "losses/total": 0.755733072757721, "ref_logps/chosen": -228.39027404785156, "ref_logps/rejected": -230.58786010742188, "rewards/accuracies": 0.59375, "rewards/chosen": 0.009829278104007244, "rewards/margins": 0.023797960951924324, "rewards/rejected": -0.013968681916594505, "step": 49 }, { "epoch": 0.01, "learning_rate": 2.3980815347721823e-08, "logps/chosen": -228.61349487304688, "logps/rejected": -247.74160766601562, "loss": 0.6887, "losses/dpo": 0.6716427206993103, "losses/sft": 0.7147995233535767, "losses/total": 0.6716427206993103, "ref_logps/chosen": -228.46707153320312, "ref_logps/rejected": -247.49159240722656, "rewards/accuracies": 0.59375, "rewards/chosen": -0.014642138034105301, "rewards/margins": 0.010360199958086014, "rewards/rejected": -0.025002337992191315, "step": 50 }, { "epoch": 0.01, "learning_rate": 2.446043165467626e-08, "logps/chosen": -242.04779052734375, "logps/rejected": -227.67979431152344, "loss": 0.6766, "losses/dpo": 0.6276530027389526, "losses/sft": 0.512235701084137, "losses/total": 0.6276530027389526, "ref_logps/chosen": -242.1258544921875, "ref_logps/rejected": -227.40536499023438, "rewards/accuracies": 0.6875, "rewards/chosen": 0.007807434536516666, "rewards/margins": 0.03525057062506676, "rewards/rejected": -0.027443135157227516, "step": 51 }, { "epoch": 0.01, "learning_rate": 2.4940047961630694e-08, "logps/chosen": -222.39056396484375, "logps/rejected": -246.11322021484375, "loss": 0.6886, "losses/dpo": 0.710288405418396, "losses/sft": 0.8502694964408875, "losses/total": 0.710288405418396, "ref_logps/chosen": -222.41888427734375, "ref_logps/rejected": -246.0380859375, "rewards/accuracies": 0.53125, "rewards/chosen": 0.002831632737070322, "rewards/margins": 0.010343475267291069, "rewards/rejected": -0.00751184206455946, "step": 52 }, { "epoch": 0.01, "learning_rate": 2.541966426858513e-08, "logps/chosen": -223.98715209960938, "logps/rejected": -232.4593505859375, "loss": 0.6869, "losses/dpo": 0.7377870082855225, "losses/sft": 0.5255107879638672, "losses/total": 0.7377870082855225, "ref_logps/chosen": -224.12554931640625, "ref_logps/rejected": -232.45608520507812, "rewards/accuracies": 0.59375, "rewards/chosen": 0.013841211795806885, "rewards/margins": 0.014165878295898438, "rewards/rejected": -0.0003246662672609091, "step": 53 }, { "epoch": 0.01, "learning_rate": 2.5899280575539567e-08, "logps/chosen": -198.27920532226562, "logps/rejected": -231.909912109375, "loss": 0.6919, "losses/dpo": 0.6897822022438049, "losses/sft": 0.5177935361862183, "losses/total": 0.6897822022438049, "ref_logps/chosen": -198.12969970703125, "ref_logps/rejected": -231.72625732421875, "rewards/accuracies": 0.40625, "rewards/chosen": -0.014949977397918701, "rewards/margins": 0.0034154660534113646, "rewards/rejected": -0.018365442752838135, "step": 54 }, { "epoch": 0.01, "learning_rate": 2.6378896882494006e-08, "logps/chosen": -237.87985229492188, "logps/rejected": -243.81155395507812, "loss": 0.6966, "losses/dpo": 0.7317380905151367, "losses/sft": 0.549078106880188, "losses/total": 0.7317380905151367, "ref_logps/chosen": -237.7666473388672, "ref_logps/rejected": -243.7567138671875, "rewards/accuracies": 0.40625, "rewards/chosen": -0.01131841354072094, "rewards/margins": -0.005834410898387432, "rewards/rejected": -0.0054840026423335075, "step": 55 }, { "epoch": 0.01, "learning_rate": 2.685851318944844e-08, "logps/chosen": -211.71875, "logps/rejected": -241.13546752929688, "loss": 0.6837, "losses/dpo": 0.6768530011177063, "losses/sft": 0.7404103875160217, "losses/total": 0.6768530011177063, "ref_logps/chosen": -211.79290771484375, "ref_logps/rejected": -241.0068817138672, "rewards/accuracies": 0.5625, "rewards/chosen": 0.007415605243295431, "rewards/margins": 0.02027425915002823, "rewards/rejected": -0.012858652509748936, "step": 56 }, { "epoch": 0.01, "learning_rate": 2.733812949640288e-08, "logps/chosen": -209.9762420654297, "logps/rejected": -195.5013885498047, "loss": 0.6789, "losses/dpo": 0.639587938785553, "losses/sft": 0.6233201622962952, "losses/total": 0.639587938785553, "ref_logps/chosen": -209.92486572265625, "ref_logps/rejected": -195.1511688232422, "rewards/accuracies": 0.625, "rewards/chosen": -0.005138188600540161, "rewards/margins": 0.029883116483688354, "rewards/rejected": -0.035021305084228516, "step": 57 }, { "epoch": 0.01, "learning_rate": 2.781774580335731e-08, "logps/chosen": -277.1181640625, "logps/rejected": -223.09933471679688, "loss": 0.6874, "losses/dpo": 0.6940524578094482, "losses/sft": 0.6153870820999146, "losses/total": 0.6940524578094482, "ref_logps/chosen": -277.1900634765625, "ref_logps/rejected": -223.04290771484375, "rewards/accuracies": 0.6875, "rewards/chosen": 0.007188671268522739, "rewards/margins": 0.012833625078201294, "rewards/rejected": -0.005644953344017267, "step": 58 }, { "epoch": 0.01, "learning_rate": 2.8297362110311752e-08, "logps/chosen": -214.34803771972656, "logps/rejected": -256.10711669921875, "loss": 0.6777, "losses/dpo": 0.5825490355491638, "losses/sft": 0.5152746438980103, "losses/total": 0.5825490355491638, "ref_logps/chosen": -214.44253540039062, "ref_logps/rejected": -255.87142944335938, "rewards/accuracies": 0.71875, "rewards/chosen": 0.009448528289794922, "rewards/margins": 0.03301580250263214, "rewards/rejected": -0.02356727048754692, "step": 59 }, { "epoch": 0.01, "learning_rate": 2.8776978417266184e-08, "logps/chosen": -217.6726837158203, "logps/rejected": -229.48915100097656, "loss": 0.6838, "losses/dpo": 0.7698537707328796, "losses/sft": 0.5255835056304932, "losses/total": 0.7698537707328796, "ref_logps/chosen": -217.64987182617188, "ref_logps/rejected": -229.2611083984375, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0022796811535954475, "rewards/margins": 0.020525163039565086, "rewards/rejected": -0.02280484512448311, "step": 60 }, { "epoch": 0.01, "learning_rate": 2.9256594724220623e-08, "logps/chosen": -193.41453552246094, "logps/rejected": -230.86428833007812, "loss": 0.701, "losses/dpo": 0.6977383494377136, "losses/sft": 0.5149680376052856, "losses/total": 0.6977383494377136, "ref_logps/chosen": -193.3206787109375, "ref_logps/rejected": -230.91612243652344, "rewards/accuracies": 0.5, "rewards/chosen": -0.009385532699525356, "rewards/margins": -0.014568096026778221, "rewards/rejected": 0.0051825642585754395, "step": 61 }, { "epoch": 0.01, "learning_rate": 2.9736211031175058e-08, "logps/chosen": -194.19955444335938, "logps/rejected": -225.0723419189453, "loss": 0.6848, "losses/dpo": 0.7079966068267822, "losses/sft": 0.325137197971344, "losses/total": 0.7079966068267822, "ref_logps/chosen": -194.28680419921875, "ref_logps/rejected": -224.9665069580078, "rewards/accuracies": 0.5625, "rewards/chosen": 0.008726870641112328, "rewards/margins": 0.019311178475618362, "rewards/rejected": -0.01058430690318346, "step": 62 }, { "epoch": 0.02, "learning_rate": 3.0215827338129496e-08, "logps/chosen": -229.43360900878906, "logps/rejected": -248.010498046875, "loss": 0.6873, "losses/dpo": 0.6740909814834595, "losses/sft": 0.7446208000183105, "losses/total": 0.6740909814834595, "ref_logps/chosen": -229.43751525878906, "ref_logps/rejected": -247.87669372558594, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00039091240614652634, "rewards/margins": 0.0137720238417387, "rewards/rejected": -0.01338111236691475, "step": 63 }, { "epoch": 0.02, "learning_rate": 3.069544364508393e-08, "logps/chosen": -206.550537109375, "logps/rejected": -215.46652221679688, "loss": 0.683, "losses/dpo": 0.6350373029708862, "losses/sft": 0.6437579989433289, "losses/total": 0.6350373029708862, "ref_logps/chosen": -206.61911010742188, "ref_logps/rejected": -215.31381225585938, "rewards/accuracies": 0.625, "rewards/chosen": 0.0068579381331801414, "rewards/margins": 0.02212909795343876, "rewards/rejected": -0.015271165408194065, "step": 64 }, { "epoch": 0.02, "learning_rate": 3.1175059952038366e-08, "logps/chosen": -243.44110107421875, "logps/rejected": -217.99044799804688, "loss": 0.6835, "losses/dpo": 0.703578531742096, "losses/sft": 0.7763662934303284, "losses/total": 0.703578531742096, "ref_logps/chosen": -243.49514770507812, "ref_logps/rejected": -217.83792114257812, "rewards/accuracies": 0.625, "rewards/chosen": 0.005403417628258467, "rewards/margins": 0.02065562643110752, "rewards/rejected": -0.015252209268510342, "step": 65 }, { "epoch": 0.02, "learning_rate": 3.16546762589928e-08, "logps/chosen": -254.1915283203125, "logps/rejected": -233.11886596679688, "loss": 0.6788, "losses/dpo": 0.7148997783660889, "losses/sft": 0.5967684984207153, "losses/total": 0.7148997783660889, "ref_logps/chosen": -254.19125366210938, "ref_logps/rejected": -232.80633544921875, "rewards/accuracies": 0.59375, "rewards/chosen": -2.572033554315567e-05, "rewards/margins": 0.031224101781845093, "rewards/rejected": -0.031249821186065674, "step": 66 }, { "epoch": 0.02, "learning_rate": 3.213429256594724e-08, "logps/chosen": -202.1600799560547, "logps/rejected": -221.42327880859375, "loss": 0.6927, "losses/dpo": 0.7298646569252014, "losses/sft": 0.5785337686538696, "losses/total": 0.7298646569252014, "ref_logps/chosen": -202.04751586914062, "ref_logps/rejected": -221.28628540039062, "rewards/accuracies": 0.625, "rewards/chosen": -0.011255128309130669, "rewards/margins": 0.002444516634568572, "rewards/rejected": -0.013699645176529884, "step": 67 }, { "epoch": 0.02, "learning_rate": 3.261390887290168e-08, "logps/chosen": -221.13748168945312, "logps/rejected": -234.50753784179688, "loss": 0.673, "losses/dpo": 0.6859936714172363, "losses/sft": 0.5644641518592834, "losses/total": 0.6859936714172363, "ref_logps/chosen": -221.21273803710938, "ref_logps/rejected": -234.15638732910156, "rewards/accuracies": 0.625, "rewards/chosen": 0.00752527778968215, "rewards/margins": 0.04264016076922417, "rewards/rejected": -0.03511488437652588, "step": 68 }, { "epoch": 0.02, "learning_rate": 3.309352517985611e-08, "logps/chosen": -230.84068298339844, "logps/rejected": -245.29965209960938, "loss": 0.6777, "losses/dpo": 0.6829096078872681, "losses/sft": 0.4659244418144226, "losses/total": 0.6829096078872681, "ref_logps/chosen": -230.90447998046875, "ref_logps/rejected": -245.0372314453125, "rewards/accuracies": 0.625, "rewards/chosen": 0.006380748935043812, "rewards/margins": 0.03262268379330635, "rewards/rejected": -0.026241932064294815, "step": 69 }, { "epoch": 0.02, "learning_rate": 3.3573141486810555e-08, "logps/chosen": -251.9886932373047, "logps/rejected": -229.9258575439453, "loss": 0.6847, "losses/dpo": 0.6709324717521667, "losses/sft": 0.6153463125228882, "losses/total": 0.6709324717521667, "ref_logps/chosen": -251.97955322265625, "ref_logps/rejected": -229.73423767089844, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0009126723743975163, "rewards/margins": 0.01824953593313694, "rewards/rejected": -0.01916220784187317, "step": 70 }, { "epoch": 0.02, "learning_rate": 3.4052757793764983e-08, "logps/chosen": -238.90435791015625, "logps/rejected": -232.46437072753906, "loss": 0.6663, "losses/dpo": 0.6291494369506836, "losses/sft": 0.7627854943275452, "losses/total": 0.6291494369506836, "ref_logps/chosen": -239.00155639648438, "ref_logps/rejected": -231.9947967529297, "rewards/accuracies": 0.625, "rewards/chosen": 0.009718602523207664, "rewards/margins": 0.056674763560295105, "rewards/rejected": -0.04695615917444229, "step": 71 }, { "epoch": 0.02, "learning_rate": 3.4532374100719425e-08, "logps/chosen": -215.9368896484375, "logps/rejected": -206.9634246826172, "loss": 0.6731, "losses/dpo": 0.729106068611145, "losses/sft": 0.7982183694839478, "losses/total": 0.729106068611145, "ref_logps/chosen": -216.05760192871094, "ref_logps/rejected": -206.6676483154297, "rewards/accuracies": 0.75, "rewards/chosen": 0.012069761753082275, "rewards/margins": 0.04164770990610123, "rewards/rejected": -0.02957794815301895, "step": 72 }, { "epoch": 0.02, "learning_rate": 3.501199040767386e-08, "logps/chosen": -261.9966125488281, "logps/rejected": -239.4683837890625, "loss": 0.6677, "losses/dpo": 0.6332001090049744, "losses/sft": 0.5268372893333435, "losses/total": 0.6332001090049744, "ref_logps/chosen": -262.0168762207031, "ref_logps/rejected": -238.9566650390625, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0020241618622094393, "rewards/margins": 0.05319586023688316, "rewards/rejected": -0.05117169767618179, "step": 73 }, { "epoch": 0.02, "learning_rate": 3.5491606714628295e-08, "logps/chosen": -214.4657440185547, "logps/rejected": -204.89346313476562, "loss": 0.6801, "losses/dpo": 0.6749098896980286, "losses/sft": 0.37936365604400635, "losses/total": 0.6749098896980286, "ref_logps/chosen": -214.4757080078125, "ref_logps/rejected": -204.62625122070312, "rewards/accuracies": 0.6875, "rewards/chosen": 0.000995695125311613, "rewards/margins": 0.02771618217229843, "rewards/rejected": -0.02672048844397068, "step": 74 }, { "epoch": 0.02, "learning_rate": 3.597122302158273e-08, "logps/chosen": -259.1119079589844, "logps/rejected": -244.95089721679688, "loss": 0.6848, "losses/dpo": 0.6784316301345825, "losses/sft": 0.6817365884780884, "losses/total": 0.6784316301345825, "ref_logps/chosen": -258.9774475097656, "ref_logps/rejected": -244.63723754882812, "rewards/accuracies": 0.65625, "rewards/chosen": -0.013446971774101257, "rewards/margins": 0.017918363213539124, "rewards/rejected": -0.03136533498764038, "step": 75 }, { "epoch": 0.02, "learning_rate": 3.645083932853717e-08, "logps/chosen": -266.10565185546875, "logps/rejected": -259.83770751953125, "loss": 0.6808, "losses/dpo": 0.6278535723686218, "losses/sft": 0.5645913481712341, "losses/total": 0.6278535723686218, "ref_logps/chosen": -266.0711669921875, "ref_logps/rejected": -259.5297546386719, "rewards/accuracies": 0.5, "rewards/chosen": -0.0034498097375035286, "rewards/margins": 0.02734585851430893, "rewards/rejected": -0.030795671045780182, "step": 76 }, { "epoch": 0.02, "learning_rate": 3.69304556354916e-08, "logps/chosen": -228.33583068847656, "logps/rejected": -231.5210723876953, "loss": 0.6626, "losses/dpo": 0.6252620220184326, "losses/sft": 0.5419886112213135, "losses/total": 0.6252620220184326, "ref_logps/chosen": -228.40411376953125, "ref_logps/rejected": -230.94354248046875, "rewards/accuracies": 0.6875, "rewards/chosen": 0.006828535348176956, "rewards/margins": 0.064580537378788, "rewards/rejected": -0.05775200203061104, "step": 77 }, { "epoch": 0.02, "learning_rate": 3.741007194244604e-08, "logps/chosen": -250.23226928710938, "logps/rejected": -220.04351806640625, "loss": 0.667, "losses/dpo": 0.6530924439430237, "losses/sft": 0.49439704418182373, "losses/total": 0.6530924439430237, "ref_logps/chosen": -250.24057006835938, "ref_logps/rejected": -219.5004425048828, "rewards/accuracies": 0.6875, "rewards/chosen": 0.000829547643661499, "rewards/margins": 0.05513736978173256, "rewards/rejected": -0.05430781841278076, "step": 78 }, { "epoch": 0.02, "learning_rate": 3.788968824940048e-08, "logps/chosen": -202.3976287841797, "logps/rejected": -210.40658569335938, "loss": 0.666, "losses/dpo": 0.5825504064559937, "losses/sft": 0.5924239754676819, "losses/total": 0.5825504064559937, "ref_logps/chosen": -202.3585662841797, "ref_logps/rejected": -209.80018615722656, "rewards/accuracies": 0.75, "rewards/chosen": -0.003906333353370428, "rewards/margins": 0.056735992431640625, "rewards/rejected": -0.06064232811331749, "step": 79 }, { "epoch": 0.02, "learning_rate": 3.836930455635491e-08, "logps/chosen": -234.4703826904297, "logps/rejected": -203.76760864257812, "loss": 0.6725, "losses/dpo": 0.6825305819511414, "losses/sft": 0.7381433248519897, "losses/total": 0.6825305819511414, "ref_logps/chosen": -234.48983764648438, "ref_logps/rejected": -203.35232543945312, "rewards/accuracies": 0.71875, "rewards/chosen": 0.0019444981589913368, "rewards/margins": 0.04347212612628937, "rewards/rejected": -0.041527628898620605, "step": 80 }, { "epoch": 0.02, "learning_rate": 3.8848920863309354e-08, "logps/chosen": -235.16729736328125, "logps/rejected": -221.92327880859375, "loss": 0.6686, "losses/dpo": 0.6495444178581238, "losses/sft": 0.4707401990890503, "losses/total": 0.6495444178581238, "ref_logps/chosen": -235.24484252929688, "ref_logps/rejected": -221.48414611816406, "rewards/accuracies": 0.8125, "rewards/chosen": 0.007754446007311344, "rewards/margins": 0.05166778713464737, "rewards/rejected": -0.0439133420586586, "step": 81 }, { "epoch": 0.02, "learning_rate": 3.932853717026379e-08, "logps/chosen": -182.38058471679688, "logps/rejected": -189.5399169921875, "loss": 0.6726, "losses/dpo": 0.6635864973068237, "losses/sft": 0.9190435409545898, "losses/total": 0.6635864973068237, "ref_logps/chosen": -182.4701385498047, "ref_logps/rejected": -189.20066833496094, "rewards/accuracies": 0.6875, "rewards/chosen": 0.008954541757702827, "rewards/margins": 0.042877860367298126, "rewards/rejected": -0.03392331674695015, "step": 82 }, { "epoch": 0.02, "learning_rate": 3.9808153477218224e-08, "logps/chosen": -221.51907348632812, "logps/rejected": -223.6055450439453, "loss": 0.6677, "losses/dpo": 0.6766473650932312, "losses/sft": 0.5655488967895508, "losses/total": 0.6766473650932312, "ref_logps/chosen": -221.6224365234375, "ref_logps/rejected": -223.17088317871094, "rewards/accuracies": 0.71875, "rewards/chosen": 0.010334836319088936, "rewards/margins": 0.05380362272262573, "rewards/rejected": -0.043468788266181946, "step": 83 }, { "epoch": 0.02, "learning_rate": 4.028776978417266e-08, "logps/chosen": -196.75469970703125, "logps/rejected": -218.39703369140625, "loss": 0.6748, "losses/dpo": 0.7024767994880676, "losses/sft": 0.5372939109802246, "losses/total": 0.7024767994880676, "ref_logps/chosen": -196.75013732910156, "ref_logps/rejected": -218.00607299804688, "rewards/accuracies": 0.6875, "rewards/chosen": -0.00045683979988098145, "rewards/margins": 0.03863915801048279, "rewards/rejected": -0.03909599781036377, "step": 84 }, { "epoch": 0.02, "learning_rate": 4.07673860911271e-08, "logps/chosen": -224.00885009765625, "logps/rejected": -204.05685424804688, "loss": 0.6626, "losses/dpo": 0.6893184781074524, "losses/sft": 0.41095682978630066, "losses/total": 0.6893184781074524, "ref_logps/chosen": -224.0706787109375, "ref_logps/rejected": -203.48275756835938, "rewards/accuracies": 0.71875, "rewards/chosen": 0.006182373035699129, "rewards/margins": 0.06359381973743439, "rewards/rejected": -0.057411447167396545, "step": 85 }, { "epoch": 0.02, "learning_rate": 4.124700239808153e-08, "logps/chosen": -218.33792114257812, "logps/rejected": -228.09213256835938, "loss": 0.6651, "losses/dpo": 0.7062862515449524, "losses/sft": 1.0745428800582886, "losses/total": 0.7062862515449524, "ref_logps/chosen": -218.41900634765625, "ref_logps/rejected": -227.58541870117188, "rewards/accuracies": 0.71875, "rewards/chosen": 0.008108992129564285, "rewards/margins": 0.05878119915723801, "rewards/rejected": -0.05067221075296402, "step": 86 }, { "epoch": 0.02, "learning_rate": 4.172661870503597e-08, "logps/chosen": -238.63790893554688, "logps/rejected": -235.0852813720703, "loss": 0.6567, "losses/dpo": 0.7122781872749329, "losses/sft": 1.1831244230270386, "losses/total": 0.7122781872749329, "ref_logps/chosen": -238.71925354003906, "ref_logps/rejected": -234.3883514404297, "rewards/accuracies": 0.6875, "rewards/chosen": 0.00813357811421156, "rewards/margins": 0.07782801985740662, "rewards/rejected": -0.06969444453716278, "step": 87 }, { "epoch": 0.02, "learning_rate": 4.2206235011990406e-08, "logps/chosen": -220.25213623046875, "logps/rejected": -223.21102905273438, "loss": 0.6672, "losses/dpo": 0.6325732469558716, "losses/sft": 0.34492382407188416, "losses/total": 0.6325732469558716, "ref_logps/chosen": -220.35528564453125, "ref_logps/rejected": -222.7693328857422, "rewards/accuracies": 0.71875, "rewards/chosen": 0.010313677601516247, "rewards/margins": 0.054484713822603226, "rewards/rejected": -0.044171035289764404, "step": 88 }, { "epoch": 0.02, "learning_rate": 4.268585131894484e-08, "logps/chosen": -214.237060546875, "logps/rejected": -223.29531860351562, "loss": 0.6618, "losses/dpo": 0.6884236335754395, "losses/sft": 0.5875878930091858, "losses/total": 0.6884236335754395, "ref_logps/chosen": -214.28042602539062, "ref_logps/rejected": -222.6896514892578, "rewards/accuracies": 0.8125, "rewards/chosen": 0.004338555037975311, "rewards/margins": 0.06490659713745117, "rewards/rejected": -0.06056804955005646, "step": 89 }, { "epoch": 0.02, "learning_rate": 4.3165467625899276e-08, "logps/chosen": -230.1966552734375, "logps/rejected": -238.5352325439453, "loss": 0.6606, "losses/dpo": 0.5695153474807739, "losses/sft": 0.7184792757034302, "losses/total": 0.5695153474807739, "ref_logps/chosen": -230.25885009765625, "ref_logps/rejected": -237.91612243652344, "rewards/accuracies": 0.75, "rewards/chosen": 0.006221175193786621, "rewards/margins": 0.0681314617395401, "rewards/rejected": -0.06191028282046318, "step": 90 }, { "epoch": 0.02, "learning_rate": 4.364508393285372e-08, "logps/chosen": -246.60525512695312, "logps/rejected": -236.91293334960938, "loss": 0.6636, "losses/dpo": 0.6956757307052612, "losses/sft": 0.4732869863510132, "losses/total": 0.6956757307052612, "ref_logps/chosen": -246.51275634765625, "ref_logps/rejected": -236.20443725585938, "rewards/accuracies": 0.78125, "rewards/chosen": -0.009248383343219757, "rewards/margins": 0.06160058453679085, "rewards/rejected": -0.0708489716053009, "step": 91 }, { "epoch": 0.02, "learning_rate": 4.4124700239808146e-08, "logps/chosen": -290.2433776855469, "logps/rejected": -231.0909423828125, "loss": 0.6438, "losses/dpo": 0.5556160807609558, "losses/sft": 0.6453263163566589, "losses/total": 0.5556160807609558, "ref_logps/chosen": -290.3253173828125, "ref_logps/rejected": -230.14089965820312, "rewards/accuracies": 0.90625, "rewards/chosen": 0.008199069648981094, "rewards/margins": 0.10320346057415009, "rewards/rejected": -0.09500439465045929, "step": 92 }, { "epoch": 0.02, "learning_rate": 4.460431654676259e-08, "logps/chosen": -201.18263244628906, "logps/rejected": -192.68325805664062, "loss": 0.6538, "losses/dpo": 0.7058287858963013, "losses/sft": 0.5775987505912781, "losses/total": 0.7058287858963013, "ref_logps/chosen": -201.2510528564453, "ref_logps/rejected": -191.93739318847656, "rewards/accuracies": 0.875, "rewards/chosen": 0.0068434663116931915, "rewards/margins": 0.08142982423305511, "rewards/rejected": -0.07458636164665222, "step": 93 }, { "epoch": 0.02, "learning_rate": 4.508393285371702e-08, "logps/chosen": -215.68978881835938, "logps/rejected": -191.0592498779297, "loss": 0.6611, "losses/dpo": 0.7627333998680115, "losses/sft": 0.5959816575050354, "losses/total": 0.7627333998680115, "ref_logps/chosen": -215.589599609375, "ref_logps/rejected": -190.28652954101562, "rewards/accuracies": 0.78125, "rewards/chosen": -0.010018523782491684, "rewards/margins": 0.06725325435400009, "rewards/rejected": -0.07727178931236267, "step": 94 }, { "epoch": 0.02, "learning_rate": 4.556354916067146e-08, "logps/chosen": -231.95541381835938, "logps/rejected": -235.90548706054688, "loss": 0.6478, "losses/dpo": 0.6672568321228027, "losses/sft": 0.7504868507385254, "losses/total": 0.6672568321228027, "ref_logps/chosen": -232.01397705078125, "ref_logps/rejected": -235.01254272460938, "rewards/accuracies": 0.84375, "rewards/chosen": 0.005856760777533054, "rewards/margins": 0.0951523631811142, "rewards/rejected": -0.08929560333490372, "step": 95 }, { "epoch": 0.02, "learning_rate": 4.60431654676259e-08, "logps/chosen": -176.11436462402344, "logps/rejected": -208.17636108398438, "loss": 0.6608, "losses/dpo": 0.6574640870094299, "losses/sft": 0.70021653175354, "losses/total": 0.6574640870094299, "ref_logps/chosen": -175.9232635498047, "ref_logps/rejected": -207.31365966796875, "rewards/accuracies": 0.84375, "rewards/chosen": -0.019109362736344337, "rewards/margins": 0.06715996563434601, "rewards/rejected": -0.0862693265080452, "step": 96 }, { "epoch": 0.02, "learning_rate": 4.6522781774580335e-08, "logps/chosen": -223.65008544921875, "logps/rejected": -221.29864501953125, "loss": 0.6361, "losses/dpo": 0.6395202279090881, "losses/sft": 0.682570219039917, "losses/total": 0.6395202279090881, "ref_logps/chosen": -223.68798828125, "ref_logps/rejected": -220.13406372070312, "rewards/accuracies": 0.875, "rewards/chosen": 0.0037914300337433815, "rewards/margins": 0.12025120854377747, "rewards/rejected": -0.11645978689193726, "step": 97 }, { "epoch": 0.02, "learning_rate": 4.700239808153477e-08, "logps/chosen": -239.09445190429688, "logps/rejected": -262.7323913574219, "loss": 0.6275, "losses/dpo": 0.5726674795150757, "losses/sft": 0.42880779504776, "losses/total": 0.5726674795150757, "ref_logps/chosen": -239.1866912841797, "ref_logps/rejected": -261.43853759765625, "rewards/accuracies": 0.9375, "rewards/chosen": 0.009223569184541702, "rewards/margins": 0.13860660791397095, "rewards/rejected": -0.12938302755355835, "step": 98 }, { "epoch": 0.02, "learning_rate": 4.7482014388489205e-08, "logps/chosen": -224.66326904296875, "logps/rejected": -222.6821746826172, "loss": 0.6271, "losses/dpo": 0.5273476839065552, "losses/sft": 0.6606442332267761, "losses/total": 0.5273476839065552, "ref_logps/chosen": -224.69557189941406, "ref_logps/rejected": -221.31292724609375, "rewards/accuracies": 0.90625, "rewards/chosen": 0.0032282350584864616, "rewards/margins": 0.14015483856201172, "rewards/rejected": -0.13692662119865417, "step": 99 }, { "epoch": 0.02, "learning_rate": 4.796163069544365e-08, "logps/chosen": -256.29400634765625, "logps/rejected": -263.64190673828125, "loss": 0.6154, "losses/dpo": 0.6874278783798218, "losses/sft": 1.047175407409668, "losses/total": 0.6874278783798218, "ref_logps/chosen": -256.34320068359375, "ref_logps/rejected": -262.03314208984375, "rewards/accuracies": 0.9375, "rewards/chosen": 0.004917502403259277, "rewards/margins": 0.16579538583755493, "rewards/rejected": -0.16087786853313446, "step": 100 }, { "epoch": 0.02, "learning_rate": 4.8441247002398075e-08, "logps/chosen": -204.30284118652344, "logps/rejected": -203.50537109375, "loss": 0.6481, "losses/dpo": 0.724174439907074, "losses/sft": 0.6152711510658264, "losses/total": 0.724174439907074, "ref_logps/chosen": -204.19024658203125, "ref_logps/rejected": -202.43545532226562, "rewards/accuracies": 0.78125, "rewards/chosen": -0.011260945349931717, "rewards/margins": 0.09573039412498474, "rewards/rejected": -0.10699134320020676, "step": 101 }, { "epoch": 0.02, "learning_rate": 4.892086330935252e-08, "logps/chosen": -243.09100341796875, "logps/rejected": -211.80116271972656, "loss": 0.628, "losses/dpo": 0.5647795796394348, "losses/sft": 0.6304104328155518, "losses/total": 0.5647795796394348, "ref_logps/chosen": -243.12547302246094, "ref_logps/rejected": -210.45474243164062, "rewards/accuracies": 0.875, "rewards/chosen": 0.003447231836616993, "rewards/margins": 0.13808992505073547, "rewards/rejected": -0.13464269042015076, "step": 102 }, { "epoch": 0.02, "learning_rate": 4.940047961630695e-08, "logps/chosen": -253.7047119140625, "logps/rejected": -262.74969482421875, "loss": 0.619, "losses/dpo": 0.5743977427482605, "losses/sft": 1.0119160413742065, "losses/total": 0.5743977427482605, "ref_logps/chosen": -253.6084442138672, "ref_logps/rejected": -261.08245849609375, "rewards/accuracies": 0.9375, "rewards/chosen": -0.009626149199903011, "rewards/margins": 0.15709610283374786, "rewards/rejected": -0.16672223806381226, "step": 103 }, { "epoch": 0.02, "learning_rate": 4.988009592326139e-08, "logps/chosen": -215.8086700439453, "logps/rejected": -210.15597534179688, "loss": 0.6279, "losses/dpo": 0.6108782291412354, "losses/sft": 0.4697265625, "losses/total": 0.6108782291412354, "ref_logps/chosen": -215.93472290039062, "ref_logps/rejected": -208.90725708007812, "rewards/accuracies": 0.9375, "rewards/chosen": 0.012608631514012814, "rewards/margins": 0.13747981190681458, "rewards/rejected": -0.12487118691205978, "step": 104 }, { "epoch": 0.03, "learning_rate": 5.035971223021582e-08, "logps/chosen": -242.61610412597656, "logps/rejected": -253.43980407714844, "loss": 0.613, "losses/dpo": 0.5792238712310791, "losses/sft": 0.8260404467582703, "losses/total": 0.5792238712310791, "ref_logps/chosen": -242.67819213867188, "ref_logps/rejected": -251.78785705566406, "rewards/accuracies": 0.9375, "rewards/chosen": 0.006210315972566605, "rewards/margins": 0.17140522599220276, "rewards/rejected": -0.16519491374492645, "step": 105 }, { "epoch": 0.03, "learning_rate": 5.083932853717026e-08, "logps/chosen": -232.74618530273438, "logps/rejected": -234.0415802001953, "loss": 0.615, "losses/dpo": 0.5301788449287415, "losses/sft": 0.5701903700828552, "losses/total": 0.5301788449287415, "ref_logps/chosen": -232.73141479492188, "ref_logps/rejected": -232.35609436035156, "rewards/accuracies": 1.0, "rewards/chosen": -0.0014767824904993176, "rewards/margins": 0.16707123816013336, "rewards/rejected": -0.16854801774024963, "step": 106 }, { "epoch": 0.03, "learning_rate": 5.13189448441247e-08, "logps/chosen": -227.49609375, "logps/rejected": -190.36961364746094, "loss": 0.6387, "losses/dpo": 0.6278862953186035, "losses/sft": 0.5480640530586243, "losses/total": 0.6278862953186035, "ref_logps/chosen": -227.37045288085938, "ref_logps/rejected": -189.0908966064453, "rewards/accuracies": 0.9375, "rewards/chosen": -0.012565476819872856, "rewards/margins": 0.11530736088752747, "rewards/rejected": -0.12787283957004547, "step": 107 }, { "epoch": 0.03, "learning_rate": 5.1798561151079134e-08, "logps/chosen": -204.54864501953125, "logps/rejected": -177.87818908691406, "loss": 0.6259, "losses/dpo": 0.6303272843360901, "losses/sft": 0.7500016689300537, "losses/total": 0.6303272843360901, "ref_logps/chosen": -204.59323120117188, "ref_logps/rejected": -176.50558471679688, "rewards/accuracies": 0.9375, "rewards/chosen": 0.00445692241191864, "rewards/margins": 0.14171674847602844, "rewards/rejected": -0.1372598260641098, "step": 108 }, { "epoch": 0.03, "learning_rate": 5.227817745803357e-08, "logps/chosen": -213.63211059570312, "logps/rejected": -190.08001708984375, "loss": 0.6119, "losses/dpo": 0.5622921586036682, "losses/sft": 0.585150420665741, "losses/total": 0.5622921586036682, "ref_logps/chosen": -213.843505859375, "ref_logps/rejected": -188.55072021484375, "rewards/accuracies": 0.9375, "rewards/chosen": 0.02113887295126915, "rewards/margins": 0.17407014966011047, "rewards/rejected": -0.15293127298355103, "step": 109 }, { "epoch": 0.03, "learning_rate": 5.275779376498801e-08, "logps/chosen": -210.03286743164062, "logps/rejected": -236.89462280273438, "loss": 0.6064, "losses/dpo": 0.6311395764350891, "losses/sft": 0.6113638877868652, "losses/total": 0.6311395764350891, "ref_logps/chosen": -210.1173553466797, "ref_logps/rejected": -235.11578369140625, "rewards/accuracies": 1.0, "rewards/chosen": 0.008448552340269089, "rewards/margins": 0.18633267283439636, "rewards/rejected": -0.17788410186767578, "step": 110 }, { "epoch": 0.03, "learning_rate": 5.3237410071942446e-08, "logps/chosen": -221.70140075683594, "logps/rejected": -210.56275939941406, "loss": 0.6269, "losses/dpo": 0.5947102308273315, "losses/sft": 0.4928923547267914, "losses/total": 0.5947102308273315, "ref_logps/chosen": -221.8306427001953, "ref_logps/rejected": -209.2947540283203, "rewards/accuracies": 1.0, "rewards/chosen": 0.012922502122819424, "rewards/margins": 0.13972342014312744, "rewards/rejected": -0.12680092453956604, "step": 111 }, { "epoch": 0.03, "learning_rate": 5.371702637889688e-08, "logps/chosen": -225.876220703125, "logps/rejected": -217.99693298339844, "loss": 0.614, "losses/dpo": 0.6604347229003906, "losses/sft": 0.5047462582588196, "losses/total": 0.6604347229003906, "ref_logps/chosen": -225.87158203125, "ref_logps/rejected": -216.30276489257812, "rewards/accuracies": 0.9375, "rewards/chosen": -0.0004660901613533497, "rewards/margins": 0.1689515858888626, "rewards/rejected": -0.16941766440868378, "step": 112 }, { "epoch": 0.03, "learning_rate": 5.419664268585131e-08, "logps/chosen": -224.54583740234375, "logps/rejected": -213.4689178466797, "loss": 0.5933, "losses/dpo": 0.5958684086799622, "losses/sft": 0.6051914095878601, "losses/total": 0.5958684086799622, "ref_logps/chosen": -224.67520141601562, "ref_logps/rejected": -211.43191528320312, "rewards/accuracies": 0.96875, "rewards/chosen": 0.01293634157627821, "rewards/margins": 0.21663714945316315, "rewards/rejected": -0.20370081067085266, "step": 113 }, { "epoch": 0.03, "learning_rate": 5.467625899280576e-08, "logps/chosen": -220.74310302734375, "logps/rejected": -242.31326293945312, "loss": 0.6001, "losses/dpo": 0.5577617287635803, "losses/sft": 1.0377243757247925, "losses/total": 0.5577617287635803, "ref_logps/chosen": -220.80511474609375, "ref_logps/rejected": -240.375, "rewards/accuracies": 0.96875, "rewards/chosen": 0.00620079692453146, "rewards/margins": 0.20002779364585876, "rewards/rejected": -0.19382698833942413, "step": 114 }, { "epoch": 0.03, "learning_rate": 5.515587529976019e-08, "logps/chosen": -220.1526336669922, "logps/rejected": -229.90171813964844, "loss": 0.6173, "losses/dpo": 0.523753821849823, "losses/sft": 0.626242995262146, "losses/total": 0.523753821849823, "ref_logps/chosen": -220.0028839111328, "ref_logps/rejected": -228.13558959960938, "rewards/accuracies": 0.9375, "rewards/chosen": -0.014975928701460361, "rewards/margins": 0.16163590550422668, "rewards/rejected": -0.17661184072494507, "step": 115 }, { "epoch": 0.03, "learning_rate": 5.563549160671462e-08, "logps/chosen": -259.07635498046875, "logps/rejected": -248.85873413085938, "loss": 0.5996, "losses/dpo": 0.45944952964782715, "losses/sft": 0.8736971020698547, "losses/total": 0.45944952964782715, "ref_logps/chosen": -259.09130859375, "ref_logps/rejected": -246.85626220703125, "rewards/accuracies": 0.96875, "rewards/chosen": 0.0014987594913691282, "rewards/margins": 0.20174674689769745, "rewards/rejected": -0.20024797320365906, "step": 116 }, { "epoch": 0.03, "learning_rate": 5.6115107913669057e-08, "logps/chosen": -259.10107421875, "logps/rejected": -239.64251708984375, "loss": 0.597, "losses/dpo": 0.5534869432449341, "losses/sft": 0.5331185460090637, "losses/total": 0.5534869432449341, "ref_logps/chosen": -259.2799377441406, "ref_logps/rejected": -237.70675659179688, "rewards/accuracies": 0.90625, "rewards/chosen": 0.017888564616441727, "rewards/margins": 0.2114638239145279, "rewards/rejected": -0.19357527792453766, "step": 117 }, { "epoch": 0.03, "learning_rate": 5.6594724220623505e-08, "logps/chosen": -203.77171325683594, "logps/rejected": -236.67185974121094, "loss": 0.5964, "losses/dpo": 0.554344892501831, "losses/sft": 0.7114688158035278, "losses/total": 0.554344892501831, "ref_logps/chosen": -203.78138732910156, "ref_logps/rejected": -234.59951782226562, "rewards/accuracies": 0.96875, "rewards/chosen": 0.0009669188875705004, "rewards/margins": 0.20820273458957672, "rewards/rejected": -0.20723581314086914, "step": 118 }, { "epoch": 0.03, "learning_rate": 5.7074340527577933e-08, "logps/chosen": -232.32839965820312, "logps/rejected": -240.80862426757812, "loss": 0.5864, "losses/dpo": 0.517087459564209, "losses/sft": 0.5935673713684082, "losses/total": 0.517087459564209, "ref_logps/chosen": -232.40748596191406, "ref_logps/rejected": -238.5836944580078, "rewards/accuracies": 1.0, "rewards/chosen": 0.007908107712864876, "rewards/margins": 0.23040199279785156, "rewards/rejected": -0.22249388694763184, "step": 119 }, { "epoch": 0.03, "learning_rate": 5.755395683453237e-08, "logps/chosen": -224.00338745117188, "logps/rejected": -207.595703125, "loss": 0.6215, "losses/dpo": 0.6393631100654602, "losses/sft": 0.8860717415809631, "losses/total": 0.6393631100654602, "ref_logps/chosen": -224.07943725585938, "ref_logps/rejected": -206.12306213378906, "rewards/accuracies": 0.84375, "rewards/chosen": 0.007605741731822491, "rewards/margins": 0.15486843883991241, "rewards/rejected": -0.14726269245147705, "step": 120 }, { "epoch": 0.03, "learning_rate": 5.8033573141486804e-08, "logps/chosen": -281.1446533203125, "logps/rejected": -246.57183837890625, "loss": 0.5821, "losses/dpo": 0.55320805311203, "losses/sft": 0.8629291653633118, "losses/total": 0.55320805311203, "ref_logps/chosen": -281.03179931640625, "ref_logps/rejected": -244.0305633544922, "rewards/accuracies": 0.96875, "rewards/chosen": -0.011285996064543724, "rewards/margins": 0.24284033477306366, "rewards/rejected": -0.25412631034851074, "step": 121 }, { "epoch": 0.03, "learning_rate": 5.8513189448441245e-08, "logps/chosen": -255.19601440429688, "logps/rejected": -240.8336181640625, "loss": 0.578, "losses/dpo": 0.5487377643585205, "losses/sft": 0.5952659249305725, "losses/total": 0.5487377643585205, "ref_logps/chosen": -255.3051300048828, "ref_logps/rejected": -238.44924926757812, "rewards/accuracies": 0.96875, "rewards/chosen": 0.01091008260846138, "rewards/margins": 0.24934732913970947, "rewards/rejected": -0.238437220454216, "step": 122 }, { "epoch": 0.03, "learning_rate": 5.899280575539568e-08, "logps/chosen": -194.41046142578125, "logps/rejected": -216.09231567382812, "loss": 0.5878, "losses/dpo": 0.5547345876693726, "losses/sft": 0.5987753868103027, "losses/total": 0.5547345876693726, "ref_logps/chosen": -194.54786682128906, "ref_logps/rejected": -213.9281768798828, "rewards/accuracies": 1.0, "rewards/chosen": 0.013739919289946556, "rewards/margins": 0.23015521466732025, "rewards/rejected": -0.21641528606414795, "step": 123 }, { "epoch": 0.03, "learning_rate": 5.9472422062350115e-08, "logps/chosen": -190.55589294433594, "logps/rejected": -210.86700439453125, "loss": 0.5927, "losses/dpo": 0.6202155351638794, "losses/sft": 0.37918543815612793, "losses/total": 0.6202155351638794, "ref_logps/chosen": -190.67518615722656, "ref_logps/rejected": -208.7951202392578, "rewards/accuracies": 0.9375, "rewards/chosen": 0.011928766965866089, "rewards/margins": 0.21911835670471191, "rewards/rejected": -0.20718958973884583, "step": 124 }, { "epoch": 0.03, "learning_rate": 5.995203836930456e-08, "logps/chosen": -230.08262634277344, "logps/rejected": -224.68768310546875, "loss": 0.5826, "losses/dpo": 0.4934700131416321, "losses/sft": 0.8639508485794067, "losses/total": 0.4934700131416321, "ref_logps/chosen": -230.2122039794922, "ref_logps/rejected": -222.42637634277344, "rewards/accuracies": 1.0, "rewards/chosen": 0.012957701459527016, "rewards/margins": 0.2390868067741394, "rewards/rejected": -0.22612909972667694, "step": 125 }, { "epoch": 0.03, "learning_rate": 6.043165467625899e-08, "logps/chosen": -240.8319091796875, "logps/rejected": -231.2616424560547, "loss": 0.5932, "losses/dpo": 0.5129846334457397, "losses/sft": 0.557536780834198, "losses/total": 0.5129846334457397, "ref_logps/chosen": -240.9114990234375, "ref_logps/rejected": -229.16856384277344, "rewards/accuracies": 1.0, "rewards/chosen": 0.007960165850818157, "rewards/margins": 0.2172689139842987, "rewards/rejected": -0.20930875837802887, "step": 126 }, { "epoch": 0.03, "learning_rate": 6.091127098321342e-08, "logps/chosen": -241.32818603515625, "logps/rejected": -238.5933380126953, "loss": 0.563, "losses/dpo": 0.5073736906051636, "losses/sft": 0.4011218249797821, "losses/total": 0.5073736906051636, "ref_logps/chosen": -241.45579528808594, "ref_logps/rejected": -235.84808349609375, "rewards/accuracies": 0.96875, "rewards/chosen": 0.01276168879121542, "rewards/margins": 0.28728699684143066, "rewards/rejected": -0.2745252847671509, "step": 127 }, { "epoch": 0.03, "learning_rate": 6.139088729016786e-08, "logps/chosen": -219.16171264648438, "logps/rejected": -211.08648681640625, "loss": 0.585, "losses/dpo": 0.4836885929107666, "losses/sft": 0.6268772482872009, "losses/total": 0.4836885929107666, "ref_logps/chosen": -219.37652587890625, "ref_logps/rejected": -208.9334259033203, "rewards/accuracies": 0.9375, "rewards/chosen": 0.021481646224856377, "rewards/margins": 0.2367861419916153, "rewards/rejected": -0.21530449390411377, "step": 128 }, { "epoch": 0.03, "learning_rate": 6.18705035971223e-08, "logps/chosen": -251.58872985839844, "logps/rejected": -238.37400817871094, "loss": 0.5666, "losses/dpo": 0.57139652967453, "losses/sft": 0.5971519947052002, "losses/total": 0.57139652967453, "ref_logps/chosen": -251.90267944335938, "ref_logps/rejected": -235.91831970214844, "rewards/accuracies": 1.0, "rewards/chosen": 0.03139495849609375, "rewards/margins": 0.276966392993927, "rewards/rejected": -0.24557143449783325, "step": 129 }, { "epoch": 0.03, "learning_rate": 6.235011990407673e-08, "logps/chosen": -254.15118408203125, "logps/rejected": -239.63816833496094, "loss": 0.5631, "losses/dpo": 0.5143429040908813, "losses/sft": 0.6372097730636597, "losses/total": 0.5143429040908813, "ref_logps/chosen": -254.25064086914062, "ref_logps/rejected": -236.8759307861328, "rewards/accuracies": 1.0, "rewards/chosen": 0.009947370737791061, "rewards/margins": 0.286171555519104, "rewards/rejected": -0.2762242257595062, "step": 130 }, { "epoch": 0.03, "learning_rate": 6.282973621103117e-08, "logps/chosen": -249.85348510742188, "logps/rejected": -248.95416259765625, "loss": 0.568, "losses/dpo": 0.6448809504508972, "losses/sft": 0.5814533829689026, "losses/total": 0.6448809504508972, "ref_logps/chosen": -249.74673461914062, "ref_logps/rejected": -246.10333251953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.010677620768547058, "rewards/margins": 0.2744046449661255, "rewards/rejected": -0.28508228063583374, "step": 131 }, { "epoch": 0.03, "learning_rate": 6.33093525179856e-08, "logps/chosen": -204.58445739746094, "logps/rejected": -220.04588317871094, "loss": 0.5718, "losses/dpo": 0.5374763607978821, "losses/sft": 0.5410602688789368, "losses/total": 0.5374763607978821, "ref_logps/chosen": -204.56832885742188, "ref_logps/rejected": -217.31979370117188, "rewards/accuracies": 1.0, "rewards/chosen": -0.0016148656141012907, "rewards/margins": 0.2709934711456299, "rewards/rejected": -0.27260833978652954, "step": 132 }, { "epoch": 0.03, "learning_rate": 6.378896882494004e-08, "logps/chosen": -224.90740966796875, "logps/rejected": -242.2696533203125, "loss": 0.5552, "losses/dpo": 0.5421150922775269, "losses/sft": 0.4503932297229767, "losses/total": 0.5421150922775269, "ref_logps/chosen": -224.98577880859375, "ref_logps/rejected": -239.27362060546875, "rewards/accuracies": 1.0, "rewards/chosen": 0.007833730429410934, "rewards/margins": 0.30743759870529175, "rewards/rejected": -0.2996038794517517, "step": 133 }, { "epoch": 0.03, "learning_rate": 6.426858513189449e-08, "logps/chosen": -253.99005126953125, "logps/rejected": -239.46484375, "loss": 0.5366, "losses/dpo": 0.5538178086280823, "losses/sft": 0.824885368347168, "losses/total": 0.5538178086280823, "ref_logps/chosen": -254.0891876220703, "ref_logps/rejected": -236.03817749023438, "rewards/accuracies": 0.96875, "rewards/chosen": 0.009912002831697464, "rewards/margins": 0.3525787591934204, "rewards/rejected": -0.34266674518585205, "step": 134 }, { "epoch": 0.03, "learning_rate": 6.474820143884891e-08, "logps/chosen": -228.73342895507812, "logps/rejected": -235.98605346679688, "loss": 0.5496, "losses/dpo": 0.38726377487182617, "losses/sft": 0.5315933227539062, "losses/total": 0.38726377487182617, "ref_logps/chosen": -228.7466583251953, "ref_logps/rejected": -232.80435180664062, "rewards/accuracies": 1.0, "rewards/chosen": 0.001321011921390891, "rewards/margins": 0.3194921612739563, "rewards/rejected": -0.3181711435317993, "step": 135 }, { "epoch": 0.03, "learning_rate": 6.522781774580336e-08, "logps/chosen": -251.00169372558594, "logps/rejected": -229.4866485595703, "loss": 0.5319, "losses/dpo": 0.4270710051059723, "losses/sft": 0.6829850673675537, "losses/total": 0.4270710051059723, "ref_logps/chosen": -251.15858459472656, "ref_logps/rejected": -225.99087524414062, "rewards/accuracies": 1.0, "rewards/chosen": 0.015687216073274612, "rewards/margins": 0.36526548862457275, "rewards/rejected": -0.34957826137542725, "step": 136 }, { "epoch": 0.03, "learning_rate": 6.57074340527578e-08, "logps/chosen": -193.30799865722656, "logps/rejected": -226.9827117919922, "loss": 0.5192, "losses/dpo": 0.31424787640571594, "losses/sft": 0.4330032467842102, "losses/total": 0.31424787640571594, "ref_logps/chosen": -193.5615234375, "ref_logps/rejected": -223.2628173828125, "rewards/accuracies": 1.0, "rewards/chosen": 0.02535414882004261, "rewards/margins": 0.39734458923339844, "rewards/rejected": -0.371990442276001, "step": 137 }, { "epoch": 0.03, "learning_rate": 6.618705035971223e-08, "logps/chosen": -240.975830078125, "logps/rejected": -250.36318969726562, "loss": 0.4706, "losses/dpo": 0.3900696337223053, "losses/sft": 1.3063476085662842, "losses/total": 0.3900696337223053, "ref_logps/chosen": -241.01507568359375, "ref_logps/rejected": -245.0226287841797, "rewards/accuracies": 1.0, "rewards/chosen": 0.003923529293388128, "rewards/margins": 0.5379785299301147, "rewards/rejected": -0.5340549349784851, "step": 138 }, { "epoch": 0.03, "learning_rate": 6.666666666666665e-08, "logps/chosen": -209.58358764648438, "logps/rejected": -242.96783447265625, "loss": 0.4771, "losses/dpo": 0.351431667804718, "losses/sft": 0.503008246421814, "losses/total": 0.351431667804718, "ref_logps/chosen": -209.81173706054688, "ref_logps/rejected": -238.02001953125, "rewards/accuracies": 1.0, "rewards/chosen": 0.02281363308429718, "rewards/margins": 0.5175934433937073, "rewards/rejected": -0.4947797954082489, "step": 139 }, { "epoch": 0.03, "learning_rate": 6.714628297362111e-08, "logps/chosen": -250.36178588867188, "logps/rejected": -229.40565490722656, "loss": 0.5077, "losses/dpo": 0.4748551845550537, "losses/sft": 0.4364233911037445, "losses/total": 0.4748551845550537, "ref_logps/chosen": -250.61740112304688, "ref_logps/rejected": -225.33978271484375, "rewards/accuracies": 1.0, "rewards/chosen": 0.025559978559613228, "rewards/margins": 0.43214479088783264, "rewards/rejected": -0.40658482909202576, "step": 140 }, { "epoch": 0.03, "learning_rate": 6.762589928057554e-08, "logps/chosen": -283.74761962890625, "logps/rejected": -267.6520080566406, "loss": 0.4716, "losses/dpo": 0.3645535111427307, "losses/sft": 0.47487184405326843, "losses/total": 0.3645535111427307, "ref_logps/chosen": -283.96527099609375, "ref_logps/rejected": -262.576171875, "rewards/accuracies": 1.0, "rewards/chosen": 0.021764738485217094, "rewards/margins": 0.5293512344360352, "rewards/rejected": -0.5075865387916565, "step": 141 }, { "epoch": 0.03, "learning_rate": 6.810551558752997e-08, "logps/chosen": -218.98651123046875, "logps/rejected": -204.3387451171875, "loss": 0.5283, "losses/dpo": 0.3698941171169281, "losses/sft": 0.5333694815635681, "losses/total": 0.3698941171169281, "ref_logps/chosen": -219.0262451171875, "ref_logps/rejected": -200.59173583984375, "rewards/accuracies": 1.0, "rewards/chosen": 0.003973162267357111, "rewards/margins": 0.3786737620830536, "rewards/rejected": -0.3747005760669708, "step": 142 }, { "epoch": 0.03, "learning_rate": 6.858513189448441e-08, "logps/chosen": -260.3921813964844, "logps/rejected": -242.12799072265625, "loss": 0.4729, "losses/dpo": 0.23844937980175018, "losses/sft": 0.5297333598136902, "losses/total": 0.23844937980175018, "ref_logps/chosen": -260.65667724609375, "ref_logps/rejected": -237.07037353515625, "rewards/accuracies": 1.0, "rewards/chosen": 0.026448413729667664, "rewards/margins": 0.5322088003158569, "rewards/rejected": -0.5057603716850281, "step": 143 }, { "epoch": 0.03, "learning_rate": 6.906474820143885e-08, "logps/chosen": -212.8799285888672, "logps/rejected": -213.90850830078125, "loss": 0.4972, "losses/dpo": 0.49711883068084717, "losses/sft": 1.0842598676681519, "losses/total": 0.49711883068084717, "ref_logps/chosen": -213.0482940673828, "ref_logps/rejected": -209.44252014160156, "rewards/accuracies": 1.0, "rewards/chosen": 0.016836978495121002, "rewards/margins": 0.463434636592865, "rewards/rejected": -0.4465976655483246, "step": 144 }, { "epoch": 0.03, "learning_rate": 6.954436450839328e-08, "logps/chosen": -228.1583251953125, "logps/rejected": -202.0294189453125, "loss": 0.489, "losses/dpo": 0.6090012192726135, "losses/sft": 0.6343166828155518, "losses/total": 0.6090012192726135, "ref_logps/chosen": -228.55526733398438, "ref_logps/rejected": -197.59042358398438, "rewards/accuracies": 1.0, "rewards/chosen": 0.03969307616353035, "rewards/margins": 0.48359158635139465, "rewards/rejected": -0.4438985288143158, "step": 145 }, { "epoch": 0.04, "learning_rate": 7.002398081534772e-08, "logps/chosen": -243.2750244140625, "logps/rejected": -226.78466796875, "loss": 0.4604, "losses/dpo": 0.37663325667381287, "losses/sft": 0.6886533498764038, "losses/total": 0.37663325667381287, "ref_logps/chosen": -243.46734619140625, "ref_logps/rejected": -221.2547607421875, "rewards/accuracies": 1.0, "rewards/chosen": 0.019232774153351784, "rewards/margins": 0.5722224712371826, "rewards/rejected": -0.5529897212982178, "step": 146 }, { "epoch": 0.04, "learning_rate": 7.050359712230215e-08, "logps/chosen": -205.27255249023438, "logps/rejected": -236.2374267578125, "loss": 0.4531, "losses/dpo": 0.5428012013435364, "losses/sft": 0.6872162818908691, "losses/total": 0.5428012013435364, "ref_logps/chosen": -205.50714111328125, "ref_logps/rejected": -230.61001586914062, "rewards/accuracies": 1.0, "rewards/chosen": 0.023457497358322144, "rewards/margins": 0.586200475692749, "rewards/rejected": -0.5627429485321045, "step": 147 }, { "epoch": 0.04, "learning_rate": 7.098321342925659e-08, "logps/chosen": -238.6186065673828, "logps/rejected": -262.08013916015625, "loss": 0.4416, "losses/dpo": 0.4858938455581665, "losses/sft": 0.5867885947227478, "losses/total": 0.4858938455581665, "ref_logps/chosen": -238.8809814453125, "ref_logps/rejected": -256.1026916503906, "rewards/accuracies": 1.0, "rewards/chosen": 0.026239095255732536, "rewards/margins": 0.6239868402481079, "rewards/rejected": -0.5977477431297302, "step": 148 }, { "epoch": 0.04, "learning_rate": 7.146282973621103e-08, "logps/chosen": -193.42080688476562, "logps/rejected": -228.41664123535156, "loss": 0.4427, "losses/dpo": 0.3993741273880005, "losses/sft": 0.6345066428184509, "losses/total": 0.3993741273880005, "ref_logps/chosen": -193.56613159179688, "ref_logps/rejected": -222.3511505126953, "rewards/accuracies": 1.0, "rewards/chosen": 0.014531494118273258, "rewards/margins": 0.6210784316062927, "rewards/rejected": -0.606546938419342, "step": 149 }, { "epoch": 0.04, "learning_rate": 7.194244604316546e-08, "logps/chosen": -192.58712768554688, "logps/rejected": -206.43911743164062, "loss": 0.4732, "losses/dpo": 0.41879451274871826, "losses/sft": 0.6147765517234802, "losses/total": 0.41879451274871826, "ref_logps/chosen": -192.87936401367188, "ref_logps/rejected": -201.4298095703125, "rewards/accuracies": 1.0, "rewards/chosen": 0.02922241762280464, "rewards/margins": 0.5301570892333984, "rewards/rejected": -0.5009346604347229, "step": 150 }, { "epoch": 0.04, "learning_rate": 7.24220623501199e-08, "logps/chosen": -181.3852996826172, "logps/rejected": -216.98526000976562, "loss": 0.456, "losses/dpo": 0.22797900438308716, "losses/sft": 0.5964255332946777, "losses/total": 0.22797900438308716, "ref_logps/chosen": -181.74496459960938, "ref_logps/rejected": -211.5086669921875, "rewards/accuracies": 1.0, "rewards/chosen": 0.035966020077466965, "rewards/margins": 0.5836249589920044, "rewards/rejected": -0.5476588606834412, "step": 151 }, { "epoch": 0.04, "learning_rate": 7.290167865707434e-08, "logps/chosen": -235.81585693359375, "logps/rejected": -223.6782684326172, "loss": 0.4798, "losses/dpo": 0.32353663444519043, "losses/sft": 0.7560741305351257, "losses/total": 0.32353663444519043, "ref_logps/chosen": -235.79327392578125, "ref_logps/rejected": -218.49974060058594, "rewards/accuracies": 1.0, "rewards/chosen": -0.002259305212646723, "rewards/margins": 0.5155926942825317, "rewards/rejected": -0.5178520083427429, "step": 152 }, { "epoch": 0.04, "learning_rate": 7.338129496402877e-08, "logps/chosen": -230.74676513671875, "logps/rejected": -223.94960021972656, "loss": 0.4886, "losses/dpo": 0.44595906138420105, "losses/sft": 0.6417722105979919, "losses/total": 0.44595906138420105, "ref_logps/chosen": -230.88400268554688, "ref_logps/rejected": -219.23880004882812, "rewards/accuracies": 1.0, "rewards/chosen": 0.01372277457267046, "rewards/margins": 0.48480355739593506, "rewards/rejected": -0.4710807800292969, "step": 153 }, { "epoch": 0.04, "learning_rate": 7.38609112709832e-08, "logps/chosen": -204.35372924804688, "logps/rejected": -207.905029296875, "loss": 0.4879, "losses/dpo": 0.442070335149765, "losses/sft": 0.5434986352920532, "losses/total": 0.442070335149765, "ref_logps/chosen": -204.54403686523438, "ref_logps/rejected": -203.23611450195312, "rewards/accuracies": 1.0, "rewards/chosen": 0.019031014293432236, "rewards/margins": 0.4859224259853363, "rewards/rejected": -0.46689140796661377, "step": 154 }, { "epoch": 0.04, "learning_rate": 7.434052757793766e-08, "logps/chosen": -261.3990478515625, "logps/rejected": -248.65965270996094, "loss": 0.4433, "losses/dpo": 0.26182231307029724, "losses/sft": 1.3745352029800415, "losses/total": 0.26182231307029724, "ref_logps/chosen": -261.54754638671875, "ref_logps/rejected": -242.56747436523438, "rewards/accuracies": 1.0, "rewards/chosen": 0.014849376864731312, "rewards/margins": 0.6240652799606323, "rewards/rejected": -0.6092159748077393, "step": 155 }, { "epoch": 0.04, "learning_rate": 7.482014388489208e-08, "logps/chosen": -226.79238891601562, "logps/rejected": -219.4222869873047, "loss": 0.4548, "losses/dpo": 0.34624168276786804, "losses/sft": 0.66420578956604, "losses/total": 0.34624168276786804, "ref_logps/chosen": -227.10745239257812, "ref_logps/rejected": -213.93824768066406, "rewards/accuracies": 1.0, "rewards/chosen": 0.03150506317615509, "rewards/margins": 0.5799087882041931, "rewards/rejected": -0.5484037399291992, "step": 156 }, { "epoch": 0.04, "learning_rate": 7.529976019184651e-08, "logps/chosen": -236.8594207763672, "logps/rejected": -250.58924865722656, "loss": 0.422, "losses/dpo": 0.2633001208305359, "losses/sft": 0.4614797830581665, "losses/total": 0.2633001208305359, "ref_logps/chosen": -237.05105590820312, "ref_logps/rejected": -243.76055908203125, "rewards/accuracies": 1.0, "rewards/chosen": 0.01916416734457016, "rewards/margins": 0.7020345330238342, "rewards/rejected": -0.6828703284263611, "step": 157 }, { "epoch": 0.04, "learning_rate": 7.577937649880095e-08, "logps/chosen": -168.65951538085938, "logps/rejected": -200.8760223388672, "loss": 0.4791, "losses/dpo": 0.346417635679245, "losses/sft": 0.4693962037563324, "losses/total": 0.346417635679245, "ref_logps/chosen": -168.9166259765625, "ref_logps/rejected": -195.99452209472656, "rewards/accuracies": 1.0, "rewards/chosen": 0.025710148736834526, "rewards/margins": 0.5138606429100037, "rewards/rejected": -0.4881504774093628, "step": 158 }, { "epoch": 0.04, "learning_rate": 7.62589928057554e-08, "logps/chosen": -191.00173950195312, "logps/rejected": -206.39700317382812, "loss": 0.4857, "losses/dpo": 0.4611247181892395, "losses/sft": 0.7016743421554565, "losses/total": 0.4611247181892395, "ref_logps/chosen": -191.20986938476562, "ref_logps/rejected": -201.5801239013672, "rewards/accuracies": 0.96875, "rewards/chosen": 0.02081288769841194, "rewards/margins": 0.5025008320808411, "rewards/rejected": -0.4816879630088806, "step": 159 }, { "epoch": 0.04, "learning_rate": 7.673860911270982e-08, "logps/chosen": -245.38214111328125, "logps/rejected": -216.4864501953125, "loss": 0.4736, "losses/dpo": 0.2825773358345032, "losses/sft": 0.531553328037262, "losses/total": 0.2825773358345032, "ref_logps/chosen": -245.3043975830078, "ref_logps/rejected": -211.04684448242188, "rewards/accuracies": 1.0, "rewards/chosen": -0.0077738650143146515, "rewards/margins": 0.5361867547035217, "rewards/rejected": -0.543960690498352, "step": 160 }, { "epoch": 0.04, "learning_rate": 7.721822541966427e-08, "logps/chosen": -242.36448669433594, "logps/rejected": -196.6497802734375, "loss": 0.4547, "losses/dpo": 0.3855081796646118, "losses/sft": 0.7669775485992432, "losses/total": 0.3855081796646118, "ref_logps/chosen": -242.39566040039062, "ref_logps/rejected": -190.70297241210938, "rewards/accuracies": 1.0, "rewards/chosen": 0.0031159287318587303, "rewards/margins": 0.5977975130081177, "rewards/rejected": -0.5946815609931946, "step": 161 }, { "epoch": 0.04, "learning_rate": 7.769784172661871e-08, "logps/chosen": -217.03802490234375, "logps/rejected": -226.6476287841797, "loss": 0.4273, "losses/dpo": 0.2480551302433014, "losses/sft": 0.49929606914520264, "losses/total": 0.2480551302433014, "ref_logps/chosen": -217.00184631347656, "ref_logps/rejected": -219.80032348632812, "rewards/accuracies": 1.0, "rewards/chosen": -0.003617870155721903, "rewards/margins": 0.6811143755912781, "rewards/rejected": -0.68473219871521, "step": 162 }, { "epoch": 0.04, "learning_rate": 7.817745803357314e-08, "logps/chosen": -267.05535888671875, "logps/rejected": -220.09698486328125, "loss": 0.4339, "losses/dpo": 0.2881166636943817, "losses/sft": 0.4456699788570404, "losses/total": 0.2881166636943817, "ref_logps/chosen": -267.2043151855469, "ref_logps/rejected": -213.75601196289062, "rewards/accuracies": 1.0, "rewards/chosen": 0.014896642416715622, "rewards/margins": 0.6489920616149902, "rewards/rejected": -0.6340954303741455, "step": 163 }, { "epoch": 0.04, "learning_rate": 7.865707434052758e-08, "logps/chosen": -213.67564392089844, "logps/rejected": -238.876220703125, "loss": 0.4538, "losses/dpo": 0.44649404287338257, "losses/sft": 0.5693972110748291, "losses/total": 0.44649404287338257, "ref_logps/chosen": -213.84527587890625, "ref_logps/rejected": -233.19027709960938, "rewards/accuracies": 1.0, "rewards/chosen": 0.01696460321545601, "rewards/margins": 0.5855571031570435, "rewards/rejected": -0.5685924887657166, "step": 164 }, { "epoch": 0.04, "learning_rate": 7.913669064748201e-08, "logps/chosen": -235.9366912841797, "logps/rejected": -228.9468994140625, "loss": 0.4479, "losses/dpo": 0.31202855706214905, "losses/sft": 0.36799705028533936, "losses/total": 0.31202855706214905, "ref_logps/chosen": -236.299072265625, "ref_logps/rejected": -223.13232421875, "rewards/accuracies": 1.0, "rewards/chosen": 0.036238111555576324, "rewards/margins": 0.6176968812942505, "rewards/rejected": -0.5814588069915771, "step": 165 }, { "epoch": 0.04, "learning_rate": 7.961630695443645e-08, "logps/chosen": -201.08615112304688, "logps/rejected": -208.87533569335938, "loss": 0.4518, "losses/dpo": 0.26655736565589905, "losses/sft": 0.47408461570739746, "losses/total": 0.26655736565589905, "ref_logps/chosen": -201.35403442382812, "ref_logps/rejected": -203.13987731933594, "rewards/accuracies": 1.0, "rewards/chosen": 0.026787303388118744, "rewards/margins": 0.6003329753875732, "rewards/rejected": -0.5735456943511963, "step": 166 }, { "epoch": 0.04, "learning_rate": 8.009592326139089e-08, "logps/chosen": -222.4311981201172, "logps/rejected": -209.09725952148438, "loss": 0.4379, "losses/dpo": 0.49538183212280273, "losses/sft": 0.6766504645347595, "losses/total": 0.49538183212280273, "ref_logps/chosen": -222.51824951171875, "ref_logps/rejected": -202.86154174804688, "rewards/accuracies": 1.0, "rewards/chosen": 0.008703896775841713, "rewards/margins": 0.632276713848114, "rewards/rejected": -0.623572826385498, "step": 167 }, { "epoch": 0.04, "learning_rate": 8.057553956834532e-08, "logps/chosen": -209.48341369628906, "logps/rejected": -241.56101989746094, "loss": 0.3857, "losses/dpo": 0.2880609333515167, "losses/sft": 0.521109938621521, "losses/total": 0.2880609333515167, "ref_logps/chosen": -209.6242218017578, "ref_logps/rejected": -233.605712890625, "rewards/accuracies": 1.0, "rewards/chosen": 0.014080840162932873, "rewards/margins": 0.809609055519104, "rewards/rejected": -0.7955282926559448, "step": 168 }, { "epoch": 0.04, "learning_rate": 8.105515587529975e-08, "logps/chosen": -204.18807983398438, "logps/rejected": -232.31402587890625, "loss": 0.4496, "losses/dpo": 0.3397159278392792, "losses/sft": 0.4283502399921417, "losses/total": 0.3397159278392792, "ref_logps/chosen": -204.03765869140625, "ref_logps/rejected": -225.94406127929688, "rewards/accuracies": 1.0, "rewards/chosen": -0.01504288986325264, "rewards/margins": 0.6219532489776611, "rewards/rejected": -0.6369961500167847, "step": 169 }, { "epoch": 0.04, "learning_rate": 8.15347721822542e-08, "logps/chosen": -257.28607177734375, "logps/rejected": -249.08868408203125, "loss": 0.3717, "losses/dpo": 0.2828756868839264, "losses/sft": 0.6457157731056213, "losses/total": 0.2828756868839264, "ref_logps/chosen": -257.5560302734375, "ref_logps/rejected": -240.50570678710938, "rewards/accuracies": 1.0, "rewards/chosen": 0.02699587494134903, "rewards/margins": 0.8852931261062622, "rewards/rejected": -0.8582972288131714, "step": 170 }, { "epoch": 0.04, "learning_rate": 8.201438848920863e-08, "logps/chosen": -221.77471923828125, "logps/rejected": -199.5004425048828, "loss": 0.4479, "losses/dpo": 0.28124141693115234, "losses/sft": 0.8437215685844421, "losses/total": 0.28124141693115234, "ref_logps/chosen": -222.19483947753906, "ref_logps/rejected": -193.8284912109375, "rewards/accuracies": 1.0, "rewards/chosen": 0.04201015084981918, "rewards/margins": 0.6092057824134827, "rewards/rejected": -0.5671956539154053, "step": 171 }, { "epoch": 0.04, "learning_rate": 8.249400479616306e-08, "logps/chosen": -202.1851348876953, "logps/rejected": -204.84190368652344, "loss": 0.4034, "losses/dpo": 0.4848230481147766, "losses/sft": 0.5287148356437683, "losses/total": 0.4848230481147766, "ref_logps/chosen": -202.4793243408203, "ref_logps/rejected": -197.52857971191406, "rewards/accuracies": 1.0, "rewards/chosen": 0.02941962704062462, "rewards/margins": 0.7607525587081909, "rewards/rejected": -0.7313328981399536, "step": 172 }, { "epoch": 0.04, "learning_rate": 8.29736211031175e-08, "logps/chosen": -247.38229370117188, "logps/rejected": -259.6751403808594, "loss": 0.3585, "losses/dpo": 0.24494299292564392, "losses/sft": 0.5852348208427429, "losses/total": 0.24494299292564392, "ref_logps/chosen": -247.68505859375, "ref_logps/rejected": -251.13937377929688, "rewards/accuracies": 1.0, "rewards/chosen": 0.03027374856173992, "rewards/margins": 0.8838493824005127, "rewards/rejected": -0.8535757064819336, "step": 173 }, { "epoch": 0.04, "learning_rate": 8.345323741007194e-08, "logps/chosen": -238.55978393554688, "logps/rejected": -227.37693786621094, "loss": 0.406, "losses/dpo": 0.2659766376018524, "losses/sft": 0.5226985812187195, "losses/total": 0.2659766376018524, "ref_logps/chosen": -238.600341796875, "ref_logps/rejected": -220.03358459472656, "rewards/accuracies": 1.0, "rewards/chosen": 0.004054749384522438, "rewards/margins": 0.7383896112442017, "rewards/rejected": -0.7343348860740662, "step": 174 }, { "epoch": 0.04, "learning_rate": 8.393285371702637e-08, "logps/chosen": -219.17108154296875, "logps/rejected": -236.41549682617188, "loss": 0.3791, "losses/dpo": 0.3012677729129791, "losses/sft": 0.8752253651618958, "losses/total": 0.3012677729129791, "ref_logps/chosen": -219.362548828125, "ref_logps/rejected": -228.1414031982422, "rewards/accuracies": 1.0, "rewards/chosen": 0.019145360216498375, "rewards/margins": 0.8465538024902344, "rewards/rejected": -0.8274084329605103, "step": 175 }, { "epoch": 0.04, "learning_rate": 8.441247002398081e-08, "logps/chosen": -235.0795440673828, "logps/rejected": -258.5284118652344, "loss": 0.3624, "losses/dpo": 0.2158084660768509, "losses/sft": 0.4969451427459717, "losses/total": 0.2158084660768509, "ref_logps/chosen": -235.51336669921875, "ref_logps/rejected": -250.18255615234375, "rewards/accuracies": 1.0, "rewards/chosen": 0.04338214546442032, "rewards/margins": 0.8779664039611816, "rewards/rejected": -0.8345842361450195, "step": 176 }, { "epoch": 0.04, "learning_rate": 8.489208633093525e-08, "logps/chosen": -222.04013061523438, "logps/rejected": -226.20907592773438, "loss": 0.4347, "losses/dpo": 0.19211876392364502, "losses/sft": 0.5897559523582458, "losses/total": 0.19211876392364502, "ref_logps/chosen": -222.14268493652344, "ref_logps/rejected": -219.82115173339844, "rewards/accuracies": 1.0, "rewards/chosen": 0.010255686938762665, "rewards/margins": 0.6490480303764343, "rewards/rejected": -0.6387923359870911, "step": 177 }, { "epoch": 0.04, "learning_rate": 8.537170263788968e-08, "logps/chosen": -242.70521545410156, "logps/rejected": -212.92996215820312, "loss": 0.4013, "losses/dpo": 0.2633001208305359, "losses/sft": 0.600238561630249, "losses/total": 0.2633001208305359, "ref_logps/chosen": -243.05284118652344, "ref_logps/rejected": -205.40423583984375, "rewards/accuracies": 1.0, "rewards/chosen": 0.034763164818286896, "rewards/margins": 0.7873361110687256, "rewards/rejected": -0.7525730133056641, "step": 178 }, { "epoch": 0.04, "learning_rate": 8.585131894484412e-08, "logps/chosen": -194.2042694091797, "logps/rejected": -197.32470703125, "loss": 0.4493, "losses/dpo": 0.27401307225227356, "losses/sft": 0.6115163564682007, "losses/total": 0.27401307225227356, "ref_logps/chosen": -194.37911987304688, "ref_logps/rejected": -191.28070068359375, "rewards/accuracies": 1.0, "rewards/chosen": 0.017485041171312332, "rewards/margins": 0.6218861937522888, "rewards/rejected": -0.604401171207428, "step": 179 }, { "epoch": 0.04, "learning_rate": 8.633093525179855e-08, "logps/chosen": -242.6411590576172, "logps/rejected": -246.32977294921875, "loss": 0.3639, "losses/dpo": 0.5233930349349976, "losses/sft": 0.5708298683166504, "losses/total": 0.5233930349349976, "ref_logps/chosen": -242.9275360107422, "ref_logps/rejected": -237.82749938964844, "rewards/accuracies": 1.0, "rewards/chosen": 0.028638841584324837, "rewards/margins": 0.8788664937019348, "rewards/rejected": -0.8502277135848999, "step": 180 }, { "epoch": 0.04, "learning_rate": 8.6810551558753e-08, "logps/chosen": -207.65545654296875, "logps/rejected": -222.50799560546875, "loss": 0.4023, "losses/dpo": 0.3714944124221802, "losses/sft": 0.5726811289787292, "losses/total": 0.3714944124221802, "ref_logps/chosen": -207.9318389892578, "ref_logps/rejected": -215.01327514648438, "rewards/accuracies": 1.0, "rewards/chosen": 0.027638256549835205, "rewards/margins": 0.7771090269088745, "rewards/rejected": -0.7494708299636841, "step": 181 }, { "epoch": 0.04, "learning_rate": 8.729016786570744e-08, "logps/chosen": -227.26058959960938, "logps/rejected": -223.83265686035156, "loss": 0.3837, "losses/dpo": 0.30894795060157776, "losses/sft": 0.6398345232009888, "losses/total": 0.30894795060157776, "ref_logps/chosen": -227.26824951171875, "ref_logps/rejected": -215.65768432617188, "rewards/accuracies": 1.0, "rewards/chosen": 0.0007682684808969498, "rewards/margins": 0.8182668089866638, "rewards/rejected": -0.817498505115509, "step": 182 }, { "epoch": 0.04, "learning_rate": 8.776978417266186e-08, "logps/chosen": -193.38510131835938, "logps/rejected": -219.8938446044922, "loss": 0.4037, "losses/dpo": 0.38029298186302185, "losses/sft": 0.4916646182537079, "losses/total": 0.38029298186302185, "ref_logps/chosen": -193.46670532226562, "ref_logps/rejected": -212.3253936767578, "rewards/accuracies": 1.0, "rewards/chosen": 0.00816110335290432, "rewards/margins": 0.7650065422058105, "rewards/rejected": -0.7568454742431641, "step": 183 }, { "epoch": 0.04, "learning_rate": 8.824940047961629e-08, "logps/chosen": -226.2332000732422, "logps/rejected": -220.88882446289062, "loss": 0.3842, "losses/dpo": 0.2774001359939575, "losses/sft": 0.5384929180145264, "losses/total": 0.2774001359939575, "ref_logps/chosen": -226.04373168945312, "ref_logps/rejected": -212.43682861328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.018946174532175064, "rewards/margins": 0.8262524008750916, "rewards/rejected": -0.8451985120773315, "step": 184 }, { "epoch": 0.04, "learning_rate": 8.872901678657075e-08, "logps/chosen": -230.3717041015625, "logps/rejected": -245.202392578125, "loss": 0.3833, "losses/dpo": 0.35195106267929077, "losses/sft": 0.43244168162345886, "losses/total": 0.35195106267929077, "ref_logps/chosen": -230.7655792236328, "ref_logps/rejected": -237.25778198242188, "rewards/accuracies": 1.0, "rewards/chosen": 0.03938842564821243, "rewards/margins": 0.8338524699211121, "rewards/rejected": -0.7944639921188354, "step": 185 }, { "epoch": 0.04, "learning_rate": 8.920863309352518e-08, "logps/chosen": -223.658203125, "logps/rejected": -240.16725158691406, "loss": 0.3764, "losses/dpo": 0.2433108687400818, "losses/sft": 0.6713700890541077, "losses/total": 0.2433108687400818, "ref_logps/chosen": -223.86622619628906, "ref_logps/rejected": -231.764892578125, "rewards/accuracies": 1.0, "rewards/chosen": 0.020803088322281837, "rewards/margins": 0.8610395789146423, "rewards/rejected": -0.840236485004425, "step": 186 }, { "epoch": 0.04, "learning_rate": 8.96882494004796e-08, "logps/chosen": -273.52996826171875, "logps/rejected": -247.30767822265625, "loss": 0.3661, "losses/dpo": 0.33201053738594055, "losses/sft": 0.475113183259964, "losses/total": 0.33201053738594055, "ref_logps/chosen": -273.5435791015625, "ref_logps/rejected": -238.57008361816406, "rewards/accuracies": 1.0, "rewards/chosen": 0.0013591269962489605, "rewards/margins": 0.8751175403594971, "rewards/rejected": -0.8737584352493286, "step": 187 }, { "epoch": 0.05, "learning_rate": 9.016786570743405e-08, "logps/chosen": -217.7066192626953, "logps/rejected": -216.87327575683594, "loss": 0.3941, "losses/dpo": 0.35187047719955444, "losses/sft": 0.6103029847145081, "losses/total": 0.35187047719955444, "ref_logps/chosen": -218.1029510498047, "ref_logps/rejected": -209.6170654296875, "rewards/accuracies": 1.0, "rewards/chosen": 0.03963134437799454, "rewards/margins": 0.765252411365509, "rewards/rejected": -0.7256210446357727, "step": 188 }, { "epoch": 0.05, "learning_rate": 9.064748201438849e-08, "logps/chosen": -232.5751495361328, "logps/rejected": -254.92428588867188, "loss": 0.3461, "losses/dpo": 0.1761164367198944, "losses/sft": 0.4545741081237793, "losses/total": 0.1761164367198944, "ref_logps/chosen": -233.01556396484375, "ref_logps/rejected": -245.850341796875, "rewards/accuracies": 1.0, "rewards/chosen": 0.04404080659151077, "rewards/margins": 0.9514366984367371, "rewards/rejected": -0.9073958396911621, "step": 189 }, { "epoch": 0.05, "learning_rate": 9.112709832134292e-08, "logps/chosen": -250.6006317138672, "logps/rejected": -210.67047119140625, "loss": 0.4165, "losses/dpo": 0.34267657995224, "losses/sft": 0.5372596383094788, "losses/total": 0.34267657995224, "ref_logps/chosen": -250.49679565429688, "ref_logps/rejected": -203.3780517578125, "rewards/accuracies": 1.0, "rewards/chosen": -0.010384434834122658, "rewards/margins": 0.7188563346862793, "rewards/rejected": -0.7292407751083374, "step": 190 }, { "epoch": 0.05, "learning_rate": 9.160671462829736e-08, "logps/chosen": -186.06597900390625, "logps/rejected": -200.10763549804688, "loss": 0.4073, "losses/dpo": 0.4131956696510315, "losses/sft": 0.552838921546936, "losses/total": 0.4131956696510315, "ref_logps/chosen": -186.40545654296875, "ref_logps/rejected": -192.9406280517578, "rewards/accuracies": 1.0, "rewards/chosen": 0.033945947885513306, "rewards/margins": 0.7506473064422607, "rewards/rejected": -0.7167013883590698, "step": 191 }, { "epoch": 0.05, "learning_rate": 9.20863309352518e-08, "logps/chosen": -240.46640014648438, "logps/rejected": -241.91751098632812, "loss": 0.3633, "losses/dpo": 0.36958375573158264, "losses/sft": 1.0251681804656982, "losses/total": 0.36958375573158264, "ref_logps/chosen": -240.61965942382812, "ref_logps/rejected": -233.0818328857422, "rewards/accuracies": 1.0, "rewards/chosen": 0.015326136723160744, "rewards/margins": 0.8988915085792542, "rewards/rejected": -0.883565366268158, "step": 192 }, { "epoch": 0.05, "learning_rate": 9.256594724220623e-08, "logps/chosen": -209.25131225585938, "logps/rejected": -230.64532470703125, "loss": 0.3627, "losses/dpo": 0.20286937057971954, "losses/sft": 0.884830117225647, "losses/total": 0.20286937057971954, "ref_logps/chosen": -209.41981506347656, "ref_logps/rejected": -221.68104553222656, "rewards/accuracies": 1.0, "rewards/chosen": 0.016850460320711136, "rewards/margins": 0.9132784605026245, "rewards/rejected": -0.8964279890060425, "step": 193 }, { "epoch": 0.05, "learning_rate": 9.304556354916067e-08, "logps/chosen": -227.1307830810547, "logps/rejected": -230.69834899902344, "loss": 0.3724, "losses/dpo": 0.455837607383728, "losses/sft": 0.942908763885498, "losses/total": 0.455837607383728, "ref_logps/chosen": -227.3841552734375, "ref_logps/rejected": -222.07052612304688, "rewards/accuracies": 1.0, "rewards/chosen": 0.02533712424337864, "rewards/margins": 0.8881167769432068, "rewards/rejected": -0.8627796173095703, "step": 194 }, { "epoch": 0.05, "learning_rate": 9.35251798561151e-08, "logps/chosen": -221.44178771972656, "logps/rejected": -222.1568145751953, "loss": 0.3581, "losses/dpo": 0.18321651220321655, "losses/sft": 0.6136723160743713, "losses/total": 0.18321651220321655, "ref_logps/chosen": -221.62576293945312, "ref_logps/rejected": -213.15029907226562, "rewards/accuracies": 1.0, "rewards/chosen": 0.018398582935333252, "rewards/margins": 0.9190487861633301, "rewards/rejected": -0.900650143623352, "step": 195 }, { "epoch": 0.05, "learning_rate": 9.400479616306954e-08, "logps/chosen": -220.36016845703125, "logps/rejected": -228.9633331298828, "loss": 0.3555, "losses/dpo": 0.14697615802288055, "losses/sft": 0.4441142976284027, "losses/total": 0.14697615802288055, "ref_logps/chosen": -220.84481811523438, "ref_logps/rejected": -219.8927001953125, "rewards/accuracies": 1.0, "rewards/chosen": 0.048464737832546234, "rewards/margins": 0.9555299282073975, "rewards/rejected": -0.9070650935173035, "step": 196 }, { "epoch": 0.05, "learning_rate": 9.448441247002398e-08, "logps/chosen": -239.2434844970703, "logps/rejected": -229.86941528320312, "loss": 0.3521, "losses/dpo": 0.19149450957775116, "losses/sft": 1.0581660270690918, "losses/total": 0.19149450957775116, "ref_logps/chosen": -239.4163360595703, "ref_logps/rejected": -220.57455444335938, "rewards/accuracies": 1.0, "rewards/chosen": 0.017284099012613297, "rewards/margins": 0.9467689394950867, "rewards/rejected": -0.9294848442077637, "step": 197 }, { "epoch": 0.05, "learning_rate": 9.496402877697841e-08, "logps/chosen": -241.75332641601562, "logps/rejected": -217.72772216796875, "loss": 0.3418, "losses/dpo": 0.2833196222782135, "losses/sft": 0.7586278915405273, "losses/total": 0.2833196222782135, "ref_logps/chosen": -242.0028076171875, "ref_logps/rejected": -207.90016174316406, "rewards/accuracies": 1.0, "rewards/chosen": 0.02494763769209385, "rewards/margins": 1.007705569267273, "rewards/rejected": -0.9827579259872437, "step": 198 }, { "epoch": 0.05, "learning_rate": 9.544364508393284e-08, "logps/chosen": -242.94949340820312, "logps/rejected": -237.4647216796875, "loss": 0.3238, "losses/dpo": 0.13909044861793518, "losses/sft": 0.8134393692016602, "losses/total": 0.13909044861793518, "ref_logps/chosen": -243.28785705566406, "ref_logps/rejected": -227.24188232421875, "rewards/accuracies": 1.0, "rewards/chosen": 0.03383728116750717, "rewards/margins": 1.0561233758926392, "rewards/rejected": -1.022286057472229, "step": 199 }, { "epoch": 0.05, "learning_rate": 9.59232613908873e-08, "logps/chosen": -217.0780029296875, "logps/rejected": -219.11093139648438, "loss": 0.3425, "losses/dpo": 0.2219257801771164, "losses/sft": 0.593511700630188, "losses/total": 0.2219257801771164, "ref_logps/chosen": -217.22821044921875, "ref_logps/rejected": -209.29637145996094, "rewards/accuracies": 1.0, "rewards/chosen": 0.01502010878175497, "rewards/margins": 0.996475338935852, "rewards/rejected": -0.9814552068710327, "step": 200 }, { "epoch": 0.05, "learning_rate": 9.640287769784172e-08, "logps/chosen": -274.6129455566406, "logps/rejected": -270.43499755859375, "loss": 0.2715, "losses/dpo": 0.20327992737293243, "losses/sft": 0.6704562902450562, "losses/total": 0.20327992737293243, "ref_logps/chosen": -275.1683349609375, "ref_logps/rejected": -257.513916015625, "rewards/accuracies": 1.0, "rewards/chosen": 0.05553843080997467, "rewards/margins": 1.3476428985595703, "rewards/rejected": -1.2921043634414673, "step": 201 }, { "epoch": 0.05, "learning_rate": 9.688249400479615e-08, "logps/chosen": -233.83792114257812, "logps/rejected": -229.2107391357422, "loss": 0.3175, "losses/dpo": 0.24937023222446442, "losses/sft": 0.7387675046920776, "losses/total": 0.24937023222446442, "ref_logps/chosen": -234.09283447265625, "ref_logps/rejected": -218.52481079101562, "rewards/accuracies": 1.0, "rewards/chosen": 0.02549074962735176, "rewards/margins": 1.0940827131271362, "rewards/rejected": -1.0685920715332031, "step": 202 }, { "epoch": 0.05, "learning_rate": 9.73621103117506e-08, "logps/chosen": -226.2960968017578, "logps/rejected": -223.65606689453125, "loss": 0.3214, "losses/dpo": 0.09139002114534378, "losses/sft": 0.4482300579547882, "losses/total": 0.09139002114534378, "ref_logps/chosen": -226.54232788085938, "ref_logps/rejected": -212.80947875976562, "rewards/accuracies": 1.0, "rewards/chosen": 0.024625658988952637, "rewards/margins": 1.109284520149231, "rewards/rejected": -1.0846588611602783, "step": 203 }, { "epoch": 0.05, "learning_rate": 9.784172661870503e-08, "logps/chosen": -274.698974609375, "logps/rejected": -248.207275390625, "loss": 0.3008, "losses/dpo": 0.11939084529876709, "losses/sft": 0.6152705550193787, "losses/total": 0.11939084529876709, "ref_logps/chosen": -275.0128173828125, "ref_logps/rejected": -237.04241943359375, "rewards/accuracies": 1.0, "rewards/chosen": 0.03138342499732971, "rewards/margins": 1.1478716135025024, "rewards/rejected": -1.1164882183074951, "step": 204 }, { "epoch": 0.05, "learning_rate": 9.832134292565946e-08, "logps/chosen": -218.98333740234375, "logps/rejected": -247.60409545898438, "loss": 0.2574, "losses/dpo": 0.28062474727630615, "losses/sft": 0.9158276915550232, "losses/total": 0.28062474727630615, "ref_logps/chosen": -219.5443115234375, "ref_logps/rejected": -234.92526245117188, "rewards/accuracies": 1.0, "rewards/chosen": 0.05609691143035889, "rewards/margins": 1.3239829540252686, "rewards/rejected": -1.2678859233856201, "step": 205 }, { "epoch": 0.05, "learning_rate": 9.88009592326139e-08, "logps/chosen": -240.84780883789062, "logps/rejected": -245.1466064453125, "loss": 0.2599, "losses/dpo": 0.13181482255458832, "losses/sft": 0.5563780665397644, "losses/total": 0.13181482255458832, "ref_logps/chosen": -241.25314331054688, "ref_logps/rejected": -232.15618896484375, "rewards/accuracies": 1.0, "rewards/chosen": 0.04053521156311035, "rewards/margins": 1.3395764827728271, "rewards/rejected": -1.2990412712097168, "step": 206 }, { "epoch": 0.05, "learning_rate": 9.928057553956835e-08, "logps/chosen": -196.827880859375, "logps/rejected": -207.542236328125, "loss": 0.3248, "losses/dpo": 0.08891167491674423, "losses/sft": 0.7091540694236755, "losses/total": 0.08891167491674423, "ref_logps/chosen": -197.33180236816406, "ref_logps/rejected": -197.25160217285156, "rewards/accuracies": 1.0, "rewards/chosen": 0.05039016529917717, "rewards/margins": 1.079453468322754, "rewards/rejected": -1.02906334400177, "step": 207 }, { "epoch": 0.05, "learning_rate": 9.976019184652277e-08, "logps/chosen": -227.93136596679688, "logps/rejected": -261.5827941894531, "loss": 0.2554, "losses/dpo": 0.07127783447504044, "losses/sft": 1.4309806823730469, "losses/total": 0.07127783447504044, "ref_logps/chosen": -228.4580841064453, "ref_logps/rejected": -248.03472900390625, "rewards/accuracies": 1.0, "rewards/chosen": 0.052673377096652985, "rewards/margins": 1.4074796438217163, "rewards/rejected": -1.3548063039779663, "step": 208 }, { "epoch": 0.05, "learning_rate": 1.0023980815347722e-07, "logps/chosen": -217.70066833496094, "logps/rejected": -236.0228271484375, "loss": 0.2684, "losses/dpo": 0.09333795309066772, "losses/sft": 0.4584328234195709, "losses/total": 0.09333795309066772, "ref_logps/chosen": -218.12655639648438, "ref_logps/rejected": -223.6395263671875, "rewards/accuracies": 1.0, "rewards/chosen": 0.04258891940116882, "rewards/margins": 1.280918836593628, "rewards/rejected": -1.2383298873901367, "step": 209 }, { "epoch": 0.05, "learning_rate": 1.0071942446043164e-07, "logps/chosen": -263.4556579589844, "logps/rejected": -253.89059448242188, "loss": 0.2295, "losses/dpo": 0.10712315887212753, "losses/sft": 0.6619054675102234, "losses/total": 0.10712315887212753, "ref_logps/chosen": -263.77593994140625, "ref_logps/rejected": -237.8572540283203, "rewards/accuracies": 1.0, "rewards/chosen": 0.032029055058956146, "rewards/margins": 1.6353625059127808, "rewards/rejected": -1.6033332347869873, "step": 210 }, { "epoch": 0.05, "learning_rate": 1.0119904076738607e-07, "logps/chosen": -210.25755310058594, "logps/rejected": -245.30389404296875, "loss": 0.2226, "losses/dpo": 0.30797189474105835, "losses/sft": 0.4067469835281372, "losses/total": 0.30797189474105835, "ref_logps/chosen": -210.47483825683594, "ref_logps/rejected": -229.63401794433594, "rewards/accuracies": 1.0, "rewards/chosen": 0.021729109808802605, "rewards/margins": 1.5887155532836914, "rewards/rejected": -1.566986322402954, "step": 211 }, { "epoch": 0.05, "learning_rate": 1.0167865707434051e-07, "logps/chosen": -219.8419952392578, "logps/rejected": -272.05889892578125, "loss": 0.2235, "losses/dpo": 0.05805809050798416, "losses/sft": 0.4558568000793457, "losses/total": 0.05805809050798416, "ref_logps/chosen": -220.29470825195312, "ref_logps/rejected": -256.40521240234375, "rewards/accuracies": 1.0, "rewards/chosen": 0.04526923596858978, "rewards/margins": 1.6106393337249756, "rewards/rejected": -1.5653700828552246, "step": 212 }, { "epoch": 0.05, "learning_rate": 1.0215827338129497e-07, "logps/chosen": -247.03778076171875, "logps/rejected": -247.4205322265625, "loss": 0.2569, "losses/dpo": 0.0915185883641243, "losses/sft": 0.5127957463264465, "losses/total": 0.0915185883641243, "ref_logps/chosen": -247.08212280273438, "ref_logps/rejected": -233.51007080078125, "rewards/accuracies": 1.0, "rewards/chosen": 0.0044339969754219055, "rewards/margins": 1.3954812288284302, "rewards/rejected": -1.3910472393035889, "step": 213 }, { "epoch": 0.05, "learning_rate": 1.026378896882494e-07, "logps/chosen": -238.06983947753906, "logps/rejected": -250.04368591308594, "loss": 0.1967, "losses/dpo": 0.11154404282569885, "losses/sft": 0.5051796436309814, "losses/total": 0.11154404282569885, "ref_logps/chosen": -238.55023193359375, "ref_logps/rejected": -233.2794647216797, "rewards/accuracies": 1.0, "rewards/chosen": 0.048038627952337265, "rewards/margins": 1.7244594097137451, "rewards/rejected": -1.6764206886291504, "step": 214 }, { "epoch": 0.05, "learning_rate": 1.0311750599520384e-07, "logps/chosen": -236.73477172851562, "logps/rejected": -252.6602783203125, "loss": 0.2465, "losses/dpo": 0.09819323569536209, "losses/sft": 0.4876343607902527, "losses/total": 0.09819323569536209, "ref_logps/chosen": -236.9515380859375, "ref_logps/rejected": -237.5775604248047, "rewards/accuracies": 1.0, "rewards/chosen": 0.021676478907465935, "rewards/margins": 1.5299477577209473, "rewards/rejected": -1.5082714557647705, "step": 215 }, { "epoch": 0.05, "learning_rate": 1.0359712230215827e-07, "logps/chosen": -240.68331909179688, "logps/rejected": -239.08612060546875, "loss": 0.2451, "losses/dpo": 0.22843794524669647, "losses/sft": 0.5559660792350769, "losses/total": 0.22843794524669647, "ref_logps/chosen": -240.8357696533203, "ref_logps/rejected": -223.3365478515625, "rewards/accuracies": 1.0, "rewards/chosen": 0.015245556831359863, "rewards/margins": 1.5902040004730225, "rewards/rejected": -1.5749584436416626, "step": 216 }, { "epoch": 0.05, "learning_rate": 1.040767386091127e-07, "logps/chosen": -239.19288635253906, "logps/rejected": -252.50865173339844, "loss": 0.2051, "losses/dpo": 0.05378540977835655, "losses/sft": 0.8310717940330505, "losses/total": 0.05378540977835655, "ref_logps/chosen": -239.7662811279297, "ref_logps/rejected": -236.21826171875, "rewards/accuracies": 1.0, "rewards/chosen": 0.05733942240476608, "rewards/margins": 1.6863760948181152, "rewards/rejected": -1.6290366649627686, "step": 217 }, { "epoch": 0.05, "learning_rate": 1.0455635491606714e-07, "logps/chosen": -220.36972045898438, "logps/rejected": -234.966552734375, "loss": 0.2661, "losses/dpo": 0.12528866529464722, "losses/sft": 0.6879314184188843, "losses/total": 0.12528866529464722, "ref_logps/chosen": -220.39599609375, "ref_logps/rejected": -221.03062438964844, "rewards/accuracies": 1.0, "rewards/chosen": 0.0026308856904506683, "rewards/margins": 1.3962241411209106, "rewards/rejected": -1.3935933113098145, "step": 218 }, { "epoch": 0.05, "learning_rate": 1.0503597122302157e-07, "logps/chosen": -250.04861450195312, "logps/rejected": -233.6830596923828, "loss": 0.2233, "losses/dpo": 0.1359192132949829, "losses/sft": 0.4207455515861511, "losses/total": 0.1359192132949829, "ref_logps/chosen": -249.92410278320312, "ref_logps/rejected": -216.16021728515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.012450747191905975, "rewards/margins": 1.7398335933685303, "rewards/rejected": -1.7522841691970825, "step": 219 }, { "epoch": 0.05, "learning_rate": 1.0551558752997602e-07, "logps/chosen": -220.05734252929688, "logps/rejected": -222.31741333007812, "loss": 0.254, "losses/dpo": 0.11638454347848892, "losses/sft": 0.5441364049911499, "losses/total": 0.11638454347848892, "ref_logps/chosen": -220.2730712890625, "ref_logps/rejected": -208.0173797607422, "rewards/accuracies": 1.0, "rewards/chosen": 0.021573692560195923, "rewards/margins": 1.4515776634216309, "rewards/rejected": -1.4300038814544678, "step": 220 }, { "epoch": 0.05, "learning_rate": 1.0599520383693046e-07, "logps/chosen": -201.37457275390625, "logps/rejected": -241.8056640625, "loss": 0.2178, "losses/dpo": 0.12505275011062622, "losses/sft": 0.5648071765899658, "losses/total": 0.12505275011062622, "ref_logps/chosen": -201.62884521484375, "ref_logps/rejected": -225.13507080078125, "rewards/accuracies": 1.0, "rewards/chosen": 0.02542823739349842, "rewards/margins": 1.6924874782562256, "rewards/rejected": -1.6670591831207275, "step": 221 }, { "epoch": 0.05, "learning_rate": 1.0647482014388489e-07, "logps/chosen": -215.7587432861328, "logps/rejected": -228.73834228515625, "loss": 0.2507, "losses/dpo": 0.2281184196472168, "losses/sft": 1.031870722770691, "losses/total": 0.2281184196472168, "ref_logps/chosen": -215.8619842529297, "ref_logps/rejected": -212.6431427001953, "rewards/accuracies": 1.0, "rewards/chosen": 0.010325947776436806, "rewards/margins": 1.6198456287384033, "rewards/rejected": -1.6095197200775146, "step": 222 }, { "epoch": 0.05, "learning_rate": 1.0695443645083932e-07, "logps/chosen": -224.14341735839844, "logps/rejected": -233.33746337890625, "loss": 0.2259, "losses/dpo": 0.2690639793872833, "losses/sft": 0.6383207440376282, "losses/total": 0.2690639793872833, "ref_logps/chosen": -224.22401428222656, "ref_logps/rejected": -217.31866455078125, "rewards/accuracies": 1.0, "rewards/chosen": 0.008058324456214905, "rewards/margins": 1.609938144683838, "rewards/rejected": -1.6018798351287842, "step": 223 }, { "epoch": 0.05, "learning_rate": 1.0743405275779376e-07, "logps/chosen": -219.28176879882812, "logps/rejected": -229.7862548828125, "loss": 0.2174, "losses/dpo": 0.0610390268266201, "losses/sft": 0.6868208050727844, "losses/total": 0.0610390268266201, "ref_logps/chosen": -219.51568603515625, "ref_logps/rejected": -213.06964111328125, "rewards/accuracies": 1.0, "rewards/chosen": 0.023390166461467743, "rewards/margins": 1.695051908493042, "rewards/rejected": -1.6716618537902832, "step": 224 }, { "epoch": 0.05, "learning_rate": 1.0791366906474819e-07, "logps/chosen": -206.95535278320312, "logps/rejected": -231.06216430664062, "loss": 0.1947, "losses/dpo": 0.0464896596968174, "losses/sft": 0.7198383808135986, "losses/total": 0.0464896596968174, "ref_logps/chosen": -206.9734649658203, "ref_logps/rejected": -212.33511352539062, "rewards/accuracies": 1.0, "rewards/chosen": 0.001811920665204525, "rewards/margins": 1.874516487121582, "rewards/rejected": -1.8727045059204102, "step": 225 }, { "epoch": 0.05, "learning_rate": 1.0839328537170262e-07, "logps/chosen": -237.09774780273438, "logps/rejected": -246.2694549560547, "loss": 0.2021, "losses/dpo": 0.12921489775180817, "losses/sft": 0.8869383335113525, "losses/total": 0.12921489775180817, "ref_logps/chosen": -237.37158203125, "ref_logps/rejected": -229.0869903564453, "rewards/accuracies": 1.0, "rewards/chosen": 0.0273844413459301, "rewards/margins": 1.7456321716308594, "rewards/rejected": -1.718247652053833, "step": 226 }, { "epoch": 0.05, "learning_rate": 1.0887290167865706e-07, "logps/chosen": -199.92694091796875, "logps/rejected": -212.70372009277344, "loss": 0.2509, "losses/dpo": 0.0549764558672905, "losses/sft": 0.5903735160827637, "losses/total": 0.0549764558672905, "ref_logps/chosen": -200.051513671875, "ref_logps/rejected": -197.81881713867188, "rewards/accuracies": 1.0, "rewards/chosen": 0.012456987984478474, "rewards/margins": 1.5009478330612183, "rewards/rejected": -1.4884908199310303, "step": 227 }, { "epoch": 0.05, "learning_rate": 1.0935251798561152e-07, "logps/chosen": -250.7592010498047, "logps/rejected": -277.6080627441406, "loss": 0.1993, "losses/dpo": 0.13763967156410217, "losses/sft": 0.4960741400718689, "losses/total": 0.13763967156410217, "ref_logps/chosen": -250.8712921142578, "ref_logps/rejected": -258.84185791015625, "rewards/accuracies": 1.0, "rewards/chosen": 0.011207936331629753, "rewards/margins": 1.8878273963928223, "rewards/rejected": -1.876619577407837, "step": 228 }, { "epoch": 0.05, "learning_rate": 1.0983213429256594e-07, "logps/chosen": -213.3785400390625, "logps/rejected": -237.85723876953125, "loss": 0.184, "losses/dpo": 0.04404279589653015, "losses/sft": 0.5654621720314026, "losses/total": 0.04404279589653015, "ref_logps/chosen": -213.86907958984375, "ref_logps/rejected": -218.54385375976562, "rewards/accuracies": 1.0, "rewards/chosen": 0.04905291646718979, "rewards/margins": 1.980391502380371, "rewards/rejected": -1.9313386678695679, "step": 229 }, { "epoch": 0.06, "learning_rate": 1.1031175059952039e-07, "logps/chosen": -195.09136962890625, "logps/rejected": -235.92642211914062, "loss": 0.2328, "losses/dpo": 0.025025898590683937, "losses/sft": 0.5515609979629517, "losses/total": 0.025025898590683937, "ref_logps/chosen": -195.67239379882812, "ref_logps/rejected": -219.16543579101562, "rewards/accuracies": 1.0, "rewards/chosen": 0.05810001865029335, "rewards/margins": 1.7341996431350708, "rewards/rejected": -1.6760995388031006, "step": 230 }, { "epoch": 0.06, "learning_rate": 1.1079136690647481e-07, "logps/chosen": -239.84906005859375, "logps/rejected": -249.80401611328125, "loss": 0.2122, "losses/dpo": 0.08717817813158035, "losses/sft": 0.6128903031349182, "losses/total": 0.08717817813158035, "ref_logps/chosen": -239.94076538085938, "ref_logps/rejected": -231.85360717773438, "rewards/accuracies": 1.0, "rewards/chosen": 0.009171070531010628, "rewards/margins": 1.8042125701904297, "rewards/rejected": -1.795041561126709, "step": 231 }, { "epoch": 0.06, "learning_rate": 1.1127098321342924e-07, "logps/chosen": -267.49884033203125, "logps/rejected": -247.8172607421875, "loss": 0.2019, "losses/dpo": 0.03939082473516464, "losses/sft": 0.6280481219291687, "losses/total": 0.03939082473516464, "ref_logps/chosen": -267.9692687988281, "ref_logps/rejected": -230.1329345703125, "rewards/accuracies": 1.0, "rewards/chosen": 0.0470414012670517, "rewards/margins": 1.815474510192871, "rewards/rejected": -1.7684330940246582, "step": 232 }, { "epoch": 0.06, "learning_rate": 1.1175059952038368e-07, "logps/chosen": -204.5781707763672, "logps/rejected": -226.89431762695312, "loss": 0.1953, "losses/dpo": 0.016600485891103745, "losses/sft": 0.6097224354743958, "losses/total": 0.016600485891103745, "ref_logps/chosen": -204.89779663085938, "ref_logps/rejected": -208.0856170654297, "rewards/accuracies": 1.0, "rewards/chosen": 0.03196091577410698, "rewards/margins": 1.9128283262252808, "rewards/rejected": -1.8808673620224, "step": 233 }, { "epoch": 0.06, "learning_rate": 1.1223021582733811e-07, "logps/chosen": -203.90447998046875, "logps/rejected": -261.55291748046875, "loss": 0.1667, "losses/dpo": 0.015096748247742653, "losses/sft": 0.5876286625862122, "losses/total": 0.015096748247742653, "ref_logps/chosen": -204.36703491210938, "ref_logps/rejected": -240.69320678710938, "rewards/accuracies": 1.0, "rewards/chosen": 0.046252600848674774, "rewards/margins": 2.1322240829467773, "rewards/rejected": -2.0859713554382324, "step": 234 }, { "epoch": 0.06, "learning_rate": 1.1270983213429257e-07, "logps/chosen": -206.56825256347656, "logps/rejected": -231.18893432617188, "loss": 0.23, "losses/dpo": 0.037412263453006744, "losses/sft": 0.9610894918441772, "losses/total": 0.037412263453006744, "ref_logps/chosen": -207.05496215820312, "ref_logps/rejected": -214.53009033203125, "rewards/accuracies": 1.0, "rewards/chosen": 0.04867132008075714, "rewards/margins": 1.7145538330078125, "rewards/rejected": -1.6658825874328613, "step": 235 }, { "epoch": 0.06, "learning_rate": 1.1318944844124701e-07, "logps/chosen": -179.19595336914062, "logps/rejected": -234.2972869873047, "loss": 0.1958, "losses/dpo": 0.03263027220964432, "losses/sft": 0.6495882868766785, "losses/total": 0.03263027220964432, "ref_logps/chosen": -179.71826171875, "ref_logps/rejected": -217.06600952148438, "rewards/accuracies": 1.0, "rewards/chosen": 0.05223208665847778, "rewards/margins": 1.7753580808639526, "rewards/rejected": -1.72312593460083, "step": 236 }, { "epoch": 0.06, "learning_rate": 1.1366906474820144e-07, "logps/chosen": -197.49266052246094, "logps/rejected": -242.28729248046875, "loss": 0.203, "losses/dpo": 0.1039901152253151, "losses/sft": 0.48082953691482544, "losses/total": 0.1039901152253151, "ref_logps/chosen": -198.28839111328125, "ref_logps/rejected": -224.46435546875, "rewards/accuracies": 1.0, "rewards/chosen": 0.07957437634468079, "rewards/margins": 1.8618675470352173, "rewards/rejected": -1.7822932004928589, "step": 237 }, { "epoch": 0.06, "learning_rate": 1.1414868105515587e-07, "logps/chosen": -243.9394073486328, "logps/rejected": -252.04908752441406, "loss": 0.1809, "losses/dpo": 0.11835617572069168, "losses/sft": 0.7168901562690735, "losses/total": 0.11835617572069168, "ref_logps/chosen": -244.1352081298828, "ref_logps/rejected": -232.05056762695312, "rewards/accuracies": 1.0, "rewards/chosen": 0.0195817518979311, "rewards/margins": 2.0194332599639893, "rewards/rejected": -1.9998513460159302, "step": 238 }, { "epoch": 0.06, "learning_rate": 1.1462829736211031e-07, "logps/chosen": -208.0064239501953, "logps/rejected": -233.71966552734375, "loss": 0.1798, "losses/dpo": 0.04415931552648544, "losses/sft": 0.6786404848098755, "losses/total": 0.04415931552648544, "ref_logps/chosen": -208.08840942382812, "ref_logps/rejected": -214.10919189453125, "rewards/accuracies": 1.0, "rewards/chosen": 0.008198339492082596, "rewards/margins": 1.9692459106445312, "rewards/rejected": -1.9610475301742554, "step": 239 }, { "epoch": 0.06, "learning_rate": 1.1510791366906474e-07, "logps/chosen": -247.7860107421875, "logps/rejected": -220.1297607421875, "loss": 0.2386, "losses/dpo": 0.019229823723435402, "losses/sft": 0.9114503264427185, "losses/total": 0.019229823723435402, "ref_logps/chosen": -248.03659057617188, "ref_logps/rejected": -201.97738647460938, "rewards/accuracies": 1.0, "rewards/chosen": 0.02505486272275448, "rewards/margins": 1.840293288230896, "rewards/rejected": -1.8152384757995605, "step": 240 }, { "epoch": 0.06, "learning_rate": 1.1558752997601917e-07, "logps/chosen": -208.36439514160156, "logps/rejected": -244.80667114257812, "loss": 0.1586, "losses/dpo": 0.06705638766288757, "losses/sft": 0.47430500388145447, "losses/total": 0.06705638766288757, "ref_logps/chosen": -208.7392578125, "ref_logps/rejected": -223.17860412597656, "rewards/accuracies": 1.0, "rewards/chosen": 0.03748580813407898, "rewards/margins": 2.2002921104431152, "rewards/rejected": -2.162806272506714, "step": 241 }, { "epoch": 0.06, "learning_rate": 1.1606714628297361e-07, "logps/chosen": -197.80120849609375, "logps/rejected": -208.0428009033203, "loss": 0.209, "losses/dpo": 0.015375918708741665, "losses/sft": 0.9647651314735413, "losses/total": 0.015375918708741665, "ref_logps/chosen": -197.8456573486328, "ref_logps/rejected": -190.08546447753906, "rewards/accuracies": 1.0, "rewards/chosen": 0.00444638729095459, "rewards/margins": 1.8001772165298462, "rewards/rejected": -1.795730710029602, "step": 242 }, { "epoch": 0.06, "learning_rate": 1.1654676258992806e-07, "logps/chosen": -212.95559692382812, "logps/rejected": -251.90478515625, "loss": 0.152, "losses/dpo": 0.1386520266532898, "losses/sft": 0.8128570914268494, "losses/total": 0.1386520266532898, "ref_logps/chosen": -213.35818481445312, "ref_logps/rejected": -230.97349548339844, "rewards/accuracies": 1.0, "rewards/chosen": 0.04025895893573761, "rewards/margins": 2.133387565612793, "rewards/rejected": -2.0931286811828613, "step": 243 }, { "epoch": 0.06, "learning_rate": 1.1702637889688249e-07, "logps/chosen": -198.12408447265625, "logps/rejected": -220.28515625, "loss": 0.2285, "losses/dpo": 0.10131841152906418, "losses/sft": 0.4872863292694092, "losses/total": 0.10131841152906418, "ref_logps/chosen": -198.6854705810547, "ref_logps/rejected": -203.4904022216797, "rewards/accuracies": 1.0, "rewards/chosen": 0.05613837391138077, "rewards/margins": 1.7356138229370117, "rewards/rejected": -1.6794755458831787, "step": 244 }, { "epoch": 0.06, "learning_rate": 1.1750599520383693e-07, "logps/chosen": -241.6875, "logps/rejected": -282.34710693359375, "loss": 0.1371, "losses/dpo": 0.004096281249076128, "losses/sft": 0.7009930610656738, "losses/total": 0.004096281249076128, "ref_logps/chosen": -241.6943817138672, "ref_logps/rejected": -258.0897216796875, "rewards/accuracies": 1.0, "rewards/chosen": 0.0006893370300531387, "rewards/margins": 2.4264278411865234, "rewards/rejected": -2.4257383346557617, "step": 245 }, { "epoch": 0.06, "learning_rate": 1.1798561151079136e-07, "logps/chosen": -258.6807556152344, "logps/rejected": -249.41653442382812, "loss": 0.1512, "losses/dpo": 0.02100592851638794, "losses/sft": 0.6796557903289795, "losses/total": 0.02100592851638794, "ref_logps/chosen": -258.6533203125, "ref_logps/rejected": -225.43289184570312, "rewards/accuracies": 1.0, "rewards/chosen": -0.002744009718298912, "rewards/margins": 2.3956189155578613, "rewards/rejected": -2.398362874984741, "step": 246 }, { "epoch": 0.06, "learning_rate": 1.1846522781774579e-07, "logps/chosen": -208.83255004882812, "logps/rejected": -264.26507568359375, "loss": 0.1722, "losses/dpo": 0.03806552290916443, "losses/sft": 1.2501940727233887, "losses/total": 0.03806552290916443, "ref_logps/chosen": -209.5966339111328, "ref_logps/rejected": -244.28672790527344, "rewards/accuracies": 1.0, "rewards/chosen": 0.07640688866376877, "rewards/margins": 2.074241876602173, "rewards/rejected": -1.9978350400924683, "step": 247 }, { "epoch": 0.06, "learning_rate": 1.1894484412470023e-07, "logps/chosen": -238.46328735351562, "logps/rejected": -236.26951599121094, "loss": 0.1736, "losses/dpo": 0.006278574001044035, "losses/sft": 1.1346555948257446, "losses/total": 0.006278574001044035, "ref_logps/chosen": -238.72210693359375, "ref_logps/rejected": -215.45144653320312, "rewards/accuracies": 1.0, "rewards/chosen": 0.025882381945848465, "rewards/margins": 2.107689380645752, "rewards/rejected": -2.0818071365356445, "step": 248 }, { "epoch": 0.06, "learning_rate": 1.1942446043165466e-07, "logps/chosen": -202.90170288085938, "logps/rejected": -247.02334594726562, "loss": 0.1748, "losses/dpo": 0.015097576193511486, "losses/sft": 1.098059058189392, "losses/total": 0.015097576193511486, "ref_logps/chosen": -203.10633850097656, "ref_logps/rejected": -226.19180297851562, "rewards/accuracies": 1.0, "rewards/chosen": 0.020463382825255394, "rewards/margins": 2.1036171913146973, "rewards/rejected": -2.08315372467041, "step": 249 }, { "epoch": 0.06, "learning_rate": 1.1990407673860913e-07, "logps/chosen": -227.35427856445312, "logps/rejected": -265.84283447265625, "loss": 0.124, "losses/dpo": 0.029737897217273712, "losses/sft": 0.5628466606140137, "losses/total": 0.029737897217273712, "ref_logps/chosen": -227.76785278320312, "ref_logps/rejected": -241.2743682861328, "rewards/accuracies": 1.0, "rewards/chosen": 0.041357122361660004, "rewards/margins": 2.4982049465179443, "rewards/rejected": -2.456847667694092, "step": 250 }, { "epoch": 0.06, "learning_rate": 1.2038369304556354e-07, "logps/chosen": -233.614990234375, "logps/rejected": -260.7721252441406, "loss": 0.1848, "losses/dpo": 0.012593653053045273, "losses/sft": 0.42412999272346497, "losses/total": 0.012593653053045273, "ref_logps/chosen": -234.0486602783203, "ref_logps/rejected": -239.71475219726562, "rewards/accuracies": 1.0, "rewards/chosen": 0.04336591809988022, "rewards/margins": 2.1491010189056396, "rewards/rejected": -2.1057353019714355, "step": 251 }, { "epoch": 0.06, "learning_rate": 1.2086330935251798e-07, "logps/chosen": -217.03094482421875, "logps/rejected": -248.67172241210938, "loss": 0.1707, "losses/dpo": 0.11710583418607712, "losses/sft": 0.5124498605728149, "losses/total": 0.11710583418607712, "ref_logps/chosen": -217.2735595703125, "ref_logps/rejected": -228.6622772216797, "rewards/accuracies": 1.0, "rewards/chosen": 0.024260729551315308, "rewards/margins": 2.0252060890197754, "rewards/rejected": -2.000945568084717, "step": 252 }, { "epoch": 0.06, "learning_rate": 1.2134292565947243e-07, "logps/chosen": -232.95010375976562, "logps/rejected": -242.58612060546875, "loss": 0.1814, "losses/dpo": 0.12052270025014877, "losses/sft": 0.5172476768493652, "losses/total": 0.12052270025014877, "ref_logps/chosen": -233.18902587890625, "ref_logps/rejected": -222.7397003173828, "rewards/accuracies": 1.0, "rewards/chosen": 0.023892294615507126, "rewards/margins": 2.0085349082946777, "rewards/rejected": -1.984642744064331, "step": 253 }, { "epoch": 0.06, "learning_rate": 1.2182254196642684e-07, "logps/chosen": -246.80282592773438, "logps/rejected": -235.47459411621094, "loss": 0.182, "losses/dpo": 0.01903417892754078, "losses/sft": 0.5886527895927429, "losses/total": 0.01903417892754078, "ref_logps/chosen": -247.07882690429688, "ref_logps/rejected": -215.4468536376953, "rewards/accuracies": 1.0, "rewards/chosen": 0.02760019339621067, "rewards/margins": 2.0303750038146973, "rewards/rejected": -2.002774715423584, "step": 254 }, { "epoch": 0.06, "learning_rate": 1.2230215827338128e-07, "logps/chosen": -229.16563415527344, "logps/rejected": -248.33114624023438, "loss": 0.1581, "losses/dpo": 0.07633142173290253, "losses/sft": 0.6008123755455017, "losses/total": 0.07633142173290253, "ref_logps/chosen": -229.17831420898438, "ref_logps/rejected": -225.24490356445312, "rewards/accuracies": 1.0, "rewards/chosen": 0.0012675188481807709, "rewards/margins": 2.309892177581787, "rewards/rejected": -2.308624505996704, "step": 255 }, { "epoch": 0.06, "learning_rate": 1.2278177458033572e-07, "logps/chosen": -211.5328369140625, "logps/rejected": -213.6350555419922, "loss": 0.216, "losses/dpo": 0.1244046613574028, "losses/sft": 0.44822317361831665, "losses/total": 0.1244046613574028, "ref_logps/chosen": -211.79051208496094, "ref_logps/rejected": -194.6527862548828, "rewards/accuracies": 1.0, "rewards/chosen": 0.025769151747226715, "rewards/margins": 1.9239964485168457, "rewards/rejected": -1.898227334022522, "step": 256 }, { "epoch": 0.06, "learning_rate": 1.2326139088729017e-07, "logps/chosen": -220.3427734375, "logps/rejected": -246.41830444335938, "loss": 0.1281, "losses/dpo": 0.03739747777581215, "losses/sft": 0.7996841073036194, "losses/total": 0.03739747777581215, "ref_logps/chosen": -221.0841064453125, "ref_logps/rejected": -223.9773406982422, "rewards/accuracies": 1.0, "rewards/chosen": 0.07413351535797119, "rewards/margins": 2.318228244781494, "rewards/rejected": -2.2440948486328125, "step": 257 }, { "epoch": 0.06, "learning_rate": 1.237410071942446e-07, "logps/chosen": -230.95069885253906, "logps/rejected": -217.80142211914062, "loss": 0.1951, "losses/dpo": 0.06317833811044693, "losses/sft": 0.4232015907764435, "losses/total": 0.06317833811044693, "ref_logps/chosen": -231.2579803466797, "ref_logps/rejected": -199.32537841796875, "rewards/accuracies": 1.0, "rewards/chosen": 0.03072834573686123, "rewards/margins": 1.8783330917358398, "rewards/rejected": -1.847604751586914, "step": 258 }, { "epoch": 0.06, "learning_rate": 1.2422062350119905e-07, "logps/chosen": -222.5234375, "logps/rejected": -269.60858154296875, "loss": 0.1539, "losses/dpo": 0.04457104206085205, "losses/sft": 1.0345909595489502, "losses/total": 0.04457104206085205, "ref_logps/chosen": -223.05499267578125, "ref_logps/rejected": -246.98757934570312, "rewards/accuracies": 1.0, "rewards/chosen": 0.05315570533275604, "rewards/margins": 2.3152577877044678, "rewards/rejected": -2.2621021270751953, "step": 259 }, { "epoch": 0.06, "learning_rate": 1.2470023980815346e-07, "logps/chosen": -239.89398193359375, "logps/rejected": -244.4452667236328, "loss": 0.1841, "losses/dpo": 0.03688852861523628, "losses/sft": 0.4362914264202118, "losses/total": 0.03688852861523628, "ref_logps/chosen": -240.3095245361328, "ref_logps/rejected": -223.98452758789062, "rewards/accuracies": 1.0, "rewards/chosen": 0.04155474156141281, "rewards/margins": 2.08762788772583, "rewards/rejected": -2.0460729598999023, "step": 260 }, { "epoch": 0.06, "learning_rate": 1.251798561151079e-07, "logps/chosen": -245.17422485351562, "logps/rejected": -243.1425018310547, "loss": 0.1603, "losses/dpo": 0.042788807302713394, "losses/sft": 0.6175804734230042, "losses/total": 0.042788807302713394, "ref_logps/chosen": -245.42239379882812, "ref_logps/rejected": -220.82711791992188, "rewards/accuracies": 1.0, "rewards/chosen": 0.02481662482023239, "rewards/margins": 2.2563533782958984, "rewards/rejected": -2.231536865234375, "step": 261 }, { "epoch": 0.06, "learning_rate": 1.2565947242206235e-07, "logps/chosen": -216.3980712890625, "logps/rejected": -253.26943969726562, "loss": 0.1534, "losses/dpo": 0.046675365418195724, "losses/sft": 0.5492117404937744, "losses/total": 0.046675365418195724, "ref_logps/chosen": -216.86708068847656, "ref_logps/rejected": -229.6472930908203, "rewards/accuracies": 1.0, "rewards/chosen": 0.04690227657556534, "rewards/margins": 2.409119129180908, "rewards/rejected": -2.3622169494628906, "step": 262 }, { "epoch": 0.06, "learning_rate": 1.2613908872901676e-07, "logps/chosen": -198.3968505859375, "logps/rejected": -247.4354248046875, "loss": 0.1778, "losses/dpo": 0.18178117275238037, "losses/sft": 0.5613694190979004, "losses/total": 0.18178117275238037, "ref_logps/chosen": -199.0281219482422, "ref_logps/rejected": -227.29013061523438, "rewards/accuracies": 1.0, "rewards/chosen": 0.06312864273786545, "rewards/margins": 2.07765793800354, "rewards/rejected": -2.0145294666290283, "step": 263 }, { "epoch": 0.06, "learning_rate": 1.266187050359712e-07, "logps/chosen": -245.50485229492188, "logps/rejected": -248.64569091796875, "loss": 0.1561, "losses/dpo": 0.018913956359028816, "losses/sft": 0.9333996772766113, "losses/total": 0.018913956359028816, "ref_logps/chosen": -245.76052856445312, "ref_logps/rejected": -226.41754150390625, "rewards/accuracies": 1.0, "rewards/chosen": 0.025567375123500824, "rewards/margins": 2.2483837604522705, "rewards/rejected": -2.2228164672851562, "step": 264 }, { "epoch": 0.06, "learning_rate": 1.2709832134292567e-07, "logps/chosen": -210.9600830078125, "logps/rejected": -224.4310760498047, "loss": 0.2091, "losses/dpo": 0.04207427799701691, "losses/sft": 0.4584929347038269, "losses/total": 0.04207427799701691, "ref_logps/chosen": -211.25538635253906, "ref_logps/rejected": -206.0514373779297, "rewards/accuracies": 1.0, "rewards/chosen": 0.029530491679906845, "rewards/margins": 1.8674945831298828, "rewards/rejected": -1.8379640579223633, "step": 265 }, { "epoch": 0.06, "learning_rate": 1.275779376498801e-07, "logps/chosen": -236.6046142578125, "logps/rejected": -265.4197998046875, "loss": 0.1416, "losses/dpo": 0.021323688328266144, "losses/sft": 0.9060676693916321, "losses/total": 0.021323688328266144, "ref_logps/chosen": -236.98574829101562, "ref_logps/rejected": -240.8504180908203, "rewards/accuracies": 1.0, "rewards/chosen": 0.03811391070485115, "rewards/margins": 2.495054244995117, "rewards/rejected": -2.456940174102783, "step": 266 }, { "epoch": 0.06, "learning_rate": 1.2805755395683453e-07, "logps/chosen": -236.77774047851562, "logps/rejected": -281.59283447265625, "loss": 0.1309, "losses/dpo": 0.0035666944459080696, "losses/sft": 0.9428807497024536, "losses/total": 0.0035666944459080696, "ref_logps/chosen": -237.53282165527344, "ref_logps/rejected": -254.30345153808594, "rewards/accuracies": 1.0, "rewards/chosen": 0.07550930231809616, "rewards/margins": 2.8044493198394775, "rewards/rejected": -2.728940010070801, "step": 267 }, { "epoch": 0.06, "learning_rate": 1.2853717026378897e-07, "logps/chosen": -210.0772705078125, "logps/rejected": -221.297119140625, "loss": 0.1997, "losses/dpo": 0.12623567879199982, "losses/sft": 0.6770551800727844, "losses/total": 0.12623567879199982, "ref_logps/chosen": -210.44830322265625, "ref_logps/rejected": -200.73655700683594, "rewards/accuracies": 1.0, "rewards/chosen": 0.03710400313138962, "rewards/margins": 2.0931601524353027, "rewards/rejected": -2.056056261062622, "step": 268 }, { "epoch": 0.06, "learning_rate": 1.290167865707434e-07, "logps/chosen": -240.64752197265625, "logps/rejected": -265.59552001953125, "loss": 0.1339, "losses/dpo": 0.11393772810697556, "losses/sft": 0.635058581829071, "losses/total": 0.11393772810697556, "ref_logps/chosen": -241.08541870117188, "ref_logps/rejected": -239.659912109375, "rewards/accuracies": 1.0, "rewards/chosen": 0.043788425624370575, "rewards/margins": 2.6373519897460938, "rewards/rejected": -2.5935635566711426, "step": 269 }, { "epoch": 0.06, "learning_rate": 1.2949640287769783e-07, "logps/chosen": -200.38165283203125, "logps/rejected": -241.82620239257812, "loss": 0.1508, "losses/dpo": 0.01563538797199726, "losses/sft": 0.5078610181808472, "losses/total": 0.01563538797199726, "ref_logps/chosen": -200.6396484375, "ref_logps/rejected": -220.3717803955078, "rewards/accuracies": 1.0, "rewards/chosen": 0.025800080969929695, "rewards/margins": 2.1712422370910645, "rewards/rejected": -2.145442008972168, "step": 270 }, { "epoch": 0.07, "learning_rate": 1.2997601918465227e-07, "logps/chosen": -215.73098754882812, "logps/rejected": -263.6153564453125, "loss": 0.1424, "losses/dpo": 0.015716837719082832, "losses/sft": 0.6508393287658691, "losses/total": 0.015716837719082832, "ref_logps/chosen": -216.17222595214844, "ref_logps/rejected": -238.60104370117188, "rewards/accuracies": 1.0, "rewards/chosen": 0.04412280395627022, "rewards/margins": 2.545553684234619, "rewards/rejected": -2.5014307498931885, "step": 271 }, { "epoch": 0.07, "learning_rate": 1.304556354916067e-07, "logps/chosen": -256.37127685546875, "logps/rejected": -244.478515625, "loss": 0.157, "losses/dpo": 0.04406672716140747, "losses/sft": 0.43102046847343445, "losses/total": 0.04406672716140747, "ref_logps/chosen": -256.525146484375, "ref_logps/rejected": -221.11814880371094, "rewards/accuracies": 1.0, "rewards/chosen": 0.015389859676361084, "rewards/margins": 2.351428747177124, "rewards/rejected": -2.336038589477539, "step": 272 }, { "epoch": 0.07, "learning_rate": 1.3093525179856115e-07, "logps/chosen": -248.576904296875, "logps/rejected": -264.14923095703125, "loss": 0.1224, "losses/dpo": 0.040836483240127563, "losses/sft": 1.1020139455795288, "losses/total": 0.040836483240127563, "ref_logps/chosen": -248.97601318359375, "ref_logps/rejected": -238.93707275390625, "rewards/accuracies": 1.0, "rewards/chosen": 0.03991180285811424, "rewards/margins": 2.5611298084259033, "rewards/rejected": -2.5212180614471436, "step": 273 }, { "epoch": 0.07, "learning_rate": 1.314148681055156e-07, "logps/chosen": -216.18304443359375, "logps/rejected": -243.70394897460938, "loss": 0.1596, "losses/dpo": 0.039093125611543655, "losses/sft": 1.1892009973526, "losses/total": 0.039093125611543655, "ref_logps/chosen": -216.78091430664062, "ref_logps/rejected": -219.16131591796875, "rewards/accuracies": 1.0, "rewards/chosen": 0.05978814512491226, "rewards/margins": 2.5140535831451416, "rewards/rejected": -2.4542653560638428, "step": 274 }, { "epoch": 0.07, "learning_rate": 1.3189448441247e-07, "logps/chosen": -263.00677490234375, "logps/rejected": -267.2589111328125, "loss": 0.1592, "losses/dpo": 0.12295721471309662, "losses/sft": 1.0623064041137695, "losses/total": 0.12295721471309662, "ref_logps/chosen": -262.7037048339844, "ref_logps/rejected": -244.9216766357422, "rewards/accuracies": 1.0, "rewards/chosen": -0.030308915302157402, "rewards/margins": 2.2034144401550293, "rewards/rejected": -2.2337234020233154, "step": 275 }, { "epoch": 0.07, "learning_rate": 1.3237410071942445e-07, "logps/chosen": -272.1014404296875, "logps/rejected": -277.50341796875, "loss": 0.0978, "losses/dpo": 0.02784036658704281, "losses/sft": 0.5690745115280151, "losses/total": 0.02784036658704281, "ref_logps/chosen": -272.1763000488281, "ref_logps/rejected": -248.1638946533203, "rewards/accuracies": 1.0, "rewards/chosen": 0.007486486807465553, "rewards/margins": 2.9414377212524414, "rewards/rejected": -2.9339513778686523, "step": 276 }, { "epoch": 0.07, "learning_rate": 1.328537170263789e-07, "logps/chosen": -205.32986450195312, "logps/rejected": -237.35488891601562, "loss": 0.1878, "losses/dpo": 0.021135607734322548, "losses/sft": 0.45211470127105713, "losses/total": 0.021135607734322548, "ref_logps/chosen": -206.03704833984375, "ref_logps/rejected": -216.6300811767578, "rewards/accuracies": 1.0, "rewards/chosen": 0.07071655988693237, "rewards/margins": 2.1431970596313477, "rewards/rejected": -2.0724806785583496, "step": 277 }, { "epoch": 0.07, "learning_rate": 1.333333333333333e-07, "logps/chosen": -226.4116973876953, "logps/rejected": -261.06622314453125, "loss": 0.1515, "losses/dpo": 0.04733847826719284, "losses/sft": 0.4563581943511963, "losses/total": 0.04733847826719284, "ref_logps/chosen": -226.7712860107422, "ref_logps/rejected": -234.45480346679688, "rewards/accuracies": 1.0, "rewards/chosen": 0.03595935180783272, "rewards/margins": 2.6971027851104736, "rewards/rejected": -2.6611433029174805, "step": 278 }, { "epoch": 0.07, "learning_rate": 1.3381294964028775e-07, "logps/chosen": -204.69876098632812, "logps/rejected": -247.0092010498047, "loss": 0.1711, "losses/dpo": 0.04067986086010933, "losses/sft": 0.5344972610473633, "losses/total": 0.04067986086010933, "ref_logps/chosen": -204.83201599121094, "ref_logps/rejected": -225.2942352294922, "rewards/accuracies": 1.0, "rewards/chosen": 0.0133244963362813, "rewards/margins": 2.184821605682373, "rewards/rejected": -2.171496868133545, "step": 279 }, { "epoch": 0.07, "learning_rate": 1.3429256594724222e-07, "logps/chosen": -158.52749633789062, "logps/rejected": -238.16873168945312, "loss": 0.1927, "losses/dpo": 0.01768941804766655, "losses/sft": 0.5263872146606445, "losses/total": 0.01768941804766655, "ref_logps/chosen": -158.97442626953125, "ref_logps/rejected": -216.92855834960938, "rewards/accuracies": 1.0, "rewards/chosen": 0.0446930006146431, "rewards/margins": 2.168710470199585, "rewards/rejected": -2.1240177154541016, "step": 280 }, { "epoch": 0.07, "learning_rate": 1.3477218225419663e-07, "logps/chosen": -251.95062255859375, "logps/rejected": -293.4254455566406, "loss": 0.0871, "losses/dpo": 0.0020838722120970488, "losses/sft": 1.0695749521255493, "losses/total": 0.0020838722120970488, "ref_logps/chosen": -252.60911560058594, "ref_logps/rejected": -263.66754150390625, "rewards/accuracies": 1.0, "rewards/chosen": 0.06585009396076202, "rewards/margins": 3.041642427444458, "rewards/rejected": -2.975792407989502, "step": 281 }, { "epoch": 0.07, "learning_rate": 1.3525179856115108e-07, "logps/chosen": -242.6351318359375, "logps/rejected": -274.8751220703125, "loss": 0.1257, "losses/dpo": 0.044082652777433395, "losses/sft": 0.6119174361228943, "losses/total": 0.044082652777433395, "ref_logps/chosen": -242.7373809814453, "ref_logps/rejected": -246.12625122070312, "rewards/accuracies": 1.0, "rewards/chosen": 0.01022423803806305, "rewards/margins": 2.8851118087768555, "rewards/rejected": -2.8748879432678223, "step": 282 }, { "epoch": 0.07, "learning_rate": 1.3573141486810552e-07, "logps/chosen": -251.18133544921875, "logps/rejected": -261.9129638671875, "loss": 0.1256, "losses/dpo": 0.044861309230327606, "losses/sft": 0.5814718008041382, "losses/total": 0.044861309230327606, "ref_logps/chosen": -251.1887664794922, "ref_logps/rejected": -235.0281982421875, "rewards/accuracies": 1.0, "rewards/chosen": 0.0007404275238513947, "rewards/margins": 2.6892197132110596, "rewards/rejected": -2.68847918510437, "step": 283 }, { "epoch": 0.07, "learning_rate": 1.3621103117505993e-07, "logps/chosen": -218.0936737060547, "logps/rejected": -243.3306121826172, "loss": 0.1742, "losses/dpo": 0.027061181142926216, "losses/sft": 0.314945787191391, "losses/total": 0.027061181142926216, "ref_logps/chosen": -218.3463897705078, "ref_logps/rejected": -221.3895721435547, "rewards/accuracies": 1.0, "rewards/chosen": 0.025271091610193253, "rewards/margins": 2.2193756103515625, "rewards/rejected": -2.1941044330596924, "step": 284 }, { "epoch": 0.07, "learning_rate": 1.3669064748201438e-07, "logps/chosen": -228.89463806152344, "logps/rejected": -269.19500732421875, "loss": 0.1375, "losses/dpo": 0.03260510414838791, "losses/sft": 0.5359758138656616, "losses/total": 0.03260510414838791, "ref_logps/chosen": -229.30892944335938, "ref_logps/rejected": -241.77886962890625, "rewards/accuracies": 1.0, "rewards/chosen": 0.04142855852842331, "rewards/margins": 2.7830419540405273, "rewards/rejected": -2.7416133880615234, "step": 285 }, { "epoch": 0.07, "learning_rate": 1.3717026378896882e-07, "logps/chosen": -278.52044677734375, "logps/rejected": -287.7493896484375, "loss": 0.097, "losses/dpo": 0.017156923189759254, "losses/sft": 0.5956503748893738, "losses/total": 0.017156923189759254, "ref_logps/chosen": -278.5535888671875, "ref_logps/rejected": -257.499267578125, "rewards/accuracies": 1.0, "rewards/chosen": 0.0033106934279203415, "rewards/margins": 3.0283234119415283, "rewards/rejected": -3.025012493133545, "step": 286 }, { "epoch": 0.07, "learning_rate": 1.3764988009592326e-07, "logps/chosen": -191.19810485839844, "logps/rejected": -238.40908813476562, "loss": 0.1757, "losses/dpo": 0.026489878073334694, "losses/sft": 0.5135608315467834, "losses/total": 0.026489878073334694, "ref_logps/chosen": -191.46728515625, "ref_logps/rejected": -217.13525390625, "rewards/accuracies": 1.0, "rewards/chosen": 0.02691909670829773, "rewards/margins": 2.1543028354644775, "rewards/rejected": -2.1273837089538574, "step": 287 }, { "epoch": 0.07, "learning_rate": 1.381294964028777e-07, "logps/chosen": -194.62802124023438, "logps/rejected": -205.9516143798828, "loss": 0.1929, "losses/dpo": 0.03171275556087494, "losses/sft": 0.5657708644866943, "losses/total": 0.03171275556087494, "ref_logps/chosen": -195.09524536132812, "ref_logps/rejected": -184.07321166992188, "rewards/accuracies": 1.0, "rewards/chosen": 0.04672352969646454, "rewards/margins": 2.2345621585845947, "rewards/rejected": -2.1878387928009033, "step": 288 }, { "epoch": 0.07, "learning_rate": 1.3860911270983214e-07, "logps/chosen": -203.01007080078125, "logps/rejected": -225.69020080566406, "loss": 0.1637, "losses/dpo": 0.0004985919804312289, "losses/sft": 0.8308814167976379, "losses/total": 0.0004985919804312289, "ref_logps/chosen": -203.471923828125, "ref_logps/rejected": -203.4219512939453, "rewards/accuracies": 1.0, "rewards/chosen": 0.04618567228317261, "rewards/margins": 2.2730112075805664, "rewards/rejected": -2.226825475692749, "step": 289 }, { "epoch": 0.07, "learning_rate": 1.3908872901678656e-07, "logps/chosen": -221.58175659179688, "logps/rejected": -261.43939208984375, "loss": 0.1433, "losses/dpo": 0.009290630929172039, "losses/sft": 1.002595067024231, "losses/total": 0.009290630929172039, "ref_logps/chosen": -221.87350463867188, "ref_logps/rejected": -235.8797149658203, "rewards/accuracies": 1.0, "rewards/chosen": 0.02917395904660225, "rewards/margins": 2.585142135620117, "rewards/rejected": -2.5559680461883545, "step": 290 }, { "epoch": 0.07, "learning_rate": 1.39568345323741e-07, "logps/chosen": -208.9764862060547, "logps/rejected": -222.25711059570312, "loss": 0.1784, "losses/dpo": 0.03189732879400253, "losses/sft": 0.5834529399871826, "losses/total": 0.03189732879400253, "ref_logps/chosen": -209.70761108398438, "ref_logps/rejected": -201.68569946289062, "rewards/accuracies": 1.0, "rewards/chosen": 0.07311302423477173, "rewards/margins": 2.130256175994873, "rewards/rejected": -2.057143211364746, "step": 291 }, { "epoch": 0.07, "learning_rate": 1.4004796163069544e-07, "logps/chosen": -259.2993469238281, "logps/rejected": -270.133056640625, "loss": 0.1141, "losses/dpo": 0.053703173995018005, "losses/sft": 0.697163462638855, "losses/total": 0.053703173995018005, "ref_logps/chosen": -259.67974853515625, "ref_logps/rejected": -240.07684326171875, "rewards/accuracies": 1.0, "rewards/chosen": 0.038045238703489304, "rewards/margins": 3.043668746948242, "rewards/rejected": -3.0056233406066895, "step": 292 }, { "epoch": 0.07, "learning_rate": 1.4052757793764986e-07, "logps/chosen": -242.34170532226562, "logps/rejected": -246.8984832763672, "loss": 0.1682, "losses/dpo": 0.007924062199890614, "losses/sft": 0.6224687099456787, "losses/total": 0.007924062199890614, "ref_logps/chosen": -242.61880493164062, "ref_logps/rejected": -221.93621826171875, "rewards/accuracies": 1.0, "rewards/chosen": 0.027708353474736214, "rewards/margins": 2.5239365100860596, "rewards/rejected": -2.496227979660034, "step": 293 }, { "epoch": 0.07, "learning_rate": 1.410071942446043e-07, "logps/chosen": -190.08639526367188, "logps/rejected": -256.3228454589844, "loss": 0.096, "losses/dpo": 0.009716110303997993, "losses/sft": 0.6354929804801941, "losses/total": 0.009716110303997993, "ref_logps/chosen": -190.45016479492188, "ref_logps/rejected": -225.71737670898438, "rewards/accuracies": 1.0, "rewards/chosen": 0.036376144737005234, "rewards/margins": 3.0969228744506836, "rewards/rejected": -3.060546875, "step": 294 }, { "epoch": 0.07, "learning_rate": 1.4148681055155877e-07, "logps/chosen": -252.14065551757812, "logps/rejected": -245.460205078125, "loss": 0.1446, "losses/dpo": 0.024778060615062714, "losses/sft": 0.9100992679595947, "losses/total": 0.024778060615062714, "ref_logps/chosen": -252.98345947265625, "ref_logps/rejected": -221.0264129638672, "rewards/accuracies": 1.0, "rewards/chosen": 0.08428068459033966, "rewards/margins": 2.5276594161987305, "rewards/rejected": -2.4433789253234863, "step": 295 }, { "epoch": 0.07, "learning_rate": 1.4196642685851318e-07, "logps/chosen": -217.31991577148438, "logps/rejected": -253.92872619628906, "loss": 0.1428, "losses/dpo": 0.0009695091866888106, "losses/sft": 1.0686347484588623, "losses/total": 0.0009695091866888106, "ref_logps/chosen": -217.55250549316406, "ref_logps/rejected": -229.40310668945312, "rewards/accuracies": 1.0, "rewards/chosen": 0.023258589208126068, "rewards/margins": 2.475821018218994, "rewards/rejected": -2.4525623321533203, "step": 296 }, { "epoch": 0.07, "learning_rate": 1.4244604316546762e-07, "logps/chosen": -255.78598022460938, "logps/rejected": -268.24273681640625, "loss": 0.0982, "losses/dpo": 0.02544843778014183, "losses/sft": 0.4649616479873657, "losses/total": 0.02544843778014183, "ref_logps/chosen": -256.517333984375, "ref_logps/rejected": -239.00120544433594, "rewards/accuracies": 1.0, "rewards/chosen": 0.07313409447669983, "rewards/margins": 2.9972872734069824, "rewards/rejected": -2.9241533279418945, "step": 297 }, { "epoch": 0.07, "learning_rate": 1.4292565947242206e-07, "logps/chosen": -270.2460632324219, "logps/rejected": -272.49212646484375, "loss": 0.1128, "losses/dpo": 0.013653604313731194, "losses/sft": 0.5197446942329407, "losses/total": 0.013653604313731194, "ref_logps/chosen": -270.73779296875, "ref_logps/rejected": -244.29678344726562, "rewards/accuracies": 1.0, "rewards/chosen": 0.049172595143318176, "rewards/margins": 2.868706703186035, "rewards/rejected": -2.8195338249206543, "step": 298 }, { "epoch": 0.07, "learning_rate": 1.4340527577937648e-07, "logps/chosen": -198.9384002685547, "logps/rejected": -238.16009521484375, "loss": 0.1536, "losses/dpo": 0.44800227880477905, "losses/sft": 0.5432268977165222, "losses/total": 0.44800227880477905, "ref_logps/chosen": -199.12525939941406, "ref_logps/rejected": -212.3064727783203, "rewards/accuracies": 1.0, "rewards/chosen": 0.018685098737478256, "rewards/margins": 2.6040472984313965, "rewards/rejected": -2.585362434387207, "step": 299 }, { "epoch": 0.07, "learning_rate": 1.4388489208633092e-07, "logps/chosen": -233.61065673828125, "logps/rejected": -231.55960083007812, "loss": 0.1481, "losses/dpo": 0.04792015999555588, "losses/sft": 0.5532643795013428, "losses/total": 0.04792015999555588, "ref_logps/chosen": -233.72503662109375, "ref_logps/rejected": -207.10626220703125, "rewards/accuracies": 1.0, "rewards/chosen": 0.011439280584454536, "rewards/margins": 2.4567716121673584, "rewards/rejected": -2.4453322887420654, "step": 300 }, { "epoch": 0.07, "learning_rate": 1.4436450839328536e-07, "logps/chosen": -227.4835662841797, "logps/rejected": -246.65171813964844, "loss": 0.1259, "losses/dpo": 0.06975121796131134, "losses/sft": 0.59730464220047, "losses/total": 0.06975121796131134, "ref_logps/chosen": -227.62228393554688, "ref_logps/rejected": -219.404541015625, "rewards/accuracies": 1.0, "rewards/chosen": 0.013872338458895683, "rewards/margins": 2.738590717315674, "rewards/rejected": -2.7247185707092285, "step": 301 }, { "epoch": 0.07, "learning_rate": 1.448441247002398e-07, "logps/chosen": -257.1293029785156, "logps/rejected": -278.98883056640625, "loss": 0.1011, "losses/dpo": 0.015625352039933205, "losses/sft": 0.6209622025489807, "losses/total": 0.015625352039933205, "ref_logps/chosen": -257.89312744140625, "ref_logps/rejected": -249.53567504882812, "rewards/accuracies": 1.0, "rewards/chosen": 0.07638482749462128, "rewards/margins": 3.0217013359069824, "rewards/rejected": -2.9453163146972656, "step": 302 }, { "epoch": 0.07, "learning_rate": 1.4532374100719425e-07, "logps/chosen": -231.23316955566406, "logps/rejected": -256.2231750488281, "loss": 0.1389, "losses/dpo": 0.004736121743917465, "losses/sft": 0.49517661333084106, "losses/total": 0.004736121743917465, "ref_logps/chosen": -231.49913024902344, "ref_logps/rejected": -229.55197143554688, "rewards/accuracies": 1.0, "rewards/chosen": 0.026595260947942734, "rewards/margins": 2.6937179565429688, "rewards/rejected": -2.6671226024627686, "step": 303 }, { "epoch": 0.07, "learning_rate": 1.458033573141487e-07, "logps/chosen": -215.603759765625, "logps/rejected": -245.25973510742188, "loss": 0.1311, "losses/dpo": 0.018512053415179253, "losses/sft": 0.5994486212730408, "losses/total": 0.018512053415179253, "ref_logps/chosen": -216.3567352294922, "ref_logps/rejected": -220.990966796875, "rewards/accuracies": 1.0, "rewards/chosen": 0.07529780268669128, "rewards/margins": 2.5021772384643555, "rewards/rejected": -2.426879405975342, "step": 304 }, { "epoch": 0.07, "learning_rate": 1.462829736211031e-07, "logps/chosen": -244.12823486328125, "logps/rejected": -259.1986999511719, "loss": 0.1116, "losses/dpo": 0.06462820619344711, "losses/sft": 1.0238806009292603, "losses/total": 0.06462820619344711, "ref_logps/chosen": -243.93060302734375, "ref_logps/rejected": -230.47213745117188, "rewards/accuracies": 1.0, "rewards/chosen": -0.019760861992836, "rewards/margins": 2.8528952598571777, "rewards/rejected": -2.8726563453674316, "step": 305 }, { "epoch": 0.07, "learning_rate": 1.4676258992805754e-07, "logps/chosen": -196.1495819091797, "logps/rejected": -226.6968536376953, "loss": 0.1856, "losses/dpo": 0.009463776834309101, "losses/sft": 0.5506085753440857, "losses/total": 0.009463776834309101, "ref_logps/chosen": -196.37796020507812, "ref_logps/rejected": -204.86289978027344, "rewards/accuracies": 1.0, "rewards/chosen": 0.022838860750198364, "rewards/margins": 2.206234931945801, "rewards/rejected": -2.1833958625793457, "step": 306 }, { "epoch": 0.07, "learning_rate": 1.4724220623501199e-07, "logps/chosen": -199.2637939453125, "logps/rejected": -265.70880126953125, "loss": 0.1164, "losses/dpo": 0.06725816428661346, "losses/sft": 0.5654178261756897, "losses/total": 0.06725816428661346, "ref_logps/chosen": -199.37686157226562, "ref_logps/rejected": -239.4753875732422, "rewards/accuracies": 1.0, "rewards/chosen": 0.011305613443255424, "rewards/margins": 2.634645938873291, "rewards/rejected": -2.623340129852295, "step": 307 }, { "epoch": 0.07, "learning_rate": 1.477218225419664e-07, "logps/chosen": -208.40049743652344, "logps/rejected": -240.32608032226562, "loss": 0.1285, "losses/dpo": 0.020596696063876152, "losses/sft": 1.125730037689209, "losses/total": 0.020596696063876152, "ref_logps/chosen": -208.93576049804688, "ref_logps/rejected": -215.2783966064453, "rewards/accuracies": 1.0, "rewards/chosen": 0.053525134921073914, "rewards/margins": 2.558293342590332, "rewards/rejected": -2.504768133163452, "step": 308 }, { "epoch": 0.07, "learning_rate": 1.4820143884892084e-07, "logps/chosen": -221.0391845703125, "logps/rejected": -255.3882293701172, "loss": 0.1396, "losses/dpo": 0.09647955745458603, "losses/sft": 0.4571684002876282, "losses/total": 0.09647955745458603, "ref_logps/chosen": -221.3773193359375, "ref_logps/rejected": -231.85488891601562, "rewards/accuracies": 1.0, "rewards/chosen": 0.033814627677202225, "rewards/margins": 2.3871476650238037, "rewards/rejected": -2.353332996368408, "step": 309 }, { "epoch": 0.07, "learning_rate": 1.486810551558753e-07, "logps/chosen": -223.1595001220703, "logps/rejected": -239.6898956298828, "loss": 0.1564, "losses/dpo": 0.007379153277724981, "losses/sft": 0.6112187504768372, "losses/total": 0.007379153277724981, "ref_logps/chosen": -223.44361877441406, "ref_logps/rejected": -213.48501586914062, "rewards/accuracies": 1.0, "rewards/chosen": 0.028412649407982826, "rewards/margins": 2.648902654647827, "rewards/rejected": -2.620490074157715, "step": 310 }, { "epoch": 0.07, "learning_rate": 1.4916067146282973e-07, "logps/chosen": -232.369873046875, "logps/rejected": -269.61968994140625, "loss": 0.1023, "losses/dpo": 0.0022508157417178154, "losses/sft": 0.9644865393638611, "losses/total": 0.0022508157417178154, "ref_logps/chosen": -232.4925537109375, "ref_logps/rejected": -239.45584106445312, "rewards/accuracies": 1.0, "rewards/chosen": 0.012266566045582294, "rewards/margins": 3.028653621673584, "rewards/rejected": -3.0163869857788086, "step": 311 }, { "epoch": 0.07, "learning_rate": 1.4964028776978417e-07, "logps/chosen": -218.12197875976562, "logps/rejected": -245.8292694091797, "loss": 0.1399, "losses/dpo": 0.008040008135139942, "losses/sft": 1.103675127029419, "losses/total": 0.008040008135139942, "ref_logps/chosen": -218.43263244628906, "ref_logps/rejected": -220.03097534179688, "rewards/accuracies": 1.0, "rewards/chosen": 0.031066209077835083, "rewards/margins": 2.6108946800231934, "rewards/rejected": -2.5798282623291016, "step": 312 }, { "epoch": 0.08, "learning_rate": 1.501199040767386e-07, "logps/chosen": -214.12109375, "logps/rejected": -243.7325439453125, "loss": 0.1278, "losses/dpo": 0.008314738050103188, "losses/sft": 0.5763677954673767, "losses/total": 0.008314738050103188, "ref_logps/chosen": -214.476318359375, "ref_logps/rejected": -218.1216583251953, "rewards/accuracies": 1.0, "rewards/chosen": 0.03552220016717911, "rewards/margins": 2.596611976623535, "rewards/rejected": -2.561089515686035, "step": 313 }, { "epoch": 0.08, "learning_rate": 1.5059952038369303e-07, "logps/chosen": -203.20028686523438, "logps/rejected": -233.61151123046875, "loss": 0.132, "losses/dpo": 0.029185764491558075, "losses/sft": 0.625251829624176, "losses/total": 0.029185764491558075, "ref_logps/chosen": -203.7978973388672, "ref_logps/rejected": -209.380126953125, "rewards/accuracies": 1.0, "rewards/chosen": 0.059761129319667816, "rewards/margins": 2.4828996658325195, "rewards/rejected": -2.4231386184692383, "step": 314 }, { "epoch": 0.08, "learning_rate": 1.5107913669064747e-07, "logps/chosen": -229.59408569335938, "logps/rejected": -262.76861572265625, "loss": 0.1235, "losses/dpo": 0.020704859867691994, "losses/sft": 0.5342892408370972, "losses/total": 0.020704859867691994, "ref_logps/chosen": -229.91763305664062, "ref_logps/rejected": -237.05738830566406, "rewards/accuracies": 1.0, "rewards/chosen": 0.0323549248278141, "rewards/margins": 2.603478193283081, "rewards/rejected": -2.5711231231689453, "step": 315 }, { "epoch": 0.08, "learning_rate": 1.515587529976019e-07, "logps/chosen": -243.16639709472656, "logps/rejected": -270.04791259765625, "loss": 0.1063, "losses/dpo": 0.0008832419407553971, "losses/sft": 0.8287760615348816, "losses/total": 0.0008832419407553971, "ref_logps/chosen": -243.4913787841797, "ref_logps/rejected": -241.1365966796875, "rewards/accuracies": 1.0, "rewards/chosen": 0.03249967098236084, "rewards/margins": 2.923633098602295, "rewards/rejected": -2.8911335468292236, "step": 316 }, { "epoch": 0.08, "learning_rate": 1.5203836930455635e-07, "logps/chosen": -225.3759307861328, "logps/rejected": -240.88404846191406, "loss": 0.1404, "losses/dpo": 0.00979212298989296, "losses/sft": 1.2174875736236572, "losses/total": 0.00979212298989296, "ref_logps/chosen": -225.28663635253906, "ref_logps/rejected": -212.9962921142578, "rewards/accuracies": 1.0, "rewards/chosen": -0.00892823189496994, "rewards/margins": 2.779845714569092, "rewards/rejected": -2.788774013519287, "step": 317 }, { "epoch": 0.08, "learning_rate": 1.525179856115108e-07, "logps/chosen": -240.62945556640625, "logps/rejected": -279.48956298828125, "loss": 0.1043, "losses/dpo": 0.012783228419721127, "losses/sft": 0.514988899230957, "losses/total": 0.012783228419721127, "ref_logps/chosen": -240.74705505371094, "ref_logps/rejected": -249.71768188476562, "rewards/accuracies": 1.0, "rewards/chosen": 0.011759845539927483, "rewards/margins": 2.9889473915100098, "rewards/rejected": -2.977187395095825, "step": 318 }, { "epoch": 0.08, "learning_rate": 1.5299760191846523e-07, "logps/chosen": -232.96217346191406, "logps/rejected": -248.25082397460938, "loss": 0.128, "losses/dpo": 0.012664888985455036, "losses/sft": 0.5619103908538818, "losses/total": 0.012664888985455036, "ref_logps/chosen": -232.9538116455078, "ref_logps/rejected": -221.03692626953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.0008365325629711151, "rewards/margins": 2.7205538749694824, "rewards/rejected": -2.7213902473449707, "step": 319 }, { "epoch": 0.08, "learning_rate": 1.5347721822541965e-07, "logps/chosen": -224.74166870117188, "logps/rejected": -256.1207275390625, "loss": 0.1107, "losses/dpo": 0.0011484422720968723, "losses/sft": 0.6358317136764526, "losses/total": 0.0011484422720968723, "ref_logps/chosen": -225.41847229003906, "ref_logps/rejected": -228.8552703857422, "rewards/accuracies": 1.0, "rewards/chosen": 0.06768245995044708, "rewards/margins": 2.794227123260498, "rewards/rejected": -2.7265446186065674, "step": 320 }, { "epoch": 0.08, "learning_rate": 1.539568345323741e-07, "logps/chosen": -212.65646362304688, "logps/rejected": -261.1561584472656, "loss": 0.1177, "losses/dpo": 0.000840063439682126, "losses/sft": 1.1737500429153442, "losses/total": 0.000840063439682126, "ref_logps/chosen": -212.43951416015625, "ref_logps/rejected": -230.56996154785156, "rewards/accuracies": 1.0, "rewards/chosen": -0.021692920476198196, "rewards/margins": 3.036923885345459, "rewards/rejected": -3.0586166381835938, "step": 321 }, { "epoch": 0.08, "learning_rate": 1.5443645083932853e-07, "logps/chosen": -226.43763732910156, "logps/rejected": -257.6418762207031, "loss": 0.1225, "losses/dpo": 0.005988163407891989, "losses/sft": 0.4970652163028717, "losses/total": 0.005988163407891989, "ref_logps/chosen": -226.77230834960938, "ref_logps/rejected": -229.1529083251953, "rewards/accuracies": 1.0, "rewards/chosen": 0.0334680937230587, "rewards/margins": 2.8823633193969727, "rewards/rejected": -2.8488950729370117, "step": 322 }, { "epoch": 0.08, "learning_rate": 1.5491606714628295e-07, "logps/chosen": -240.74087524414062, "logps/rejected": -243.82730102539062, "loss": 0.1324, "losses/dpo": 0.004906828980892897, "losses/sft": 0.6258312463760376, "losses/total": 0.004906828980892897, "ref_logps/chosen": -240.68023681640625, "ref_logps/rejected": -216.9535369873047, "rewards/accuracies": 1.0, "rewards/chosen": -0.006064943037927151, "rewards/margins": 2.6813130378723145, "rewards/rejected": -2.6873779296875, "step": 323 }, { "epoch": 0.08, "learning_rate": 1.5539568345323742e-07, "logps/chosen": -236.825927734375, "logps/rejected": -248.54962158203125, "loss": 0.1255, "losses/dpo": 0.004336100537329912, "losses/sft": 0.5310423970222473, "losses/total": 0.004336100537329912, "ref_logps/chosen": -236.6578826904297, "ref_logps/rejected": -222.24234008789062, "rewards/accuracies": 1.0, "rewards/chosen": -0.0168045274913311, "rewards/margins": 2.613924503326416, "rewards/rejected": -2.6307291984558105, "step": 324 }, { "epoch": 0.08, "learning_rate": 1.5587529976019186e-07, "logps/chosen": -289.63525390625, "logps/rejected": -306.3829040527344, "loss": 0.0772, "losses/dpo": 0.0045837825164198875, "losses/sft": 0.5867018699645996, "losses/total": 0.0045837825164198875, "ref_logps/chosen": -289.798828125, "ref_logps/rejected": -274.8426818847656, "rewards/accuracies": 1.0, "rewards/chosen": 0.016357094049453735, "rewards/margins": 3.170379400253296, "rewards/rejected": -3.154022216796875, "step": 325 }, { "epoch": 0.08, "learning_rate": 1.5635491606714627e-07, "logps/chosen": -256.488525390625, "logps/rejected": -284.0159606933594, "loss": 0.1033, "losses/dpo": 0.006006593815982342, "losses/sft": 0.6380584239959717, "losses/total": 0.006006593815982342, "ref_logps/chosen": -256.95916748046875, "ref_logps/rejected": -252.11839294433594, "rewards/accuracies": 1.0, "rewards/chosen": 0.047066595405340195, "rewards/margins": 3.2368228435516357, "rewards/rejected": -3.189756393432617, "step": 326 }, { "epoch": 0.08, "learning_rate": 1.5683453237410071e-07, "logps/chosen": -229.03140258789062, "logps/rejected": -234.06439208984375, "loss": 0.1459, "losses/dpo": 0.003980903886258602, "losses/sft": 0.4297468960285187, "losses/total": 0.003980903886258602, "ref_logps/chosen": -229.63198852539062, "ref_logps/rejected": -208.4084014892578, "rewards/accuracies": 1.0, "rewards/chosen": 0.060058653354644775, "rewards/margins": 2.625657796859741, "rewards/rejected": -2.565598964691162, "step": 327 }, { "epoch": 0.08, "learning_rate": 1.5731414868105516e-07, "logps/chosen": -197.15769958496094, "logps/rejected": -248.92990112304688, "loss": 0.146, "losses/dpo": 0.03555295988917351, "losses/sft": 0.6332929730415344, "losses/total": 0.03555295988917351, "ref_logps/chosen": -197.5083465576172, "ref_logps/rejected": -221.75860595703125, "rewards/accuracies": 1.0, "rewards/chosen": 0.03506510332226753, "rewards/margins": 2.7521955966949463, "rewards/rejected": -2.717130422592163, "step": 328 }, { "epoch": 0.08, "learning_rate": 1.5779376498800957e-07, "logps/chosen": -257.5069885253906, "logps/rejected": -276.9559631347656, "loss": 0.1001, "losses/dpo": 0.0017154511297121644, "losses/sft": 0.5833638310432434, "losses/total": 0.0017154511297121644, "ref_logps/chosen": -258.1302490234375, "ref_logps/rejected": -247.07366943359375, "rewards/accuracies": 1.0, "rewards/chosen": 0.062326543033123016, "rewards/margins": 3.0505552291870117, "rewards/rejected": -2.9882285594940186, "step": 329 }, { "epoch": 0.08, "learning_rate": 1.5827338129496401e-07, "logps/chosen": -252.36483764648438, "logps/rejected": -273.6446533203125, "loss": 0.0786, "losses/dpo": 0.01898292638361454, "losses/sft": 0.433858186006546, "losses/total": 0.01898292638361454, "ref_logps/chosen": -252.40121459960938, "ref_logps/rejected": -243.9038543701172, "rewards/accuracies": 1.0, "rewards/chosen": 0.0036363229155540466, "rewards/margins": 2.977715015411377, "rewards/rejected": -2.97407865524292, "step": 330 }, { "epoch": 0.08, "learning_rate": 1.5875299760191845e-07, "logps/chosen": -239.1497039794922, "logps/rejected": -263.04937744140625, "loss": 0.1153, "losses/dpo": 0.009471051394939423, "losses/sft": 0.6728125810623169, "losses/total": 0.009471051394939423, "ref_logps/chosen": -239.23638916015625, "ref_logps/rejected": -232.71026611328125, "rewards/accuracies": 1.0, "rewards/chosen": 0.008668770082294941, "rewards/margins": 3.0425803661346436, "rewards/rejected": -3.03391170501709, "step": 331 }, { "epoch": 0.08, "learning_rate": 1.592326139088729e-07, "logps/chosen": -217.60189819335938, "logps/rejected": -236.82052612304688, "loss": 0.158, "losses/dpo": 0.013475533574819565, "losses/sft": 0.5070273280143738, "losses/total": 0.013475533574819565, "ref_logps/chosen": -217.4766387939453, "ref_logps/rejected": -210.2818145751953, "rewards/accuracies": 1.0, "rewards/chosen": -0.012525100260972977, "rewards/margins": 2.6413445472717285, "rewards/rejected": -2.65386962890625, "step": 332 }, { "epoch": 0.08, "learning_rate": 1.5971223021582734e-07, "logps/chosen": -217.39419555664062, "logps/rejected": -234.5841064453125, "loss": 0.1256, "losses/dpo": 0.007673332002013922, "losses/sft": 0.5969727635383606, "losses/total": 0.007673332002013922, "ref_logps/chosen": -217.9666748046875, "ref_logps/rejected": -208.02239990234375, "rewards/accuracies": 1.0, "rewards/chosen": 0.057246431708335876, "rewards/margins": 2.7134172916412354, "rewards/rejected": -2.6561708450317383, "step": 333 }, { "epoch": 0.08, "learning_rate": 1.6019184652278178e-07, "logps/chosen": -237.50115966796875, "logps/rejected": -236.49029541015625, "loss": 0.1346, "losses/dpo": 0.01544650737196207, "losses/sft": 0.3828185796737671, "losses/total": 0.01544650737196207, "ref_logps/chosen": -237.706787109375, "ref_logps/rejected": -210.92578125, "rewards/accuracies": 1.0, "rewards/chosen": 0.0205613411962986, "rewards/margins": 2.5770137310028076, "rewards/rejected": -2.556452512741089, "step": 334 }, { "epoch": 0.08, "learning_rate": 1.606714628297362e-07, "logps/chosen": -233.58901977539062, "logps/rejected": -239.13934326171875, "loss": 0.1125, "losses/dpo": 0.009261805564165115, "losses/sft": 1.2881720066070557, "losses/total": 0.009261805564165115, "ref_logps/chosen": -233.87393188476562, "ref_logps/rejected": -213.433349609375, "rewards/accuracies": 1.0, "rewards/chosen": 0.028490960597991943, "rewards/margins": 2.5990915298461914, "rewards/rejected": -2.5706005096435547, "step": 335 }, { "epoch": 0.08, "learning_rate": 1.6115107913669064e-07, "logps/chosen": -208.4798583984375, "logps/rejected": -244.25123596191406, "loss": 0.1279, "losses/dpo": 0.013498428277671337, "losses/sft": 0.6537903547286987, "losses/total": 0.013498428277671337, "ref_logps/chosen": -209.18350219726562, "ref_logps/rejected": -215.19522094726562, "rewards/accuracies": 1.0, "rewards/chosen": 0.07036397606134415, "rewards/margins": 2.975965976715088, "rewards/rejected": -2.905601978302002, "step": 336 }, { "epoch": 0.08, "learning_rate": 1.6163069544364508e-07, "logps/chosen": -213.82894897460938, "logps/rejected": -250.03274536132812, "loss": 0.1178, "losses/dpo": 0.011411982588469982, "losses/sft": 0.5292767882347107, "losses/total": 0.011411982588469982, "ref_logps/chosen": -214.49961853027344, "ref_logps/rejected": -221.58985900878906, "rewards/accuracies": 1.0, "rewards/chosen": 0.06706620007753372, "rewards/margins": 2.9113552570343018, "rewards/rejected": -2.8442893028259277, "step": 337 }, { "epoch": 0.08, "learning_rate": 1.621103117505995e-07, "logps/chosen": -261.24224853515625, "logps/rejected": -282.61077880859375, "loss": 0.0637, "losses/dpo": 0.006493390537798405, "losses/sft": 0.5213918685913086, "losses/total": 0.006493390537798405, "ref_logps/chosen": -261.642333984375, "ref_logps/rejected": -247.91236877441406, "rewards/accuracies": 1.0, "rewards/chosen": 0.04001128673553467, "rewards/margins": 3.509850025177002, "rewards/rejected": -3.4698386192321777, "step": 338 }, { "epoch": 0.08, "learning_rate": 1.6258992805755396e-07, "logps/chosen": -226.892822265625, "logps/rejected": -251.74923706054688, "loss": 0.1208, "losses/dpo": 0.021280979737639427, "losses/sft": 0.46064382791519165, "losses/total": 0.021280979737639427, "ref_logps/chosen": -226.87489318847656, "ref_logps/rejected": -224.25440979003906, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017921701073646545, "rewards/margins": 2.747690200805664, "rewards/rejected": -2.7494823932647705, "step": 339 }, { "epoch": 0.08, "learning_rate": 1.630695443645084e-07, "logps/chosen": -257.2532043457031, "logps/rejected": -263.73052978515625, "loss": 0.1033, "losses/dpo": 0.03735101595520973, "losses/sft": 0.7343640327453613, "losses/total": 0.03735101595520973, "ref_logps/chosen": -257.08905029296875, "ref_logps/rejected": -232.7646942138672, "rewards/accuracies": 1.0, "rewards/chosen": -0.016413547098636627, "rewards/margins": 3.080172061920166, "rewards/rejected": -3.096585750579834, "step": 340 }, { "epoch": 0.08, "learning_rate": 1.6354916067146282e-07, "logps/chosen": -224.83697509765625, "logps/rejected": -238.72561645507812, "loss": 0.1256, "losses/dpo": 0.005321309436112642, "losses/sft": 1.1323251724243164, "losses/total": 0.005321309436112642, "ref_logps/chosen": -225.30393981933594, "ref_logps/rejected": -211.79803466796875, "rewards/accuracies": 1.0, "rewards/chosen": 0.04669633507728577, "rewards/margins": 2.739455223083496, "rewards/rejected": -2.692758798599243, "step": 341 }, { "epoch": 0.08, "learning_rate": 1.6402877697841726e-07, "logps/chosen": -181.6143798828125, "logps/rejected": -235.5192413330078, "loss": 0.1416, "losses/dpo": 0.047273188829422, "losses/sft": 0.6893259882926941, "losses/total": 0.047273188829422, "ref_logps/chosen": -181.72821044921875, "ref_logps/rejected": -209.76747131347656, "rewards/accuracies": 1.0, "rewards/chosen": 0.01138225756585598, "rewards/margins": 2.586559772491455, "rewards/rejected": -2.5751774311065674, "step": 342 }, { "epoch": 0.08, "learning_rate": 1.645083932853717e-07, "logps/chosen": -218.29083251953125, "logps/rejected": -257.6624755859375, "loss": 0.121, "losses/dpo": 0.04330984875559807, "losses/sft": 0.630342423915863, "losses/total": 0.04330984875559807, "ref_logps/chosen": -218.37611389160156, "ref_logps/rejected": -230.1754913330078, "rewards/accuracies": 1.0, "rewards/chosen": 0.008528048172593117, "rewards/margins": 2.7572271823883057, "rewards/rejected": -2.748699188232422, "step": 343 }, { "epoch": 0.08, "learning_rate": 1.6498800959232612e-07, "logps/chosen": -273.145751953125, "logps/rejected": -247.91714477539062, "loss": 0.1008, "losses/dpo": 0.00808755587786436, "losses/sft": 0.5780286192893982, "losses/total": 0.00808755587786436, "ref_logps/chosen": -273.40777587890625, "ref_logps/rejected": -216.81956481933594, "rewards/accuracies": 1.0, "rewards/chosen": 0.02620229870080948, "rewards/margins": 3.135960578918457, "rewards/rejected": -3.109758138656616, "step": 344 }, { "epoch": 0.08, "learning_rate": 1.6546762589928056e-07, "logps/chosen": -219.42787170410156, "logps/rejected": -265.2581787109375, "loss": 0.0936, "losses/dpo": 0.02577284164726734, "losses/sft": 0.6038211584091187, "losses/total": 0.02577284164726734, "ref_logps/chosen": -219.83404541015625, "ref_logps/rejected": -233.04949951171875, "rewards/accuracies": 1.0, "rewards/chosen": 0.04061523824930191, "rewards/margins": 3.261486530303955, "rewards/rejected": -3.2208709716796875, "step": 345 }, { "epoch": 0.08, "learning_rate": 1.65947242206235e-07, "logps/chosen": -180.294189453125, "logps/rejected": -216.74838256835938, "loss": 0.1408, "losses/dpo": 0.0052217524498701096, "losses/sft": 0.6031147837638855, "losses/total": 0.0052217524498701096, "ref_logps/chosen": -180.9489288330078, "ref_logps/rejected": -190.28509521484375, "rewards/accuracies": 1.0, "rewards/chosen": 0.06547581404447556, "rewards/margins": 2.711805820465088, "rewards/rejected": -2.646329879760742, "step": 346 }, { "epoch": 0.08, "learning_rate": 1.6642685851318944e-07, "logps/chosen": -208.96527099609375, "logps/rejected": -275.86944580078125, "loss": 0.0974, "losses/dpo": 0.011677740141749382, "losses/sft": 0.5867857336997986, "losses/total": 0.011677740141749382, "ref_logps/chosen": -209.09678649902344, "ref_logps/rejected": -242.67979431152344, "rewards/accuracies": 1.0, "rewards/chosen": 0.013152007013559341, "rewards/margins": 3.332120180130005, "rewards/rejected": -3.3189680576324463, "step": 347 }, { "epoch": 0.08, "learning_rate": 1.6690647482014388e-07, "logps/chosen": -211.7921142578125, "logps/rejected": -235.49481201171875, "loss": 0.1049, "losses/dpo": 0.11941453814506531, "losses/sft": 0.6203804612159729, "losses/total": 0.11941453814506531, "ref_logps/chosen": -212.1964111328125, "ref_logps/rejected": -207.73191833496094, "rewards/accuracies": 1.0, "rewards/chosen": 0.04043048992753029, "rewards/margins": 2.8167190551757812, "rewards/rejected": -2.7762887477874756, "step": 348 }, { "epoch": 0.08, "learning_rate": 1.6738609112709833e-07, "logps/chosen": -224.2265625, "logps/rejected": -289.0511169433594, "loss": 0.0907, "losses/dpo": 0.0185434240847826, "losses/sft": 0.7923979759216309, "losses/total": 0.0185434240847826, "ref_logps/chosen": -224.49790954589844, "ref_logps/rejected": -254.76904296875, "rewards/accuracies": 1.0, "rewards/chosen": 0.027133066207170486, "rewards/margins": 3.4553380012512207, "rewards/rejected": -3.4282047748565674, "step": 349 }, { "epoch": 0.08, "learning_rate": 1.6786570743405274e-07, "logps/chosen": -229.2664794921875, "logps/rejected": -246.55178833007812, "loss": 0.1212, "losses/dpo": 0.02827809937298298, "losses/sft": 0.673829972743988, "losses/total": 0.02827809937298298, "ref_logps/chosen": -228.68051147460938, "ref_logps/rejected": -217.47998046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.05859590321779251, "rewards/margins": 2.848587989807129, "rewards/rejected": -2.9071836471557617, "step": 350 }, { "epoch": 0.08, "learning_rate": 1.6834532374100718e-07, "logps/chosen": -232.12347412109375, "logps/rejected": -276.9136657714844, "loss": 0.0862, "losses/dpo": 0.0032543588895350695, "losses/sft": 0.43715184926986694, "losses/total": 0.0032543588895350695, "ref_logps/chosen": -231.93678283691406, "ref_logps/rejected": -242.34315490722656, "rewards/accuracies": 1.0, "rewards/chosen": -0.01867123320698738, "rewards/margins": 3.4383788108825684, "rewards/rejected": -3.457050085067749, "step": 351 }, { "epoch": 0.08, "learning_rate": 1.6882494004796162e-07, "logps/chosen": -214.50669860839844, "logps/rejected": -233.6807861328125, "loss": 0.1034, "losses/dpo": 0.029713228344917297, "losses/sft": 0.6764442920684814, "losses/total": 0.029713228344917297, "ref_logps/chosen": -214.69842529296875, "ref_logps/rejected": -205.9283447265625, "rewards/accuracies": 1.0, "rewards/chosen": 0.019171901047229767, "rewards/margins": 2.794414758682251, "rewards/rejected": -2.775242805480957, "step": 352 }, { "epoch": 0.08, "learning_rate": 1.6930455635491604e-07, "logps/chosen": -191.59219360351562, "logps/rejected": -242.74765014648438, "loss": 0.157, "losses/dpo": 0.00920718815177679, "losses/sft": 0.47644373774528503, "losses/total": 0.00920718815177679, "ref_logps/chosen": -191.64846801757812, "ref_logps/rejected": -216.05947875976562, "rewards/accuracies": 1.0, "rewards/chosen": 0.005625693127512932, "rewards/margins": 2.6744441986083984, "rewards/rejected": -2.668818473815918, "step": 353 }, { "epoch": 0.08, "learning_rate": 1.697841726618705e-07, "logps/chosen": -252.8436737060547, "logps/rejected": -270.502197265625, "loss": 0.0705, "losses/dpo": 0.005242886487394571, "losses/sft": 0.5072601437568665, "losses/total": 0.005242886487394571, "ref_logps/chosen": -253.52040100097656, "ref_logps/rejected": -235.37876892089844, "rewards/accuracies": 1.0, "rewards/chosen": 0.06767073273658752, "rewards/margins": 3.5800137519836426, "rewards/rejected": -3.512342929840088, "step": 354 }, { "epoch": 0.09, "learning_rate": 1.7026378896882495e-07, "logps/chosen": -216.5613250732422, "logps/rejected": -229.11245727539062, "loss": 0.0999, "losses/dpo": 0.013111166656017303, "losses/sft": 0.6191701889038086, "losses/total": 0.013111166656017303, "ref_logps/chosen": -217.18206787109375, "ref_logps/rejected": -199.19100952148438, "rewards/accuracies": 1.0, "rewards/chosen": 0.06207408756017685, "rewards/margins": 3.0542194843292236, "rewards/rejected": -2.992145299911499, "step": 355 }, { "epoch": 0.09, "learning_rate": 1.7074340527577937e-07, "logps/chosen": -198.9185028076172, "logps/rejected": -252.61729431152344, "loss": 0.1196, "losses/dpo": 0.0019377222051844, "losses/sft": 0.4973559081554413, "losses/total": 0.0019377222051844, "ref_logps/chosen": -199.39260864257812, "ref_logps/rejected": -223.46331787109375, "rewards/accuracies": 1.0, "rewards/chosen": 0.04741166904568672, "rewards/margins": 2.962808847427368, "rewards/rejected": -2.9153971672058105, "step": 356 }, { "epoch": 0.09, "learning_rate": 1.712230215827338e-07, "logps/chosen": -228.28045654296875, "logps/rejected": -250.3546142578125, "loss": 0.0877, "losses/dpo": 0.008617596700787544, "losses/sft": 0.709051251411438, "losses/total": 0.008617596700787544, "ref_logps/chosen": -228.39117431640625, "ref_logps/rejected": -219.56906127929688, "rewards/accuracies": 1.0, "rewards/chosen": 0.011072931811213493, "rewards/margins": 3.089629650115967, "rewards/rejected": -3.078557014465332, "step": 357 }, { "epoch": 0.09, "learning_rate": 1.7170263788968825e-07, "logps/chosen": -233.4622039794922, "logps/rejected": -275.82049560546875, "loss": 0.072, "losses/dpo": 0.003575374837964773, "losses/sft": 0.5009869933128357, "losses/total": 0.003575374837964773, "ref_logps/chosen": -233.38800048828125, "ref_logps/rejected": -237.12942504882812, "rewards/accuracies": 1.0, "rewards/chosen": -0.007421508431434631, "rewards/margins": 3.8616862297058105, "rewards/rejected": -3.869107723236084, "step": 358 }, { "epoch": 0.09, "learning_rate": 1.7218225419664266e-07, "logps/chosen": -200.32211303710938, "logps/rejected": -249.76190185546875, "loss": 0.1107, "losses/dpo": 0.011164328083395958, "losses/sft": 0.5256338119506836, "losses/total": 0.011164328083395958, "ref_logps/chosen": -201.25350952148438, "ref_logps/rejected": -221.46697998046875, "rewards/accuracies": 1.0, "rewards/chosen": 0.09314021468162537, "rewards/margins": 2.922630548477173, "rewards/rejected": -2.8294901847839355, "step": 359 }, { "epoch": 0.09, "learning_rate": 1.726618705035971e-07, "logps/chosen": -291.82305908203125, "logps/rejected": -283.549072265625, "loss": 0.096, "losses/dpo": 0.004097616765648127, "losses/sft": 1.1351255178451538, "losses/total": 0.004097616765648127, "ref_logps/chosen": -291.96197509765625, "ref_logps/rejected": -249.23912048339844, "rewards/accuracies": 1.0, "rewards/chosen": 0.013894367963075638, "rewards/margins": 3.4448904991149902, "rewards/rejected": -3.4309961795806885, "step": 360 }, { "epoch": 0.09, "learning_rate": 1.7314148681055155e-07, "logps/chosen": -247.5580596923828, "logps/rejected": -285.9872131347656, "loss": 0.0745, "losses/dpo": 0.026240212842822075, "losses/sft": 0.24671557545661926, "losses/total": 0.026240212842822075, "ref_logps/chosen": -247.3787384033203, "ref_logps/rejected": -249.15081787109375, "rewards/accuracies": 1.0, "rewards/chosen": -0.017933305352926254, "rewards/margins": 3.6657047271728516, "rewards/rejected": -3.6836376190185547, "step": 361 }, { "epoch": 0.09, "learning_rate": 1.73621103117506e-07, "logps/chosen": -220.98837280273438, "logps/rejected": -272.535888671875, "loss": 0.1284, "losses/dpo": 0.01105938758701086, "losses/sft": 0.6456508636474609, "losses/total": 0.01105938758701086, "ref_logps/chosen": -220.71005249023438, "ref_logps/rejected": -241.79510498046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.027833562344312668, "rewards/margins": 3.0462417602539062, "rewards/rejected": -3.07407546043396, "step": 362 }, { "epoch": 0.09, "learning_rate": 1.7410071942446043e-07, "logps/chosen": -232.9167938232422, "logps/rejected": -257.5305480957031, "loss": 0.0851, "losses/dpo": 0.005528003443032503, "losses/sft": 0.6366551518440247, "losses/total": 0.005528003443032503, "ref_logps/chosen": -233.59548950195312, "ref_logps/rejected": -225.80335998535156, "rewards/accuracies": 1.0, "rewards/chosen": 0.06786978244781494, "rewards/margins": 3.2405881881713867, "rewards/rejected": -3.1727185249328613, "step": 363 }, { "epoch": 0.09, "learning_rate": 1.7458033573141487e-07, "logps/chosen": -223.50347900390625, "logps/rejected": -246.57354736328125, "loss": 0.142, "losses/dpo": 0.00017391491564922035, "losses/sft": 0.5368149280548096, "losses/total": 0.00017391491564922035, "ref_logps/chosen": -223.18386840820312, "ref_logps/rejected": -217.053955078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.03196001052856445, "rewards/margins": 2.9199986457824707, "rewards/rejected": -2.951958656311035, "step": 364 }, { "epoch": 0.09, "learning_rate": 1.750599520383693e-07, "logps/chosen": -237.5111083984375, "logps/rejected": -263.10064697265625, "loss": 0.089, "losses/dpo": 0.006309497635811567, "losses/sft": 0.8299179673194885, "losses/total": 0.006309497635811567, "ref_logps/chosen": -237.7407684326172, "ref_logps/rejected": -230.08294677734375, "rewards/accuracies": 1.0, "rewards/chosen": 0.022965308278799057, "rewards/margins": 3.3247365951538086, "rewards/rejected": -3.3017711639404297, "step": 365 }, { "epoch": 0.09, "learning_rate": 1.7553956834532373e-07, "logps/chosen": -228.78953552246094, "logps/rejected": -257.3482971191406, "loss": 0.1047, "losses/dpo": 0.011349167674779892, "losses/sft": 0.571344256401062, "losses/total": 0.011349167674779892, "ref_logps/chosen": -229.13299560546875, "ref_logps/rejected": -226.96522521972656, "rewards/accuracies": 0.96875, "rewards/chosen": 0.03434696048498154, "rewards/margins": 3.0726547241210938, "rewards/rejected": -3.0383076667785645, "step": 366 }, { "epoch": 0.09, "learning_rate": 1.7601918465227817e-07, "logps/chosen": -219.94627380371094, "logps/rejected": -241.75888061523438, "loss": 0.0905, "losses/dpo": 0.006830682046711445, "losses/sft": 0.6241193413734436, "losses/total": 0.006830682046711445, "ref_logps/chosen": -219.92193603515625, "ref_logps/rejected": -210.32122802734375, "rewards/accuracies": 1.0, "rewards/chosen": -0.0024332422763109207, "rewards/margins": 3.141331911087036, "rewards/rejected": -3.1437649726867676, "step": 367 }, { "epoch": 0.09, "learning_rate": 1.7649880095923259e-07, "logps/chosen": -248.18411254882812, "logps/rejected": -273.01934814453125, "loss": 0.0598, "losses/dpo": 0.01602974347770214, "losses/sft": 0.48619163036346436, "losses/total": 0.01602974347770214, "ref_logps/chosen": -248.36520385742188, "ref_logps/rejected": -235.47142028808594, "rewards/accuracies": 1.0, "rewards/chosen": 0.018109574913978577, "rewards/margins": 3.772901773452759, "rewards/rejected": -3.754791736602783, "step": 368 }, { "epoch": 0.09, "learning_rate": 1.7697841726618705e-07, "logps/chosen": -193.38824462890625, "logps/rejected": -229.52740478515625, "loss": 0.1012, "losses/dpo": 0.061648037284612656, "losses/sft": 0.5691829323768616, "losses/total": 0.061648037284612656, "ref_logps/chosen": -193.6048583984375, "ref_logps/rejected": -200.86056518554688, "rewards/accuracies": 1.0, "rewards/chosen": 0.02166163921356201, "rewards/margins": 2.8883447647094727, "rewards/rejected": -2.8666832447052, "step": 369 }, { "epoch": 0.09, "learning_rate": 1.774580335731415e-07, "logps/chosen": -217.8745574951172, "logps/rejected": -271.250732421875, "loss": 0.0825, "losses/dpo": 0.008489330299198627, "losses/sft": 1.0005085468292236, "losses/total": 0.008489330299198627, "ref_logps/chosen": -218.66697692871094, "ref_logps/rejected": -237.5027313232422, "rewards/accuracies": 1.0, "rewards/chosen": 0.0792430192232132, "rewards/margins": 3.454045295715332, "rewards/rejected": -3.3748021125793457, "step": 370 }, { "epoch": 0.09, "learning_rate": 1.779376498800959e-07, "logps/chosen": -277.2566223144531, "logps/rejected": -257.6632995605469, "loss": 0.0892, "losses/dpo": 0.3942951261997223, "losses/sft": 0.6169834136962891, "losses/total": 0.3942951261997223, "ref_logps/chosen": -277.58917236328125, "ref_logps/rejected": -223.05734252929688, "rewards/accuracies": 1.0, "rewards/chosen": 0.03325492516160011, "rewards/margins": 3.4938488006591797, "rewards/rejected": -3.4605941772460938, "step": 371 }, { "epoch": 0.09, "learning_rate": 1.7841726618705035e-07, "logps/chosen": -229.4576416015625, "logps/rejected": -265.296630859375, "loss": 0.1015, "losses/dpo": 0.00984125304967165, "losses/sft": 0.659243106842041, "losses/total": 0.00984125304967165, "ref_logps/chosen": -230.03701782226562, "ref_logps/rejected": -233.44447326660156, "rewards/accuracies": 1.0, "rewards/chosen": 0.057936396449804306, "rewards/margins": 3.2431535720825195, "rewards/rejected": -3.1852169036865234, "step": 372 }, { "epoch": 0.09, "learning_rate": 1.788968824940048e-07, "logps/chosen": -237.95484924316406, "logps/rejected": -271.6171875, "loss": 0.0919, "losses/dpo": 0.015163058415055275, "losses/sft": 0.6163766980171204, "losses/total": 0.015163058415055275, "ref_logps/chosen": -238.18605041503906, "ref_logps/rejected": -238.84259033203125, "rewards/accuracies": 1.0, "rewards/chosen": 0.023120570927858353, "rewards/margins": 3.3005800247192383, "rewards/rejected": -3.2774596214294434, "step": 373 }, { "epoch": 0.09, "learning_rate": 1.793764988009592e-07, "logps/chosen": -217.43801879882812, "logps/rejected": -277.1588134765625, "loss": 0.0699, "losses/dpo": 0.054293349385261536, "losses/sft": 0.5851227045059204, "losses/total": 0.054293349385261536, "ref_logps/chosen": -217.71148681640625, "ref_logps/rejected": -240.57064819335938, "rewards/accuracies": 1.0, "rewards/chosen": 0.027346286922693253, "rewards/margins": 3.6861624717712402, "rewards/rejected": -3.65881609916687, "step": 374 }, { "epoch": 0.09, "learning_rate": 1.7985611510791365e-07, "logps/chosen": -234.130126953125, "logps/rejected": -265.5294189453125, "loss": 0.0647, "losses/dpo": 0.018438704311847687, "losses/sft": 0.602670431137085, "losses/total": 0.018438704311847687, "ref_logps/chosen": -234.38414001464844, "ref_logps/rejected": -227.68414306640625, "rewards/accuracies": 1.0, "rewards/chosen": 0.025401173159480095, "rewards/margins": 3.8099284172058105, "rewards/rejected": -3.78452730178833, "step": 375 }, { "epoch": 0.09, "learning_rate": 1.803357314148681e-07, "logps/chosen": -187.37432861328125, "logps/rejected": -254.8491668701172, "loss": 0.0836, "losses/dpo": 0.00666979281231761, "losses/sft": 0.6496092677116394, "losses/total": 0.00666979281231761, "ref_logps/chosen": -188.46670532226562, "ref_logps/rejected": -220.79298400878906, "rewards/accuracies": 1.0, "rewards/chosen": 0.10923853516578674, "rewards/margins": 3.5148558616638184, "rewards/rejected": -3.4056172370910645, "step": 376 }, { "epoch": 0.09, "learning_rate": 1.8081534772182253e-07, "logps/chosen": -235.498046875, "logps/rejected": -226.47300720214844, "loss": 0.1351, "losses/dpo": 0.00956125557422638, "losses/sft": 0.5669731497764587, "losses/total": 0.00956125557422638, "ref_logps/chosen": -235.99639892578125, "ref_logps/rejected": -197.2596893310547, "rewards/accuracies": 1.0, "rewards/chosen": 0.0498347282409668, "rewards/margins": 2.97116756439209, "rewards/rejected": -2.921332836151123, "step": 377 }, { "epoch": 0.09, "learning_rate": 1.8129496402877698e-07, "logps/chosen": -230.9638214111328, "logps/rejected": -252.48431396484375, "loss": 0.1046, "losses/dpo": 0.009275064803659916, "losses/sft": 0.6117644309997559, "losses/total": 0.009275064803659916, "ref_logps/chosen": -231.34292602539062, "ref_logps/rejected": -220.37869262695312, "rewards/accuracies": 1.0, "rewards/chosen": 0.037913255393505096, "rewards/margins": 3.2484750747680664, "rewards/rejected": -3.210561990737915, "step": 378 }, { "epoch": 0.09, "learning_rate": 1.8177458033573142e-07, "logps/chosen": -245.9505615234375, "logps/rejected": -247.2647247314453, "loss": 0.0868, "losses/dpo": 0.020411325618624687, "losses/sft": 0.5730230808258057, "losses/total": 0.020411325618624687, "ref_logps/chosen": -245.99871826171875, "ref_logps/rejected": -216.44879150390625, "rewards/accuracies": 1.0, "rewards/chosen": 0.004814818501472473, "rewards/margins": 3.0864057540893555, "rewards/rejected": -3.0815906524658203, "step": 379 }, { "epoch": 0.09, "learning_rate": 1.8225419664268583e-07, "logps/chosen": -162.7472686767578, "logps/rejected": -239.35897827148438, "loss": 0.1034, "losses/dpo": 0.006610231939703226, "losses/sft": 0.4069230556488037, "losses/total": 0.006610231939703226, "ref_logps/chosen": -164.1186981201172, "ref_logps/rejected": -207.73602294921875, "rewards/accuracies": 1.0, "rewards/chosen": 0.13714246451854706, "rewards/margins": 3.2994399070739746, "rewards/rejected": -3.162297248840332, "step": 380 }, { "epoch": 0.09, "learning_rate": 1.8273381294964028e-07, "logps/chosen": -206.97607421875, "logps/rejected": -258.1732177734375, "loss": 0.0696, "losses/dpo": 0.0028462212067097425, "losses/sft": 0.5959049463272095, "losses/total": 0.0028462212067097425, "ref_logps/chosen": -207.4652862548828, "ref_logps/rejected": -222.19100952148438, "rewards/accuracies": 1.0, "rewards/chosen": 0.04892311617732048, "rewards/margins": 3.6471402645111084, "rewards/rejected": -3.598217010498047, "step": 381 }, { "epoch": 0.09, "learning_rate": 1.8321342925659472e-07, "logps/chosen": -207.69935607910156, "logps/rejected": -230.4652099609375, "loss": 0.1099, "losses/dpo": 0.007126571610569954, "losses/sft": 0.6264685392379761, "losses/total": 0.007126571610569954, "ref_logps/chosen": -208.16107177734375, "ref_logps/rejected": -200.36248779296875, "rewards/accuracies": 1.0, "rewards/chosen": 0.04617094248533249, "rewards/margins": 3.056443929672241, "rewards/rejected": -3.010272979736328, "step": 382 }, { "epoch": 0.09, "learning_rate": 1.8369304556354913e-07, "logps/chosen": -231.8908233642578, "logps/rejected": -321.87518310546875, "loss": 0.0304, "losses/dpo": 0.02195567637681961, "losses/sft": 0.5999515652656555, "losses/total": 0.02195567637681961, "ref_logps/chosen": -231.86630249023438, "ref_logps/rejected": -277.62744140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.002451132982969284, "rewards/margins": 4.422320365905762, "rewards/rejected": -4.424771308898926, "step": 383 }, { "epoch": 0.09, "learning_rate": 1.841726618705036e-07, "logps/chosen": -195.158447265625, "logps/rejected": -246.74136352539062, "loss": 0.0796, "losses/dpo": 0.04420507699251175, "losses/sft": 0.516146719455719, "losses/total": 0.04420507699251175, "ref_logps/chosen": -196.11553955078125, "ref_logps/rejected": -215.2347869873047, "rewards/accuracies": 1.0, "rewards/chosen": 0.09570998698472977, "rewards/margins": 3.246368885040283, "rewards/rejected": -3.1506588459014893, "step": 384 }, { "epoch": 0.09, "learning_rate": 1.8465227817745804e-07, "logps/chosen": -205.1256103515625, "logps/rejected": -233.43154907226562, "loss": 0.0849, "losses/dpo": 0.013254445046186447, "losses/sft": 0.7280950546264648, "losses/total": 0.013254445046186447, "ref_logps/chosen": -205.13059997558594, "ref_logps/rejected": -203.46034240722656, "rewards/accuracies": 1.0, "rewards/chosen": 0.0005009463056921959, "rewards/margins": 2.997622013092041, "rewards/rejected": -2.9971210956573486, "step": 385 }, { "epoch": 0.09, "learning_rate": 1.8513189448441246e-07, "logps/chosen": -241.04440307617188, "logps/rejected": -280.53045654296875, "loss": 0.0633, "losses/dpo": 0.005946926772594452, "losses/sft": 0.7041234970092773, "losses/total": 0.005946926772594452, "ref_logps/chosen": -241.53778076171875, "ref_logps/rejected": -243.26223754882812, "rewards/accuracies": 1.0, "rewards/chosen": 0.04933660477399826, "rewards/margins": 3.7761588096618652, "rewards/rejected": -3.7268221378326416, "step": 386 }, { "epoch": 0.09, "learning_rate": 1.856115107913669e-07, "logps/chosen": -210.17239379882812, "logps/rejected": -250.02154541015625, "loss": 0.0932, "losses/dpo": 0.010393019765615463, "losses/sft": 0.8341076970100403, "losses/total": 0.010393019765615463, "ref_logps/chosen": -210.45855712890625, "ref_logps/rejected": -215.4253387451172, "rewards/accuracies": 1.0, "rewards/chosen": 0.02861649915575981, "rewards/margins": 3.488236427307129, "rewards/rejected": -3.4596199989318848, "step": 387 }, { "epoch": 0.09, "learning_rate": 1.8609112709832134e-07, "logps/chosen": -221.2245330810547, "logps/rejected": -252.39083862304688, "loss": 0.0868, "losses/dpo": 0.008232565596699715, "losses/sft": 0.6519103050231934, "losses/total": 0.008232565596699715, "ref_logps/chosen": -220.97784423828125, "ref_logps/rejected": -215.94235229492188, "rewards/accuracies": 1.0, "rewards/chosen": -0.024667467921972275, "rewards/margins": 3.620180606842041, "rewards/rejected": -3.644848108291626, "step": 388 }, { "epoch": 0.09, "learning_rate": 1.8657074340527576e-07, "logps/chosen": -200.50082397460938, "logps/rejected": -266.19036865234375, "loss": 0.0682, "losses/dpo": 0.0007166899158619344, "losses/sft": 0.43350690603256226, "losses/total": 0.0007166899158619344, "ref_logps/chosen": -201.31642150878906, "ref_logps/rejected": -225.78475952148438, "rewards/accuracies": 1.0, "rewards/chosen": 0.0815603956580162, "rewards/margins": 4.122121334075928, "rewards/rejected": -4.040560722351074, "step": 389 }, { "epoch": 0.09, "learning_rate": 1.870503597122302e-07, "logps/chosen": -205.5830078125, "logps/rejected": -264.2600402832031, "loss": 0.083, "losses/dpo": 0.006757864262908697, "losses/sft": 0.5730425119400024, "losses/total": 0.006757864262908697, "ref_logps/chosen": -205.62557983398438, "ref_logps/rejected": -228.5718994140625, "rewards/accuracies": 1.0, "rewards/chosen": 0.004257636144757271, "rewards/margins": 3.5730738639831543, "rewards/rejected": -3.5688159465789795, "step": 390 }, { "epoch": 0.09, "learning_rate": 1.8752997601918467e-07, "logps/chosen": -231.0078125, "logps/rejected": -250.0488739013672, "loss": 0.0896, "losses/dpo": 0.01735752634704113, "losses/sft": 0.5472691059112549, "losses/total": 0.01735752634704113, "ref_logps/chosen": -230.7842254638672, "ref_logps/rejected": -212.7904510498047, "rewards/accuracies": 1.0, "rewards/chosen": -0.022358421236276627, "rewards/margins": 3.7034835815429688, "rewards/rejected": -3.725841999053955, "step": 391 }, { "epoch": 0.09, "learning_rate": 1.8800959232613908e-07, "logps/chosen": -189.73092651367188, "logps/rejected": -235.35845947265625, "loss": 0.0885, "losses/dpo": 0.004920247010886669, "losses/sft": 0.5317693948745728, "losses/total": 0.004920247010886669, "ref_logps/chosen": -189.90167236328125, "ref_logps/rejected": -204.15087890625, "rewards/accuracies": 1.0, "rewards/chosen": 0.01707419753074646, "rewards/margins": 3.137830972671509, "rewards/rejected": -3.1207566261291504, "step": 392 }, { "epoch": 0.09, "learning_rate": 1.8848920863309352e-07, "logps/chosen": -218.92433166503906, "logps/rejected": -272.8694152832031, "loss": 0.0728, "losses/dpo": 0.0005089619080536067, "losses/sft": 0.9753532409667969, "losses/total": 0.0005089619080536067, "ref_logps/chosen": -218.4021453857422, "ref_logps/rejected": -232.15328979492188, "rewards/accuracies": 1.0, "rewards/chosen": -0.052216675132513046, "rewards/margins": 4.019397735595703, "rewards/rejected": -4.0716142654418945, "step": 393 }, { "epoch": 0.09, "learning_rate": 1.8896882494004796e-07, "logps/chosen": -238.1555938720703, "logps/rejected": -269.78656005859375, "loss": 0.0543, "losses/dpo": 0.012728152796626091, "losses/sft": 0.5496284365653992, "losses/total": 0.012728152796626091, "ref_logps/chosen": -237.9537811279297, "ref_logps/rejected": -226.89306640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.020179590210318565, "rewards/margins": 4.269171714782715, "rewards/rejected": -4.289351463317871, "step": 394 }, { "epoch": 0.09, "learning_rate": 1.8944844124700238e-07, "logps/chosen": -228.8469696044922, "logps/rejected": -268.09490966796875, "loss": 0.0812, "losses/dpo": 0.0019141945522278547, "losses/sft": 0.5006589293479919, "losses/total": 0.0019141945522278547, "ref_logps/chosen": -230.03451538085938, "ref_logps/rejected": -229.60153198242188, "rewards/accuracies": 1.0, "rewards/chosen": 0.11875525116920471, "rewards/margins": 3.968090295791626, "rewards/rejected": -3.849334955215454, "step": 395 }, { "epoch": 0.1, "learning_rate": 1.8992805755395682e-07, "logps/chosen": -195.7974853515625, "logps/rejected": -231.44091796875, "loss": 0.0901, "losses/dpo": 4.8780948418425396e-05, "losses/sft": 0.35733747482299805, "losses/total": 4.8780948418425396e-05, "ref_logps/chosen": -196.25958251953125, "ref_logps/rejected": -194.66387939453125, "rewards/accuracies": 1.0, "rewards/chosen": 0.046208981424570084, "rewards/margins": 3.7239131927490234, "rewards/rejected": -3.677704095840454, "step": 396 }, { "epoch": 0.1, "learning_rate": 1.9040767386091126e-07, "logps/chosen": -220.3060760498047, "logps/rejected": -254.25604248046875, "loss": 0.0653, "losses/dpo": 0.003860223339870572, "losses/sft": 0.6512227058410645, "losses/total": 0.003860223339870572, "ref_logps/chosen": -221.24212646484375, "ref_logps/rejected": -217.22996520996094, "rewards/accuracies": 1.0, "rewards/chosen": 0.09360373020172119, "rewards/margins": 3.796210527420044, "rewards/rejected": -3.702606439590454, "step": 397 }, { "epoch": 0.1, "learning_rate": 1.9088729016786568e-07, "logps/chosen": -193.33535766601562, "logps/rejected": -248.45053100585938, "loss": 0.0805, "losses/dpo": 0.005637552589178085, "losses/sft": 0.4435352087020874, "losses/total": 0.005637552589178085, "ref_logps/chosen": -193.52389526367188, "ref_logps/rejected": -213.92945861816406, "rewards/accuracies": 1.0, "rewards/chosen": 0.01885376125574112, "rewards/margins": 3.4709603786468506, "rewards/rejected": -3.452106475830078, "step": 398 }, { "epoch": 0.1, "learning_rate": 1.9136690647482015e-07, "logps/chosen": -212.4105987548828, "logps/rejected": -256.1532897949219, "loss": 0.0763, "losses/dpo": 0.001887063728645444, "losses/sft": 0.5911840200424194, "losses/total": 0.001887063728645444, "ref_logps/chosen": -212.69937133789062, "ref_logps/rejected": -221.60147094726562, "rewards/accuracies": 1.0, "rewards/chosen": 0.0288764126598835, "rewards/margins": 3.4840564727783203, "rewards/rejected": -3.4551801681518555, "step": 399 }, { "epoch": 0.1, "learning_rate": 1.918465227817746e-07, "logps/chosen": -192.9623260498047, "logps/rejected": -270.04547119140625, "loss": 0.0697, "losses/dpo": 0.000809791381470859, "losses/sft": 1.0878125429153442, "losses/total": 0.000809791381470859, "ref_logps/chosen": -193.41354370117188, "ref_logps/rejected": -234.93304443359375, "rewards/accuracies": 1.0, "rewards/chosen": 0.04512327164411545, "rewards/margins": 3.556366443634033, "rewards/rejected": -3.5112431049346924, "step": 400 }, { "epoch": 0.1, "learning_rate": 1.92326139088729e-07, "logps/chosen": -219.4453125, "logps/rejected": -244.33065795898438, "loss": 0.0972, "losses/dpo": 0.5304702520370483, "losses/sft": 0.7700635194778442, "losses/total": 0.5304702520370483, "ref_logps/chosen": -218.0960693359375, "ref_logps/rejected": -210.88400268554688, "rewards/accuracies": 1.0, "rewards/chosen": -0.13492245972156525, "rewards/margins": 3.209743022918701, "rewards/rejected": -3.34466552734375, "step": 401 }, { "epoch": 0.1, "learning_rate": 1.9280575539568344e-07, "logps/chosen": -210.28970336914062, "logps/rejected": -261.991455078125, "loss": 0.067, "losses/dpo": 0.0199581291526556, "losses/sft": 0.40830665826797485, "losses/total": 0.0199581291526556, "ref_logps/chosen": -210.73471069335938, "ref_logps/rejected": -223.81561279296875, "rewards/accuracies": 1.0, "rewards/chosen": 0.04449869692325592, "rewards/margins": 3.862086296081543, "rewards/rejected": -3.817587375640869, "step": 402 }, { "epoch": 0.1, "learning_rate": 1.9328537170263789e-07, "logps/chosen": -225.24281311035156, "logps/rejected": -270.65191650390625, "loss": 0.0658, "losses/dpo": 0.009339611977338791, "losses/sft": 0.5946564078330994, "losses/total": 0.009339611977338791, "ref_logps/chosen": -225.65615844726562, "ref_logps/rejected": -233.29217529296875, "rewards/accuracies": 1.0, "rewards/chosen": 0.04133213683962822, "rewards/margins": 3.7773048877716064, "rewards/rejected": -3.7359726428985596, "step": 403 }, { "epoch": 0.1, "learning_rate": 1.937649880095923e-07, "logps/chosen": -211.42080688476562, "logps/rejected": -240.8201446533203, "loss": 0.0772, "losses/dpo": 0.00445047440007329, "losses/sft": 0.641703188419342, "losses/total": 0.00445047440007329, "ref_logps/chosen": -211.32196044921875, "ref_logps/rejected": -203.05697631835938, "rewards/accuracies": 1.0, "rewards/chosen": -0.009884197264909744, "rewards/margins": 3.7664332389831543, "rewards/rejected": -3.776317596435547, "step": 404 }, { "epoch": 0.1, "learning_rate": 1.9424460431654674e-07, "logps/chosen": -212.4838409423828, "logps/rejected": -249.02716064453125, "loss": 0.0817, "losses/dpo": 0.006367148831486702, "losses/sft": 0.4832040071487427, "losses/total": 0.006367148831486702, "ref_logps/chosen": -212.94564819335938, "ref_logps/rejected": -213.692138671875, "rewards/accuracies": 1.0, "rewards/chosen": 0.04618174955248833, "rewards/margins": 3.5796847343444824, "rewards/rejected": -3.5335030555725098, "step": 405 }, { "epoch": 0.1, "learning_rate": 1.947242206235012e-07, "logps/chosen": -189.9763946533203, "logps/rejected": -214.00343322753906, "loss": 0.1025, "losses/dpo": 0.00020495560602284968, "losses/sft": 0.6548482775688171, "losses/total": 0.00020495560602284968, "ref_logps/chosen": -189.97805786132812, "ref_logps/rejected": -181.23094177246094, "rewards/accuracies": 1.0, "rewards/chosen": 0.00016667693853378296, "rewards/margins": 3.2774150371551514, "rewards/rejected": -3.2772481441497803, "step": 406 }, { "epoch": 0.1, "learning_rate": 1.9520383693045563e-07, "logps/chosen": -225.08729553222656, "logps/rejected": -277.8194885253906, "loss": 0.048, "losses/dpo": 0.0027749796863645315, "losses/sft": 0.38060373067855835, "losses/total": 0.0027749796863645315, "ref_logps/chosen": -225.60340881347656, "ref_logps/rejected": -236.9735565185547, "rewards/accuracies": 1.0, "rewards/chosen": 0.05161021649837494, "rewards/margins": 4.136204242706299, "rewards/rejected": -4.084593772888184, "step": 407 }, { "epoch": 0.1, "learning_rate": 1.9568345323741007e-07, "logps/chosen": -229.380859375, "logps/rejected": -251.2332763671875, "loss": 0.0865, "losses/dpo": 0.00953012052923441, "losses/sft": 0.5585977435112, "losses/total": 0.00953012052923441, "ref_logps/chosen": -229.523193359375, "ref_logps/rejected": -216.33494567871094, "rewards/accuracies": 1.0, "rewards/chosen": 0.014233124442398548, "rewards/margins": 3.5040647983551025, "rewards/rejected": -3.4898316860198975, "step": 408 }, { "epoch": 0.1, "learning_rate": 1.961630695443645e-07, "logps/chosen": -239.74375915527344, "logps/rejected": -260.2661437988281, "loss": 0.0598, "losses/dpo": 0.002115837065503001, "losses/sft": 0.5585288405418396, "losses/total": 0.002115837065503001, "ref_logps/chosen": -240.08035278320312, "ref_logps/rejected": -217.80931091308594, "rewards/accuracies": 1.0, "rewards/chosen": 0.03365885466337204, "rewards/margins": 4.2793426513671875, "rewards/rejected": -4.245683670043945, "step": 409 }, { "epoch": 0.1, "learning_rate": 1.9664268585131893e-07, "logps/chosen": -212.1595458984375, "logps/rejected": -257.0945129394531, "loss": 0.0806, "losses/dpo": 0.0053507862612605095, "losses/sft": 0.6979221701622009, "losses/total": 0.0053507862612605095, "ref_logps/chosen": -213.1703338623047, "ref_logps/rejected": -222.07034301757812, "rewards/accuracies": 1.0, "rewards/chosen": 0.10107828676700592, "rewards/margins": 3.6034960746765137, "rewards/rejected": -3.502418041229248, "step": 410 }, { "epoch": 0.1, "learning_rate": 1.9712230215827337e-07, "logps/chosen": -211.47576904296875, "logps/rejected": -278.1316223144531, "loss": 0.0402, "losses/dpo": 0.004020490683615208, "losses/sft": 0.45225659012794495, "losses/total": 0.004020490683615208, "ref_logps/chosen": -211.70944213867188, "ref_logps/rejected": -234.29067993164062, "rewards/accuracies": 1.0, "rewards/chosen": 0.023369234055280685, "rewards/margins": 4.407461643218994, "rewards/rejected": -4.384092330932617, "step": 411 }, { "epoch": 0.1, "learning_rate": 1.976019184652278e-07, "logps/chosen": -231.15081787109375, "logps/rejected": -278.1329650878906, "loss": 0.0741, "losses/dpo": 0.0003084212075918913, "losses/sft": 0.5435898303985596, "losses/total": 0.0003084212075918913, "ref_logps/chosen": -231.371337890625, "ref_logps/rejected": -236.33377075195312, "rewards/accuracies": 0.96875, "rewards/chosen": 0.022053740918636322, "rewards/margins": 4.201970100402832, "rewards/rejected": -4.1799163818359375, "step": 412 }, { "epoch": 0.1, "learning_rate": 1.9808153477218222e-07, "logps/chosen": -218.51348876953125, "logps/rejected": -236.87496948242188, "loss": 0.0673, "losses/dpo": 0.001256247516721487, "losses/sft": 0.6001503467559814, "losses/total": 0.001256247516721487, "ref_logps/chosen": -218.98602294921875, "ref_logps/rejected": -200.0592498779297, "rewards/accuracies": 1.0, "rewards/chosen": 0.04725222289562225, "rewards/margins": 3.728823661804199, "rewards/rejected": -3.6815717220306396, "step": 413 }, { "epoch": 0.1, "learning_rate": 1.985611510791367e-07, "logps/chosen": -279.1685485839844, "logps/rejected": -298.15338134765625, "loss": 0.0656, "losses/dpo": 0.0016459088074043393, "losses/sft": 0.6497185230255127, "losses/total": 0.0016459088074043393, "ref_logps/chosen": -278.14996337890625, "ref_logps/rejected": -254.3009796142578, "rewards/accuracies": 1.0, "rewards/chosen": -0.1018579751253128, "rewards/margins": 4.283383846282959, "rewards/rejected": -4.385241985321045, "step": 414 }, { "epoch": 0.1, "learning_rate": 1.9904076738609113e-07, "logps/chosen": -219.2233123779297, "logps/rejected": -266.6907958984375, "loss": 0.051, "losses/dpo": 0.0024445585440844297, "losses/sft": 0.5307902097702026, "losses/total": 0.0024445585440844297, "ref_logps/chosen": -220.29006958007812, "ref_logps/rejected": -226.164794921875, "rewards/accuracies": 1.0, "rewards/chosen": 0.10667642951011658, "rewards/margins": 4.159275531768799, "rewards/rejected": -4.0525994300842285, "step": 415 }, { "epoch": 0.1, "learning_rate": 1.9952038369304555e-07, "logps/chosen": -259.1754150390625, "logps/rejected": -270.16278076171875, "loss": 0.0734, "losses/dpo": 0.018178163096308708, "losses/sft": 0.46781742572784424, "losses/total": 0.018178163096308708, "ref_logps/chosen": -258.73431396484375, "ref_logps/rejected": -230.48788452148438, "rewards/accuracies": 1.0, "rewards/chosen": -0.044108178466558456, "rewards/margins": 3.9233827590942383, "rewards/rejected": -3.9674909114837646, "step": 416 }, { "epoch": 0.1, "learning_rate": 2e-07, "logps/chosen": -221.57974243164062, "logps/rejected": -258.1894226074219, "loss": 0.0575, "losses/dpo": 0.0005623308243229985, "losses/sft": 0.7466726899147034, "losses/total": 0.0005623308243229985, "ref_logps/chosen": -221.02374267578125, "ref_logps/rejected": -216.87136840820312, "rewards/accuracies": 1.0, "rewards/chosen": -0.05560081824660301, "rewards/margins": 4.076206207275391, "rewards/rejected": -4.131807327270508, "step": 417 }, { "epoch": 0.1, "learning_rate": 1.9994666666666667e-07, "logps/chosen": -241.60739135742188, "logps/rejected": -267.03271484375, "loss": 0.0478, "losses/dpo": 0.0036738133057951927, "losses/sft": 0.5178593397140503, "losses/total": 0.0036738133057951927, "ref_logps/chosen": -241.63613891601562, "ref_logps/rejected": -221.82009887695312, "rewards/accuracies": 1.0, "rewards/chosen": 0.0028744935989379883, "rewards/margins": 4.524135112762451, "rewards/rejected": -4.521260738372803, "step": 418 }, { "epoch": 0.1, "learning_rate": 1.9989333333333332e-07, "logps/chosen": -170.20782470703125, "logps/rejected": -223.89443969726562, "loss": 0.0913, "losses/dpo": 0.02516092360019684, "losses/sft": 0.654964029788971, "losses/total": 0.02516092360019684, "ref_logps/chosen": -170.6856689453125, "ref_logps/rejected": -192.34555053710938, "rewards/accuracies": 1.0, "rewards/chosen": 0.04778621718287468, "rewards/margins": 3.2026758193969727, "rewards/rejected": -3.1548895835876465, "step": 419 }, { "epoch": 0.1, "learning_rate": 1.9984e-07, "logps/chosen": -175.5120391845703, "logps/rejected": -260.85687255859375, "loss": 0.071, "losses/dpo": 0.0033355390187352896, "losses/sft": 0.44675251841545105, "losses/total": 0.0033355390187352896, "ref_logps/chosen": -175.8011932373047, "ref_logps/rejected": -221.12225341796875, "rewards/accuracies": 1.0, "rewards/chosen": 0.02891494147479534, "rewards/margins": 4.002374649047852, "rewards/rejected": -3.9734599590301514, "step": 420 }, { "epoch": 0.1, "learning_rate": 1.9978666666666667e-07, "logps/chosen": -243.59930419921875, "logps/rejected": -256.7080383300781, "loss": 0.0849, "losses/dpo": 0.0009621767094358802, "losses/sft": 0.8713856935501099, "losses/total": 0.0009621767094358802, "ref_logps/chosen": -243.54434204101562, "ref_logps/rejected": -218.67596435546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.005495523102581501, "rewards/margins": 3.797713279724121, "rewards/rejected": -3.803208827972412, "step": 421 }, { "epoch": 0.1, "learning_rate": 1.9973333333333332e-07, "logps/chosen": -214.68756103515625, "logps/rejected": -244.57427978515625, "loss": 0.0862, "losses/dpo": 0.001728493720293045, "losses/sft": 0.4437835216522217, "losses/total": 0.001728493720293045, "ref_logps/chosen": -215.43182373046875, "ref_logps/rejected": -207.54339599609375, "rewards/accuracies": 1.0, "rewards/chosen": 0.07442975044250488, "rewards/margins": 3.7775158882141113, "rewards/rejected": -3.7030863761901855, "step": 422 }, { "epoch": 0.1, "learning_rate": 1.9967999999999997e-07, "logps/chosen": -240.72845458984375, "logps/rejected": -251.85910034179688, "loss": 0.0484, "losses/dpo": 0.00013582146493718028, "losses/sft": 0.8405857682228088, "losses/total": 0.00013582146493718028, "ref_logps/chosen": -240.94879150390625, "ref_logps/rejected": -208.13551330566406, "rewards/accuracies": 1.0, "rewards/chosen": 0.022034630179405212, "rewards/margins": 4.394392013549805, "rewards/rejected": -4.372357368469238, "step": 423 }, { "epoch": 0.1, "learning_rate": 1.9962666666666665e-07, "logps/chosen": -207.5045166015625, "logps/rejected": -236.74752807617188, "loss": 0.0885, "losses/dpo": 0.002699431963264942, "losses/sft": 0.6597952246665955, "losses/total": 0.002699431963264942, "ref_logps/chosen": -207.24560546875, "ref_logps/rejected": -200.09152221679688, "rewards/accuracies": 1.0, "rewards/chosen": -0.025891674682497978, "rewards/margins": 3.6397111415863037, "rewards/rejected": -3.6656031608581543, "step": 424 }, { "epoch": 0.1, "learning_rate": 1.9957333333333333e-07, "logps/chosen": -193.3130340576172, "logps/rejected": -279.0089111328125, "loss": 0.0457, "losses/dpo": 0.0006794788059778512, "losses/sft": 0.5741525292396545, "losses/total": 0.0006794788059778512, "ref_logps/chosen": -193.34918212890625, "ref_logps/rejected": -231.26605224609375, "rewards/accuracies": 1.0, "rewards/chosen": 0.0036149611696600914, "rewards/margins": 4.777900695800781, "rewards/rejected": -4.774285793304443, "step": 425 }, { "epoch": 0.1, "learning_rate": 1.9952e-07, "logps/chosen": -222.18728637695312, "logps/rejected": -293.56207275390625, "loss": 0.0393, "losses/dpo": 0.006192094646394253, "losses/sft": 0.6054272651672363, "losses/total": 0.006192094646394253, "ref_logps/chosen": -222.88980102539062, "ref_logps/rejected": -252.8831787109375, "rewards/accuracies": 1.0, "rewards/chosen": 0.07025016099214554, "rewards/margins": 4.1381378173828125, "rewards/rejected": -4.067888259887695, "step": 426 }, { "epoch": 0.1, "learning_rate": 1.9946666666666665e-07, "logps/chosen": -241.19873046875, "logps/rejected": -286.3368835449219, "loss": 0.0513, "losses/dpo": 0.004983088001608849, "losses/sft": 0.44527116417884827, "losses/total": 0.004983088001608849, "ref_logps/chosen": -241.65484619140625, "ref_logps/rejected": -244.5561981201172, "rewards/accuracies": 1.0, "rewards/chosen": 0.04561350494623184, "rewards/margins": 4.223682403564453, "rewards/rejected": -4.1780686378479, "step": 427 }, { "epoch": 0.1, "learning_rate": 1.9941333333333333e-07, "logps/chosen": -209.41722106933594, "logps/rejected": -271.6551513671875, "loss": 0.063, "losses/dpo": 0.0005006591090932488, "losses/sft": 0.44754454493522644, "losses/total": 0.0005006591090932488, "ref_logps/chosen": -209.8820037841797, "ref_logps/rejected": -230.9610595703125, "rewards/accuracies": 1.0, "rewards/chosen": 0.04647738113999367, "rewards/margins": 4.115889549255371, "rewards/rejected": -4.0694122314453125, "step": 428 }, { "epoch": 0.1, "learning_rate": 1.9936e-07, "logps/chosen": -207.81350708007812, "logps/rejected": -277.62628173828125, "loss": 0.0528, "losses/dpo": 0.003410959616303444, "losses/sft": 0.6682607531547546, "losses/total": 0.003410959616303444, "ref_logps/chosen": -208.0696563720703, "ref_logps/rejected": -234.87220764160156, "rewards/accuracies": 1.0, "rewards/chosen": 0.025614969432353973, "rewards/margins": 4.301023483276367, "rewards/rejected": -4.275407791137695, "step": 429 }, { "epoch": 0.1, "learning_rate": 1.9930666666666666e-07, "logps/chosen": -267.386474609375, "logps/rejected": -288.93951416015625, "loss": 0.0418, "losses/dpo": 0.0046060713939368725, "losses/sft": 0.8342338800430298, "losses/total": 0.0046060713939368725, "ref_logps/chosen": -267.7779846191406, "ref_logps/rejected": -242.78321838378906, "rewards/accuracies": 1.0, "rewards/chosen": 0.03915094584226608, "rewards/margins": 4.654780864715576, "rewards/rejected": -4.61562967300415, "step": 430 }, { "epoch": 0.1, "learning_rate": 1.992533333333333e-07, "logps/chosen": -238.2349395751953, "logps/rejected": -249.4231414794922, "loss": 0.0489, "losses/dpo": 0.0024127059150487185, "losses/sft": 0.6954320073127747, "losses/total": 0.0024127059150487185, "ref_logps/chosen": -238.8325958251953, "ref_logps/rejected": -210.7586669921875, "rewards/accuracies": 1.0, "rewards/chosen": 0.05976716801524162, "rewards/margins": 3.9262142181396484, "rewards/rejected": -3.8664472103118896, "step": 431 }, { "epoch": 0.1, "learning_rate": 1.9919999999999998e-07, "logps/chosen": -249.59889221191406, "logps/rejected": -259.95709228515625, "loss": 0.053, "losses/dpo": 0.0015878792619332671, "losses/sft": 1.1116806268692017, "losses/total": 0.0015878792619332671, "ref_logps/chosen": -249.38143920898438, "ref_logps/rejected": -218.69644165039062, "rewards/accuracies": 1.0, "rewards/chosen": -0.021745111793279648, "rewards/margins": 4.1043219566345215, "rewards/rejected": -4.126067161560059, "step": 432 }, { "epoch": 0.1, "learning_rate": 1.9914666666666666e-07, "logps/chosen": -250.92660522460938, "logps/rejected": -261.1123962402344, "loss": 0.0422, "losses/dpo": 0.050393447279930115, "losses/sft": 0.7047452330589294, "losses/total": 0.050393447279930115, "ref_logps/chosen": -251.383056640625, "ref_logps/rejected": -218.83203125, "rewards/accuracies": 1.0, "rewards/chosen": 0.04564567282795906, "rewards/margins": 4.273683547973633, "rewards/rejected": -4.2280378341674805, "step": 433 }, { "epoch": 0.1, "learning_rate": 1.9909333333333334e-07, "logps/chosen": -239.21835327148438, "logps/rejected": -277.3555908203125, "loss": 0.0524, "losses/dpo": 0.0012977407313883305, "losses/sft": 0.5779939889907837, "losses/total": 0.0012977407313883305, "ref_logps/chosen": -239.06704711914062, "ref_logps/rejected": -233.77906799316406, "rewards/accuracies": 1.0, "rewards/chosen": -0.015129730105400085, "rewards/margins": 4.342523574829102, "rewards/rejected": -4.3576531410217285, "step": 434 }, { "epoch": 0.1, "learning_rate": 1.9904e-07, "logps/chosen": -240.7468719482422, "logps/rejected": -294.72314453125, "loss": 0.0356, "losses/dpo": 0.06467873603105545, "losses/sft": 0.47783568501472473, "losses/total": 0.06467873603105545, "ref_logps/chosen": -240.51895141601562, "ref_logps/rejected": -251.6407928466797, "rewards/accuracies": 1.0, "rewards/chosen": -0.022791191935539246, "rewards/margins": 4.285445690155029, "rewards/rejected": -4.308237075805664, "step": 435 }, { "epoch": 0.1, "learning_rate": 1.9898666666666666e-07, "logps/chosen": -223.95947265625, "logps/rejected": -288.0596923828125, "loss": 0.0426, "losses/dpo": 0.00020143559959251434, "losses/sft": 0.5183254480361938, "losses/total": 0.00020143559959251434, "ref_logps/chosen": -224.0563507080078, "ref_logps/rejected": -244.92129516601562, "rewards/accuracies": 1.0, "rewards/chosen": 0.009689690545201302, "rewards/margins": 4.323531150817871, "rewards/rejected": -4.313841819763184, "step": 436 }, { "epoch": 0.1, "learning_rate": 1.9893333333333334e-07, "logps/chosen": -210.0992889404297, "logps/rejected": -222.79840087890625, "loss": 0.0643, "losses/dpo": 0.004652327857911587, "losses/sft": 0.9884487986564636, "losses/total": 0.004652327857911587, "ref_logps/chosen": -209.552001953125, "ref_logps/rejected": -185.51377868652344, "rewards/accuracies": 1.0, "rewards/chosen": -0.05473087728023529, "rewards/margins": 3.673731803894043, "rewards/rejected": -3.7284626960754395, "step": 437 }, { "epoch": 0.11, "learning_rate": 1.9888e-07, "logps/chosen": -235.4115447998047, "logps/rejected": -285.44219970703125, "loss": 0.0399, "losses/dpo": 0.0003371609782334417, "losses/sft": 0.4789685904979706, "losses/total": 0.0003371609782334417, "ref_logps/chosen": -235.90615844726562, "ref_logps/rejected": -238.44021606445312, "rewards/accuracies": 1.0, "rewards/chosen": 0.04946370795369148, "rewards/margins": 4.749660491943359, "rewards/rejected": -4.700196743011475, "step": 438 }, { "epoch": 0.11, "learning_rate": 1.9882666666666664e-07, "logps/chosen": -210.6427001953125, "logps/rejected": -265.882080078125, "loss": 0.0391, "losses/dpo": 0.0021712803281843662, "losses/sft": 0.45063623785972595, "losses/total": 0.0021712803281843662, "ref_logps/chosen": -210.27981567382812, "ref_logps/rejected": -224.09336853027344, "rewards/accuracies": 1.0, "rewards/chosen": -0.03628900647163391, "rewards/margins": 4.142583847045898, "rewards/rejected": -4.178873062133789, "step": 439 }, { "epoch": 0.11, "learning_rate": 1.9877333333333332e-07, "logps/chosen": -241.41104125976562, "logps/rejected": -304.0252990722656, "loss": 0.036, "losses/dpo": 4.511409133556299e-05, "losses/sft": 0.5915945172309875, "losses/total": 4.511409133556299e-05, "ref_logps/chosen": -240.31698608398438, "ref_logps/rejected": -252.8621826171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.10940533876419067, "rewards/margins": 5.0069074630737305, "rewards/rejected": -5.1163129806518555, "step": 440 }, { "epoch": 0.11, "learning_rate": 1.9872e-07, "logps/chosen": -182.22561645507812, "logps/rejected": -257.801025390625, "loss": 0.0588, "losses/dpo": 0.00018104552873410285, "losses/sft": 0.6100059151649475, "losses/total": 0.00018104552873410285, "ref_logps/chosen": -181.71607971191406, "ref_logps/rejected": -214.27728271484375, "rewards/accuracies": 1.0, "rewards/chosen": -0.050954315811395645, "rewards/margins": 4.301421642303467, "rewards/rejected": -4.3523759841918945, "step": 441 }, { "epoch": 0.11, "learning_rate": 1.9866666666666665e-07, "logps/chosen": -207.4700469970703, "logps/rejected": -240.8095703125, "loss": 0.0734, "losses/dpo": 0.004828807432204485, "losses/sft": 0.5322418808937073, "losses/total": 0.004828807432204485, "ref_logps/chosen": -207.50399780273438, "ref_logps/rejected": -201.49896240234375, "rewards/accuracies": 1.0, "rewards/chosen": 0.0033941958099603653, "rewards/margins": 3.934455633163452, "rewards/rejected": -3.931061267852783, "step": 442 }, { "epoch": 0.11, "learning_rate": 1.9861333333333332e-07, "logps/chosen": -208.6500244140625, "logps/rejected": -245.49319458007812, "loss": 0.0653, "losses/dpo": 0.000520115252584219, "losses/sft": 0.5419037938117981, "losses/total": 0.000520115252584219, "ref_logps/chosen": -208.53578186035156, "ref_logps/rejected": -206.328857421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.011424858123064041, "rewards/margins": 3.905007839202881, "rewards/rejected": -3.9164328575134277, "step": 443 }, { "epoch": 0.11, "learning_rate": 1.9856e-07, "logps/chosen": -210.98023986816406, "logps/rejected": -272.7930603027344, "loss": 0.052, "losses/dpo": 0.00028039299650117755, "losses/sft": 0.7317469120025635, "losses/total": 0.00028039299650117755, "ref_logps/chosen": -210.54881286621094, "ref_logps/rejected": -228.7121124267578, "rewards/accuracies": 1.0, "rewards/chosen": -0.0431404709815979, "rewards/margins": 4.36495304107666, "rewards/rejected": -4.408093452453613, "step": 444 }, { "epoch": 0.11, "learning_rate": 1.9850666666666668e-07, "logps/chosen": -244.86941528320312, "logps/rejected": -235.16207885742188, "loss": 0.0737, "losses/dpo": 0.0006668432615697384, "losses/sft": 0.8049888014793396, "losses/total": 0.0006668432615697384, "ref_logps/chosen": -244.9074249267578, "ref_logps/rejected": -193.2578125, "rewards/accuracies": 1.0, "rewards/chosen": 0.0038013793528079987, "rewards/margins": 4.1942291259765625, "rewards/rejected": -4.190427780151367, "step": 445 }, { "epoch": 0.11, "learning_rate": 1.9845333333333333e-07, "logps/chosen": -203.41860961914062, "logps/rejected": -270.5901184082031, "loss": 0.0531, "losses/dpo": 0.004118259064853191, "losses/sft": 0.6035458445549011, "losses/total": 0.004118259064853191, "ref_logps/chosen": -202.7349090576172, "ref_logps/rejected": -221.95062255859375, "rewards/accuracies": 1.0, "rewards/chosen": -0.06837201118469238, "rewards/margins": 4.795576572418213, "rewards/rejected": -4.863948345184326, "step": 446 }, { "epoch": 0.11, "learning_rate": 1.9839999999999998e-07, "logps/chosen": -210.07464599609375, "logps/rejected": -260.4158935546875, "loss": 0.0554, "losses/dpo": 0.0008197681745514274, "losses/sft": 0.6514835953712463, "losses/total": 0.0008197681745514274, "ref_logps/chosen": -210.40113830566406, "ref_logps/rejected": -216.52566528320312, "rewards/accuracies": 1.0, "rewards/chosen": 0.03265036642551422, "rewards/margins": 4.421672344207764, "rewards/rejected": -4.389021873474121, "step": 447 }, { "epoch": 0.11, "learning_rate": 1.9834666666666665e-07, "logps/chosen": -233.51512145996094, "logps/rejected": -285.6983642578125, "loss": 0.0518, "losses/dpo": 0.0010411045514047146, "losses/sft": 0.6096970438957214, "losses/total": 0.0010411045514047146, "ref_logps/chosen": -234.01666259765625, "ref_logps/rejected": -244.1446533203125, "rewards/accuracies": 1.0, "rewards/chosen": 0.05015500262379646, "rewards/margins": 4.205526351928711, "rewards/rejected": -4.155370712280273, "step": 448 }, { "epoch": 0.11, "learning_rate": 1.9829333333333333e-07, "logps/chosen": -192.3249969482422, "logps/rejected": -254.86294555664062, "loss": 0.0747, "losses/dpo": 0.003069151658564806, "losses/sft": 0.47167250514030457, "losses/total": 0.003069151658564806, "ref_logps/chosen": -192.13211059570312, "ref_logps/rejected": -211.8514404296875, "rewards/accuracies": 1.0, "rewards/chosen": -0.019290227442979813, "rewards/margins": 4.281859397888184, "rewards/rejected": -4.301149368286133, "step": 449 }, { "epoch": 0.11, "learning_rate": 1.9823999999999998e-07, "logps/chosen": -205.87977600097656, "logps/rejected": -254.63978576660156, "loss": 0.0569, "losses/dpo": 0.000350284855812788, "losses/sft": 0.5268433690071106, "losses/total": 0.000350284855812788, "ref_logps/chosen": -205.87600708007812, "ref_logps/rejected": -212.10696411132812, "rewards/accuracies": 1.0, "rewards/chosen": -0.00037687644362449646, "rewards/margins": 4.252906322479248, "rewards/rejected": -4.253283500671387, "step": 450 }, { "epoch": 0.11, "learning_rate": 1.9818666666666666e-07, "logps/chosen": -253.13433837890625, "logps/rejected": -258.45635986328125, "loss": 0.0503, "losses/dpo": 0.0013691667700186372, "losses/sft": 0.7669347524642944, "losses/total": 0.0013691667700186372, "ref_logps/chosen": -252.00721740722656, "ref_logps/rejected": -214.57107543945312, "rewards/accuracies": 1.0, "rewards/chosen": -0.11271188408136368, "rewards/margins": 4.275815963745117, "rewards/rejected": -4.388527870178223, "step": 451 }, { "epoch": 0.11, "learning_rate": 1.9813333333333333e-07, "logps/chosen": -222.1473388671875, "logps/rejected": -232.51739501953125, "loss": 0.0516, "losses/dpo": 0.0002449072489980608, "losses/sft": 0.4474215805530548, "losses/total": 0.0002449072489980608, "ref_logps/chosen": -222.3138885498047, "ref_logps/rejected": -190.43966674804688, "rewards/accuracies": 1.0, "rewards/chosen": 0.01665543019771576, "rewards/margins": 4.224427223205566, "rewards/rejected": -4.207771301269531, "step": 452 }, { "epoch": 0.11, "learning_rate": 1.9807999999999998e-07, "logps/chosen": -236.42774963378906, "logps/rejected": -255.589111328125, "loss": 0.0768, "losses/dpo": 5.930348197580315e-05, "losses/sft": 0.6669339537620544, "losses/total": 5.930348197580315e-05, "ref_logps/chosen": -236.24270629882812, "ref_logps/rejected": -215.13351440429688, "rewards/accuracies": 1.0, "rewards/chosen": -0.01850355602800846, "rewards/margins": 4.027056694030762, "rewards/rejected": -4.045559883117676, "step": 453 }, { "epoch": 0.11, "learning_rate": 1.9802666666666666e-07, "logps/chosen": -224.6739501953125, "logps/rejected": -255.80157470703125, "loss": 0.0533, "losses/dpo": 0.0015890757786110044, "losses/sft": 0.4504189193248749, "losses/total": 0.0015890757786110044, "ref_logps/chosen": -224.92018127441406, "ref_logps/rejected": -211.67913818359375, "rewards/accuracies": 1.0, "rewards/chosen": 0.02462385594844818, "rewards/margins": 4.436867713928223, "rewards/rejected": -4.412243843078613, "step": 454 }, { "epoch": 0.11, "learning_rate": 1.9797333333333334e-07, "logps/chosen": -211.05313110351562, "logps/rejected": -254.28565979003906, "loss": 0.0417, "losses/dpo": 0.002286868402734399, "losses/sft": 0.4725096821784973, "losses/total": 0.002286868402734399, "ref_logps/chosen": -210.2997589111328, "ref_logps/rejected": -210.66525268554688, "rewards/accuracies": 1.0, "rewards/chosen": -0.07533761858940125, "rewards/margins": 4.286700248718262, "rewards/rejected": -4.362037658691406, "step": 455 }, { "epoch": 0.11, "learning_rate": 1.9792e-07, "logps/chosen": -209.71470642089844, "logps/rejected": -286.0678405761719, "loss": 0.0425, "losses/dpo": 5.9847818192793056e-05, "losses/sft": 0.5728899836540222, "losses/total": 5.9847818192793056e-05, "ref_logps/chosen": -209.39459228515625, "ref_logps/rejected": -234.77178955078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.03201107308268547, "rewards/margins": 5.097596168518066, "rewards/rejected": -5.129607677459717, "step": 456 }, { "epoch": 0.11, "learning_rate": 1.9786666666666664e-07, "logps/chosen": -195.69107055664062, "logps/rejected": -245.29537963867188, "loss": 0.0543, "losses/dpo": 5.842046448378824e-05, "losses/sft": 0.5069923400878906, "losses/total": 5.842046448378824e-05, "ref_logps/chosen": -194.77999877929688, "ref_logps/rejected": -202.97427368164062, "rewards/accuracies": 1.0, "rewards/chosen": -0.09110909700393677, "rewards/margins": 4.1410017013549805, "rewards/rejected": -4.232110977172852, "step": 457 }, { "epoch": 0.11, "learning_rate": 1.9781333333333332e-07, "logps/chosen": -162.86314392089844, "logps/rejected": -237.09109497070312, "loss": 0.0803, "losses/dpo": 0.009390389546751976, "losses/sft": 0.5947399139404297, "losses/total": 0.009390389546751976, "ref_logps/chosen": -162.9515380859375, "ref_logps/rejected": -196.25918579101562, "rewards/accuracies": 1.0, "rewards/chosen": 0.00883970595896244, "rewards/margins": 4.0920305252075195, "rewards/rejected": -4.08319091796875, "step": 458 }, { "epoch": 0.11, "learning_rate": 1.9776e-07, "logps/chosen": -191.86080932617188, "logps/rejected": -267.9547119140625, "loss": 0.0394, "losses/dpo": 0.035273704677820206, "losses/sft": 0.6367626786231995, "losses/total": 0.035273704677820206, "ref_logps/chosen": -192.2730712890625, "ref_logps/rejected": -222.61654663085938, "rewards/accuracies": 1.0, "rewards/chosen": 0.0412268228828907, "rewards/margins": 4.575039863586426, "rewards/rejected": -4.533812999725342, "step": 459 }, { "epoch": 0.11, "learning_rate": 1.9770666666666667e-07, "logps/chosen": -266.36834716796875, "logps/rejected": -252.90142822265625, "loss": 0.051, "losses/dpo": 0.00031349071650765836, "losses/sft": 0.7084897756576538, "losses/total": 0.00031349071650765836, "ref_logps/chosen": -266.05975341796875, "ref_logps/rejected": -209.16810607910156, "rewards/accuracies": 1.0, "rewards/chosen": -0.03085780143737793, "rewards/margins": 4.342472076416016, "rewards/rejected": -4.373330116271973, "step": 460 }, { "epoch": 0.11, "learning_rate": 1.9765333333333332e-07, "logps/chosen": -242.43643188476562, "logps/rejected": -279.5550231933594, "loss": 0.0305, "losses/dpo": 0.004272465128451586, "losses/sft": 0.6255767941474915, "losses/total": 0.004272465128451586, "ref_logps/chosen": -242.41009521484375, "ref_logps/rejected": -232.68011474609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.002632424235343933, "rewards/margins": 4.684858798980713, "rewards/rejected": -4.687491416931152, "step": 461 }, { "epoch": 0.11, "learning_rate": 1.976e-07, "logps/chosen": -221.83859252929688, "logps/rejected": -248.27879333496094, "loss": 0.0498, "losses/dpo": 0.017202196642756462, "losses/sft": 0.6355413794517517, "losses/total": 0.017202196642756462, "ref_logps/chosen": -221.8585205078125, "ref_logps/rejected": -208.34710693359375, "rewards/accuracies": 1.0, "rewards/chosen": 0.0019944682717323303, "rewards/margins": 3.9951624870300293, "rewards/rejected": -3.9931681156158447, "step": 462 }, { "epoch": 0.11, "learning_rate": 1.9754666666666667e-07, "logps/chosen": -238.18934631347656, "logps/rejected": -274.52459716796875, "loss": 0.0342, "losses/dpo": 0.003961719572544098, "losses/sft": 0.8698765635490417, "losses/total": 0.003961719572544098, "ref_logps/chosen": -237.95550537109375, "ref_logps/rejected": -224.8041229248047, "rewards/accuracies": 1.0, "rewards/chosen": -0.02338554710149765, "rewards/margins": 4.948659420013428, "rewards/rejected": -4.972045421600342, "step": 463 }, { "epoch": 0.11, "learning_rate": 1.9749333333333332e-07, "logps/chosen": -206.5590057373047, "logps/rejected": -275.5373229980469, "loss": 0.0449, "losses/dpo": 0.0031067358795553446, "losses/sft": 0.5101313591003418, "losses/total": 0.0031067358795553446, "ref_logps/chosen": -206.21539306640625, "ref_logps/rejected": -226.8532257080078, "rewards/accuracies": 1.0, "rewards/chosen": -0.03436284139752388, "rewards/margins": 4.834043979644775, "rewards/rejected": -4.868406772613525, "step": 464 }, { "epoch": 0.11, "learning_rate": 1.9743999999999997e-07, "logps/chosen": -203.1328125, "logps/rejected": -238.52920532226562, "loss": 0.0696, "losses/dpo": 0.0018221947830170393, "losses/sft": 0.4959322512149811, "losses/total": 0.0018221947830170393, "ref_logps/chosen": -203.77081298828125, "ref_logps/rejected": -198.3025665283203, "rewards/accuracies": 1.0, "rewards/chosen": 0.06379911303520203, "rewards/margins": 4.086462497711182, "rewards/rejected": -4.022663593292236, "step": 465 }, { "epoch": 0.11, "learning_rate": 1.9738666666666665e-07, "logps/chosen": -237.85768127441406, "logps/rejected": -235.32286071777344, "loss": 0.0652, "losses/dpo": 0.0017006141133606434, "losses/sft": 0.6443789601325989, "losses/total": 0.0017006141133606434, "ref_logps/chosen": -236.86544799804688, "ref_logps/rejected": -194.19805908203125, "rewards/accuracies": 1.0, "rewards/chosen": -0.09922318905591965, "rewards/margins": 4.013256549835205, "rewards/rejected": -4.1124796867370605, "step": 466 }, { "epoch": 0.11, "learning_rate": 1.9733333333333333e-07, "logps/chosen": -224.64219665527344, "logps/rejected": -281.8807373046875, "loss": 0.0427, "losses/dpo": 0.0011107848258689046, "losses/sft": 1.1162128448486328, "losses/total": 0.0011107848258689046, "ref_logps/chosen": -223.83096313476562, "ref_logps/rejected": -231.66197204589844, "rewards/accuracies": 1.0, "rewards/chosen": -0.08112262189388275, "rewards/margins": 4.940752983093262, "rewards/rejected": -5.021875381469727, "step": 467 }, { "epoch": 0.11, "learning_rate": 1.9728e-07, "logps/chosen": -217.82164001464844, "logps/rejected": -286.131103515625, "loss": 0.0335, "losses/dpo": 0.003132896963506937, "losses/sft": 0.502668559551239, "losses/total": 0.003132896963506937, "ref_logps/chosen": -217.65155029296875, "ref_logps/rejected": -237.12554931640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.01700877957046032, "rewards/margins": 4.883543968200684, "rewards/rejected": -4.900552749633789, "step": 468 }, { "epoch": 0.11, "learning_rate": 1.9722666666666665e-07, "logps/chosen": -179.6543426513672, "logps/rejected": -259.83587646484375, "loss": 0.0488, "losses/dpo": 0.0031593155581504107, "losses/sft": 0.3789460361003876, "losses/total": 0.0031593155581504107, "ref_logps/chosen": -179.43467712402344, "ref_logps/rejected": -215.36264038085938, "rewards/accuracies": 1.0, "rewards/chosen": -0.021964989602565765, "rewards/margins": 4.425354957580566, "rewards/rejected": -4.447319984436035, "step": 469 }, { "epoch": 0.11, "learning_rate": 1.9717333333333333e-07, "logps/chosen": -212.6563720703125, "logps/rejected": -275.684814453125, "loss": 0.0526, "losses/dpo": 0.015096154995262623, "losses/sft": 0.620395302772522, "losses/total": 0.015096154995262623, "ref_logps/chosen": -212.148193359375, "ref_logps/rejected": -232.1359100341797, "rewards/accuracies": 1.0, "rewards/chosen": -0.050817497074604034, "rewards/margins": 4.30407190322876, "rewards/rejected": -4.354889869689941, "step": 470 }, { "epoch": 0.11, "learning_rate": 1.9712e-07, "logps/chosen": -203.8250732421875, "logps/rejected": -292.08447265625, "loss": 0.0406, "losses/dpo": 0.003220840124413371, "losses/sft": 0.49649858474731445, "losses/total": 0.003220840124413371, "ref_logps/chosen": -203.00099182128906, "ref_logps/rejected": -242.21893310546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.08240707218647003, "rewards/margins": 4.904149055480957, "rewards/rejected": -4.986556053161621, "step": 471 }, { "epoch": 0.11, "learning_rate": 1.9706666666666666e-07, "logps/chosen": -187.36764526367188, "logps/rejected": -239.36605834960938, "loss": 0.052, "losses/dpo": 0.0005771253490820527, "losses/sft": 1.1392848491668701, "losses/total": 0.0005771253490820527, "ref_logps/chosen": -187.20062255859375, "ref_logps/rejected": -195.54916381835938, "rewards/accuracies": 1.0, "rewards/chosen": -0.0167009886354208, "rewards/margins": 4.364989280700684, "rewards/rejected": -4.381690502166748, "step": 472 }, { "epoch": 0.11, "learning_rate": 1.970133333333333e-07, "logps/chosen": -215.9027862548828, "logps/rejected": -255.51043701171875, "loss": 0.0427, "losses/dpo": 0.0014771748101338744, "losses/sft": 0.8154699206352234, "losses/total": 0.0014771748101338744, "ref_logps/chosen": -215.5841827392578, "ref_logps/rejected": -210.93698120117188, "rewards/accuracies": 1.0, "rewards/chosen": -0.031859129667282104, "rewards/margins": 4.4254865646362305, "rewards/rejected": -4.457345962524414, "step": 473 }, { "epoch": 0.11, "learning_rate": 1.9695999999999998e-07, "logps/chosen": -244.95530700683594, "logps/rejected": -287.7817077636719, "loss": 0.0545, "losses/dpo": 0.0007773188408464193, "losses/sft": 0.589356005191803, "losses/total": 0.0007773188408464193, "ref_logps/chosen": -244.90182495117188, "ref_logps/rejected": -238.7423095703125, "rewards/accuracies": 1.0, "rewards/chosen": -0.005348535254597664, "rewards/margins": 4.898589134216309, "rewards/rejected": -4.903938293457031, "step": 474 }, { "epoch": 0.11, "learning_rate": 1.9690666666666666e-07, "logps/chosen": -169.2880859375, "logps/rejected": -223.26019287109375, "loss": 0.0823, "losses/dpo": 0.00047570717288181186, "losses/sft": 0.718794584274292, "losses/total": 0.00047570717288181186, "ref_logps/chosen": -169.20555114746094, "ref_logps/rejected": -183.52386474609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.00825367122888565, "rewards/margins": 3.9653804302215576, "rewards/rejected": -3.9736340045928955, "step": 475 }, { "epoch": 0.11, "learning_rate": 1.968533333333333e-07, "logps/chosen": -268.54071044921875, "logps/rejected": -302.4468994140625, "loss": 0.0279, "losses/dpo": 0.0018221723148599267, "losses/sft": 0.5366832613945007, "losses/total": 0.0018221723148599267, "ref_logps/chosen": -267.51495361328125, "ref_logps/rejected": -247.10443115234375, "rewards/accuracies": 1.0, "rewards/chosen": -0.10257553309202194, "rewards/margins": 5.431673049926758, "rewards/rejected": -5.534248352050781, "step": 476 }, { "epoch": 0.11, "learning_rate": 1.968e-07, "logps/chosen": -176.9971923828125, "logps/rejected": -249.34446716308594, "loss": 0.043, "losses/dpo": 0.018318165093660355, "losses/sft": 0.6752180457115173, "losses/total": 0.018318165093660355, "ref_logps/chosen": -177.43972778320312, "ref_logps/rejected": -205.7630615234375, "rewards/accuracies": 1.0, "rewards/chosen": 0.044254742562770844, "rewards/margins": 4.402396202087402, "rewards/rejected": -4.35814094543457, "step": 477 }, { "epoch": 0.11, "learning_rate": 1.9674666666666667e-07, "logps/chosen": -224.09471130371094, "logps/rejected": -269.2533874511719, "loss": 0.0493, "losses/dpo": 0.005559402517974377, "losses/sft": 0.512998640537262, "losses/total": 0.005559402517974377, "ref_logps/chosen": -224.0096893310547, "ref_logps/rejected": -221.05613708496094, "rewards/accuracies": 1.0, "rewards/chosen": -0.008501682430505753, "rewards/margins": 4.811221599578857, "rewards/rejected": -4.819723129272461, "step": 478 }, { "epoch": 0.11, "learning_rate": 1.9669333333333334e-07, "logps/chosen": -247.31619262695312, "logps/rejected": -273.10589599609375, "loss": 0.0413, "losses/dpo": 0.00023215115652419627, "losses/sft": 0.6907440423965454, "losses/total": 0.00023215115652419627, "ref_logps/chosen": -247.79107666015625, "ref_logps/rejected": -224.8311767578125, "rewards/accuracies": 1.0, "rewards/chosen": 0.0474892258644104, "rewards/margins": 4.87496280670166, "rewards/rejected": -4.827473163604736, "step": 479 }, { "epoch": 0.12, "learning_rate": 1.9664e-07, "logps/chosen": -229.72891235351562, "logps/rejected": -281.2899169921875, "loss": 0.0378, "losses/dpo": 0.0011429211590439081, "losses/sft": 0.5118415355682373, "losses/total": 0.0011429211590439081, "ref_logps/chosen": -229.96006774902344, "ref_logps/rejected": -230.24188232421875, "rewards/accuracies": 1.0, "rewards/chosen": 0.023115750402212143, "rewards/margins": 5.127922534942627, "rewards/rejected": -5.104806423187256, "step": 480 }, { "epoch": 0.12, "learning_rate": 1.9658666666666667e-07, "logps/chosen": -203.09515380859375, "logps/rejected": -247.62136840820312, "loss": 0.0492, "losses/dpo": 0.0001974880724446848, "losses/sft": 0.7220944762229919, "losses/total": 0.0001974880724446848, "ref_logps/chosen": -203.3564453125, "ref_logps/rejected": -202.50961303710938, "rewards/accuracies": 1.0, "rewards/chosen": 0.026128409430384636, "rewards/margins": 4.53730583190918, "rewards/rejected": -4.5111775398254395, "step": 481 }, { "epoch": 0.12, "learning_rate": 1.9653333333333332e-07, "logps/chosen": -305.73590087890625, "logps/rejected": -288.48321533203125, "loss": 0.0173, "losses/dpo": 0.016087742522358894, "losses/sft": 0.7455621361732483, "losses/total": 0.016087742522358894, "ref_logps/chosen": -304.95758056640625, "ref_logps/rejected": -234.98287963867188, "rewards/accuracies": 1.0, "rewards/chosen": -0.07783345133066177, "rewards/margins": 5.272200107574463, "rewards/rejected": -5.350033760070801, "step": 482 }, { "epoch": 0.12, "learning_rate": 1.9648e-07, "logps/chosen": -262.68597412109375, "logps/rejected": -278.28106689453125, "loss": 0.0334, "losses/dpo": 0.00027571397367864847, "losses/sft": 0.5315371155738831, "losses/total": 0.00027571397367864847, "ref_logps/chosen": -260.9942932128906, "ref_logps/rejected": -227.76162719726562, "rewards/accuracies": 1.0, "rewards/chosen": -0.1691661775112152, "rewards/margins": 4.882778167724609, "rewards/rejected": -5.051944732666016, "step": 483 }, { "epoch": 0.12, "learning_rate": 1.9642666666666665e-07, "logps/chosen": -212.1987762451172, "logps/rejected": -263.39404296875, "loss": 0.0389, "losses/dpo": 0.0018500791629776359, "losses/sft": 0.49347442388534546, "losses/total": 0.0018500791629776359, "ref_logps/chosen": -211.71017456054688, "ref_logps/rejected": -214.3931427001953, "rewards/accuracies": 1.0, "rewards/chosen": -0.048860229551792145, "rewards/margins": 4.851228713989258, "rewards/rejected": -4.900089263916016, "step": 484 }, { "epoch": 0.12, "learning_rate": 1.9637333333333332e-07, "logps/chosen": -230.8946533203125, "logps/rejected": -282.651123046875, "loss": 0.0389, "losses/dpo": 0.0007410281687043607, "losses/sft": 0.5615968704223633, "losses/total": 0.0007410281687043607, "ref_logps/chosen": -231.44871520996094, "ref_logps/rejected": -229.30361938476562, "rewards/accuracies": 1.0, "rewards/chosen": 0.05540613830089569, "rewards/margins": 5.390157222747803, "rewards/rejected": -5.334751129150391, "step": 485 }, { "epoch": 0.12, "learning_rate": 1.9632e-07, "logps/chosen": -222.18336486816406, "logps/rejected": -276.1203918457031, "loss": 0.0346, "losses/dpo": 9.528552618576214e-05, "losses/sft": 0.8274270296096802, "losses/total": 9.528552618576214e-05, "ref_logps/chosen": -222.07496643066406, "ref_logps/rejected": -223.88470458984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.010838055983185768, "rewards/margins": 5.21273136138916, "rewards/rejected": -5.223569393157959, "step": 486 }, { "epoch": 0.12, "learning_rate": 1.9626666666666665e-07, "logps/chosen": -226.80899047851562, "logps/rejected": -273.00238037109375, "loss": 0.0724, "losses/dpo": 0.008785046637058258, "losses/sft": 0.43912097811698914, "losses/total": 0.008785046637058258, "ref_logps/chosen": -224.9456024169922, "ref_logps/rejected": -224.44068908691406, "rewards/accuracies": 0.96875, "rewards/chosen": -0.18633931875228882, "rewards/margins": 4.669832229614258, "rewards/rejected": -4.856171607971191, "step": 487 }, { "epoch": 0.12, "learning_rate": 1.9621333333333333e-07, "logps/chosen": -226.8582305908203, "logps/rejected": -267.9808349609375, "loss": 0.0425, "losses/dpo": 0.0014213138492777944, "losses/sft": 0.780878484249115, "losses/total": 0.0014213138492777944, "ref_logps/chosen": -225.54859924316406, "ref_logps/rejected": -220.04356384277344, "rewards/accuracies": 1.0, "rewards/chosen": -0.13096386194229126, "rewards/margins": 4.662761688232422, "rewards/rejected": -4.793725967407227, "step": 488 }, { "epoch": 0.12, "learning_rate": 1.9616e-07, "logps/chosen": -222.90139770507812, "logps/rejected": -287.74652099609375, "loss": 0.019, "losses/dpo": 0.0018827876774594188, "losses/sft": 0.5306023955345154, "losses/total": 0.0018827876774594188, "ref_logps/chosen": -221.58837890625, "ref_logps/rejected": -231.53060913085938, "rewards/accuracies": 1.0, "rewards/chosen": -0.13130220770835876, "rewards/margins": 5.490291118621826, "rewards/rejected": -5.621593475341797, "step": 489 }, { "epoch": 0.12, "learning_rate": 1.9610666666666665e-07, "logps/chosen": -196.205810546875, "logps/rejected": -268.719482421875, "loss": 0.0535, "losses/dpo": 0.0006848751218058169, "losses/sft": 0.7662638425827026, "losses/total": 0.0006848751218058169, "ref_logps/chosen": -196.03546142578125, "ref_logps/rejected": -217.48451232910156, "rewards/accuracies": 1.0, "rewards/chosen": -0.01703658513724804, "rewards/margins": 5.1064605712890625, "rewards/rejected": -5.123497009277344, "step": 490 }, { "epoch": 0.12, "learning_rate": 1.960533333333333e-07, "logps/chosen": -244.09136962890625, "logps/rejected": -281.9866943359375, "loss": 0.0308, "losses/dpo": 2.0624893295462243e-05, "losses/sft": 0.5339211821556091, "losses/total": 2.0624893295462243e-05, "ref_logps/chosen": -243.4960479736328, "ref_logps/rejected": -227.22097778320312, "rewards/accuracies": 1.0, "rewards/chosen": -0.05953293666243553, "rewards/margins": 5.417040824890137, "rewards/rejected": -5.476573944091797, "step": 491 }, { "epoch": 0.12, "learning_rate": 1.9599999999999998e-07, "logps/chosen": -187.91644287109375, "logps/rejected": -230.4845428466797, "loss": 0.0479, "losses/dpo": 0.000870658433996141, "losses/sft": 0.5273017287254333, "losses/total": 0.000870658433996141, "ref_logps/chosen": -187.89865112304688, "ref_logps/rejected": -187.568115234375, "rewards/accuracies": 1.0, "rewards/chosen": -0.0017794966697692871, "rewards/margins": 4.289863586425781, "rewards/rejected": -4.291642665863037, "step": 492 }, { "epoch": 0.12, "learning_rate": 1.9594666666666666e-07, "logps/chosen": -262.3135986328125, "logps/rejected": -274.2283630371094, "loss": 0.0448, "losses/dpo": 0.00021386126172728837, "losses/sft": 0.4789522886276245, "losses/total": 0.00021386126172728837, "ref_logps/chosen": -260.43695068359375, "ref_logps/rejected": -227.37156677246094, "rewards/accuracies": 1.0, "rewards/chosen": -0.18766410648822784, "rewards/margins": 4.49801778793335, "rewards/rejected": -4.68568229675293, "step": 493 }, { "epoch": 0.12, "learning_rate": 1.9589333333333334e-07, "logps/chosen": -219.86502075195312, "logps/rejected": -285.60723876953125, "loss": 0.0422, "losses/dpo": 0.008738205768167973, "losses/sft": 0.5289340615272522, "losses/total": 0.008738205768167973, "ref_logps/chosen": -219.86627197265625, "ref_logps/rejected": -231.6346435546875, "rewards/accuracies": 1.0, "rewards/chosen": 0.00012370198965072632, "rewards/margins": 5.39738655090332, "rewards/rejected": -5.397263050079346, "step": 494 }, { "epoch": 0.12, "learning_rate": 1.9583999999999999e-07, "logps/chosen": -240.89886474609375, "logps/rejected": -277.1437072753906, "loss": 0.0331, "losses/dpo": 0.0003653813328128308, "losses/sft": 0.6216765642166138, "losses/total": 0.0003653813328128308, "ref_logps/chosen": -239.8816375732422, "ref_logps/rejected": -225.8678436279297, "rewards/accuracies": 1.0, "rewards/chosen": -0.10172353684902191, "rewards/margins": 5.025862693786621, "rewards/rejected": -5.127586364746094, "step": 495 }, { "epoch": 0.12, "learning_rate": 1.9578666666666666e-07, "logps/chosen": -244.54600524902344, "logps/rejected": -297.8708190917969, "loss": 0.019, "losses/dpo": 2.1409539840533398e-05, "losses/sft": 0.5055067539215088, "losses/total": 2.1409539840533398e-05, "ref_logps/chosen": -242.96859741210938, "ref_logps/rejected": -240.7670135498047, "rewards/accuracies": 1.0, "rewards/chosen": -0.15773972868919373, "rewards/margins": 5.552641868591309, "rewards/rejected": -5.710381984710693, "step": 496 }, { "epoch": 0.12, "learning_rate": 1.9573333333333334e-07, "logps/chosen": -240.17031860351562, "logps/rejected": -282.6920166015625, "loss": 0.0199, "losses/dpo": 0.0007642643176950514, "losses/sft": 0.8603014945983887, "losses/total": 0.0007642643176950514, "ref_logps/chosen": -238.8646697998047, "ref_logps/rejected": -225.99151611328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.13056564331054688, "rewards/margins": 5.539484977722168, "rewards/rejected": -5.670050621032715, "step": 497 }, { "epoch": 0.12, "learning_rate": 1.9568e-07, "logps/chosen": -250.47552490234375, "logps/rejected": -287.260986328125, "loss": 0.0365, "losses/dpo": 0.0008694375865161419, "losses/sft": 0.5779188275337219, "losses/total": 0.0008694375865161419, "ref_logps/chosen": -249.2882080078125, "ref_logps/rejected": -227.40322875976562, "rewards/accuracies": 1.0, "rewards/chosen": -0.11872907727956772, "rewards/margins": 5.867044448852539, "rewards/rejected": -5.985774040222168, "step": 498 }, { "epoch": 0.12, "learning_rate": 1.9562666666666664e-07, "logps/chosen": -247.07373046875, "logps/rejected": -306.5794372558594, "loss": 0.0334, "losses/dpo": 0.0010775476694107056, "losses/sft": 0.5919786691665649, "losses/total": 0.0010775476694107056, "ref_logps/chosen": -248.17709350585938, "ref_logps/rejected": -252.98907470703125, "rewards/accuracies": 1.0, "rewards/chosen": 0.11033895611763, "rewards/margins": 5.469374179840088, "rewards/rejected": -5.359035015106201, "step": 499 }, { "epoch": 0.12, "learning_rate": 1.9557333333333332e-07, "logps/chosen": -236.56356811523438, "logps/rejected": -283.8231201171875, "loss": 0.0297, "losses/dpo": 0.0002258680178783834, "losses/sft": 0.5206022262573242, "losses/total": 0.0002258680178783834, "ref_logps/chosen": -235.85108947753906, "ref_logps/rejected": -225.1977996826172, "rewards/accuracies": 1.0, "rewards/chosen": -0.07124701887369156, "rewards/margins": 5.791286468505859, "rewards/rejected": -5.862533092498779, "step": 500 }, { "epoch": 0.12, "learning_rate": 1.9552e-07, "logps/chosen": -262.4732666015625, "logps/rejected": -320.380126953125, "loss": 0.0296, "losses/dpo": 1.3207485608290881e-05, "losses/sft": 0.5942274332046509, "losses/total": 1.3207485608290881e-05, "ref_logps/chosen": -261.375244140625, "ref_logps/rejected": -262.8646240234375, "rewards/accuracies": 1.0, "rewards/chosen": -0.10980212688446045, "rewards/margins": 5.641749858856201, "rewards/rejected": -5.751552104949951, "step": 501 }, { "epoch": 0.12, "learning_rate": 1.9546666666666667e-07, "logps/chosen": -205.57550048828125, "logps/rejected": -261.47430419921875, "loss": 0.0408, "losses/dpo": 6.389798363670707e-05, "losses/sft": 0.5881379842758179, "losses/total": 6.389798363670707e-05, "ref_logps/chosen": -204.9127960205078, "ref_logps/rejected": -212.09938049316406, "rewards/accuracies": 1.0, "rewards/chosen": -0.06627245992422104, "rewards/margins": 4.87122106552124, "rewards/rejected": -4.937494277954102, "step": 502 }, { "epoch": 0.12, "learning_rate": 1.9541333333333332e-07, "logps/chosen": -210.64846801757812, "logps/rejected": -282.85003662109375, "loss": 0.0475, "losses/dpo": 0.0003241585218347609, "losses/sft": 0.6168089509010315, "losses/total": 0.0003241585218347609, "ref_logps/chosen": -209.03076171875, "ref_logps/rejected": -234.642333984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.1617705225944519, "rewards/margins": 4.658999443054199, "rewards/rejected": -4.820769786834717, "step": 503 }, { "epoch": 0.12, "learning_rate": 1.9536e-07, "logps/chosen": -207.9778594970703, "logps/rejected": -274.77490234375, "loss": 0.0651, "losses/dpo": 0.0047376626171171665, "losses/sft": 0.6132023930549622, "losses/total": 0.0047376626171171665, "ref_logps/chosen": -207.22439575195312, "ref_logps/rejected": -220.19894409179688, "rewards/accuracies": 1.0, "rewards/chosen": -0.07534898072481155, "rewards/margins": 5.382246017456055, "rewards/rejected": -5.457594871520996, "step": 504 }, { "epoch": 0.12, "learning_rate": 1.9530666666666667e-07, "logps/chosen": -183.672607421875, "logps/rejected": -276.76239013671875, "loss": 0.0562, "losses/dpo": 0.0003125487710349262, "losses/sft": 0.9405765533447266, "losses/total": 0.0003125487710349262, "ref_logps/chosen": -184.61590576171875, "ref_logps/rejected": -231.4737548828125, "rewards/accuracies": 1.0, "rewards/chosen": 0.09432901442050934, "rewards/margins": 4.623190879821777, "rewards/rejected": -4.528861999511719, "step": 505 }, { "epoch": 0.12, "learning_rate": 1.9525333333333332e-07, "logps/chosen": -229.73635864257812, "logps/rejected": -266.0400390625, "loss": 0.0301, "losses/dpo": 0.002671043621376157, "losses/sft": 0.47688284516334534, "losses/total": 0.002671043621376157, "ref_logps/chosen": -228.97247314453125, "ref_logps/rejected": -216.59429931640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.07638617604970932, "rewards/margins": 4.868186950683594, "rewards/rejected": -4.944573402404785, "step": 506 }, { "epoch": 0.12, "learning_rate": 1.9519999999999997e-07, "logps/chosen": -260.513916015625, "logps/rejected": -274.0113525390625, "loss": 0.0349, "losses/dpo": 0.0005668626399710774, "losses/sft": 0.5549825429916382, "losses/total": 0.0005668626399710774, "ref_logps/chosen": -259.588134765625, "ref_logps/rejected": -222.62376403808594, "rewards/accuracies": 1.0, "rewards/chosen": -0.09257568418979645, "rewards/margins": 5.046182632446289, "rewards/rejected": -5.138758659362793, "step": 507 }, { "epoch": 0.12, "learning_rate": 1.9514666666666665e-07, "logps/chosen": -203.36074829101562, "logps/rejected": -262.8918151855469, "loss": 0.0283, "losses/dpo": 0.014007323421537876, "losses/sft": 0.5712645053863525, "losses/total": 0.014007323421537876, "ref_logps/chosen": -202.08346557617188, "ref_logps/rejected": -212.12380981445312, "rewards/accuracies": 1.0, "rewards/chosen": -0.12772852182388306, "rewards/margins": 4.94907283782959, "rewards/rejected": -5.076801300048828, "step": 508 }, { "epoch": 0.12, "learning_rate": 1.9509333333333333e-07, "logps/chosen": -228.52037048339844, "logps/rejected": -276.7561340332031, "loss": 0.0333, "losses/dpo": 0.00013790167577099055, "losses/sft": 0.8785051107406616, "losses/total": 0.00013790167577099055, "ref_logps/chosen": -228.21627807617188, "ref_logps/rejected": -222.79273986816406, "rewards/accuracies": 1.0, "rewards/chosen": -0.030408764258027077, "rewards/margins": 5.365933418273926, "rewards/rejected": -5.3963422775268555, "step": 509 }, { "epoch": 0.12, "learning_rate": 1.9503999999999998e-07, "logps/chosen": -202.1248016357422, "logps/rejected": -283.52197265625, "loss": 0.023, "losses/dpo": 0.0009096666472032666, "losses/sft": 0.6525858640670776, "losses/total": 0.0009096666472032666, "ref_logps/chosen": -201.01222229003906, "ref_logps/rejected": -229.25184631347656, "rewards/accuracies": 1.0, "rewards/chosen": -0.11125794053077698, "rewards/margins": 5.315756320953369, "rewards/rejected": -5.427014350891113, "step": 510 }, { "epoch": 0.12, "learning_rate": 1.9498666666666666e-07, "logps/chosen": -246.79261779785156, "logps/rejected": -285.3822021484375, "loss": 0.031, "losses/dpo": 0.0004984370898455381, "losses/sft": 0.5540149807929993, "losses/total": 0.0004984370898455381, "ref_logps/chosen": -246.42413330078125, "ref_logps/rejected": -232.95672607421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.03684854134917259, "rewards/margins": 5.205699920654297, "rewards/rejected": -5.242547988891602, "step": 511 }, { "epoch": 0.12, "learning_rate": 1.9493333333333333e-07, "logps/chosen": -201.35137939453125, "logps/rejected": -266.8376770019531, "loss": 0.0304, "losses/dpo": 0.0012851532083004713, "losses/sft": 0.6028497815132141, "losses/total": 0.0012851532083004713, "ref_logps/chosen": -201.17230224609375, "ref_logps/rejected": -215.91122436523438, "rewards/accuracies": 1.0, "rewards/chosen": -0.017906472086906433, "rewards/margins": 5.074737071990967, "rewards/rejected": -5.092643737792969, "step": 512 }, { "epoch": 0.12, "learning_rate": 1.9488e-07, "logps/chosen": -216.60276794433594, "logps/rejected": -274.798583984375, "loss": 0.0437, "losses/dpo": 0.0006182650686241686, "losses/sft": 0.5461428761482239, "losses/total": 0.0006182650686241686, "ref_logps/chosen": -216.8577423095703, "ref_logps/rejected": -221.6320343017578, "rewards/accuracies": 1.0, "rewards/chosen": 0.025497108697891235, "rewards/margins": 5.3421525955200195, "rewards/rejected": -5.31665563583374, "step": 513 }, { "epoch": 0.12, "learning_rate": 1.9482666666666666e-07, "logps/chosen": -215.9144287109375, "logps/rejected": -278.709228515625, "loss": 0.0231, "losses/dpo": 0.0013716787798330188, "losses/sft": 0.5061013698577881, "losses/total": 0.0013716787798330188, "ref_logps/chosen": -215.7533721923828, "ref_logps/rejected": -228.57958984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.01610569655895233, "rewards/margins": 4.996859550476074, "rewards/rejected": -5.012965202331543, "step": 514 }, { "epoch": 0.12, "learning_rate": 1.9477333333333334e-07, "logps/chosen": -250.7087860107422, "logps/rejected": -284.9548645019531, "loss": 0.0386, "losses/dpo": 0.003928735386580229, "losses/sft": 0.6093958020210266, "losses/total": 0.003928735386580229, "ref_logps/chosen": -249.22445678710938, "ref_logps/rejected": -233.43878173828125, "rewards/accuracies": 1.0, "rewards/chosen": -0.1484328955411911, "rewards/margins": 5.003173828125, "rewards/rejected": -5.151607036590576, "step": 515 }, { "epoch": 0.12, "learning_rate": 1.9471999999999999e-07, "logps/chosen": -242.50367736816406, "logps/rejected": -282.8984069824219, "loss": 0.038, "losses/dpo": 3.4305176086490974e-05, "losses/sft": 0.5046817064285278, "losses/total": 3.4305176086490974e-05, "ref_logps/chosen": -241.61236572265625, "ref_logps/rejected": -232.24241638183594, "rewards/accuracies": 1.0, "rewards/chosen": -0.08913049846887589, "rewards/margins": 4.976468563079834, "rewards/rejected": -5.06559944152832, "step": 516 }, { "epoch": 0.12, "learning_rate": 1.9466666666666666e-07, "logps/chosen": -194.62890625, "logps/rejected": -284.95892333984375, "loss": 0.0212, "losses/dpo": 0.0003045676276087761, "losses/sft": 0.6987672448158264, "losses/total": 0.0003045676276087761, "ref_logps/chosen": -194.14566040039062, "ref_logps/rejected": -225.08705139160156, "rewards/accuracies": 1.0, "rewards/chosen": -0.048326559364795685, "rewards/margins": 5.93886137008667, "rewards/rejected": -5.98718786239624, "step": 517 }, { "epoch": 0.12, "learning_rate": 1.946133333333333e-07, "logps/chosen": -231.07907104492188, "logps/rejected": -264.4855041503906, "loss": 0.0394, "losses/dpo": 0.00013708639016840607, "losses/sft": 0.6489329934120178, "losses/total": 0.00013708639016840607, "ref_logps/chosen": -230.90435791015625, "ref_logps/rejected": -215.66468811035156, "rewards/accuracies": 1.0, "rewards/chosen": -0.017473433166742325, "rewards/margins": 4.864609241485596, "rewards/rejected": -4.882082462310791, "step": 518 }, { "epoch": 0.12, "learning_rate": 1.9456e-07, "logps/chosen": -227.851318359375, "logps/rejected": -275.28692626953125, "loss": 0.0411, "losses/dpo": 0.0019740001298487186, "losses/sft": 0.3717336654663086, "losses/total": 0.0019740001298487186, "ref_logps/chosen": -226.22116088867188, "ref_logps/rejected": -220.55337524414062, "rewards/accuracies": 1.0, "rewards/chosen": -0.16301649808883667, "rewards/margins": 5.310338973999023, "rewards/rejected": -5.473356246948242, "step": 519 }, { "epoch": 0.12, "learning_rate": 1.9450666666666667e-07, "logps/chosen": -258.08917236328125, "logps/rejected": -307.08917236328125, "loss": 0.0337, "losses/dpo": 8.781261385593098e-06, "losses/sft": 0.4752625524997711, "losses/total": 8.781261385593098e-06, "ref_logps/chosen": -257.34326171875, "ref_logps/rejected": -255.04034423828125, "rewards/accuracies": 1.0, "rewards/chosen": -0.07459090650081635, "rewards/margins": 5.130293369293213, "rewards/rejected": -5.2048845291137695, "step": 520 }, { "epoch": 0.13, "learning_rate": 1.9445333333333332e-07, "logps/chosen": -222.65386962890625, "logps/rejected": -274.4390869140625, "loss": 0.0371, "losses/dpo": 0.00024077259877230972, "losses/sft": 0.6444891691207886, "losses/total": 0.00024077259877230972, "ref_logps/chosen": -222.90292358398438, "ref_logps/rejected": -224.24606323242188, "rewards/accuracies": 1.0, "rewards/chosen": 0.024904966354370117, "rewards/margins": 5.044208526611328, "rewards/rejected": -5.019303321838379, "step": 521 }, { "epoch": 0.13, "learning_rate": 1.944e-07, "logps/chosen": -248.44534301757812, "logps/rejected": -318.5035705566406, "loss": 0.031, "losses/dpo": 0.0015495363622903824, "losses/sft": 0.6780334115028381, "losses/total": 0.0015495363622903824, "ref_logps/chosen": -248.27871704101562, "ref_logps/rejected": -257.2454528808594, "rewards/accuracies": 1.0, "rewards/chosen": -0.01666231080889702, "rewards/margins": 6.109152793884277, "rewards/rejected": -6.125814914703369, "step": 522 }, { "epoch": 0.13, "learning_rate": 1.9434666666666667e-07, "logps/chosen": -262.58538818359375, "logps/rejected": -288.0432434082031, "loss": 0.0523, "losses/dpo": 1.3305424545251299e-05, "losses/sft": 0.48947763442993164, "losses/total": 1.3305424545251299e-05, "ref_logps/chosen": -260.9541015625, "ref_logps/rejected": -230.58204650878906, "rewards/accuracies": 1.0, "rewards/chosen": -0.16313177347183228, "rewards/margins": 5.582988262176514, "rewards/rejected": -5.746120452880859, "step": 523 }, { "epoch": 0.13, "learning_rate": 1.9429333333333332e-07, "logps/chosen": -236.75247192382812, "logps/rejected": -302.0104064941406, "loss": 0.0227, "losses/dpo": 0.0058427173644304276, "losses/sft": 0.34488412737846375, "losses/total": 0.0058427173644304276, "ref_logps/chosen": -234.8700408935547, "ref_logps/rejected": -244.8223876953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.18824371695518494, "rewards/margins": 5.530555725097656, "rewards/rejected": -5.718799591064453, "step": 524 }, { "epoch": 0.13, "learning_rate": 1.9423999999999997e-07, "logps/chosen": -257.9975891113281, "logps/rejected": -259.31842041015625, "loss": 0.0881, "losses/dpo": 0.0005380308139137924, "losses/sft": 0.9496929049491882, "losses/total": 0.0005380308139137924, "ref_logps/chosen": -257.11187744140625, "ref_logps/rejected": -211.653564453125, "rewards/accuracies": 1.0, "rewards/chosen": -0.08857065439224243, "rewards/margins": 4.677915573120117, "rewards/rejected": -4.766486167907715, "step": 525 }, { "epoch": 0.13, "learning_rate": 1.9418666666666665e-07, "logps/chosen": -256.76971435546875, "logps/rejected": -285.75439453125, "loss": 0.0338, "losses/dpo": 0.0010070811258628964, "losses/sft": 0.5763995051383972, "losses/total": 0.0010070811258628964, "ref_logps/chosen": -254.2735595703125, "ref_logps/rejected": -232.8920135498047, "rewards/accuracies": 1.0, "rewards/chosen": -0.2496155947446823, "rewards/margins": 5.036622047424316, "rewards/rejected": -5.286237716674805, "step": 526 }, { "epoch": 0.13, "learning_rate": 1.9413333333333332e-07, "logps/chosen": -225.6380157470703, "logps/rejected": -300.6522216796875, "loss": 0.0364, "losses/dpo": 0.0002042806736426428, "losses/sft": 0.5266841053962708, "losses/total": 0.0002042806736426428, "ref_logps/chosen": -224.55824279785156, "ref_logps/rejected": -246.26376342773438, "rewards/accuracies": 1.0, "rewards/chosen": -0.10797493159770966, "rewards/margins": 5.330875396728516, "rewards/rejected": -5.438850402832031, "step": 527 }, { "epoch": 0.13, "learning_rate": 1.9408e-07, "logps/chosen": -218.62155151367188, "logps/rejected": -253.80438232421875, "loss": 0.0452, "losses/dpo": 6.682700040983036e-05, "losses/sft": 0.6495805382728577, "losses/total": 6.682700040983036e-05, "ref_logps/chosen": -217.1563720703125, "ref_logps/rejected": -202.9835968017578, "rewards/accuracies": 1.0, "rewards/chosen": -0.14651918411254883, "rewards/margins": 4.9355597496032715, "rewards/rejected": -5.08207893371582, "step": 528 }, { "epoch": 0.13, "learning_rate": 1.9402666666666665e-07, "logps/chosen": -216.2485809326172, "logps/rejected": -268.079833984375, "loss": 0.0238, "losses/dpo": 0.0326717384159565, "losses/sft": 0.5660243034362793, "losses/total": 0.0326717384159565, "ref_logps/chosen": -217.50311279296875, "ref_logps/rejected": -214.462646484375, "rewards/accuracies": 1.0, "rewards/chosen": 0.12545427680015564, "rewards/margins": 5.4871721267700195, "rewards/rejected": -5.361718654632568, "step": 529 }, { "epoch": 0.13, "learning_rate": 1.9397333333333333e-07, "logps/chosen": -253.84109497070312, "logps/rejected": -325.0635070800781, "loss": 0.0106, "losses/dpo": 0.00011307946260785684, "losses/sft": 0.5670117735862732, "losses/total": 0.00011307946260785684, "ref_logps/chosen": -252.71316528320312, "ref_logps/rejected": -264.0163879394531, "rewards/accuracies": 1.0, "rewards/chosen": -0.11279164254665375, "rewards/margins": 5.991922855377197, "rewards/rejected": -6.104714393615723, "step": 530 }, { "epoch": 0.13, "learning_rate": 1.9392e-07, "logps/chosen": -195.63177490234375, "logps/rejected": -250.7781982421875, "loss": 0.0395, "losses/dpo": 2.058716199826449e-05, "losses/sft": 0.8847008347511292, "losses/total": 2.058716199826449e-05, "ref_logps/chosen": -195.89382934570312, "ref_logps/rejected": -204.12603759765625, "rewards/accuracies": 1.0, "rewards/chosen": 0.026205461472272873, "rewards/margins": 4.691420555114746, "rewards/rejected": -4.665215015411377, "step": 531 }, { "epoch": 0.13, "learning_rate": 1.9386666666666666e-07, "logps/chosen": -221.90469360351562, "logps/rejected": -276.51141357421875, "loss": 0.0393, "losses/dpo": 0.0029748044908046722, "losses/sft": 0.718905508518219, "losses/total": 0.0029748044908046722, "ref_logps/chosen": -220.7667236328125, "ref_logps/rejected": -220.93942260742188, "rewards/accuracies": 1.0, "rewards/chosen": -0.1137978658080101, "rewards/margins": 5.4434003829956055, "rewards/rejected": -5.557198524475098, "step": 532 }, { "epoch": 0.13, "learning_rate": 1.938133333333333e-07, "logps/chosen": -183.4857635498047, "logps/rejected": -246.24029541015625, "loss": 0.0384, "losses/dpo": 6.828863843111321e-05, "losses/sft": 0.5733599066734314, "losses/total": 6.828863843111321e-05, "ref_logps/chosen": -183.7529296875, "ref_logps/rejected": -193.29513549804688, "rewards/accuracies": 1.0, "rewards/chosen": 0.026713762432336807, "rewards/margins": 5.321229934692383, "rewards/rejected": -5.294516086578369, "step": 533 }, { "epoch": 0.13, "learning_rate": 1.9375999999999998e-07, "logps/chosen": -241.54925537109375, "logps/rejected": -292.4668884277344, "loss": 0.0241, "losses/dpo": 0.0017909365706145763, "losses/sft": 0.5136231184005737, "losses/total": 0.0017909365706145763, "ref_logps/chosen": -240.90977478027344, "ref_logps/rejected": -236.54940795898438, "rewards/accuracies": 1.0, "rewards/chosen": -0.06394743919372559, "rewards/margins": 5.527799129486084, "rewards/rejected": -5.591746807098389, "step": 534 }, { "epoch": 0.13, "learning_rate": 1.9370666666666666e-07, "logps/chosen": -265.9327392578125, "logps/rejected": -296.93182373046875, "loss": 0.0376, "losses/dpo": 0.0005725742666982114, "losses/sft": 0.4931681454181671, "losses/total": 0.0005725742666982114, "ref_logps/chosen": -264.3113098144531, "ref_logps/rejected": -240.20880126953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.16214317083358765, "rewards/margins": 5.510158538818359, "rewards/rejected": -5.672301292419434, "step": 535 }, { "epoch": 0.13, "learning_rate": 1.9365333333333334e-07, "logps/chosen": -251.86354064941406, "logps/rejected": -306.0887451171875, "loss": 0.0254, "losses/dpo": 5.9696587413782254e-05, "losses/sft": 0.7248809337615967, "losses/total": 5.9696587413782254e-05, "ref_logps/chosen": -249.57395935058594, "ref_logps/rejected": -246.1682891845703, "rewards/accuracies": 1.0, "rewards/chosen": -0.22895768284797668, "rewards/margins": 5.763088226318359, "rewards/rejected": -5.992045879364014, "step": 536 }, { "epoch": 0.13, "learning_rate": 1.9359999999999999e-07, "logps/chosen": -233.3856201171875, "logps/rejected": -282.7174072265625, "loss": 0.024, "losses/dpo": 4.424824965099106e-06, "losses/sft": 0.44362032413482666, "losses/total": 4.424824965099106e-06, "ref_logps/chosen": -232.78436279296875, "ref_logps/rejected": -218.7183380126953, "rewards/accuracies": 1.0, "rewards/chosen": -0.060126278549432755, "rewards/margins": 6.339779853820801, "rewards/rejected": -6.399907112121582, "step": 537 }, { "epoch": 0.13, "learning_rate": 1.9354666666666666e-07, "logps/chosen": -236.1624755859375, "logps/rejected": -298.8553771972656, "loss": 0.0309, "losses/dpo": 0.016136709600687027, "losses/sft": 0.4755387008190155, "losses/total": 0.016136709600687027, "ref_logps/chosen": -234.61033630371094, "ref_logps/rejected": -241.75511169433594, "rewards/accuracies": 1.0, "rewards/chosen": -0.155213862657547, "rewards/margins": 5.554811954498291, "rewards/rejected": -5.710025787353516, "step": 538 }, { "epoch": 0.13, "learning_rate": 1.9349333333333334e-07, "logps/chosen": -208.27157592773438, "logps/rejected": -270.810791015625, "loss": 0.0456, "losses/dpo": 5.222758318268461e-06, "losses/sft": 0.637454628944397, "losses/total": 5.222758318268461e-06, "ref_logps/chosen": -207.28736877441406, "ref_logps/rejected": -212.9489288330078, "rewards/accuracies": 1.0, "rewards/chosen": -0.09841947257518768, "rewards/margins": 5.687768459320068, "rewards/rejected": -5.786187648773193, "step": 539 }, { "epoch": 0.13, "learning_rate": 1.9344e-07, "logps/chosen": -241.60476684570312, "logps/rejected": -290.7644348144531, "loss": 0.0382, "losses/dpo": 0.003742544213309884, "losses/sft": 0.7781136631965637, "losses/total": 0.003742544213309884, "ref_logps/chosen": -239.81130981445312, "ref_logps/rejected": -233.16355895996094, "rewards/accuracies": 1.0, "rewards/chosen": -0.17934754490852356, "rewards/margins": 5.580739498138428, "rewards/rejected": -5.760087490081787, "step": 540 }, { "epoch": 0.13, "learning_rate": 1.9338666666666664e-07, "logps/chosen": -253.95486450195312, "logps/rejected": -240.89805603027344, "loss": 0.0416, "losses/dpo": 0.0002801981463562697, "losses/sft": 0.6092159748077393, "losses/total": 0.0002801981463562697, "ref_logps/chosen": -252.51815795898438, "ref_logps/rejected": -193.0398406982422, "rewards/accuracies": 1.0, "rewards/chosen": -0.14367108047008514, "rewards/margins": 4.642149925231934, "rewards/rejected": -4.785820960998535, "step": 541 }, { "epoch": 0.13, "learning_rate": 1.9333333333333332e-07, "logps/chosen": -212.02157592773438, "logps/rejected": -252.56060791015625, "loss": 0.0356, "losses/dpo": 5.0246941100340337e-05, "losses/sft": 0.4879094362258911, "losses/total": 5.0246941100340337e-05, "ref_logps/chosen": -212.20257568359375, "ref_logps/rejected": -204.86866760253906, "rewards/accuracies": 1.0, "rewards/chosen": 0.018097661435604095, "rewards/margins": 4.787290573120117, "rewards/rejected": -4.769192695617676, "step": 542 }, { "epoch": 0.13, "learning_rate": 1.9328e-07, "logps/chosen": -223.86099243164062, "logps/rejected": -278.5662841796875, "loss": 0.0553, "losses/dpo": 0.001249072840437293, "losses/sft": 0.8128189444541931, "losses/total": 0.001249072840437293, "ref_logps/chosen": -222.9125518798828, "ref_logps/rejected": -227.2410888671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.09484565258026123, "rewards/margins": 5.037673473358154, "rewards/rejected": -5.132518768310547, "step": 543 }, { "epoch": 0.13, "learning_rate": 1.9322666666666664e-07, "logps/chosen": -241.1956329345703, "logps/rejected": -290.20654296875, "loss": 0.0309, "losses/dpo": 0.0008908970630727708, "losses/sft": 0.6907840967178345, "losses/total": 0.0008908970630727708, "ref_logps/chosen": -241.55239868164062, "ref_logps/rejected": -232.42283630371094, "rewards/accuracies": 1.0, "rewards/chosen": 0.03567618131637573, "rewards/margins": 5.8140482902526855, "rewards/rejected": -5.778371810913086, "step": 544 }, { "epoch": 0.13, "learning_rate": 1.9317333333333332e-07, "logps/chosen": -222.68215942382812, "logps/rejected": -298.9481201171875, "loss": 0.0251, "losses/dpo": 0.002448056358844042, "losses/sft": 0.5672305822372437, "losses/total": 0.002448056358844042, "ref_logps/chosen": -222.36595153808594, "ref_logps/rejected": -240.8396453857422, "rewards/accuracies": 1.0, "rewards/chosen": -0.031621046364307404, "rewards/margins": 5.779228210449219, "rewards/rejected": -5.810849189758301, "step": 545 }, { "epoch": 0.13, "learning_rate": 1.9312e-07, "logps/chosen": -233.56097412109375, "logps/rejected": -284.82586669921875, "loss": 0.0272, "losses/dpo": 6.647936061199289e-06, "losses/sft": 1.133476734161377, "losses/total": 6.647936061199289e-06, "ref_logps/chosen": -231.01980590820312, "ref_logps/rejected": -226.62001037597656, "rewards/accuracies": 1.0, "rewards/chosen": -0.2541171908378601, "rewards/margins": 5.566469192504883, "rewards/rejected": -5.820586204528809, "step": 546 }, { "epoch": 0.13, "learning_rate": 1.9306666666666667e-07, "logps/chosen": -212.05300903320312, "logps/rejected": -272.5029296875, "loss": 0.0348, "losses/dpo": 0.00034179314388893545, "losses/sft": 0.5320177674293518, "losses/total": 0.00034179314388893545, "ref_logps/chosen": -211.42510986328125, "ref_logps/rejected": -214.8641357421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.06279139220714569, "rewards/margins": 5.701086044311523, "rewards/rejected": -5.7638773918151855, "step": 547 }, { "epoch": 0.13, "learning_rate": 1.9301333333333333e-07, "logps/chosen": -201.613037109375, "logps/rejected": -238.49183654785156, "loss": 0.0418, "losses/dpo": 0.08885990083217621, "losses/sft": 0.5884774923324585, "losses/total": 0.08885990083217621, "ref_logps/chosen": -199.83062744140625, "ref_logps/rejected": -191.81005859375, "rewards/accuracies": 1.0, "rewards/chosen": -0.1782413274049759, "rewards/margins": 4.4899373054504395, "rewards/rejected": -4.668178558349609, "step": 548 }, { "epoch": 0.13, "learning_rate": 1.9296e-07, "logps/chosen": -230.08843994140625, "logps/rejected": -276.41741943359375, "loss": 0.0484, "losses/dpo": 0.0002862724650185555, "losses/sft": 0.38048285245895386, "losses/total": 0.0002862724650185555, "ref_logps/chosen": -228.91751098632812, "ref_logps/rejected": -222.30799865722656, "rewards/accuracies": 1.0, "rewards/chosen": -0.11709423363208771, "rewards/margins": 5.293846607208252, "rewards/rejected": -5.410941123962402, "step": 549 }, { "epoch": 0.13, "learning_rate": 1.9290666666666665e-07, "logps/chosen": -204.81715393066406, "logps/rejected": -283.7685852050781, "loss": 0.0266, "losses/dpo": 0.0006286029238253832, "losses/sft": 0.58090740442276, "losses/total": 0.0006286029238253832, "ref_logps/chosen": -204.0070037841797, "ref_logps/rejected": -224.8516845703125, "rewards/accuracies": 1.0, "rewards/chosen": -0.08101440966129303, "rewards/margins": 5.810674667358398, "rewards/rejected": -5.891688346862793, "step": 550 }, { "epoch": 0.13, "learning_rate": 1.9285333333333333e-07, "logps/chosen": -255.91510009765625, "logps/rejected": -287.5010070800781, "loss": 0.0258, "losses/dpo": 0.0022182229440659285, "losses/sft": 0.5010700225830078, "losses/total": 0.0022182229440659285, "ref_logps/chosen": -253.98056030273438, "ref_logps/rejected": -231.45358276367188, "rewards/accuracies": 1.0, "rewards/chosen": -0.19345623254776, "rewards/margins": 5.411285877227783, "rewards/rejected": -5.604742050170898, "step": 551 }, { "epoch": 0.13, "learning_rate": 1.9279999999999998e-07, "logps/chosen": -217.3665313720703, "logps/rejected": -271.4977111816406, "loss": 0.0438, "losses/dpo": 8.697545854374766e-05, "losses/sft": 0.5978262424468994, "losses/total": 8.697545854374766e-05, "ref_logps/chosen": -216.84658813476562, "ref_logps/rejected": -218.40687561035156, "rewards/accuracies": 1.0, "rewards/chosen": -0.051994454115629196, "rewards/margins": 5.257087230682373, "rewards/rejected": -5.30908203125, "step": 552 }, { "epoch": 0.13, "learning_rate": 1.9274666666666666e-07, "logps/chosen": -244.20059204101562, "logps/rejected": -296.51727294921875, "loss": 0.0176, "losses/dpo": 0.00042614812264218926, "losses/sft": 0.6299421191215515, "losses/total": 0.00042614812264218926, "ref_logps/chosen": -244.69488525390625, "ref_logps/rejected": -240.3931121826172, "rewards/accuracies": 1.0, "rewards/chosen": 0.04943039268255234, "rewards/margins": 5.661843776702881, "rewards/rejected": -5.61241340637207, "step": 553 }, { "epoch": 0.13, "learning_rate": 1.9269333333333333e-07, "logps/chosen": -234.94659423828125, "logps/rejected": -257.1839904785156, "loss": 0.0228, "losses/dpo": 0.002118112286552787, "losses/sft": 0.39747506380081177, "losses/total": 0.002118112286552787, "ref_logps/chosen": -234.03309631347656, "ref_logps/rejected": -204.25985717773438, "rewards/accuracies": 1.0, "rewards/chosen": -0.09134909510612488, "rewards/margins": 5.201066017150879, "rewards/rejected": -5.292415142059326, "step": 554 }, { "epoch": 0.13, "learning_rate": 1.9263999999999998e-07, "logps/chosen": -220.91978454589844, "logps/rejected": -262.77154541015625, "loss": 0.0409, "losses/dpo": 0.00046852396917529404, "losses/sft": 0.6854633092880249, "losses/total": 0.00046852396917529404, "ref_logps/chosen": -219.78607177734375, "ref_logps/rejected": -205.50340270996094, "rewards/accuracies": 1.0, "rewards/chosen": -0.11337100714445114, "rewards/margins": 5.6134443283081055, "rewards/rejected": -5.726815700531006, "step": 555 }, { "epoch": 0.13, "learning_rate": 1.9258666666666666e-07, "logps/chosen": -244.59490966796875, "logps/rejected": -295.1272277832031, "loss": 0.0197, "losses/dpo": 0.00012511678505688906, "losses/sft": 0.4997722804546356, "losses/total": 0.00012511678505688906, "ref_logps/chosen": -242.6902618408203, "ref_logps/rejected": -229.8974609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.19046452641487122, "rewards/margins": 6.332510948181152, "rewards/rejected": -6.522975921630859, "step": 556 }, { "epoch": 0.13, "learning_rate": 1.9253333333333334e-07, "logps/chosen": -190.81741333007812, "logps/rejected": -255.6485137939453, "loss": 0.0481, "losses/dpo": 0.004096269607543945, "losses/sft": 0.556681215763092, "losses/total": 0.004096269607543945, "ref_logps/chosen": -189.8600616455078, "ref_logps/rejected": -202.8923797607422, "rewards/accuracies": 1.0, "rewards/chosen": -0.09573526680469513, "rewards/margins": 5.1798787117004395, "rewards/rejected": -5.275613784790039, "step": 557 }, { "epoch": 0.13, "learning_rate": 1.9248e-07, "logps/chosen": -194.78839111328125, "logps/rejected": -277.9974060058594, "loss": 0.025, "losses/dpo": 0.000448686012532562, "losses/sft": 0.488455593585968, "losses/total": 0.000448686012532562, "ref_logps/chosen": -195.61294555664062, "ref_logps/rejected": -221.4540557861328, "rewards/accuracies": 1.0, "rewards/chosen": 0.08245579898357391, "rewards/margins": 5.736791133880615, "rewards/rejected": -5.6543354988098145, "step": 558 }, { "epoch": 0.13, "learning_rate": 1.9242666666666664e-07, "logps/chosen": -247.3877716064453, "logps/rejected": -298.8175354003906, "loss": 0.0171, "losses/dpo": 9.034449612954631e-05, "losses/sft": 0.5542526245117188, "losses/total": 9.034449612954631e-05, "ref_logps/chosen": -245.61187744140625, "ref_logps/rejected": -239.13140869140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.1775892674922943, "rewards/margins": 5.791022300720215, "rewards/rejected": -5.968612194061279, "step": 559 }, { "epoch": 0.13, "learning_rate": 1.9237333333333331e-07, "logps/chosen": -178.5565948486328, "logps/rejected": -259.7945251464844, "loss": 0.0528, "losses/dpo": 0.0014604537282139063, "losses/sft": 0.610127866268158, "losses/total": 0.0014604537282139063, "ref_logps/chosen": -177.83705139160156, "ref_logps/rejected": -206.67901611328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.07195277512073517, "rewards/margins": 5.239598274230957, "rewards/rejected": -5.311551094055176, "step": 560 }, { "epoch": 0.13, "learning_rate": 1.9232e-07, "logps/chosen": -244.419921875, "logps/rejected": -315.5182189941406, "loss": 0.0125, "losses/dpo": 0.0004977818462066352, "losses/sft": 0.507482647895813, "losses/total": 0.0004977818462066352, "ref_logps/chosen": -243.14239501953125, "ref_logps/rejected": -245.34637451171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.12775246798992157, "rewards/margins": 6.889431953430176, "rewards/rejected": -7.017185211181641, "step": 561 }, { "epoch": 0.13, "learning_rate": 1.9226666666666667e-07, "logps/chosen": -234.33230590820312, "logps/rejected": -291.346923828125, "loss": 0.0374, "losses/dpo": 0.0007299612043425441, "losses/sft": 0.5352464914321899, "losses/total": 0.0007299612043425441, "ref_logps/chosen": -233.20556640625, "ref_logps/rejected": -232.006103515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.1126740574836731, "rewards/margins": 5.821408271789551, "rewards/rejected": -5.93408203125, "step": 562 }, { "epoch": 0.14, "learning_rate": 1.9221333333333332e-07, "logps/chosen": -192.34844970703125, "logps/rejected": -273.0919189453125, "loss": 0.0163, "losses/dpo": 0.0007374222041107714, "losses/sft": 1.067307710647583, "losses/total": 0.0007374222041107714, "ref_logps/chosen": -192.3092803955078, "ref_logps/rejected": -217.35226440429688, "rewards/accuracies": 1.0, "rewards/chosen": -0.003918953239917755, "rewards/margins": 5.570046424865723, "rewards/rejected": -5.573966026306152, "step": 563 }, { "epoch": 0.14, "learning_rate": 1.9216e-07, "logps/chosen": -221.08050537109375, "logps/rejected": -298.5184326171875, "loss": 0.0119, "losses/dpo": 0.00031432282412424684, "losses/sft": 0.5590086579322815, "losses/total": 0.00031432282412424684, "ref_logps/chosen": -219.0028076171875, "ref_logps/rejected": -238.83978271484375, "rewards/accuracies": 1.0, "rewards/chosen": -0.20777152478694916, "rewards/margins": 5.760094165802002, "rewards/rejected": -5.967865943908691, "step": 564 }, { "epoch": 0.14, "learning_rate": 1.9210666666666667e-07, "logps/chosen": -263.0570373535156, "logps/rejected": -291.01214599609375, "loss": 0.0359, "losses/dpo": 9.987186058424413e-05, "losses/sft": 0.574700117111206, "losses/total": 9.987186058424413e-05, "ref_logps/chosen": -259.5683898925781, "ref_logps/rejected": -230.26919555664062, "rewards/accuracies": 0.96875, "rewards/chosen": -0.34886568784713745, "rewards/margins": 5.725431442260742, "rewards/rejected": -6.074296951293945, "step": 565 }, { "epoch": 0.14, "learning_rate": 1.9205333333333332e-07, "logps/chosen": -215.0703887939453, "logps/rejected": -300.2003479003906, "loss": 0.0209, "losses/dpo": 0.00022576873016078025, "losses/sft": 0.49733513593673706, "losses/total": 0.00022576873016078025, "ref_logps/chosen": -214.91641235351562, "ref_logps/rejected": -239.60922241210938, "rewards/accuracies": 1.0, "rewards/chosen": -0.015397297218441963, "rewards/margins": 6.0437164306640625, "rewards/rejected": -6.0591139793396, "step": 566 }, { "epoch": 0.14, "learning_rate": 1.9199999999999997e-07, "logps/chosen": -211.98570251464844, "logps/rejected": -266.9590148925781, "loss": 0.0481, "losses/dpo": 7.757305866107345e-05, "losses/sft": 0.6436522006988525, "losses/total": 7.757305866107345e-05, "ref_logps/chosen": -209.87484741210938, "ref_logps/rejected": -214.2095947265625, "rewards/accuracies": 1.0, "rewards/chosen": -0.21108399331569672, "rewards/margins": 5.063857078552246, "rewards/rejected": -5.274941444396973, "step": 567 }, { "epoch": 0.14, "learning_rate": 1.9194666666666665e-07, "logps/chosen": -230.71597290039062, "logps/rejected": -306.76708984375, "loss": 0.017, "losses/dpo": 0.00011656402057269588, "losses/sft": 0.4221608340740204, "losses/total": 0.00011656402057269588, "ref_logps/chosen": -231.0066680908203, "ref_logps/rejected": -244.5668182373047, "rewards/accuracies": 1.0, "rewards/chosen": 0.02907133474946022, "rewards/margins": 6.249095439910889, "rewards/rejected": -6.220024108886719, "step": 568 }, { "epoch": 0.14, "learning_rate": 1.9189333333333333e-07, "logps/chosen": -234.2073974609375, "logps/rejected": -259.3880615234375, "loss": 0.0561, "losses/dpo": 0.002925461158156395, "losses/sft": 0.7035113573074341, "losses/total": 0.002925461158156395, "ref_logps/chosen": -232.43994140625, "ref_logps/rejected": -207.65318298339844, "rewards/accuracies": 1.0, "rewards/chosen": -0.1767461597919464, "rewards/margins": 4.996741771697998, "rewards/rejected": -5.173488140106201, "step": 569 }, { "epoch": 0.14, "learning_rate": 1.9184e-07, "logps/chosen": -260.6943359375, "logps/rejected": -288.30804443359375, "loss": 0.0175, "losses/dpo": 0.02983704023063183, "losses/sft": 0.644736111164093, "losses/total": 0.02983704023063183, "ref_logps/chosen": -260.0293884277344, "ref_logps/rejected": -232.286376953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.06649388372898102, "rewards/margins": 5.535675048828125, "rewards/rejected": -5.602168560028076, "step": 570 }, { "epoch": 0.14, "learning_rate": 1.9178666666666665e-07, "logps/chosen": -242.77574157714844, "logps/rejected": -319.10162353515625, "loss": 0.0152, "losses/dpo": 0.00010557468340266496, "losses/sft": 1.1349139213562012, "losses/total": 0.00010557468340266496, "ref_logps/chosen": -242.04832458496094, "ref_logps/rejected": -260.8663635253906, "rewards/accuracies": 1.0, "rewards/chosen": -0.07274206727743149, "rewards/margins": 5.750782012939453, "rewards/rejected": -5.823524475097656, "step": 571 }, { "epoch": 0.14, "learning_rate": 1.9173333333333333e-07, "logps/chosen": -260.534912109375, "logps/rejected": -302.0492858886719, "loss": 0.0397, "losses/dpo": 2.361161750741303e-05, "losses/sft": 0.9967859983444214, "losses/total": 2.361161750741303e-05, "ref_logps/chosen": -258.47314453125, "ref_logps/rejected": -240.0457000732422, "rewards/accuracies": 1.0, "rewards/chosen": -0.20617729425430298, "rewards/margins": 5.994181156158447, "rewards/rejected": -6.2003583908081055, "step": 572 }, { "epoch": 0.14, "learning_rate": 1.9168e-07, "logps/chosen": -270.11688232421875, "logps/rejected": -302.1210632324219, "loss": 0.0339, "losses/dpo": 5.125437382957898e-05, "losses/sft": 0.5037969350814819, "losses/total": 5.125437382957898e-05, "ref_logps/chosen": -269.8896789550781, "ref_logps/rejected": -240.81576538085938, "rewards/accuracies": 1.0, "rewards/chosen": -0.02271992154419422, "rewards/margins": 6.1078081130981445, "rewards/rejected": -6.130528450012207, "step": 573 }, { "epoch": 0.14, "learning_rate": 1.9162666666666666e-07, "logps/chosen": -189.38644409179688, "logps/rejected": -284.4112548828125, "loss": 0.0148, "losses/dpo": 6.321738328551874e-05, "losses/sft": 0.5406598448753357, "losses/total": 6.321738328551874e-05, "ref_logps/chosen": -188.36317443847656, "ref_logps/rejected": -223.8560791015625, "rewards/accuracies": 1.0, "rewards/chosen": -0.10232722759246826, "rewards/margins": 5.95319128036499, "rewards/rejected": -6.05551815032959, "step": 574 }, { "epoch": 0.14, "learning_rate": 1.915733333333333e-07, "logps/chosen": -211.54115295410156, "logps/rejected": -271.34033203125, "loss": 0.03, "losses/dpo": 0.00021936548000667244, "losses/sft": 0.5747847557067871, "losses/total": 0.00021936548000667244, "ref_logps/chosen": -210.27566528320312, "ref_logps/rejected": -213.89401245117188, "rewards/accuracies": 1.0, "rewards/chosen": -0.1265488713979721, "rewards/margins": 5.618083477020264, "rewards/rejected": -5.744632720947266, "step": 575 }, { "epoch": 0.14, "learning_rate": 1.9151999999999998e-07, "logps/chosen": -236.50851440429688, "logps/rejected": -279.71356201171875, "loss": 0.0303, "losses/dpo": 0.000328696274664253, "losses/sft": 0.6312247514724731, "losses/total": 0.000328696274664253, "ref_logps/chosen": -235.28297424316406, "ref_logps/rejected": -223.30726623535156, "rewards/accuracies": 1.0, "rewards/chosen": -0.1225530356168747, "rewards/margins": 5.518075942993164, "rewards/rejected": -5.640628814697266, "step": 576 }, { "epoch": 0.14, "learning_rate": 1.9146666666666666e-07, "logps/chosen": -233.7297821044922, "logps/rejected": -302.3677978515625, "loss": 0.0232, "losses/dpo": 0.003844582475721836, "losses/sft": 0.8064451217651367, "losses/total": 0.003844582475721836, "ref_logps/chosen": -231.92608642578125, "ref_logps/rejected": -241.1112060546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.1803688257932663, "rewards/margins": 5.94528865814209, "rewards/rejected": -6.125657081604004, "step": 577 }, { "epoch": 0.14, "learning_rate": 1.914133333333333e-07, "logps/chosen": -269.6802978515625, "logps/rejected": -293.4240417480469, "loss": 0.0233, "losses/dpo": 0.0032706810161471367, "losses/sft": 0.49323102831840515, "losses/total": 0.0032706810161471367, "ref_logps/chosen": -266.56353759765625, "ref_logps/rejected": -227.3341064453125, "rewards/accuracies": 1.0, "rewards/chosen": -0.31167370080947876, "rewards/margins": 6.297318458557129, "rewards/rejected": -6.608992099761963, "step": 578 }, { "epoch": 0.14, "learning_rate": 1.9136e-07, "logps/chosen": -259.9886169433594, "logps/rejected": -328.94207763671875, "loss": 0.0382, "losses/dpo": 0.009131121449172497, "losses/sft": 0.5788549184799194, "losses/total": 0.009131121449172497, "ref_logps/chosen": -258.0262451171875, "ref_logps/rejected": -260.99835205078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.19623804092407227, "rewards/margins": 6.598134994506836, "rewards/rejected": -6.794373035430908, "step": 579 }, { "epoch": 0.14, "learning_rate": 1.9130666666666666e-07, "logps/chosen": -223.94268798828125, "logps/rejected": -319.7686767578125, "loss": 0.0129, "losses/dpo": 0.0001507865235907957, "losses/sft": 0.8813892602920532, "losses/total": 0.0001507865235907957, "ref_logps/chosen": -222.0897674560547, "ref_logps/rejected": -258.16497802734375, "rewards/accuracies": 1.0, "rewards/chosen": -0.18529179692268372, "rewards/margins": 5.9750800132751465, "rewards/rejected": -6.16037130355835, "step": 580 }, { "epoch": 0.14, "learning_rate": 1.9125333333333334e-07, "logps/chosen": -219.92037963867188, "logps/rejected": -269.48712158203125, "loss": 0.0463, "losses/dpo": 1.9114075257675722e-05, "losses/sft": 0.4035121500492096, "losses/total": 1.9114075257675722e-05, "ref_logps/chosen": -218.885498046875, "ref_logps/rejected": -217.56204223632812, "rewards/accuracies": 1.0, "rewards/chosen": -0.10348939895629883, "rewards/margins": 5.089017868041992, "rewards/rejected": -5.192507743835449, "step": 581 }, { "epoch": 0.14, "learning_rate": 1.912e-07, "logps/chosen": -226.23390197753906, "logps/rejected": -255.25552368164062, "loss": 0.0312, "losses/dpo": 0.0019830605015158653, "losses/sft": 0.6803484559059143, "losses/total": 0.0019830605015158653, "ref_logps/chosen": -226.1061553955078, "ref_logps/rejected": -202.7032928466797, "rewards/accuracies": 1.0, "rewards/chosen": -0.012771908193826675, "rewards/margins": 5.2424516677856445, "rewards/rejected": -5.255223274230957, "step": 582 }, { "epoch": 0.14, "learning_rate": 1.9114666666666667e-07, "logps/chosen": -236.30555725097656, "logps/rejected": -279.0375671386719, "loss": 0.0508, "losses/dpo": 6.309500167844817e-05, "losses/sft": 0.596363365650177, "losses/total": 6.309500167844817e-05, "ref_logps/chosen": -234.59632873535156, "ref_logps/rejected": -223.23365783691406, "rewards/accuracies": 1.0, "rewards/chosen": -0.17092294991016388, "rewards/margins": 5.409466743469238, "rewards/rejected": -5.580389499664307, "step": 583 }, { "epoch": 0.14, "learning_rate": 1.9109333333333332e-07, "logps/chosen": -249.85186767578125, "logps/rejected": -305.06793212890625, "loss": 0.0145, "losses/dpo": 0.0026168227195739746, "losses/sft": 0.4869450628757477, "losses/total": 0.0026168227195739746, "ref_logps/chosen": -248.86953735351562, "ref_logps/rejected": -244.6089324951172, "rewards/accuracies": 1.0, "rewards/chosen": -0.09823320060968399, "rewards/margins": 5.947667598724365, "rewards/rejected": -6.045900821685791, "step": 584 }, { "epoch": 0.14, "learning_rate": 1.9104e-07, "logps/chosen": -212.30673217773438, "logps/rejected": -271.20611572265625, "loss": 0.0362, "losses/dpo": 0.006482822820544243, "losses/sft": 0.6259345412254333, "losses/total": 0.006482822820544243, "ref_logps/chosen": -210.77122497558594, "ref_logps/rejected": -212.41537475585938, "rewards/accuracies": 1.0, "rewards/chosen": -0.15355214476585388, "rewards/margins": 5.725521087646484, "rewards/rejected": -5.879073143005371, "step": 585 }, { "epoch": 0.14, "learning_rate": 1.9098666666666665e-07, "logps/chosen": -209.62017822265625, "logps/rejected": -259.0879211425781, "loss": 0.0282, "losses/dpo": 0.0008841166854836047, "losses/sft": 0.3690822124481201, "losses/total": 0.0008841166854836047, "ref_logps/chosen": -208.83114624023438, "ref_logps/rejected": -202.8600616455078, "rewards/accuracies": 1.0, "rewards/chosen": -0.07890382409095764, "rewards/margins": 5.543883800506592, "rewards/rejected": -5.622787952423096, "step": 586 }, { "epoch": 0.14, "learning_rate": 1.9093333333333332e-07, "logps/chosen": -205.12420654296875, "logps/rejected": -267.40533447265625, "loss": 0.0317, "losses/dpo": 0.00039186267531476915, "losses/sft": 0.5964657664299011, "losses/total": 0.00039186267531476915, "ref_logps/chosen": -204.65919494628906, "ref_logps/rejected": -211.32986450195312, "rewards/accuracies": 1.0, "rewards/chosen": -0.04650215432047844, "rewards/margins": 5.561043739318848, "rewards/rejected": -5.607546329498291, "step": 587 }, { "epoch": 0.14, "learning_rate": 1.9088e-07, "logps/chosen": -247.09576416015625, "logps/rejected": -303.8192443847656, "loss": 0.0131, "losses/dpo": 7.001708581810817e-05, "losses/sft": 0.7461417317390442, "losses/total": 7.001708581810817e-05, "ref_logps/chosen": -246.97247314453125, "ref_logps/rejected": -240.08177185058594, "rewards/accuracies": 1.0, "rewards/chosen": -0.012329615652561188, "rewards/margins": 6.361418724060059, "rewards/rejected": -6.373748779296875, "step": 588 }, { "epoch": 0.14, "learning_rate": 1.9082666666666668e-07, "logps/chosen": -218.10292053222656, "logps/rejected": -262.2877502441406, "loss": 0.0668, "losses/dpo": 0.00010398645827081054, "losses/sft": 0.6142202615737915, "losses/total": 0.00010398645827081054, "ref_logps/chosen": -217.09689331054688, "ref_logps/rejected": -211.58065795898438, "rewards/accuracies": 1.0, "rewards/chosen": -0.10060305893421173, "rewards/margins": 4.970107078552246, "rewards/rejected": -5.070710182189941, "step": 589 }, { "epoch": 0.14, "learning_rate": 1.9077333333333333e-07, "logps/chosen": -235.33090209960938, "logps/rejected": -289.07684326171875, "loss": 0.0333, "losses/dpo": 0.0002912480558734387, "losses/sft": 0.4805985987186432, "losses/total": 0.0002912480558734387, "ref_logps/chosen": -233.94747924804688, "ref_logps/rejected": -225.25244140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.13834059238433838, "rewards/margins": 6.244098663330078, "rewards/rejected": -6.382438659667969, "step": 590 }, { "epoch": 0.14, "learning_rate": 1.9072e-07, "logps/chosen": -260.3480224609375, "logps/rejected": -316.77728271484375, "loss": 0.0153, "losses/dpo": 0.0031861017923802137, "losses/sft": 0.4198523461818695, "losses/total": 0.0031861017923802137, "ref_logps/chosen": -257.5816345214844, "ref_logps/rejected": -250.93893432617188, "rewards/accuracies": 1.0, "rewards/chosen": -0.2766399085521698, "rewards/margins": 6.3071980476379395, "rewards/rejected": -6.58383846282959, "step": 591 }, { "epoch": 0.14, "learning_rate": 1.9066666666666665e-07, "logps/chosen": -243.22552490234375, "logps/rejected": -305.4575500488281, "loss": 0.0245, "losses/dpo": 3.618007031036541e-05, "losses/sft": 0.6609069108963013, "losses/total": 3.618007031036541e-05, "ref_logps/chosen": -241.060546875, "ref_logps/rejected": -242.30169677734375, "rewards/accuracies": 1.0, "rewards/chosen": -0.21649938821792603, "rewards/margins": 6.099085807800293, "rewards/rejected": -6.315585136413574, "step": 592 }, { "epoch": 0.14, "learning_rate": 1.906133333333333e-07, "logps/chosen": -235.42616271972656, "logps/rejected": -295.6114501953125, "loss": 0.0213, "losses/dpo": 4.923470987705514e-05, "losses/sft": 0.9894939661026001, "losses/total": 4.923470987705514e-05, "ref_logps/chosen": -234.72967529296875, "ref_logps/rejected": -234.86317443847656, "rewards/accuracies": 1.0, "rewards/chosen": -0.06964743882417679, "rewards/margins": 6.005181312561035, "rewards/rejected": -6.074828624725342, "step": 593 }, { "epoch": 0.14, "learning_rate": 1.9055999999999998e-07, "logps/chosen": -226.8382110595703, "logps/rejected": -266.85870361328125, "loss": 0.038, "losses/dpo": 0.0013178784865885973, "losses/sft": 0.5638512969017029, "losses/total": 0.0013178784865885973, "ref_logps/chosen": -225.01197814941406, "ref_logps/rejected": -208.75746154785156, "rewards/accuracies": 1.0, "rewards/chosen": -0.1826229691505432, "rewards/margins": 5.627500534057617, "rewards/rejected": -5.810122966766357, "step": 594 }, { "epoch": 0.14, "learning_rate": 1.9050666666666666e-07, "logps/chosen": -240.961181640625, "logps/rejected": -306.25921630859375, "loss": 0.0304, "losses/dpo": 0.011628915555775166, "losses/sft": 0.49958324432373047, "losses/total": 0.011628915555775166, "ref_logps/chosen": -239.00177001953125, "ref_logps/rejected": -243.42599487304688, "rewards/accuracies": 1.0, "rewards/chosen": -0.1959385871887207, "rewards/margins": 6.087383270263672, "rewards/rejected": -6.283321380615234, "step": 595 }, { "epoch": 0.14, "learning_rate": 1.9045333333333333e-07, "logps/chosen": -225.70864868164062, "logps/rejected": -298.14959716796875, "loss": 0.0237, "losses/dpo": 0.0003870667133014649, "losses/sft": 0.5861716270446777, "losses/total": 0.0003870667133014649, "ref_logps/chosen": -224.13851928710938, "ref_logps/rejected": -233.21055603027344, "rewards/accuracies": 1.0, "rewards/chosen": -0.15701404213905334, "rewards/margins": 6.3368916511535645, "rewards/rejected": -6.493906021118164, "step": 596 }, { "epoch": 0.14, "learning_rate": 1.9039999999999998e-07, "logps/chosen": -225.8697509765625, "logps/rejected": -293.3359680175781, "loss": 0.0545, "losses/dpo": 1.2040800356771797e-06, "losses/sft": 0.458538681268692, "losses/total": 1.2040800356771797e-06, "ref_logps/chosen": -224.50096130371094, "ref_logps/rejected": -232.6795654296875, "rewards/accuracies": 1.0, "rewards/chosen": -0.13687795400619507, "rewards/margins": 5.928763389587402, "rewards/rejected": -6.065640926361084, "step": 597 }, { "epoch": 0.14, "learning_rate": 1.9034666666666666e-07, "logps/chosen": -225.10281372070312, "logps/rejected": -266.64544677734375, "loss": 0.0495, "losses/dpo": 1.4914094208506867e-05, "losses/sft": 0.654615581035614, "losses/total": 1.4914094208506867e-05, "ref_logps/chosen": -224.0142059326172, "ref_logps/rejected": -213.54771423339844, "rewards/accuracies": 1.0, "rewards/chosen": -0.10885941982269287, "rewards/margins": 5.20091438293457, "rewards/rejected": -5.3097734451293945, "step": 598 }, { "epoch": 0.14, "learning_rate": 1.9029333333333334e-07, "logps/chosen": -205.87503051757812, "logps/rejected": -284.75933837890625, "loss": 0.042, "losses/dpo": 0.0006424114108085632, "losses/sft": 0.43775323033332825, "losses/total": 0.0006424114108085632, "ref_logps/chosen": -204.57249450683594, "ref_logps/rejected": -227.66094970703125, "rewards/accuracies": 1.0, "rewards/chosen": -0.13025391101837158, "rewards/margins": 5.579586029052734, "rewards/rejected": -5.709839820861816, "step": 599 }, { "epoch": 0.14, "learning_rate": 1.9024e-07, "logps/chosen": -242.86166381835938, "logps/rejected": -283.7806396484375, "loss": 0.0232, "losses/dpo": 3.2008956623030826e-05, "losses/sft": 0.4680567681789398, "losses/total": 3.2008956623030826e-05, "ref_logps/chosen": -242.4496612548828, "ref_logps/rejected": -222.09136962890625, "rewards/accuracies": 1.0, "rewards/chosen": -0.04119894653558731, "rewards/margins": 6.1277289390563965, "rewards/rejected": -6.1689276695251465, "step": 600 }, { "epoch": 0.14, "learning_rate": 1.9018666666666664e-07, "logps/chosen": -211.15756225585938, "logps/rejected": -272.55859375, "loss": 0.029, "losses/dpo": 0.0005141609581187367, "losses/sft": 0.568399965763092, "losses/total": 0.0005141609581187367, "ref_logps/chosen": -209.15377807617188, "ref_logps/rejected": -211.3975372314453, "rewards/accuracies": 1.0, "rewards/chosen": -0.20037756860256195, "rewards/margins": 5.915726661682129, "rewards/rejected": -6.116105079650879, "step": 601 }, { "epoch": 0.14, "learning_rate": 1.9013333333333332e-07, "logps/chosen": -233.05429077148438, "logps/rejected": -279.03436279296875, "loss": 0.0286, "losses/dpo": 0.0010462423088029027, "losses/sft": 0.4954144358634949, "losses/total": 0.0010462423088029027, "ref_logps/chosen": -232.25784301757812, "ref_logps/rejected": -224.62808227539062, "rewards/accuracies": 1.0, "rewards/chosen": -0.07964657247066498, "rewards/margins": 5.360979080200195, "rewards/rejected": -5.4406256675720215, "step": 602 }, { "epoch": 0.14, "learning_rate": 1.9008e-07, "logps/chosen": -219.15484619140625, "logps/rejected": -268.52923583984375, "loss": 0.0317, "losses/dpo": 5.5637883633608e-05, "losses/sft": 0.5890751481056213, "losses/total": 5.5637883633608e-05, "ref_logps/chosen": -218.53018188476562, "ref_logps/rejected": -208.09432983398438, "rewards/accuracies": 1.0, "rewards/chosen": -0.06246552616357803, "rewards/margins": 5.981027603149414, "rewards/rejected": -6.043493270874023, "step": 603 }, { "epoch": 0.14, "learning_rate": 1.9002666666666667e-07, "logps/chosen": -193.87747192382812, "logps/rejected": -263.7756652832031, "loss": 0.0302, "losses/dpo": 0.0007590270834043622, "losses/sft": 0.5507609248161316, "losses/total": 0.0007590270834043622, "ref_logps/chosen": -193.55056762695312, "ref_logps/rejected": -208.57919311523438, "rewards/accuracies": 1.0, "rewards/chosen": -0.03268997371196747, "rewards/margins": 5.486956596374512, "rewards/rejected": -5.519646167755127, "step": 604 }, { "epoch": 0.15, "learning_rate": 1.8997333333333332e-07, "logps/chosen": -249.23825073242188, "logps/rejected": -302.5553894042969, "loss": 0.0267, "losses/dpo": 0.0001319687580689788, "losses/sft": 0.5846229791641235, "losses/total": 0.0001319687580689788, "ref_logps/chosen": -247.16165161132812, "ref_logps/rejected": -234.56321716308594, "rewards/accuracies": 1.0, "rewards/chosen": -0.2076595425605774, "rewards/margins": 6.591556072235107, "rewards/rejected": -6.799215793609619, "step": 605 }, { "epoch": 0.15, "learning_rate": 1.8992e-07, "logps/chosen": -242.71688842773438, "logps/rejected": -297.69110107421875, "loss": 0.0185, "losses/dpo": 0.00015862729924265295, "losses/sft": 0.4254014194011688, "losses/total": 0.00015862729924265295, "ref_logps/chosen": -239.9459228515625, "ref_logps/rejected": -237.43032836914062, "rewards/accuracies": 1.0, "rewards/chosen": -0.277096688747406, "rewards/margins": 5.748979568481445, "rewards/rejected": -6.026076316833496, "step": 606 }, { "epoch": 0.15, "learning_rate": 1.8986666666666667e-07, "logps/chosen": -238.02072143554688, "logps/rejected": -285.50726318359375, "loss": 0.0242, "losses/dpo": 0.0031794696114957333, "losses/sft": 0.5063117146492004, "losses/total": 0.0031794696114957333, "ref_logps/chosen": -236.019287109375, "ref_logps/rejected": -229.2255401611328, "rewards/accuracies": 1.0, "rewards/chosen": -0.2001437395811081, "rewards/margins": 5.428028106689453, "rewards/rejected": -5.628171920776367, "step": 607 }, { "epoch": 0.15, "learning_rate": 1.8981333333333332e-07, "logps/chosen": -222.21426391601562, "logps/rejected": -295.477294921875, "loss": 0.0233, "losses/dpo": 3.1150815630098805e-05, "losses/sft": 0.5427461266517639, "losses/total": 3.1150815630098805e-05, "ref_logps/chosen": -221.10565185546875, "ref_logps/rejected": -233.58291625976562, "rewards/accuracies": 1.0, "rewards/chosen": -0.11086101830005646, "rewards/margins": 6.078577518463135, "rewards/rejected": -6.189438819885254, "step": 608 }, { "epoch": 0.15, "learning_rate": 1.8976e-07, "logps/chosen": -254.82003784179688, "logps/rejected": -303.2315673828125, "loss": 0.0122, "losses/dpo": 0.0026830036658793688, "losses/sft": 0.6765676140785217, "losses/total": 0.0026830036658793688, "ref_logps/chosen": -251.87489318847656, "ref_logps/rejected": -234.39747619628906, "rewards/accuracies": 1.0, "rewards/chosen": -0.2945147752761841, "rewards/margins": 6.588891506195068, "rewards/rejected": -6.883406162261963, "step": 609 }, { "epoch": 0.15, "learning_rate": 1.8970666666666665e-07, "logps/chosen": -215.9941864013672, "logps/rejected": -294.0511474609375, "loss": 0.0356, "losses/dpo": 2.787016455840785e-05, "losses/sft": 0.5344851613044739, "losses/total": 2.787016455840785e-05, "ref_logps/chosen": -215.27288818359375, "ref_logps/rejected": -231.0452117919922, "rewards/accuracies": 1.0, "rewards/chosen": -0.07212908565998077, "rewards/margins": 6.228466033935547, "rewards/rejected": -6.300595283508301, "step": 610 }, { "epoch": 0.15, "learning_rate": 1.8965333333333333e-07, "logps/chosen": -201.4410858154297, "logps/rejected": -278.8097229003906, "loss": 0.0201, "losses/dpo": 6.864821625640616e-05, "losses/sft": 1.0803169012069702, "losses/total": 6.864821625640616e-05, "ref_logps/chosen": -199.62884521484375, "ref_logps/rejected": -220.9080810546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.18122200667858124, "rewards/margins": 5.608941078186035, "rewards/rejected": -5.790163040161133, "step": 611 }, { "epoch": 0.15, "learning_rate": 1.8959999999999998e-07, "logps/chosen": -238.8749237060547, "logps/rejected": -288.66143798828125, "loss": 0.0282, "losses/dpo": 0.00020975785446353257, "losses/sft": 0.8848227858543396, "losses/total": 0.00020975785446353257, "ref_logps/chosen": -237.17803955078125, "ref_logps/rejected": -225.0447998046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.16968825459480286, "rewards/margins": 6.191977024078369, "rewards/rejected": -6.36166524887085, "step": 612 }, { "epoch": 0.15, "learning_rate": 1.8954666666666665e-07, "logps/chosen": -264.8121337890625, "logps/rejected": -316.841552734375, "loss": 0.0286, "losses/dpo": 0.0002383473183726892, "losses/sft": 1.1314905881881714, "losses/total": 0.0002383473183726892, "ref_logps/chosen": -262.5582275390625, "ref_logps/rejected": -257.18853759765625, "rewards/accuracies": 1.0, "rewards/chosen": -0.2253897488117218, "rewards/margins": 5.7399139404296875, "rewards/rejected": -5.965303421020508, "step": 613 }, { "epoch": 0.15, "learning_rate": 1.8949333333333333e-07, "logps/chosen": -286.6923522949219, "logps/rejected": -305.77716064453125, "loss": 0.0288, "losses/dpo": 0.0002155238325940445, "losses/sft": 0.4287835955619812, "losses/total": 0.0002155238325940445, "ref_logps/chosen": -283.922607421875, "ref_logps/rejected": -244.3067169189453, "rewards/accuracies": 1.0, "rewards/chosen": -0.2769756019115448, "rewards/margins": 5.870071887969971, "rewards/rejected": -6.147047519683838, "step": 614 }, { "epoch": 0.15, "learning_rate": 1.8944e-07, "logps/chosen": -250.33677673339844, "logps/rejected": -287.94793701171875, "loss": 0.0448, "losses/dpo": 0.0030109186191111803, "losses/sft": 0.6934131383895874, "losses/total": 0.0030109186191111803, "ref_logps/chosen": -248.19471740722656, "ref_logps/rejected": -224.50758361816406, "rewards/accuracies": 1.0, "rewards/chosen": -0.2142074704170227, "rewards/margins": 6.12982702255249, "rewards/rejected": -6.344034671783447, "step": 615 }, { "epoch": 0.15, "learning_rate": 1.8938666666666666e-07, "logps/chosen": -243.78616333007812, "logps/rejected": -262.9642333984375, "loss": 0.0354, "losses/dpo": 0.018462078645825386, "losses/sft": 0.5738198757171631, "losses/total": 0.018462078645825386, "ref_logps/chosen": -243.3750457763672, "ref_logps/rejected": -208.53976440429688, "rewards/accuracies": 1.0, "rewards/chosen": -0.04110997915267944, "rewards/margins": 5.401337146759033, "rewards/rejected": -5.442447185516357, "step": 616 }, { "epoch": 0.15, "learning_rate": 1.8933333333333333e-07, "logps/chosen": -238.81198120117188, "logps/rejected": -286.92279052734375, "loss": 0.0254, "losses/dpo": 0.007422829512506723, "losses/sft": 0.47077488899230957, "losses/total": 0.007422829512506723, "ref_logps/chosen": -237.00588989257812, "ref_logps/rejected": -228.5644989013672, "rewards/accuracies": 1.0, "rewards/chosen": -0.180605947971344, "rewards/margins": 5.655226707458496, "rewards/rejected": -5.8358330726623535, "step": 617 }, { "epoch": 0.15, "learning_rate": 1.8927999999999998e-07, "logps/chosen": -241.47763061523438, "logps/rejected": -293.0635986328125, "loss": 0.0247, "losses/dpo": 0.00013910196139477193, "losses/sft": 0.6617066264152527, "losses/total": 0.00013910196139477193, "ref_logps/chosen": -238.3504638671875, "ref_logps/rejected": -228.9840545654297, "rewards/accuracies": 1.0, "rewards/chosen": -0.31271541118621826, "rewards/margins": 6.095236778259277, "rewards/rejected": -6.407952308654785, "step": 618 }, { "epoch": 0.15, "learning_rate": 1.8922666666666666e-07, "logps/chosen": -226.13150024414062, "logps/rejected": -273.8498840332031, "loss": 0.0346, "losses/dpo": 1.3158202818885911e-05, "losses/sft": 0.49640092253685, "losses/total": 1.3158202818885911e-05, "ref_logps/chosen": -223.53173828125, "ref_logps/rejected": -214.01455688476562, "rewards/accuracies": 1.0, "rewards/chosen": -0.25997650623321533, "rewards/margins": 5.723555564880371, "rewards/rejected": -5.983532428741455, "step": 619 }, { "epoch": 0.15, "learning_rate": 1.891733333333333e-07, "logps/chosen": -177.43673706054688, "logps/rejected": -255.48513793945312, "loss": 0.0376, "losses/dpo": 3.93561931559816e-05, "losses/sft": 0.5835516452789307, "losses/total": 3.93561931559816e-05, "ref_logps/chosen": -176.91012573242188, "ref_logps/rejected": -198.43478393554688, "rewards/accuracies": 1.0, "rewards/chosen": -0.05266132205724716, "rewards/margins": 5.6523756980896, "rewards/rejected": -5.7050371170043945, "step": 620 }, { "epoch": 0.15, "learning_rate": 1.8912e-07, "logps/chosen": -216.66400146484375, "logps/rejected": -299.62774658203125, "loss": 0.0378, "losses/dpo": 4.775619163410738e-05, "losses/sft": 0.49812451004981995, "losses/total": 4.775619163410738e-05, "ref_logps/chosen": -215.2349853515625, "ref_logps/rejected": -235.77127075195312, "rewards/accuracies": 1.0, "rewards/chosen": -0.14290282130241394, "rewards/margins": 6.242746353149414, "rewards/rejected": -6.385649681091309, "step": 621 }, { "epoch": 0.15, "learning_rate": 1.8906666666666667e-07, "logps/chosen": -244.71597290039062, "logps/rejected": -289.2686767578125, "loss": 0.0099, "losses/dpo": 0.0008201483869925141, "losses/sft": 0.6881868839263916, "losses/total": 0.0008201483869925141, "ref_logps/chosen": -243.406982421875, "ref_logps/rejected": -223.52032470703125, "rewards/accuracies": 1.0, "rewards/chosen": -0.13090205192565918, "rewards/margins": 6.443931579589844, "rewards/rejected": -6.574833869934082, "step": 622 }, { "epoch": 0.15, "learning_rate": 1.8901333333333334e-07, "logps/chosen": -168.77880859375, "logps/rejected": -266.7429504394531, "loss": 0.0319, "losses/dpo": 0.0007249268237501383, "losses/sft": 0.46919241547584534, "losses/total": 0.0007249268237501383, "ref_logps/chosen": -167.8909454345703, "ref_logps/rejected": -209.76028442382812, "rewards/accuracies": 1.0, "rewards/chosen": -0.08878645300865173, "rewards/margins": 5.609480857849121, "rewards/rejected": -5.698267459869385, "step": 623 }, { "epoch": 0.15, "learning_rate": 1.8896e-07, "logps/chosen": -221.73553466796875, "logps/rejected": -307.6062927246094, "loss": 0.0161, "losses/dpo": 0.0005909677129238844, "losses/sft": 1.2885957956314087, "losses/total": 0.0005909677129238844, "ref_logps/chosen": -220.9830322265625, "ref_logps/rejected": -239.473876953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.07524921745061874, "rewards/margins": 6.73799467086792, "rewards/rejected": -6.813243389129639, "step": 624 }, { "epoch": 0.15, "learning_rate": 1.8890666666666667e-07, "logps/chosen": -225.412109375, "logps/rejected": -303.0155334472656, "loss": 0.0187, "losses/dpo": 0.00012231379514560103, "losses/sft": 0.6030899286270142, "losses/total": 0.00012231379514560103, "ref_logps/chosen": -226.08547973632812, "ref_logps/rejected": -238.05348205566406, "rewards/accuracies": 1.0, "rewards/chosen": 0.06733598560094833, "rewards/margins": 6.563541412353516, "rewards/rejected": -6.496204853057861, "step": 625 }, { "epoch": 0.15, "learning_rate": 1.8885333333333332e-07, "logps/chosen": -238.35015869140625, "logps/rejected": -260.57635498046875, "loss": 0.0263, "losses/dpo": 0.0006653151358477771, "losses/sft": 0.6042740941047668, "losses/total": 0.0006653151358477771, "ref_logps/chosen": -237.51806640625, "ref_logps/rejected": -204.84617614746094, "rewards/accuracies": 1.0, "rewards/chosen": -0.08320823311805725, "rewards/margins": 5.489811897277832, "rewards/rejected": -5.573019981384277, "step": 626 }, { "epoch": 0.15, "learning_rate": 1.8879999999999997e-07, "logps/chosen": -219.67855834960938, "logps/rejected": -251.23184204101562, "loss": 0.0372, "losses/dpo": 0.0014682285254821181, "losses/sft": 0.4760993719100952, "losses/total": 0.0014682285254821181, "ref_logps/chosen": -219.5550537109375, "ref_logps/rejected": -198.00210571289062, "rewards/accuracies": 1.0, "rewards/chosen": -0.012351464480161667, "rewards/margins": 5.310622215270996, "rewards/rejected": -5.32297420501709, "step": 627 }, { "epoch": 0.15, "learning_rate": 1.8874666666666665e-07, "logps/chosen": -206.96923828125, "logps/rejected": -297.18011474609375, "loss": 0.0136, "losses/dpo": 1.324632376054069e-05, "losses/sft": 0.7016860246658325, "losses/total": 1.324632376054069e-05, "ref_logps/chosen": -205.61868286132812, "ref_logps/rejected": -230.1177978515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.13505446910858154, "rewards/margins": 6.571177959442139, "rewards/rejected": -6.706232070922852, "step": 628 }, { "epoch": 0.15, "learning_rate": 1.8869333333333332e-07, "logps/chosen": -237.5966796875, "logps/rejected": -291.9406433105469, "loss": 0.0322, "losses/dpo": 5.296645758789964e-05, "losses/sft": 0.7048403024673462, "losses/total": 5.296645758789964e-05, "ref_logps/chosen": -235.84681701660156, "ref_logps/rejected": -227.8153076171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.17498531937599182, "rewards/margins": 6.237547874450684, "rewards/rejected": -6.412532806396484, "step": 629 }, { "epoch": 0.15, "learning_rate": 1.8864e-07, "logps/chosen": -227.9736328125, "logps/rejected": -308.1444396972656, "loss": 0.0196, "losses/dpo": 0.0007447335519827902, "losses/sft": 0.5747935175895691, "losses/total": 0.0007447335519827902, "ref_logps/chosen": -225.70143127441406, "ref_logps/rejected": -238.68641662597656, "rewards/accuracies": 1.0, "rewards/chosen": -0.22722364962100983, "rewards/margins": 6.718579292297363, "rewards/rejected": -6.945802688598633, "step": 630 }, { "epoch": 0.15, "learning_rate": 1.8858666666666665e-07, "logps/chosen": -221.04002380371094, "logps/rejected": -266.81793212890625, "loss": 0.0494, "losses/dpo": 0.0017956277588382363, "losses/sft": 0.6579630970954895, "losses/total": 0.0017956277588382363, "ref_logps/chosen": -220.24729919433594, "ref_logps/rejected": -208.0502166748047, "rewards/accuracies": 0.96875, "rewards/chosen": -0.07927396893501282, "rewards/margins": 5.797493934631348, "rewards/rejected": -5.876768112182617, "step": 631 }, { "epoch": 0.15, "learning_rate": 1.8853333333333333e-07, "logps/chosen": -225.74342346191406, "logps/rejected": -283.08746337890625, "loss": 0.032, "losses/dpo": 0.0026921287644654512, "losses/sft": 0.6565707921981812, "losses/total": 0.0026921287644654512, "ref_logps/chosen": -222.01075744628906, "ref_logps/rejected": -221.02040100097656, "rewards/accuracies": 1.0, "rewards/chosen": -0.37326526641845703, "rewards/margins": 5.833441734313965, "rewards/rejected": -6.206707000732422, "step": 632 }, { "epoch": 0.15, "learning_rate": 1.8848e-07, "logps/chosen": -219.8420867919922, "logps/rejected": -267.4110412597656, "loss": 0.0412, "losses/dpo": 0.00010132174065802246, "losses/sft": 0.670310914516449, "losses/total": 0.00010132174065802246, "ref_logps/chosen": -217.32534790039062, "ref_logps/rejected": -208.87344360351562, "rewards/accuracies": 1.0, "rewards/chosen": -0.2516729235649109, "rewards/margins": 5.602087020874023, "rewards/rejected": -5.853759765625, "step": 633 }, { "epoch": 0.15, "learning_rate": 1.8842666666666665e-07, "logps/chosen": -222.03204345703125, "logps/rejected": -306.59185791015625, "loss": 0.0319, "losses/dpo": 3.152215140289627e-05, "losses/sft": 0.7089881300926208, "losses/total": 3.152215140289627e-05, "ref_logps/chosen": -220.80296325683594, "ref_logps/rejected": -240.88320922851562, "rewards/accuracies": 1.0, "rewards/chosen": -0.12290769815444946, "rewards/margins": 6.447957992553711, "rewards/rejected": -6.570865631103516, "step": 634 }, { "epoch": 0.15, "learning_rate": 1.883733333333333e-07, "logps/chosen": -238.98135375976562, "logps/rejected": -323.63983154296875, "loss": 0.0091, "losses/dpo": 0.0002622947213239968, "losses/sft": 0.7802329063415527, "losses/total": 0.0002622947213239968, "ref_logps/chosen": -237.730224609375, "ref_logps/rejected": -251.62322998046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.12511377036571503, "rewards/margins": 7.076547145843506, "rewards/rejected": -7.201661586761475, "step": 635 }, { "epoch": 0.15, "learning_rate": 1.8831999999999998e-07, "logps/chosen": -201.0921630859375, "logps/rejected": -273.217529296875, "loss": 0.0338, "losses/dpo": 2.14194164982473e-06, "losses/sft": 0.4941311180591583, "losses/total": 2.14194164982473e-06, "ref_logps/chosen": -198.70999145507812, "ref_logps/rejected": -208.95150756835938, "rewards/accuracies": 1.0, "rewards/chosen": -0.23821547627449036, "rewards/margins": 6.188385963439941, "rewards/rejected": -6.426601409912109, "step": 636 }, { "epoch": 0.15, "learning_rate": 1.8826666666666666e-07, "logps/chosen": -213.56031799316406, "logps/rejected": -290.86505126953125, "loss": 0.0149, "losses/dpo": 0.00039096453110687435, "losses/sft": 0.592583179473877, "losses/total": 0.00039096453110687435, "ref_logps/chosen": -212.688720703125, "ref_logps/rejected": -228.68838500976562, "rewards/accuracies": 1.0, "rewards/chosen": -0.08715924620628357, "rewards/margins": 6.1305060386657715, "rewards/rejected": -6.217665672302246, "step": 637 }, { "epoch": 0.15, "learning_rate": 1.8821333333333334e-07, "logps/chosen": -241.39292907714844, "logps/rejected": -296.51959228515625, "loss": 0.0264, "losses/dpo": 1.4990481759014074e-05, "losses/sft": 0.5955309867858887, "losses/total": 1.4990481759014074e-05, "ref_logps/chosen": -240.2103271484375, "ref_logps/rejected": -236.27667236328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.11826112866401672, "rewards/margins": 5.906031608581543, "rewards/rejected": -6.024292945861816, "step": 638 }, { "epoch": 0.15, "learning_rate": 1.8815999999999999e-07, "logps/chosen": -218.20327758789062, "logps/rejected": -290.22882080078125, "loss": 0.0186, "losses/dpo": 6.970010872464627e-05, "losses/sft": 0.5089530348777771, "losses/total": 6.970010872464627e-05, "ref_logps/chosen": -216.86331176757812, "ref_logps/rejected": -223.77838134765625, "rewards/accuracies": 1.0, "rewards/chosen": -0.13399475812911987, "rewards/margins": 6.511049270629883, "rewards/rejected": -6.645044326782227, "step": 639 }, { "epoch": 0.15, "learning_rate": 1.8810666666666666e-07, "logps/chosen": -215.07394409179688, "logps/rejected": -271.5438537597656, "loss": 0.0312, "losses/dpo": 0.006258523091673851, "losses/sft": 0.5447894930839539, "losses/total": 0.006258523091673851, "ref_logps/chosen": -212.50022888183594, "ref_logps/rejected": -211.23147583007812, "rewards/accuracies": 1.0, "rewards/chosen": -0.2573707103729248, "rewards/margins": 5.773867607116699, "rewards/rejected": -6.031238079071045, "step": 640 }, { "epoch": 0.15, "learning_rate": 1.8805333333333334e-07, "logps/chosen": -217.394287109375, "logps/rejected": -271.2936706542969, "loss": 0.0338, "losses/dpo": 0.0012522351462394, "losses/sft": 0.5282384157180786, "losses/total": 0.0012522351462394, "ref_logps/chosen": -214.63551330566406, "ref_logps/rejected": -212.51614379882812, "rewards/accuracies": 1.0, "rewards/chosen": -0.2758796811103821, "rewards/margins": 5.601874351501465, "rewards/rejected": -5.877753734588623, "step": 641 }, { "epoch": 0.15, "learning_rate": 1.88e-07, "logps/chosen": -220.10382080078125, "logps/rejected": -256.01715087890625, "loss": 0.0229, "losses/dpo": 0.0005230975802987814, "losses/sft": 1.1324056386947632, "losses/total": 0.0005230975802987814, "ref_logps/chosen": -218.13427734375, "ref_logps/rejected": -198.24041748046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.1969534009695053, "rewards/margins": 5.580718994140625, "rewards/rejected": -5.77767276763916, "step": 642 }, { "epoch": 0.15, "learning_rate": 1.8794666666666667e-07, "logps/chosen": -293.8524475097656, "logps/rejected": -338.57958984375, "loss": 0.0102, "losses/dpo": 0.0024688432458788157, "losses/sft": 0.6096547842025757, "losses/total": 0.0024688432458788157, "ref_logps/chosen": -290.9395751953125, "ref_logps/rejected": -269.2981262207031, "rewards/accuracies": 1.0, "rewards/chosen": -0.29128381609916687, "rewards/margins": 6.63686466217041, "rewards/rejected": -6.928149223327637, "step": 643 }, { "epoch": 0.15, "learning_rate": 1.8789333333333332e-07, "logps/chosen": -248.5002899169922, "logps/rejected": -282.35675048828125, "loss": 0.0214, "losses/dpo": 0.0025540159549564123, "losses/sft": 0.6911236643791199, "losses/total": 0.0025540159549564123, "ref_logps/chosen": -245.3611297607422, "ref_logps/rejected": -217.32614135742188, "rewards/accuracies": 1.0, "rewards/chosen": -0.31391602754592896, "rewards/margins": 6.18914270401001, "rewards/rejected": -6.503058433532715, "step": 644 }, { "epoch": 0.15, "learning_rate": 1.8784e-07, "logps/chosen": -213.5814666748047, "logps/rejected": -276.2690734863281, "loss": 0.0351, "losses/dpo": 0.028490152209997177, "losses/sft": 0.6311760544776917, "losses/total": 0.028490152209997177, "ref_logps/chosen": -210.88023376464844, "ref_logps/rejected": -216.55926513671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.2701233923435211, "rewards/margins": 5.700859069824219, "rewards/rejected": -5.970982074737549, "step": 645 }, { "epoch": 0.16, "learning_rate": 1.8778666666666664e-07, "logps/chosen": -259.81829833984375, "logps/rejected": -308.84869384765625, "loss": 0.0182, "losses/dpo": 0.0011782778892666101, "losses/sft": 0.5322298407554626, "losses/total": 0.0011782778892666101, "ref_logps/chosen": -256.353759765625, "ref_logps/rejected": -241.28848266601562, "rewards/accuracies": 1.0, "rewards/chosen": -0.34645670652389526, "rewards/margins": 6.409562587738037, "rewards/rejected": -6.75601863861084, "step": 646 }, { "epoch": 0.16, "learning_rate": 1.8773333333333332e-07, "logps/chosen": -246.2601318359375, "logps/rejected": -292.0689697265625, "loss": 0.0103, "losses/dpo": 7.388588710455224e-05, "losses/sft": 0.5859594941139221, "losses/total": 7.388588710455224e-05, "ref_logps/chosen": -243.4990234375, "ref_logps/rejected": -225.93698120117188, "rewards/accuracies": 1.0, "rewards/chosen": -0.27610936760902405, "rewards/margins": 6.337090492248535, "rewards/rejected": -6.6132001876831055, "step": 647 }, { "epoch": 0.16, "learning_rate": 1.8768e-07, "logps/chosen": -216.9329376220703, "logps/rejected": -296.78228759765625, "loss": 0.027, "losses/dpo": 1.938665627676528e-05, "losses/sft": 0.45570558309555054, "losses/total": 1.938665627676528e-05, "ref_logps/chosen": -215.74794006347656, "ref_logps/rejected": -235.07711791992188, "rewards/accuracies": 1.0, "rewards/chosen": -0.11849897354841232, "rewards/margins": 6.052019119262695, "rewards/rejected": -6.170518398284912, "step": 648 }, { "epoch": 0.16, "learning_rate": 1.8762666666666667e-07, "logps/chosen": -221.81100463867188, "logps/rejected": -284.59832763671875, "loss": 0.015, "losses/dpo": 0.0008870274177752435, "losses/sft": 0.5978106260299683, "losses/total": 0.0008870274177752435, "ref_logps/chosen": -220.12759399414062, "ref_logps/rejected": -221.47579956054688, "rewards/accuracies": 1.0, "rewards/chosen": -0.16833919286727905, "rewards/margins": 6.143914222717285, "rewards/rejected": -6.312253475189209, "step": 649 }, { "epoch": 0.16, "learning_rate": 1.8757333333333332e-07, "logps/chosen": -233.00360107421875, "logps/rejected": -266.37969970703125, "loss": 0.0186, "losses/dpo": 7.07672443240881e-05, "losses/sft": 0.5361364483833313, "losses/total": 7.07672443240881e-05, "ref_logps/chosen": -232.37701416015625, "ref_logps/rejected": -208.19361877441406, "rewards/accuracies": 1.0, "rewards/chosen": -0.06266096979379654, "rewards/margins": 5.755947113037109, "rewards/rejected": -5.818608283996582, "step": 650 }, { "epoch": 0.16, "learning_rate": 1.8752e-07, "logps/chosen": -236.14535522460938, "logps/rejected": -290.6435241699219, "loss": 0.0106, "losses/dpo": 0.0002374499017605558, "losses/sft": 0.7477763295173645, "losses/total": 0.0002374499017605558, "ref_logps/chosen": -234.59408569335938, "ref_logps/rejected": -224.50469970703125, "rewards/accuracies": 1.0, "rewards/chosen": -0.15512922406196594, "rewards/margins": 6.458754539489746, "rewards/rejected": -6.6138834953308105, "step": 651 }, { "epoch": 0.16, "learning_rate": 1.8746666666666665e-07, "logps/chosen": -238.38937377929688, "logps/rejected": -302.9268798828125, "loss": 0.0339, "losses/dpo": 0.0004014938895124942, "losses/sft": 0.4927230477333069, "losses/total": 0.0004014938895124942, "ref_logps/chosen": -235.86135864257812, "ref_logps/rejected": -235.7247772216797, "rewards/accuracies": 1.0, "rewards/chosen": -0.2528027296066284, "rewards/margins": 6.467410087585449, "rewards/rejected": -6.720212936401367, "step": 652 }, { "epoch": 0.16, "learning_rate": 1.8741333333333333e-07, "logps/chosen": -231.888671875, "logps/rejected": -295.4974060058594, "loss": 0.041, "losses/dpo": 0.007224080618470907, "losses/sft": 0.6782195568084717, "losses/total": 0.007224080618470907, "ref_logps/chosen": -230.30836486816406, "ref_logps/rejected": -234.7332763671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.1580316722393036, "rewards/margins": 5.918381690979004, "rewards/rejected": -6.076413631439209, "step": 653 }, { "epoch": 0.16, "learning_rate": 1.8735999999999998e-07, "logps/chosen": -221.6444091796875, "logps/rejected": -308.3798828125, "loss": 0.0046, "losses/dpo": 0.00032910771551541984, "losses/sft": 0.5477851033210754, "losses/total": 0.00032910771551541984, "ref_logps/chosen": -219.4768829345703, "ref_logps/rejected": -238.8629913330078, "rewards/accuracies": 1.0, "rewards/chosen": -0.21675355732440948, "rewards/margins": 6.734933853149414, "rewards/rejected": -6.951686859130859, "step": 654 }, { "epoch": 0.16, "learning_rate": 1.8730666666666666e-07, "logps/chosen": -182.93252563476562, "logps/rejected": -265.451904296875, "loss": 0.0161, "losses/dpo": 0.00010742506856331602, "losses/sft": 0.6696919798851013, "losses/total": 0.00010742506856331602, "ref_logps/chosen": -181.75341796875, "ref_logps/rejected": -200.32846069335938, "rewards/accuracies": 1.0, "rewards/chosen": -0.11791086941957474, "rewards/margins": 6.39443302154541, "rewards/rejected": -6.512343406677246, "step": 655 }, { "epoch": 0.16, "learning_rate": 1.8725333333333333e-07, "logps/chosen": -276.2239685058594, "logps/rejected": -308.4798278808594, "loss": 0.0231, "losses/dpo": 1.7862764707388123e-06, "losses/sft": 0.5368639230728149, "losses/total": 1.7862764707388123e-06, "ref_logps/chosen": -274.1083068847656, "ref_logps/rejected": -233.77584838867188, "rewards/accuracies": 1.0, "rewards/chosen": -0.21156609058380127, "rewards/margins": 7.258831977844238, "rewards/rejected": -7.470398426055908, "step": 656 }, { "epoch": 0.16, "learning_rate": 1.872e-07, "logps/chosen": -215.54237365722656, "logps/rejected": -268.05169677734375, "loss": 0.0442, "losses/dpo": 3.458732317085378e-05, "losses/sft": 0.4820009469985962, "losses/total": 3.458732317085378e-05, "ref_logps/chosen": -214.7371063232422, "ref_logps/rejected": -212.52906799316406, "rewards/accuracies": 1.0, "rewards/chosen": -0.08052611351013184, "rewards/margins": 5.471735954284668, "rewards/rejected": -5.552262306213379, "step": 657 }, { "epoch": 0.16, "learning_rate": 1.8714666666666666e-07, "logps/chosen": -262.72674560546875, "logps/rejected": -329.8591613769531, "loss": 0.0049, "losses/dpo": 4.6335941306097084e-07, "losses/sft": 0.9374797344207764, "losses/total": 4.6335941306097084e-07, "ref_logps/chosen": -261.65911865234375, "ref_logps/rejected": -252.42904663085938, "rewards/accuracies": 1.0, "rewards/chosen": -0.10676049441099167, "rewards/margins": 7.6362504959106445, "rewards/rejected": -7.743010520935059, "step": 658 }, { "epoch": 0.16, "learning_rate": 1.8709333333333334e-07, "logps/chosen": -218.09107971191406, "logps/rejected": -288.32208251953125, "loss": 0.0227, "losses/dpo": 0.1661529392004013, "losses/sft": 0.5350820422172546, "losses/total": 0.1661529392004013, "ref_logps/chosen": -216.59999084472656, "ref_logps/rejected": -219.2747039794922, "rewards/accuracies": 1.0, "rewards/chosen": -0.14910835027694702, "rewards/margins": 6.755631446838379, "rewards/rejected": -6.904739856719971, "step": 659 }, { "epoch": 0.16, "learning_rate": 1.8703999999999999e-07, "logps/chosen": -229.4038848876953, "logps/rejected": -285.8371276855469, "loss": 0.0155, "losses/dpo": 0.00023072330805007368, "losses/sft": 1.0085371732711792, "losses/total": 0.00023072330805007368, "ref_logps/chosen": -227.8189239501953, "ref_logps/rejected": -225.86444091796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.1584944725036621, "rewards/margins": 5.838776588439941, "rewards/rejected": -5.9972710609436035, "step": 660 }, { "epoch": 0.16, "learning_rate": 1.8698666666666664e-07, "logps/chosen": -184.13641357421875, "logps/rejected": -253.84852600097656, "loss": 0.026, "losses/dpo": 0.0005215692217461765, "losses/sft": 0.8581805229187012, "losses/total": 0.0005215692217461765, "ref_logps/chosen": -183.2384033203125, "ref_logps/rejected": -202.4005126953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.08980140835046768, "rewards/margins": 5.055000305175781, "rewards/rejected": -5.144802093505859, "step": 661 }, { "epoch": 0.16, "learning_rate": 1.869333333333333e-07, "logps/chosen": -225.08099365234375, "logps/rejected": -276.2508239746094, "loss": 0.03, "losses/dpo": 0.03405534476041794, "losses/sft": 0.8662468791007996, "losses/total": 0.03405534476041794, "ref_logps/chosen": -223.42388916015625, "ref_logps/rejected": -211.0469970703125, "rewards/accuracies": 1.0, "rewards/chosen": -0.1657097339630127, "rewards/margins": 6.354673385620117, "rewards/rejected": -6.520382881164551, "step": 662 }, { "epoch": 0.16, "learning_rate": 1.8688e-07, "logps/chosen": -259.5845031738281, "logps/rejected": -324.72430419921875, "loss": 0.0124, "losses/dpo": 0.000228702396270819, "losses/sft": 0.5420520901679993, "losses/total": 0.000228702396270819, "ref_logps/chosen": -256.451171875, "ref_logps/rejected": -252.51034545898438, "rewards/accuracies": 1.0, "rewards/chosen": -0.31333285570144653, "rewards/margins": 6.908062934875488, "rewards/rejected": -7.221395492553711, "step": 663 }, { "epoch": 0.16, "learning_rate": 1.8682666666666667e-07, "logps/chosen": -242.31382751464844, "logps/rejected": -306.9019470214844, "loss": 0.0123, "losses/dpo": 0.0002810139558278024, "losses/sft": 0.6868353486061096, "losses/total": 0.0002810139558278024, "ref_logps/chosen": -238.4759521484375, "ref_logps/rejected": -237.61061096191406, "rewards/accuracies": 1.0, "rewards/chosen": -0.38378700613975525, "rewards/margins": 6.545347213745117, "rewards/rejected": -6.929134368896484, "step": 664 }, { "epoch": 0.16, "learning_rate": 1.8677333333333332e-07, "logps/chosen": -222.30572509765625, "logps/rejected": -277.72821044921875, "loss": 0.0313, "losses/dpo": 0.11752504110336304, "losses/sft": 0.6688773036003113, "losses/total": 0.11752504110336304, "ref_logps/chosen": -220.90289306640625, "ref_logps/rejected": -214.84457397460938, "rewards/accuracies": 1.0, "rewards/chosen": -0.14028596878051758, "rewards/margins": 6.148077964782715, "rewards/rejected": -6.288364410400391, "step": 665 }, { "epoch": 0.16, "learning_rate": 1.8672e-07, "logps/chosen": -227.54730224609375, "logps/rejected": -284.73944091796875, "loss": 0.0319, "losses/dpo": 0.007091098465025425, "losses/sft": 0.5843459367752075, "losses/total": 0.007091098465025425, "ref_logps/chosen": -226.39723205566406, "ref_logps/rejected": -219.70639038085938, "rewards/accuracies": 1.0, "rewards/chosen": -0.11500395834445953, "rewards/margins": 6.388303756713867, "rewards/rejected": -6.503307342529297, "step": 666 }, { "epoch": 0.16, "learning_rate": 1.8666666666666667e-07, "logps/chosen": -239.4264373779297, "logps/rejected": -240.60365295410156, "loss": 0.0287, "losses/dpo": 0.00012626829266082495, "losses/sft": 0.5599130392074585, "losses/total": 0.00012626829266082495, "ref_logps/chosen": -236.5335693359375, "ref_logps/rejected": -181.4728240966797, "rewards/accuracies": 1.0, "rewards/chosen": -0.28928589820861816, "rewards/margins": 5.62379789352417, "rewards/rejected": -5.913084030151367, "step": 667 }, { "epoch": 0.16, "learning_rate": 1.8661333333333332e-07, "logps/chosen": -234.26687622070312, "logps/rejected": -279.6156005859375, "loss": 0.0217, "losses/dpo": 1.765026718203444e-05, "losses/sft": 0.6612532734870911, "losses/total": 1.765026718203444e-05, "ref_logps/chosen": -231.9022674560547, "ref_logps/rejected": -213.045166015625, "rewards/accuracies": 1.0, "rewards/chosen": -0.23646065592765808, "rewards/margins": 6.420584678649902, "rewards/rejected": -6.657045364379883, "step": 668 }, { "epoch": 0.16, "learning_rate": 1.8655999999999997e-07, "logps/chosen": -215.85137939453125, "logps/rejected": -259.5543212890625, "loss": 0.0279, "losses/dpo": 0.009853812865912914, "losses/sft": 0.5511882305145264, "losses/total": 0.009853812865912914, "ref_logps/chosen": -214.18833923339844, "ref_logps/rejected": -203.48556518554688, "rewards/accuracies": 1.0, "rewards/chosen": -0.16630303859710693, "rewards/margins": 5.440569877624512, "rewards/rejected": -5.60687255859375, "step": 669 }, { "epoch": 0.16, "learning_rate": 1.8650666666666665e-07, "logps/chosen": -278.560791015625, "logps/rejected": -289.9013671875, "loss": 0.0118, "losses/dpo": 0.00010699689300963655, "losses/sft": 0.5714696049690247, "losses/total": 0.00010699689300963655, "ref_logps/chosen": -275.8511657714844, "ref_logps/rejected": -213.6124725341797, "rewards/accuracies": 1.0, "rewards/chosen": -0.27096086740493774, "rewards/margins": 7.357929706573486, "rewards/rejected": -7.628890037536621, "step": 670 }, { "epoch": 0.16, "learning_rate": 1.8645333333333332e-07, "logps/chosen": -184.7216796875, "logps/rejected": -286.60284423828125, "loss": 0.0324, "losses/dpo": 4.20232790929731e-05, "losses/sft": 0.3521326184272766, "losses/total": 4.20232790929731e-05, "ref_logps/chosen": -183.7681884765625, "ref_logps/rejected": -220.9744110107422, "rewards/accuracies": 1.0, "rewards/chosen": -0.09534813463687897, "rewards/margins": 6.467494010925293, "rewards/rejected": -6.562841892242432, "step": 671 }, { "epoch": 0.16, "learning_rate": 1.864e-07, "logps/chosen": -218.79052734375, "logps/rejected": -313.3693542480469, "loss": 0.013, "losses/dpo": 6.400445272447541e-06, "losses/sft": 0.6387640237808228, "losses/total": 6.400445272447541e-06, "ref_logps/chosen": -218.29151916503906, "ref_logps/rejected": -241.35009765625, "rewards/accuracies": 1.0, "rewards/chosen": -0.04989960044622421, "rewards/margins": 7.152026653289795, "rewards/rejected": -7.201926231384277, "step": 672 }, { "epoch": 0.16, "learning_rate": 1.8634666666666665e-07, "logps/chosen": -221.23489379882812, "logps/rejected": -281.87042236328125, "loss": 0.0212, "losses/dpo": 0.0001941266527865082, "losses/sft": 0.6187587380409241, "losses/total": 0.0001941266527865082, "ref_logps/chosen": -219.0323944091797, "ref_logps/rejected": -213.79074096679688, "rewards/accuracies": 1.0, "rewards/chosen": -0.22025009989738464, "rewards/margins": 6.5877203941345215, "rewards/rejected": -6.8079705238342285, "step": 673 }, { "epoch": 0.16, "learning_rate": 1.8629333333333333e-07, "logps/chosen": -241.4119415283203, "logps/rejected": -306.95208740234375, "loss": 0.0109, "losses/dpo": 0.0029362053610384464, "losses/sft": 0.8094533681869507, "losses/total": 0.0029362053610384464, "ref_logps/chosen": -240.44493103027344, "ref_logps/rejected": -235.8585205078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.09670178592205048, "rewards/margins": 7.0126543045043945, "rewards/rejected": -7.109355926513672, "step": 674 }, { "epoch": 0.16, "learning_rate": 1.8624e-07, "logps/chosen": -194.589111328125, "logps/rejected": -252.05923461914062, "loss": 0.0456, "losses/dpo": 0.05162296071648598, "losses/sft": 0.7078684568405151, "losses/total": 0.05162296071648598, "ref_logps/chosen": -193.4542694091797, "ref_logps/rejected": -197.54580688476562, "rewards/accuracies": 1.0, "rewards/chosen": -0.1134839728474617, "rewards/margins": 5.337858200073242, "rewards/rejected": -5.45134162902832, "step": 675 }, { "epoch": 0.16, "learning_rate": 1.8618666666666666e-07, "logps/chosen": -261.53094482421875, "logps/rejected": -315.80859375, "loss": 0.0175, "losses/dpo": 0.00019510170386638492, "losses/sft": 0.5158383250236511, "losses/total": 0.00019510170386638492, "ref_logps/chosen": -258.8567199707031, "ref_logps/rejected": -246.8179931640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.2674218416213989, "rewards/margins": 6.63163948059082, "rewards/rejected": -6.899061679840088, "step": 676 }, { "epoch": 0.16, "learning_rate": 1.8613333333333333e-07, "logps/chosen": -244.61056518554688, "logps/rejected": -296.7117919921875, "loss": 0.0161, "losses/dpo": 1.087839700630866e-05, "losses/sft": 0.604099690914154, "losses/total": 1.087839700630866e-05, "ref_logps/chosen": -243.0210723876953, "ref_logps/rejected": -231.46707153320312, "rewards/accuracies": 1.0, "rewards/chosen": -0.1589500606060028, "rewards/margins": 6.365522861480713, "rewards/rejected": -6.524473190307617, "step": 677 }, { "epoch": 0.16, "learning_rate": 1.8607999999999998e-07, "logps/chosen": -248.85806274414062, "logps/rejected": -279.8270568847656, "loss": 0.0312, "losses/dpo": 0.004022915381938219, "losses/sft": 0.5774445533752441, "losses/total": 0.004022915381938219, "ref_logps/chosen": -246.80288696289062, "ref_logps/rejected": -214.56504821777344, "rewards/accuracies": 1.0, "rewards/chosen": -0.20551836490631104, "rewards/margins": 6.320682525634766, "rewards/rejected": -6.526200771331787, "step": 678 }, { "epoch": 0.16, "learning_rate": 1.8602666666666666e-07, "logps/chosen": -238.51881408691406, "logps/rejected": -273.6817626953125, "loss": 0.0213, "losses/dpo": 0.00045731020509265363, "losses/sft": 1.157962441444397, "losses/total": 0.00045731020509265363, "ref_logps/chosen": -236.81101989746094, "ref_logps/rejected": -211.0719757080078, "rewards/accuracies": 1.0, "rewards/chosen": -0.1707782745361328, "rewards/margins": 6.090199947357178, "rewards/rejected": -6.260978698730469, "step": 679 }, { "epoch": 0.16, "learning_rate": 1.859733333333333e-07, "logps/chosen": -224.96353149414062, "logps/rejected": -285.5462951660156, "loss": 0.0179, "losses/dpo": 0.0020162169821560383, "losses/sft": 0.5897823572158813, "losses/total": 0.0020162169821560383, "ref_logps/chosen": -223.43887329101562, "ref_logps/rejected": -219.11700439453125, "rewards/accuracies": 1.0, "rewards/chosen": -0.1524657905101776, "rewards/margins": 6.490462303161621, "rewards/rejected": -6.642928123474121, "step": 680 }, { "epoch": 0.16, "learning_rate": 1.8591999999999999e-07, "logps/chosen": -263.41607666015625, "logps/rejected": -321.76373291015625, "loss": 0.0098, "losses/dpo": 9.985357610275969e-05, "losses/sft": 0.49154868721961975, "losses/total": 9.985357610275969e-05, "ref_logps/chosen": -262.6204833984375, "ref_logps/rejected": -245.11566162109375, "rewards/accuracies": 1.0, "rewards/chosen": -0.079559326171875, "rewards/margins": 7.585247039794922, "rewards/rejected": -7.664806365966797, "step": 681 }, { "epoch": 0.16, "learning_rate": 1.8586666666666666e-07, "logps/chosen": -237.572998046875, "logps/rejected": -307.3263244628906, "loss": 0.0115, "losses/dpo": 1.2283752539588022e-06, "losses/sft": 0.5875368118286133, "losses/total": 1.2283752539588022e-06, "ref_logps/chosen": -234.50221252441406, "ref_logps/rejected": -233.9991455078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.30707812309265137, "rewards/margins": 7.025638580322266, "rewards/rejected": -7.332716941833496, "step": 682 }, { "epoch": 0.16, "learning_rate": 1.8581333333333334e-07, "logps/chosen": -229.2255096435547, "logps/rejected": -297.5015869140625, "loss": 0.0237, "losses/dpo": 0.0014841097872704268, "losses/sft": 0.6465869545936584, "losses/total": 0.0014841097872704268, "ref_logps/chosen": -227.75222778320312, "ref_logps/rejected": -234.6844482421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.14732740819454193, "rewards/margins": 6.134385108947754, "rewards/rejected": -6.281712532043457, "step": 683 }, { "epoch": 0.16, "learning_rate": 1.8576e-07, "logps/chosen": -246.97381591796875, "logps/rejected": -291.9183349609375, "loss": 0.0196, "losses/dpo": 0.0011448716977611184, "losses/sft": 0.6321596503257751, "losses/total": 0.0011448716977611184, "ref_logps/chosen": -244.3732147216797, "ref_logps/rejected": -228.3494873046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.26005953550338745, "rewards/margins": 6.096830368041992, "rewards/rejected": -6.356889724731445, "step": 684 }, { "epoch": 0.16, "learning_rate": 1.8570666666666667e-07, "logps/chosen": -235.01937866210938, "logps/rejected": -255.99876403808594, "loss": 0.0326, "losses/dpo": 0.00011232905671931803, "losses/sft": 0.4842863380908966, "losses/total": 0.00011232905671931803, "ref_logps/chosen": -230.425048828125, "ref_logps/rejected": -195.148681640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.45943304896354675, "rewards/margins": 5.625576019287109, "rewards/rejected": -6.08500862121582, "step": 685 }, { "epoch": 0.16, "learning_rate": 1.8565333333333332e-07, "logps/chosen": -201.8472137451172, "logps/rejected": -272.8246765136719, "loss": 0.0284, "losses/dpo": 0.00014343622024171054, "losses/sft": 0.6098569631576538, "losses/total": 0.00014343622024171054, "ref_logps/chosen": -200.34019470214844, "ref_logps/rejected": -209.58609008789062, "rewards/accuracies": 1.0, "rewards/chosen": -0.15069974958896637, "rewards/margins": 6.173157691955566, "rewards/rejected": -6.323857307434082, "step": 686 }, { "epoch": 0.16, "learning_rate": 1.856e-07, "logps/chosen": -231.53245544433594, "logps/rejected": -304.98681640625, "loss": 0.0199, "losses/dpo": 0.0015590059338137507, "losses/sft": 0.45013394951820374, "losses/total": 0.0015590059338137507, "ref_logps/chosen": -229.44793701171875, "ref_logps/rejected": -233.98880004882812, "rewards/accuracies": 1.0, "rewards/chosen": -0.2084517776966095, "rewards/margins": 6.89135217666626, "rewards/rejected": -7.099803924560547, "step": 687 }, { "epoch": 0.17, "learning_rate": 1.8554666666666664e-07, "logps/chosen": -220.70880126953125, "logps/rejected": -284.5465087890625, "loss": 0.0171, "losses/dpo": 0.000141073833219707, "losses/sft": 0.5982639789581299, "losses/total": 0.000141073833219707, "ref_logps/chosen": -218.52175903320312, "ref_logps/rejected": -218.7542724609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.218703493475914, "rewards/margins": 6.360522270202637, "rewards/rejected": -6.579225540161133, "step": 688 }, { "epoch": 0.17, "learning_rate": 1.8549333333333332e-07, "logps/chosen": -264.9764404296875, "logps/rejected": -307.04315185546875, "loss": 0.0196, "losses/dpo": 0.0008658909355290234, "losses/sft": 0.5917076468467712, "losses/total": 0.0008658909355290234, "ref_logps/chosen": -262.0057067871094, "ref_logps/rejected": -240.31982421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.2970775067806244, "rewards/margins": 6.375255584716797, "rewards/rejected": -6.672333240509033, "step": 689 }, { "epoch": 0.17, "learning_rate": 1.8544e-07, "logps/chosen": -242.0786590576172, "logps/rejected": -267.6102600097656, "loss": 0.0362, "losses/dpo": 1.505218187958235e-05, "losses/sft": 0.8686763644218445, "losses/total": 1.505218187958235e-05, "ref_logps/chosen": -239.59902954101562, "ref_logps/rejected": -207.0113525390625, "rewards/accuracies": 1.0, "rewards/chosen": -0.2479628622531891, "rewards/margins": 5.81192684173584, "rewards/rejected": -6.059889793395996, "step": 690 }, { "epoch": 0.17, "learning_rate": 1.8538666666666667e-07, "logps/chosen": -227.7076873779297, "logps/rejected": -290.6796875, "loss": 0.0183, "losses/dpo": 0.0002792736340779811, "losses/sft": 0.7704082131385803, "losses/total": 0.0002792736340779811, "ref_logps/chosen": -226.79376220703125, "ref_logps/rejected": -226.30941772460938, "rewards/accuracies": 1.0, "rewards/chosen": -0.09139163792133331, "rewards/margins": 6.345635890960693, "rewards/rejected": -6.4370269775390625, "step": 691 }, { "epoch": 0.17, "learning_rate": 1.8533333333333333e-07, "logps/chosen": -200.18844604492188, "logps/rejected": -276.5862121582031, "loss": 0.0209, "losses/dpo": 2.7643434350466123e-06, "losses/sft": 0.6044830679893494, "losses/total": 2.7643434350466123e-06, "ref_logps/chosen": -199.0006866455078, "ref_logps/rejected": -214.40399169921875, "rewards/accuracies": 1.0, "rewards/chosen": -0.11877673864364624, "rewards/margins": 6.099445343017578, "rewards/rejected": -6.218222141265869, "step": 692 }, { "epoch": 0.17, "learning_rate": 1.8528e-07, "logps/chosen": -206.29888916015625, "logps/rejected": -305.70050048828125, "loss": 0.0085, "losses/dpo": 0.008168810978531837, "losses/sft": 0.6102321147918701, "losses/total": 0.008168810978531837, "ref_logps/chosen": -205.05653381347656, "ref_logps/rejected": -234.54624938964844, "rewards/accuracies": 1.0, "rewards/chosen": -0.12423482537269592, "rewards/margins": 6.991190433502197, "rewards/rejected": -7.115424633026123, "step": 693 }, { "epoch": 0.17, "learning_rate": 1.8522666666666665e-07, "logps/chosen": -212.00930786132812, "logps/rejected": -261.7713928222656, "loss": 0.0303, "losses/dpo": 6.52274175081402e-05, "losses/sft": 1.0303603410720825, "losses/total": 6.52274175081402e-05, "ref_logps/chosen": -210.95713806152344, "ref_logps/rejected": -199.35003662109375, "rewards/accuracies": 1.0, "rewards/chosen": -0.10521753132343292, "rewards/margins": 6.136918067932129, "rewards/rejected": -6.242135047912598, "step": 694 }, { "epoch": 0.17, "learning_rate": 1.851733333333333e-07, "logps/chosen": -224.99388122558594, "logps/rejected": -312.29534912109375, "loss": 0.0094, "losses/dpo": 0.00011174677638337016, "losses/sft": 0.5213025212287903, "losses/total": 0.00011174677638337016, "ref_logps/chosen": -222.6319580078125, "ref_logps/rejected": -238.77606201171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.23619115352630615, "rewards/margins": 7.115734100341797, "rewards/rejected": -7.351924896240234, "step": 695 }, { "epoch": 0.17, "learning_rate": 1.8511999999999998e-07, "logps/chosen": -252.2552490234375, "logps/rejected": -308.5826416015625, "loss": 0.0096, "losses/dpo": 9.819791011977941e-05, "losses/sft": 0.5661923885345459, "losses/total": 9.819791011977941e-05, "ref_logps/chosen": -249.76519775390625, "ref_logps/rejected": -236.488037109375, "rewards/accuracies": 1.0, "rewards/chosen": -0.24900774657726288, "rewards/margins": 6.96044921875, "rewards/rejected": -7.209456443786621, "step": 696 }, { "epoch": 0.17, "learning_rate": 1.8506666666666666e-07, "logps/chosen": -203.62301635742188, "logps/rejected": -262.93035888671875, "loss": 0.0334, "losses/dpo": 0.00025524329976178706, "losses/sft": 0.68362957239151, "losses/total": 0.00025524329976178706, "ref_logps/chosen": -201.79312133789062, "ref_logps/rejected": -201.99404907226562, "rewards/accuracies": 1.0, "rewards/chosen": -0.18298925459384918, "rewards/margins": 5.910642623901367, "rewards/rejected": -6.093631744384766, "step": 697 }, { "epoch": 0.17, "learning_rate": 1.8501333333333333e-07, "logps/chosen": -220.50970458984375, "logps/rejected": -282.8822021484375, "loss": 0.0152, "losses/dpo": 7.553648174507543e-05, "losses/sft": 0.7139968276023865, "losses/total": 7.553648174507543e-05, "ref_logps/chosen": -218.97903442382812, "ref_logps/rejected": -210.17056274414062, "rewards/accuracies": 1.0, "rewards/chosen": -0.1530662328004837, "rewards/margins": 7.118098258972168, "rewards/rejected": -7.2711639404296875, "step": 698 }, { "epoch": 0.17, "learning_rate": 1.8495999999999998e-07, "logps/chosen": -204.6310577392578, "logps/rejected": -263.0224914550781, "loss": 0.0332, "losses/dpo": 0.0001290425134357065, "losses/sft": 0.4517315924167633, "losses/total": 0.0001290425134357065, "ref_logps/chosen": -202.91444396972656, "ref_logps/rejected": -201.34808349609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.17166218161582947, "rewards/margins": 5.995779037475586, "rewards/rejected": -6.167440891265869, "step": 699 }, { "epoch": 0.17, "learning_rate": 1.8490666666666666e-07, "logps/chosen": -220.99169921875, "logps/rejected": -304.8153076171875, "loss": 0.0117, "losses/dpo": 0.0003853978996630758, "losses/sft": 0.6738167405128479, "losses/total": 0.0003853978996630758, "ref_logps/chosen": -218.5123291015625, "ref_logps/rejected": -232.47288513183594, "rewards/accuracies": 1.0, "rewards/chosen": -0.24793583154678345, "rewards/margins": 6.9863057136535645, "rewards/rejected": -7.234241962432861, "step": 700 }, { "epoch": 0.17, "learning_rate": 1.8485333333333334e-07, "logps/chosen": -217.9180450439453, "logps/rejected": -310.346923828125, "loss": 0.0239, "losses/dpo": 0.0008552124490961432, "losses/sft": 0.5458157658576965, "losses/total": 0.0008552124490961432, "ref_logps/chosen": -215.5461883544922, "ref_logps/rejected": -238.6108856201172, "rewards/accuracies": 1.0, "rewards/chosen": -0.23718446493148804, "rewards/margins": 6.93641996383667, "rewards/rejected": -7.173604488372803, "step": 701 }, { "epoch": 0.17, "learning_rate": 1.848e-07, "logps/chosen": -236.68138122558594, "logps/rejected": -340.0890197753906, "loss": 0.0164, "losses/dpo": 2.738213197517325e-06, "losses/sft": 0.8582168221473694, "losses/total": 2.738213197517325e-06, "ref_logps/chosen": -233.52427673339844, "ref_logps/rejected": -261.7547607421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.31570926308631897, "rewards/margins": 7.5177202224731445, "rewards/rejected": -7.833428859710693, "step": 702 }, { "epoch": 0.17, "learning_rate": 1.8474666666666664e-07, "logps/chosen": -205.98928833007812, "logps/rejected": -262.2579650878906, "loss": 0.0336, "losses/dpo": 0.0004938638885505497, "losses/sft": 0.3695017099380493, "losses/total": 0.0004938638885505497, "ref_logps/chosen": -203.4359588623047, "ref_logps/rejected": -197.082275390625, "rewards/accuracies": 1.0, "rewards/chosen": -0.2553342282772064, "rewards/margins": 6.262236595153809, "rewards/rejected": -6.517570495605469, "step": 703 }, { "epoch": 0.17, "learning_rate": 1.8469333333333331e-07, "logps/chosen": -223.62750244140625, "logps/rejected": -263.9992370605469, "loss": 0.0259, "losses/dpo": 0.002686644671484828, "losses/sft": 0.6653663516044617, "losses/total": 0.002686644671484828, "ref_logps/chosen": -221.13909912109375, "ref_logps/rejected": -204.80484008789062, "rewards/accuracies": 1.0, "rewards/chosen": -0.248839870095253, "rewards/margins": 5.670598983764648, "rewards/rejected": -5.919439315795898, "step": 704 }, { "epoch": 0.17, "learning_rate": 1.8464e-07, "logps/chosen": -221.64785766601562, "logps/rejected": -289.2067565917969, "loss": 0.012, "losses/dpo": 3.4432705433573574e-05, "losses/sft": 0.47229138016700745, "losses/total": 3.4432705433573574e-05, "ref_logps/chosen": -219.33004760742188, "ref_logps/rejected": -223.67262268066406, "rewards/accuracies": 1.0, "rewards/chosen": -0.23178091645240784, "rewards/margins": 6.321632385253906, "rewards/rejected": -6.553413391113281, "step": 705 }, { "epoch": 0.17, "learning_rate": 1.8458666666666667e-07, "logps/chosen": -207.50038146972656, "logps/rejected": -301.0118713378906, "loss": 0.0132, "losses/dpo": 1.0335536870798023e-07, "losses/sft": 0.8041435480117798, "losses/total": 1.0335536870798023e-07, "ref_logps/chosen": -205.9412384033203, "ref_logps/rejected": -232.65365600585938, "rewards/accuracies": 1.0, "rewards/chosen": -0.1559162437915802, "rewards/margins": 6.679904460906982, "rewards/rejected": -6.835820198059082, "step": 706 }, { "epoch": 0.17, "learning_rate": 1.8453333333333332e-07, "logps/chosen": -208.71124267578125, "logps/rejected": -303.382568359375, "loss": 0.0256, "losses/dpo": 1.7924249959833105e-06, "losses/sft": 0.6930893063545227, "losses/total": 1.7924249959833105e-06, "ref_logps/chosen": -206.9691162109375, "ref_logps/rejected": -231.07904052734375, "rewards/accuracies": 1.0, "rewards/chosen": -0.1742125153541565, "rewards/margins": 7.056141376495361, "rewards/rejected": -7.230353355407715, "step": 707 }, { "epoch": 0.17, "learning_rate": 1.8448e-07, "logps/chosen": -226.82705688476562, "logps/rejected": -286.4395751953125, "loss": 0.0177, "losses/dpo": 0.0016432260163128376, "losses/sft": 0.4910352826118469, "losses/total": 0.0016432260163128376, "ref_logps/chosen": -224.85171508789062, "ref_logps/rejected": -215.5059051513672, "rewards/accuracies": 1.0, "rewards/chosen": -0.19753426313400269, "rewards/margins": 6.895830154418945, "rewards/rejected": -7.093364238739014, "step": 708 }, { "epoch": 0.17, "learning_rate": 1.8442666666666667e-07, "logps/chosen": -251.40365600585938, "logps/rejected": -284.0809631347656, "loss": 0.0361, "losses/dpo": 0.00028330410714261234, "losses/sft": 0.6546087265014648, "losses/total": 0.00028330410714261234, "ref_logps/chosen": -249.00741577148438, "ref_logps/rejected": -217.59432983398438, "rewards/accuracies": 1.0, "rewards/chosen": -0.23962414264678955, "rewards/margins": 6.409041404724121, "rewards/rejected": -6.648665428161621, "step": 709 }, { "epoch": 0.17, "learning_rate": 1.8437333333333332e-07, "logps/chosen": -184.6872100830078, "logps/rejected": -301.12890625, "loss": 0.0206, "losses/dpo": 1.0446530723129399e-05, "losses/sft": 0.5615261197090149, "losses/total": 1.0446530723129399e-05, "ref_logps/chosen": -182.92596435546875, "ref_logps/rejected": -235.2908172607422, "rewards/accuracies": 1.0, "rewards/chosen": -0.1761244535446167, "rewards/margins": 6.4076828956604, "rewards/rejected": -6.583807468414307, "step": 710 }, { "epoch": 0.17, "learning_rate": 1.8432e-07, "logps/chosen": -213.3314208984375, "logps/rejected": -316.6830749511719, "loss": 0.0138, "losses/dpo": 1.3401852811512072e-05, "losses/sft": 0.8203626871109009, "losses/total": 1.3401852811512072e-05, "ref_logps/chosen": -211.35751342773438, "ref_logps/rejected": -243.75611877441406, "rewards/accuracies": 1.0, "rewards/chosen": -0.19739101827144623, "rewards/margins": 7.095305919647217, "rewards/rejected": -7.292696952819824, "step": 711 }, { "epoch": 0.17, "learning_rate": 1.8426666666666665e-07, "logps/chosen": -217.16783142089844, "logps/rejected": -292.7054443359375, "loss": 0.0251, "losses/dpo": 0.00047132972395047545, "losses/sft": 0.8907309770584106, "losses/total": 0.00047132972395047545, "ref_logps/chosen": -215.09275817871094, "ref_logps/rejected": -227.61952209472656, "rewards/accuracies": 1.0, "rewards/chosen": -0.20750805735588074, "rewards/margins": 6.301081657409668, "rewards/rejected": -6.508589744567871, "step": 712 }, { "epoch": 0.17, "learning_rate": 1.8421333333333333e-07, "logps/chosen": -252.916015625, "logps/rejected": -311.8254089355469, "loss": 0.0084, "losses/dpo": 3.0291168513940647e-05, "losses/sft": 0.6083870530128479, "losses/total": 3.0291168513940647e-05, "ref_logps/chosen": -249.10110473632812, "ref_logps/rejected": -238.65545654296875, "rewards/accuracies": 1.0, "rewards/chosen": -0.3814922571182251, "rewards/margins": 6.935502052307129, "rewards/rejected": -7.3169941902160645, "step": 713 }, { "epoch": 0.17, "learning_rate": 1.8415999999999998e-07, "logps/chosen": -241.83058166503906, "logps/rejected": -310.5653381347656, "loss": 0.0165, "losses/dpo": 0.004705921746790409, "losses/sft": 0.3776681125164032, "losses/total": 0.004705921746790409, "ref_logps/chosen": -238.7261199951172, "ref_logps/rejected": -242.63726806640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.31044623255729675, "rewards/margins": 6.482360363006592, "rewards/rejected": -6.792806625366211, "step": 714 }, { "epoch": 0.17, "learning_rate": 1.8410666666666665e-07, "logps/chosen": -223.82281494140625, "logps/rejected": -297.7036437988281, "loss": 0.0259, "losses/dpo": 0.0008974538650363684, "losses/sft": 0.6329580545425415, "losses/total": 0.0008974538650363684, "ref_logps/chosen": -221.7598876953125, "ref_logps/rejected": -230.08665466308594, "rewards/accuracies": 1.0, "rewards/chosen": -0.20629093050956726, "rewards/margins": 6.5554094314575195, "rewards/rejected": -6.76170015335083, "step": 715 }, { "epoch": 0.17, "learning_rate": 1.8405333333333333e-07, "logps/chosen": -203.8299560546875, "logps/rejected": -261.95391845703125, "loss": 0.0262, "losses/dpo": 0.04614021256566048, "losses/sft": 0.8990597724914551, "losses/total": 0.04614021256566048, "ref_logps/chosen": -201.81649780273438, "ref_logps/rejected": -199.86358642578125, "rewards/accuracies": 1.0, "rewards/chosen": -0.20134663581848145, "rewards/margins": 6.007688522338867, "rewards/rejected": -6.2090349197387695, "step": 716 }, { "epoch": 0.17, "learning_rate": 1.84e-07, "logps/chosen": -216.01626586914062, "logps/rejected": -281.11920166015625, "loss": 0.0199, "losses/dpo": 7.254664069478167e-06, "losses/sft": 1.0556048154830933, "losses/total": 7.254664069478167e-06, "ref_logps/chosen": -214.65261840820312, "ref_logps/rejected": -211.8323974609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.13636259734630585, "rewards/margins": 6.79231595993042, "rewards/rejected": -6.928678512573242, "step": 717 }, { "epoch": 0.17, "learning_rate": 1.8394666666666666e-07, "logps/chosen": -228.41122436523438, "logps/rejected": -285.64324951171875, "loss": 0.0136, "losses/dpo": 7.13574399924255e-06, "losses/sft": 1.0330493450164795, "losses/total": 7.13574399924255e-06, "ref_logps/chosen": -227.626708984375, "ref_logps/rejected": -219.12208557128906, "rewards/accuracies": 1.0, "rewards/chosen": -0.07845266163349152, "rewards/margins": 6.573662281036377, "rewards/rejected": -6.6521148681640625, "step": 718 }, { "epoch": 0.17, "learning_rate": 1.8389333333333333e-07, "logps/chosen": -239.4134063720703, "logps/rejected": -301.1939392089844, "loss": 0.0075, "losses/dpo": 6.537321951327613e-06, "losses/sft": 0.6376122832298279, "losses/total": 6.537321951327613e-06, "ref_logps/chosen": -237.8458251953125, "ref_logps/rejected": -224.906982421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.1567579060792923, "rewards/margins": 7.471937656402588, "rewards/rejected": -7.628696441650391, "step": 719 }, { "epoch": 0.17, "learning_rate": 1.8383999999999998e-07, "logps/chosen": -240.90658569335938, "logps/rejected": -263.4462585449219, "loss": 0.0472, "losses/dpo": 0.00045551927178166807, "losses/sft": 0.665076732635498, "losses/total": 0.00045551927178166807, "ref_logps/chosen": -236.8219757080078, "ref_logps/rejected": -201.85076904296875, "rewards/accuracies": 1.0, "rewards/chosen": -0.4084607660770416, "rewards/margins": 5.7510881423950195, "rewards/rejected": -6.159549713134766, "step": 720 }, { "epoch": 0.17, "learning_rate": 1.8378666666666666e-07, "logps/chosen": -237.21011352539062, "logps/rejected": -296.7510681152344, "loss": 0.0165, "losses/dpo": 0.00011065317085012794, "losses/sft": 0.528278112411499, "losses/total": 0.00011065317085012794, "ref_logps/chosen": -234.27011108398438, "ref_logps/rejected": -223.77105712890625, "rewards/accuracies": 1.0, "rewards/chosen": -0.2940009832382202, "rewards/margins": 7.004000663757324, "rewards/rejected": -7.298001766204834, "step": 721 }, { "epoch": 0.17, "learning_rate": 1.837333333333333e-07, "logps/chosen": -253.56069946289062, "logps/rejected": -283.45526123046875, "loss": 0.029, "losses/dpo": 1.9863415218424052e-05, "losses/sft": 0.7061198949813843, "losses/total": 1.9863415218424052e-05, "ref_logps/chosen": -250.13922119140625, "ref_logps/rejected": -215.1302490234375, "rewards/accuracies": 1.0, "rewards/chosen": -0.34215039014816284, "rewards/margins": 6.490346908569336, "rewards/rejected": -6.832498073577881, "step": 722 }, { "epoch": 0.17, "learning_rate": 1.8368e-07, "logps/chosen": -228.3936309814453, "logps/rejected": -296.97833251953125, "loss": 0.0166, "losses/dpo": 0.004925881512463093, "losses/sft": 0.528667688369751, "losses/total": 0.004925881512463093, "ref_logps/chosen": -226.2232666015625, "ref_logps/rejected": -228.14273071289062, "rewards/accuracies": 1.0, "rewards/chosen": -0.217037171125412, "rewards/margins": 6.666524887084961, "rewards/rejected": -6.883562088012695, "step": 723 }, { "epoch": 0.17, "learning_rate": 1.8362666666666666e-07, "logps/chosen": -183.36163330078125, "logps/rejected": -271.7838134765625, "loss": 0.0299, "losses/dpo": 1.799530053858689e-07, "losses/sft": 0.7141940593719482, "losses/total": 1.799530053858689e-07, "ref_logps/chosen": -182.65826416015625, "ref_logps/rejected": -210.92800903320312, "rewards/accuracies": 1.0, "rewards/chosen": -0.07033591717481613, "rewards/margins": 6.01524543762207, "rewards/rejected": -6.085582256317139, "step": 724 }, { "epoch": 0.17, "learning_rate": 1.8357333333333334e-07, "logps/chosen": -223.40255737304688, "logps/rejected": -299.0289306640625, "loss": 0.009, "losses/dpo": 0.00013481135829351842, "losses/sft": 0.3919670283794403, "losses/total": 0.00013481135829351842, "ref_logps/chosen": -221.87762451171875, "ref_logps/rejected": -229.30169677734375, "rewards/accuracies": 1.0, "rewards/chosen": -0.15249471366405487, "rewards/margins": 6.820225238800049, "rewards/rejected": -6.972720146179199, "step": 725 }, { "epoch": 0.17, "learning_rate": 1.8352e-07, "logps/chosen": -207.5797119140625, "logps/rejected": -324.83648681640625, "loss": 0.0123, "losses/dpo": 1.710168544377666e-05, "losses/sft": 0.5592018365859985, "losses/total": 1.710168544377666e-05, "ref_logps/chosen": -206.35098266601562, "ref_logps/rejected": -252.77725219726562, "rewards/accuracies": 1.0, "rewards/chosen": -0.12287311255931854, "rewards/margins": 7.083049774169922, "rewards/rejected": -7.205922603607178, "step": 726 }, { "epoch": 0.17, "learning_rate": 1.8346666666666667e-07, "logps/chosen": -247.23239135742188, "logps/rejected": -281.1535339355469, "loss": 0.0194, "losses/dpo": 0.0002391477901255712, "losses/sft": 0.5847192406654358, "losses/total": 0.0002391477901255712, "ref_logps/chosen": -245.2015838623047, "ref_logps/rejected": -214.72836303710938, "rewards/accuracies": 1.0, "rewards/chosen": -0.20308105647563934, "rewards/margins": 6.43943452835083, "rewards/rejected": -6.642516136169434, "step": 727 }, { "epoch": 0.17, "learning_rate": 1.8341333333333332e-07, "logps/chosen": -241.7789764404297, "logps/rejected": -313.59527587890625, "loss": 0.0203, "losses/dpo": 2.0824294551857747e-05, "losses/sft": 0.6538466811180115, "losses/total": 2.0824294551857747e-05, "ref_logps/chosen": -240.47940063476562, "ref_logps/rejected": -246.01063537597656, "rewards/accuracies": 1.0, "rewards/chosen": -0.1299562156200409, "rewards/margins": 6.628506183624268, "rewards/rejected": -6.758462429046631, "step": 728 }, { "epoch": 0.17, "learning_rate": 1.8335999999999997e-07, "logps/chosen": -256.98394775390625, "logps/rejected": -306.2230224609375, "loss": 0.0231, "losses/dpo": 4.9544723879080266e-05, "losses/sft": 0.8952574729919434, "losses/total": 4.9544723879080266e-05, "ref_logps/chosen": -254.39796447753906, "ref_logps/rejected": -234.11508178710938, "rewards/accuracies": 1.0, "rewards/chosen": -0.2586023807525635, "rewards/margins": 6.952191352844238, "rewards/rejected": -7.210793495178223, "step": 729 }, { "epoch": 0.18, "learning_rate": 1.8330666666666665e-07, "logps/chosen": -222.3695831298828, "logps/rejected": -306.9970397949219, "loss": 0.0101, "losses/dpo": 3.175493111484684e-05, "losses/sft": 0.9891304969787598, "losses/total": 3.175493111484684e-05, "ref_logps/chosen": -221.10134887695312, "ref_logps/rejected": -233.06466674804688, "rewards/accuracies": 1.0, "rewards/chosen": -0.12682487070560455, "rewards/margins": 7.266412258148193, "rewards/rejected": -7.393237590789795, "step": 730 }, { "epoch": 0.18, "learning_rate": 1.8325333333333332e-07, "logps/chosen": -284.65301513671875, "logps/rejected": -306.78271484375, "loss": 0.021, "losses/dpo": 8.794535460765474e-06, "losses/sft": 0.7533326148986816, "losses/total": 8.794535460765474e-06, "ref_logps/chosen": -282.14422607421875, "ref_logps/rejected": -233.161865234375, "rewards/accuracies": 1.0, "rewards/chosen": -0.25087955594062805, "rewards/margins": 7.111203670501709, "rewards/rejected": -7.362083435058594, "step": 731 }, { "epoch": 0.18, "learning_rate": 1.832e-07, "logps/chosen": -207.26226806640625, "logps/rejected": -284.65606689453125, "loss": 0.0297, "losses/dpo": 1.7253536270800396e-06, "losses/sft": 0.5827276706695557, "losses/total": 1.7253536270800396e-06, "ref_logps/chosen": -205.65170288085938, "ref_logps/rejected": -220.00845336914062, "rewards/accuracies": 1.0, "rewards/chosen": -0.16105663776397705, "rewards/margins": 6.303701400756836, "rewards/rejected": -6.46475887298584, "step": 732 }, { "epoch": 0.18, "learning_rate": 1.8314666666666665e-07, "logps/chosen": -241.23854064941406, "logps/rejected": -331.87139892578125, "loss": 0.004, "losses/dpo": 9.906323248287663e-05, "losses/sft": 0.5709455609321594, "losses/total": 9.906323248287663e-05, "ref_logps/chosen": -237.5550079345703, "ref_logps/rejected": -256.2131042480469, "rewards/accuracies": 1.0, "rewards/chosen": -0.36835333704948425, "rewards/margins": 7.197473049163818, "rewards/rejected": -7.565825939178467, "step": 733 }, { "epoch": 0.18, "learning_rate": 1.8309333333333333e-07, "logps/chosen": -187.38302612304688, "logps/rejected": -278.1702880859375, "loss": 0.0159, "losses/dpo": 6.5265203375020064e-06, "losses/sft": 0.6167894005775452, "losses/total": 6.5265203375020064e-06, "ref_logps/chosen": -185.72000122070312, "ref_logps/rejected": -208.5277862548828, "rewards/accuracies": 1.0, "rewards/chosen": -0.1663035899400711, "rewards/margins": 6.797948837280273, "rewards/rejected": -6.964252471923828, "step": 734 }, { "epoch": 0.18, "learning_rate": 1.8304e-07, "logps/chosen": -211.73614501953125, "logps/rejected": -288.9454345703125, "loss": 0.0091, "losses/dpo": 0.0007459474727511406, "losses/sft": 0.5166183114051819, "losses/total": 0.0007459474727511406, "ref_logps/chosen": -209.0272216796875, "ref_logps/rejected": -223.37757873535156, "rewards/accuracies": 1.0, "rewards/chosen": -0.2708940804004669, "rewards/margins": 6.285891532897949, "rewards/rejected": -6.556785583496094, "step": 735 }, { "epoch": 0.18, "learning_rate": 1.8298666666666668e-07, "logps/chosen": -232.69595336914062, "logps/rejected": -301.8782958984375, "loss": 0.0133, "losses/dpo": 9.218051673087757e-06, "losses/sft": 0.6459198594093323, "losses/total": 9.218051673087757e-06, "ref_logps/chosen": -230.54837036132812, "ref_logps/rejected": -230.33868408203125, "rewards/accuracies": 1.0, "rewards/chosen": -0.21475833654403687, "rewards/margins": 6.93920373916626, "rewards/rejected": -7.153961658477783, "step": 736 }, { "epoch": 0.18, "learning_rate": 1.8293333333333333e-07, "logps/chosen": -175.35723876953125, "logps/rejected": -239.90203857421875, "loss": 0.0392, "losses/dpo": 8.925748261390254e-06, "losses/sft": 0.5967206358909607, "losses/total": 8.925748261390254e-06, "ref_logps/chosen": -173.15411376953125, "ref_logps/rejected": -180.4468994140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.22031357884407043, "rewards/margins": 5.725198745727539, "rewards/rejected": -5.945512771606445, "step": 737 }, { "epoch": 0.18, "learning_rate": 1.8287999999999998e-07, "logps/chosen": -216.36782836914062, "logps/rejected": -277.80462646484375, "loss": 0.0297, "losses/dpo": 0.0012046957854181528, "losses/sft": 0.5413949489593506, "losses/total": 0.0012046957854181528, "ref_logps/chosen": -213.76412963867188, "ref_logps/rejected": -204.15975952148438, "rewards/accuracies": 1.0, "rewards/chosen": -0.2603718638420105, "rewards/margins": 7.104114532470703, "rewards/rejected": -7.364485740661621, "step": 738 }, { "epoch": 0.18, "learning_rate": 1.8282666666666666e-07, "logps/chosen": -239.76600646972656, "logps/rejected": -277.00677490234375, "loss": 0.0249, "losses/dpo": 0.2178698182106018, "losses/sft": 0.5777155160903931, "losses/total": 0.2178698182106018, "ref_logps/chosen": -235.50582885742188, "ref_logps/rejected": -214.70175170898438, "rewards/accuracies": 1.0, "rewards/chosen": -0.4260174036026001, "rewards/margins": 5.804485321044922, "rewards/rejected": -6.230503082275391, "step": 739 }, { "epoch": 0.18, "learning_rate": 1.8277333333333333e-07, "logps/chosen": -223.03195190429688, "logps/rejected": -291.6640930175781, "loss": 0.0254, "losses/dpo": 0.00033495030947960913, "losses/sft": 1.2506170272827148, "losses/total": 0.00033495030947960913, "ref_logps/chosen": -221.55899047851562, "ref_logps/rejected": -225.44384765625, "rewards/accuracies": 1.0, "rewards/chosen": -0.14729654788970947, "rewards/margins": 6.474727630615234, "rewards/rejected": -6.6220245361328125, "step": 740 }, { "epoch": 0.18, "learning_rate": 1.8271999999999998e-07, "logps/chosen": -244.33473205566406, "logps/rejected": -270.43450927734375, "loss": 0.0383, "losses/dpo": 1.9880581021425314e-06, "losses/sft": 0.5468204021453857, "losses/total": 1.9880581021425314e-06, "ref_logps/chosen": -240.4906768798828, "ref_logps/rejected": -208.96022033691406, "rewards/accuracies": 1.0, "rewards/chosen": -0.38440555334091187, "rewards/margins": 5.763020038604736, "rewards/rejected": -6.147425651550293, "step": 741 }, { "epoch": 0.18, "learning_rate": 1.8266666666666666e-07, "logps/chosen": -254.1972198486328, "logps/rejected": -289.9871826171875, "loss": 0.0072, "losses/dpo": 3.116549578408012e-07, "losses/sft": 0.4610597789287567, "losses/total": 3.116549578408012e-07, "ref_logps/chosen": -252.90573120117188, "ref_logps/rejected": -220.76242065429688, "rewards/accuracies": 1.0, "rewards/chosen": -0.12914806604385376, "rewards/margins": 6.793328285217285, "rewards/rejected": -6.922476291656494, "step": 742 }, { "epoch": 0.18, "learning_rate": 1.8261333333333334e-07, "logps/chosen": -237.84645080566406, "logps/rejected": -280.1368713378906, "loss": 0.026, "losses/dpo": 0.0002344648673897609, "losses/sft": 0.820138156414032, "losses/total": 0.0002344648673897609, "ref_logps/chosen": -234.97366333007812, "ref_logps/rejected": -210.8067169189453, "rewards/accuracies": 1.0, "rewards/chosen": -0.2872806787490845, "rewards/margins": 6.64573335647583, "rewards/rejected": -6.933014392852783, "step": 743 }, { "epoch": 0.18, "learning_rate": 1.8256e-07, "logps/chosen": -263.646484375, "logps/rejected": -307.05853271484375, "loss": 0.0076, "losses/dpo": 6.213145752553828e-06, "losses/sft": 0.7219012379646301, "losses/total": 6.213145752553828e-06, "ref_logps/chosen": -260.4364318847656, "ref_logps/rejected": -230.4541473388672, "rewards/accuracies": 1.0, "rewards/chosen": -0.3210061490535736, "rewards/margins": 7.339430809020996, "rewards/rejected": -7.66043758392334, "step": 744 }, { "epoch": 0.18, "learning_rate": 1.8250666666666667e-07, "logps/chosen": -218.94473266601562, "logps/rejected": -288.54595947265625, "loss": 0.0108, "losses/dpo": 0.0002310868148924783, "losses/sft": 0.988964319229126, "losses/total": 0.0002310868148924783, "ref_logps/chosen": -216.91802978515625, "ref_logps/rejected": -220.49468994140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.2026723474264145, "rewards/margins": 6.602451324462891, "rewards/rejected": -6.805123805999756, "step": 745 }, { "epoch": 0.18, "learning_rate": 1.8245333333333332e-07, "logps/chosen": -261.15533447265625, "logps/rejected": -303.3995361328125, "loss": 0.0091, "losses/dpo": 0.0009333047200925648, "losses/sft": 0.5659186840057373, "losses/total": 0.0009333047200925648, "ref_logps/chosen": -258.96282958984375, "ref_logps/rejected": -232.0775146484375, "rewards/accuracies": 1.0, "rewards/chosen": -0.2192530632019043, "rewards/margins": 6.912948131561279, "rewards/rejected": -7.132201194763184, "step": 746 }, { "epoch": 0.18, "learning_rate": 1.824e-07, "logps/chosen": -237.2837677001953, "logps/rejected": -320.2008361816406, "loss": 0.0136, "losses/dpo": 3.5803179798676865e-06, "losses/sft": 1.123445987701416, "losses/total": 3.5803179798676865e-06, "ref_logps/chosen": -235.92172241210938, "ref_logps/rejected": -238.59051513671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.13620460033416748, "rewards/margins": 8.024826049804688, "rewards/rejected": -8.161030769348145, "step": 747 }, { "epoch": 0.18, "learning_rate": 1.8234666666666664e-07, "logps/chosen": -208.2860107421875, "logps/rejected": -298.51519775390625, "loss": 0.0209, "losses/dpo": 0.0010935959871858358, "losses/sft": 0.5005756616592407, "losses/total": 0.0010935959871858358, "ref_logps/chosen": -207.2178192138672, "ref_logps/rejected": -228.5428009033203, "rewards/accuracies": 1.0, "rewards/chosen": -0.10681846737861633, "rewards/margins": 6.890421390533447, "rewards/rejected": -6.99724006652832, "step": 748 }, { "epoch": 0.18, "learning_rate": 1.8229333333333332e-07, "logps/chosen": -176.71160888671875, "logps/rejected": -247.3524169921875, "loss": 0.0539, "losses/dpo": 6.006130570312962e-05, "losses/sft": 0.6923603415489197, "losses/total": 6.006130570312962e-05, "ref_logps/chosen": -174.66722106933594, "ref_logps/rejected": -186.79005432128906, "rewards/accuracies": 1.0, "rewards/chosen": -0.2044396847486496, "rewards/margins": 5.851796627044678, "rewards/rejected": -6.056236267089844, "step": 749 }, { "epoch": 0.18, "learning_rate": 1.8224e-07, "logps/chosen": -231.4087371826172, "logps/rejected": -309.48516845703125, "loss": 0.0226, "losses/dpo": 7.988329343788791e-06, "losses/sft": 0.5862058401107788, "losses/total": 7.988329343788791e-06, "ref_logps/chosen": -228.95079040527344, "ref_logps/rejected": -234.15032958984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.24579690396785736, "rewards/margins": 7.287687301635742, "rewards/rejected": -7.53348445892334, "step": 750 }, { "epoch": 0.18, "learning_rate": 1.8218666666666667e-07, "logps/chosen": -216.3301544189453, "logps/rejected": -292.26617431640625, "loss": 0.0253, "losses/dpo": 0.0011975705856457353, "losses/sft": 0.6909315586090088, "losses/total": 0.0011975705856457353, "ref_logps/chosen": -214.3068084716797, "ref_logps/rejected": -221.80137634277344, "rewards/accuracies": 1.0, "rewards/chosen": -0.2023339867591858, "rewards/margins": 6.844146728515625, "rewards/rejected": -7.046480178833008, "step": 751 }, { "epoch": 0.18, "learning_rate": 1.8213333333333332e-07, "logps/chosen": -255.9632110595703, "logps/rejected": -328.28759765625, "loss": 0.0182, "losses/dpo": 1.9069630070589483e-05, "losses/sft": 0.5926684737205505, "losses/total": 1.9069630070589483e-05, "ref_logps/chosen": -252.59268188476562, "ref_logps/rejected": -250.4338836669922, "rewards/accuracies": 1.0, "rewards/chosen": -0.33705252408981323, "rewards/margins": 7.4483184814453125, "rewards/rejected": -7.785370349884033, "step": 752 }, { "epoch": 0.18, "learning_rate": 1.8208e-07, "logps/chosen": -210.9317169189453, "logps/rejected": -296.4449157714844, "loss": 0.0113, "losses/dpo": 0.000663541432004422, "losses/sft": 0.6370087265968323, "losses/total": 0.000663541432004422, "ref_logps/chosen": -209.44981384277344, "ref_logps/rejected": -226.85250854492188, "rewards/accuracies": 1.0, "rewards/chosen": -0.14818927645683289, "rewards/margins": 6.811049461364746, "rewards/rejected": -6.959238529205322, "step": 753 }, { "epoch": 0.18, "learning_rate": 1.8202666666666665e-07, "logps/chosen": -209.67361450195312, "logps/rejected": -274.05133056640625, "loss": 0.0146, "losses/dpo": 0.00882202573120594, "losses/sft": 0.5984695553779602, "losses/total": 0.00882202573120594, "ref_logps/chosen": -207.65655517578125, "ref_logps/rejected": -205.9864044189453, "rewards/accuracies": 1.0, "rewards/chosen": -0.2017056792974472, "rewards/margins": 6.604785919189453, "rewards/rejected": -6.806491374969482, "step": 754 }, { "epoch": 0.18, "learning_rate": 1.8197333333333333e-07, "logps/chosen": -191.17568969726562, "logps/rejected": -288.12823486328125, "loss": 0.0269, "losses/dpo": 6.433633825508878e-05, "losses/sft": 0.5744246244430542, "losses/total": 6.433633825508878e-05, "ref_logps/chosen": -189.78610229492188, "ref_logps/rejected": -223.8434295654297, "rewards/accuracies": 1.0, "rewards/chosen": -0.13896125555038452, "rewards/margins": 6.289520263671875, "rewards/rejected": -6.428481101989746, "step": 755 }, { "epoch": 0.18, "learning_rate": 1.8191999999999998e-07, "logps/chosen": -264.8730773925781, "logps/rejected": -305.1334228515625, "loss": 0.0194, "losses/dpo": 7.298298896785127e-06, "losses/sft": 0.5772853493690491, "losses/total": 7.298298896785127e-06, "ref_logps/chosen": -260.6629638671875, "ref_logps/rejected": -233.2104034423828, "rewards/accuracies": 1.0, "rewards/chosen": -0.421008825302124, "rewards/margins": 6.771295547485352, "rewards/rejected": -7.1923041343688965, "step": 756 }, { "epoch": 0.18, "learning_rate": 1.8186666666666665e-07, "logps/chosen": -243.25567626953125, "logps/rejected": -322.9764099121094, "loss": 0.0195, "losses/dpo": 0.0013754653045907617, "losses/sft": 0.684796929359436, "losses/total": 0.0013754653045907617, "ref_logps/chosen": -239.32261657714844, "ref_logps/rejected": -243.05569458007812, "rewards/accuracies": 1.0, "rewards/chosen": -0.39330679178237915, "rewards/margins": 7.598763465881348, "rewards/rejected": -7.992070198059082, "step": 757 }, { "epoch": 0.18, "learning_rate": 1.8181333333333333e-07, "logps/chosen": -197.71713256835938, "logps/rejected": -263.8104248046875, "loss": 0.0312, "losses/dpo": 0.008694583550095558, "losses/sft": 1.0325082540512085, "losses/total": 0.008694583550095558, "ref_logps/chosen": -194.0986328125, "ref_logps/rejected": -200.63787841796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.36185115575790405, "rewards/margins": 5.9554033279418945, "rewards/rejected": -6.317255020141602, "step": 758 }, { "epoch": 0.18, "learning_rate": 1.8176e-07, "logps/chosen": -254.54161071777344, "logps/rejected": -344.4237060546875, "loss": 0.0094, "losses/dpo": 4.806967990589328e-05, "losses/sft": 0.5361879467964172, "losses/total": 4.806967990589328e-05, "ref_logps/chosen": -251.05145263671875, "ref_logps/rejected": -266.6216125488281, "rewards/accuracies": 1.0, "rewards/chosen": -0.34901559352874756, "rewards/margins": 7.4311933517456055, "rewards/rejected": -7.780208587646484, "step": 759 }, { "epoch": 0.18, "learning_rate": 1.8170666666666666e-07, "logps/chosen": -248.96493530273438, "logps/rejected": -267.24139404296875, "loss": 0.0166, "losses/dpo": 8.209246152546257e-05, "losses/sft": 0.6235129833221436, "losses/total": 8.209246152546257e-05, "ref_logps/chosen": -245.19479370117188, "ref_logps/rejected": -205.8109130859375, "rewards/accuracies": 1.0, "rewards/chosen": -0.3770137131214142, "rewards/margins": 5.766032695770264, "rewards/rejected": -6.1430463790893555, "step": 760 }, { "epoch": 0.18, "learning_rate": 1.8165333333333333e-07, "logps/chosen": -238.03976440429688, "logps/rejected": -315.2229309082031, "loss": 0.0183, "losses/dpo": 0.0007535778568126261, "losses/sft": 0.574794590473175, "losses/total": 0.0007535778568126261, "ref_logps/chosen": -236.800048828125, "ref_logps/rejected": -238.95822143554688, "rewards/accuracies": 1.0, "rewards/chosen": -0.12397058308124542, "rewards/margins": 7.502501487731934, "rewards/rejected": -7.626471519470215, "step": 761 }, { "epoch": 0.18, "learning_rate": 1.8159999999999999e-07, "logps/chosen": -242.73593139648438, "logps/rejected": -287.71356201171875, "loss": 0.0347, "losses/dpo": 0.0011056094663217664, "losses/sft": 0.38141462206840515, "losses/total": 0.0011056094663217664, "ref_logps/chosen": -240.34597778320312, "ref_logps/rejected": -220.1634979248047, "rewards/accuracies": 1.0, "rewards/chosen": -0.23899541795253754, "rewards/margins": 6.516009330749512, "rewards/rejected": -6.755004405975342, "step": 762 }, { "epoch": 0.18, "learning_rate": 1.8154666666666664e-07, "logps/chosen": -239.428466796875, "logps/rejected": -289.003173828125, "loss": 0.0206, "losses/dpo": 2.9052982426946983e-05, "losses/sft": 0.6155385375022888, "losses/total": 2.9052982426946983e-05, "ref_logps/chosen": -235.6551055908203, "ref_logps/rejected": -218.81285095214844, "rewards/accuracies": 1.0, "rewards/chosen": -0.37733399868011475, "rewards/margins": 6.641698837280273, "rewards/rejected": -7.0190324783325195, "step": 763 }, { "epoch": 0.18, "learning_rate": 1.814933333333333e-07, "logps/chosen": -216.29071044921875, "logps/rejected": -301.332275390625, "loss": 0.0078, "losses/dpo": 0.0020775855518877506, "losses/sft": 0.6386090517044067, "losses/total": 0.0020775855518877506, "ref_logps/chosen": -215.0421905517578, "ref_logps/rejected": -230.1698760986328, "rewards/accuracies": 1.0, "rewards/chosen": -0.12485063076019287, "rewards/margins": 6.991391181945801, "rewards/rejected": -7.116241931915283, "step": 764 }, { "epoch": 0.18, "learning_rate": 1.8144e-07, "logps/chosen": -217.95770263671875, "logps/rejected": -277.97705078125, "loss": 0.0276, "losses/dpo": 0.00020516580843832344, "losses/sft": 0.6003154516220093, "losses/total": 0.00020516580843832344, "ref_logps/chosen": -215.03555297851562, "ref_logps/rejected": -213.35813903808594, "rewards/accuracies": 1.0, "rewards/chosen": -0.2922150492668152, "rewards/margins": 6.169675827026367, "rewards/rejected": -6.461891174316406, "step": 765 }, { "epoch": 0.18, "learning_rate": 1.8138666666666667e-07, "logps/chosen": -218.62850952148438, "logps/rejected": -268.3609313964844, "loss": 0.044, "losses/dpo": 0.00021087955974508077, "losses/sft": 1.0159729719161987, "losses/total": 0.00021087955974508077, "ref_logps/chosen": -217.18673706054688, "ref_logps/rejected": -204.28530883789062, "rewards/accuracies": 1.0, "rewards/chosen": -0.1441780924797058, "rewards/margins": 6.263383388519287, "rewards/rejected": -6.407561302185059, "step": 766 }, { "epoch": 0.18, "learning_rate": 1.8133333333333332e-07, "logps/chosen": -223.97894287109375, "logps/rejected": -324.6712646484375, "loss": 0.0179, "losses/dpo": 7.137266777590412e-08, "losses/sft": 0.929526150226593, "losses/total": 7.137266777590412e-08, "ref_logps/chosen": -220.38868713378906, "ref_logps/rejected": -244.81085205078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.35902732610702515, "rewards/margins": 7.62701416015625, "rewards/rejected": -7.9860405921936035, "step": 767 }, { "epoch": 0.18, "learning_rate": 1.8128e-07, "logps/chosen": -249.56150817871094, "logps/rejected": -328.23480224609375, "loss": 0.0206, "losses/dpo": 8.835379361471496e-08, "losses/sft": 0.8817064762115479, "losses/total": 8.835379361471496e-08, "ref_logps/chosen": -246.49810791015625, "ref_logps/rejected": -245.77899169921875, "rewards/accuracies": 1.0, "rewards/chosen": -0.3063419759273529, "rewards/margins": 7.939237594604492, "rewards/rejected": -8.24557876586914, "step": 768 }, { "epoch": 0.18, "learning_rate": 1.8122666666666667e-07, "logps/chosen": -201.2489776611328, "logps/rejected": -285.6650085449219, "loss": 0.0121, "losses/dpo": 4.2932399082928896e-05, "losses/sft": 0.6545699238777161, "losses/total": 4.2932399082928896e-05, "ref_logps/chosen": -199.57342529296875, "ref_logps/rejected": -216.9048309326172, "rewards/accuracies": 1.0, "rewards/chosen": -0.167555034160614, "rewards/margins": 6.7084641456604, "rewards/rejected": -6.876019477844238, "step": 769 }, { "epoch": 0.18, "learning_rate": 1.8117333333333335e-07, "logps/chosen": -309.35980224609375, "logps/rejected": -326.00384521484375, "loss": 0.0222, "losses/dpo": 3.162533903378062e-05, "losses/sft": 0.9822425842285156, "losses/total": 3.162533903378062e-05, "ref_logps/chosen": -305.3708801269531, "ref_logps/rejected": -244.7895965576172, "rewards/accuracies": 1.0, "rewards/chosen": -0.39889293909072876, "rewards/margins": 7.722531318664551, "rewards/rejected": -8.121424674987793, "step": 770 }, { "epoch": 0.19, "learning_rate": 1.8112e-07, "logps/chosen": -255.4658660888672, "logps/rejected": -309.9596862792969, "loss": 0.0205, "losses/dpo": 0.0001355895510641858, "losses/sft": 0.5914327502250671, "losses/total": 0.0001355895510641858, "ref_logps/chosen": -252.0928192138672, "ref_logps/rejected": -234.60714721679688, "rewards/accuracies": 1.0, "rewards/chosen": -0.33730548620224, "rewards/margins": 7.197948455810547, "rewards/rejected": -7.535253524780273, "step": 771 }, { "epoch": 0.19, "learning_rate": 1.8106666666666665e-07, "logps/chosen": -227.32284545898438, "logps/rejected": -296.239013671875, "loss": 0.0233, "losses/dpo": 8.987874025478959e-05, "losses/sft": 0.4358251094818115, "losses/total": 8.987874025478959e-05, "ref_logps/chosen": -224.0119171142578, "ref_logps/rejected": -228.5479736328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.3310922086238861, "rewards/margins": 6.438011169433594, "rewards/rejected": -6.769103050231934, "step": 772 }, { "epoch": 0.19, "learning_rate": 1.8101333333333332e-07, "logps/chosen": -256.3084411621094, "logps/rejected": -340.4976806640625, "loss": 0.0109, "losses/dpo": 5.912906544836005e-06, "losses/sft": 0.5772972106933594, "losses/total": 5.912906544836005e-06, "ref_logps/chosen": -253.66827392578125, "ref_logps/rejected": -264.39605712890625, "rewards/accuracies": 1.0, "rewards/chosen": -0.2640170454978943, "rewards/margins": 7.346141338348389, "rewards/rejected": -7.610158920288086, "step": 773 }, { "epoch": 0.19, "learning_rate": 1.8096e-07, "logps/chosen": -198.06253051757812, "logps/rejected": -286.59027099609375, "loss": 0.0083, "losses/dpo": 0.0001921307120937854, "losses/sft": 0.6781694889068604, "losses/total": 0.0001921307120937854, "ref_logps/chosen": -197.4798583984375, "ref_logps/rejected": -210.02685546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.05826844647526741, "rewards/margins": 7.598072052001953, "rewards/rejected": -7.656341075897217, "step": 774 }, { "epoch": 0.19, "learning_rate": 1.8090666666666665e-07, "logps/chosen": -239.30584716796875, "logps/rejected": -281.42486572265625, "loss": 0.0181, "losses/dpo": 0.004641265608370304, "losses/sft": 1.0658022165298462, "losses/total": 0.004641265608370304, "ref_logps/chosen": -236.61181640625, "ref_logps/rejected": -211.20606994628906, "rewards/accuracies": 1.0, "rewards/chosen": -0.26940423250198364, "rewards/margins": 6.752475261688232, "rewards/rejected": -7.02187967300415, "step": 775 }, { "epoch": 0.19, "learning_rate": 1.8085333333333333e-07, "logps/chosen": -269.210693359375, "logps/rejected": -275.30743408203125, "loss": 0.0141, "losses/dpo": 1.2843947843066417e-05, "losses/sft": 0.6454629898071289, "losses/total": 1.2843947843066417e-05, "ref_logps/chosen": -265.8135986328125, "ref_logps/rejected": -200.89431762695312, "rewards/accuracies": 1.0, "rewards/chosen": -0.3397122621536255, "rewards/margins": 7.101599216461182, "rewards/rejected": -7.441311359405518, "step": 776 }, { "epoch": 0.19, "learning_rate": 1.808e-07, "logps/chosen": -278.9920959472656, "logps/rejected": -296.32183837890625, "loss": 0.0162, "losses/dpo": 4.199220711598173e-06, "losses/sft": 0.6791035532951355, "losses/total": 4.199220711598173e-06, "ref_logps/chosen": -275.7860107421875, "ref_logps/rejected": -220.04229736328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.3206076920032501, "rewards/margins": 7.307347297668457, "rewards/rejected": -7.627955436706543, "step": 777 }, { "epoch": 0.19, "learning_rate": 1.8074666666666668e-07, "logps/chosen": -205.47052001953125, "logps/rejected": -266.1625061035156, "loss": 0.0126, "losses/dpo": 4.813386112800799e-05, "losses/sft": 0.6116545796394348, "losses/total": 4.813386112800799e-05, "ref_logps/chosen": -203.22723388671875, "ref_logps/rejected": -196.75381469726562, "rewards/accuracies": 1.0, "rewards/chosen": -0.22433048486709595, "rewards/margins": 6.716538429260254, "rewards/rejected": -6.940868377685547, "step": 778 }, { "epoch": 0.19, "learning_rate": 1.8069333333333333e-07, "logps/chosen": -225.02853393554688, "logps/rejected": -304.46429443359375, "loss": 0.012, "losses/dpo": 7.313167316169711e-06, "losses/sft": 0.5657708644866943, "losses/total": 7.313167316169711e-06, "ref_logps/chosen": -223.08837890625, "ref_logps/rejected": -234.30029296875, "rewards/accuracies": 1.0, "rewards/chosen": -0.1940135359764099, "rewards/margins": 6.822391510009766, "rewards/rejected": -7.01640510559082, "step": 779 }, { "epoch": 0.19, "learning_rate": 1.8063999999999998e-07, "logps/chosen": -232.67343139648438, "logps/rejected": -300.10443115234375, "loss": 0.0127, "losses/dpo": 0.0006249402067624032, "losses/sft": 0.9457993507385254, "losses/total": 0.0006249402067624032, "ref_logps/chosen": -230.05006408691406, "ref_logps/rejected": -230.31272888183594, "rewards/accuracies": 1.0, "rewards/chosen": -0.2623353600502014, "rewards/margins": 6.71683406829834, "rewards/rejected": -6.979169845581055, "step": 780 }, { "epoch": 0.19, "learning_rate": 1.8058666666666666e-07, "logps/chosen": -242.79415893554688, "logps/rejected": -289.7237548828125, "loss": 0.0251, "losses/dpo": 0.00010322205343982205, "losses/sft": 0.6313387751579285, "losses/total": 0.00010322205343982205, "ref_logps/chosen": -239.10574340820312, "ref_logps/rejected": -218.46261596679688, "rewards/accuracies": 1.0, "rewards/chosen": -0.3688424229621887, "rewards/margins": 6.757274150848389, "rewards/rejected": -7.1261162757873535, "step": 781 }, { "epoch": 0.19, "learning_rate": 1.805333333333333e-07, "logps/chosen": -229.4748992919922, "logps/rejected": -318.68157958984375, "loss": 0.0164, "losses/dpo": 1.943055394804105e-06, "losses/sft": 0.4880211055278778, "losses/total": 1.943055394804105e-06, "ref_logps/chosen": -226.76959228515625, "ref_logps/rejected": -238.85821533203125, "rewards/accuracies": 1.0, "rewards/chosen": -0.2705303132534027, "rewards/margins": 7.711808204650879, "rewards/rejected": -7.982338905334473, "step": 782 }, { "epoch": 0.19, "learning_rate": 1.8047999999999999e-07, "logps/chosen": -222.4853973388672, "logps/rejected": -287.9163818359375, "loss": 0.0301, "losses/dpo": 1.6727111869840883e-05, "losses/sft": 0.8463619947433472, "losses/total": 1.6727111869840883e-05, "ref_logps/chosen": -220.70309448242188, "ref_logps/rejected": -213.51124572753906, "rewards/accuracies": 1.0, "rewards/chosen": -0.17823058366775513, "rewards/margins": 7.262281894683838, "rewards/rejected": -7.440512657165527, "step": 783 }, { "epoch": 0.19, "learning_rate": 1.8042666666666666e-07, "logps/chosen": -214.66522216796875, "logps/rejected": -277.2326354980469, "loss": 0.0263, "losses/dpo": 0.000889647810254246, "losses/sft": 0.4384409487247467, "losses/total": 0.000889647810254246, "ref_logps/chosen": -212.94744873046875, "ref_logps/rejected": -209.3245849609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.17177867889404297, "rewards/margins": 6.619027137756348, "rewards/rejected": -6.790805816650391, "step": 784 }, { "epoch": 0.19, "learning_rate": 1.8037333333333334e-07, "logps/chosen": -168.83734130859375, "logps/rejected": -264.34588623046875, "loss": 0.0412, "losses/dpo": 3.663706957013346e-05, "losses/sft": 0.5343042016029358, "losses/total": 3.663706957013346e-05, "ref_logps/chosen": -167.01211547851562, "ref_logps/rejected": -197.86874389648438, "rewards/accuracies": 1.0, "rewards/chosen": -0.18252229690551758, "rewards/margins": 6.465192794799805, "rewards/rejected": -6.647714614868164, "step": 785 }, { "epoch": 0.19, "learning_rate": 1.8032e-07, "logps/chosen": -245.25128173828125, "logps/rejected": -318.647705078125, "loss": 0.021, "losses/dpo": 4.407801952766022e-06, "losses/sft": 0.544945478439331, "losses/total": 4.407801952766022e-06, "ref_logps/chosen": -241.33364868164062, "ref_logps/rejected": -242.63272094726562, "rewards/accuracies": 1.0, "rewards/chosen": -0.3917628228664398, "rewards/margins": 7.209735870361328, "rewards/rejected": -7.601499080657959, "step": 786 }, { "epoch": 0.19, "learning_rate": 1.8026666666666667e-07, "logps/chosen": -255.9276123046875, "logps/rejected": -302.3027648925781, "loss": 0.0088, "losses/dpo": 0.001390225370414555, "losses/sft": 0.6475132703781128, "losses/total": 0.001390225370414555, "ref_logps/chosen": -254.34373474121094, "ref_logps/rejected": -224.6887969970703, "rewards/accuracies": 1.0, "rewards/chosen": -0.15838877856731415, "rewards/margins": 7.6030073165893555, "rewards/rejected": -7.761396408081055, "step": 787 }, { "epoch": 0.19, "learning_rate": 1.8021333333333332e-07, "logps/chosen": -205.05044555664062, "logps/rejected": -276.9435119628906, "loss": 0.0088, "losses/dpo": 0.01415167935192585, "losses/sft": 0.6437108516693115, "losses/total": 0.01415167935192585, "ref_logps/chosen": -202.55027770996094, "ref_logps/rejected": -209.0282440185547, "rewards/accuracies": 1.0, "rewards/chosen": -0.25001662969589233, "rewards/margins": 6.541509628295898, "rewards/rejected": -6.791526794433594, "step": 788 }, { "epoch": 0.19, "learning_rate": 1.8016e-07, "logps/chosen": -245.97840881347656, "logps/rejected": -292.3011474609375, "loss": 0.0048, "losses/dpo": 8.61920416355133e-05, "losses/sft": 0.5465759038925171, "losses/total": 8.61920416355133e-05, "ref_logps/chosen": -242.07833862304688, "ref_logps/rejected": -220.59878540039062, "rewards/accuracies": 1.0, "rewards/chosen": -0.3900074064731598, "rewards/margins": 6.7802276611328125, "rewards/rejected": -7.170234203338623, "step": 789 }, { "epoch": 0.19, "learning_rate": 1.8010666666666664e-07, "logps/chosen": -211.6861572265625, "logps/rejected": -277.37310791015625, "loss": 0.0206, "losses/dpo": 0.01928427256643772, "losses/sft": 0.3919408321380615, "losses/total": 0.01928427256643772, "ref_logps/chosen": -209.5952606201172, "ref_logps/rejected": -211.02066040039062, "rewards/accuracies": 1.0, "rewards/chosen": -0.2090902030467987, "rewards/margins": 6.426156997680664, "rewards/rejected": -6.635247230529785, "step": 790 }, { "epoch": 0.19, "learning_rate": 1.8005333333333332e-07, "logps/chosen": -234.89137268066406, "logps/rejected": -312.33782958984375, "loss": 0.0059, "losses/dpo": 0.00015954907576087862, "losses/sft": 0.6913127899169922, "losses/total": 0.00015954907576087862, "ref_logps/chosen": -233.0442657470703, "ref_logps/rejected": -234.4135284423828, "rewards/accuracies": 1.0, "rewards/chosen": -0.18471047282218933, "rewards/margins": 7.607723712921143, "rewards/rejected": -7.792433738708496, "step": 791 }, { "epoch": 0.19, "learning_rate": 1.8e-07, "logps/chosen": -259.10736083984375, "logps/rejected": -308.12884521484375, "loss": 0.0185, "losses/dpo": 0.0002350838913116604, "losses/sft": 0.6278625726699829, "losses/total": 0.0002350838913116604, "ref_logps/chosen": -254.817626953125, "ref_logps/rejected": -231.54368591308594, "rewards/accuracies": 1.0, "rewards/chosen": -0.42897510528564453, "rewards/margins": 7.229542255401611, "rewards/rejected": -7.658516883850098, "step": 792 }, { "epoch": 0.19, "learning_rate": 1.7994666666666667e-07, "logps/chosen": -243.14251708984375, "logps/rejected": -306.81488037109375, "loss": 0.0199, "losses/dpo": 0.0031717864330857992, "losses/sft": 0.4770890474319458, "losses/total": 0.0031717864330857992, "ref_logps/chosen": -239.1190948486328, "ref_logps/rejected": -238.52745056152344, "rewards/accuracies": 1.0, "rewards/chosen": -0.4023451507091522, "rewards/margins": 6.426398277282715, "rewards/rejected": -6.828743934631348, "step": 793 }, { "epoch": 0.19, "learning_rate": 1.7989333333333332e-07, "logps/chosen": -206.2320556640625, "logps/rejected": -284.4387512207031, "loss": 0.0243, "losses/dpo": 1.3648503909280407e-06, "losses/sft": 0.6004712581634521, "losses/total": 1.3648503909280407e-06, "ref_logps/chosen": -203.1265411376953, "ref_logps/rejected": -213.0287628173828, "rewards/accuracies": 1.0, "rewards/chosen": -0.3105529844760895, "rewards/margins": 6.830444812774658, "rewards/rejected": -7.140997886657715, "step": 794 }, { "epoch": 0.19, "learning_rate": 1.7984e-07, "logps/chosen": -281.1410217285156, "logps/rejected": -321.8227233886719, "loss": 0.0121, "losses/dpo": 0.0013797254068776965, "losses/sft": 0.6473496556282043, "losses/total": 0.0013797254068776965, "ref_logps/chosen": -277.74261474609375, "ref_logps/rejected": -249.56068420410156, "rewards/accuracies": 1.0, "rewards/chosen": -0.3398441970348358, "rewards/margins": 6.886361122131348, "rewards/rejected": -7.226204872131348, "step": 795 }, { "epoch": 0.19, "learning_rate": 1.7978666666666665e-07, "logps/chosen": -216.57308959960938, "logps/rejected": -297.7743835449219, "loss": 0.0076, "losses/dpo": 2.742403739830479e-05, "losses/sft": 0.6717926263809204, "losses/total": 2.742403739830479e-05, "ref_logps/chosen": -213.68295288085938, "ref_logps/rejected": -221.6839599609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.28901243209838867, "rewards/margins": 7.320030212402344, "rewards/rejected": -7.609042167663574, "step": 796 }, { "epoch": 0.19, "learning_rate": 1.797333333333333e-07, "logps/chosen": -198.84237670898438, "logps/rejected": -280.6567687988281, "loss": 0.0341, "losses/dpo": 3.5061093512922525e-06, "losses/sft": 0.5094792246818542, "losses/total": 3.5061093512922525e-06, "ref_logps/chosen": -196.49212646484375, "ref_logps/rejected": -210.26394653320312, "rewards/accuracies": 1.0, "rewards/chosen": -0.23502539098262787, "rewards/margins": 6.804259777069092, "rewards/rejected": -7.039285182952881, "step": 797 }, { "epoch": 0.19, "learning_rate": 1.7967999999999998e-07, "logps/chosen": -178.99777221679688, "logps/rejected": -260.0669250488281, "loss": 0.0291, "losses/dpo": 7.85210886533605e-06, "losses/sft": 0.6228417754173279, "losses/total": 7.85210886533605e-06, "ref_logps/chosen": -176.57403564453125, "ref_logps/rejected": -197.5716552734375, "rewards/accuracies": 1.0, "rewards/chosen": -0.24237322807312012, "rewards/margins": 6.0071539878845215, "rewards/rejected": -6.2495269775390625, "step": 798 }, { "epoch": 0.19, "learning_rate": 1.7962666666666666e-07, "logps/chosen": -245.51979064941406, "logps/rejected": -302.18804931640625, "loss": 0.0307, "losses/dpo": 6.243689131224528e-05, "losses/sft": 0.515067458152771, "losses/total": 6.243689131224528e-05, "ref_logps/chosen": -242.0814666748047, "ref_logps/rejected": -231.42092895507812, "rewards/accuracies": 1.0, "rewards/chosen": -0.34383273124694824, "rewards/margins": 6.732881546020508, "rewards/rejected": -7.076714515686035, "step": 799 }, { "epoch": 0.19, "learning_rate": 1.7957333333333333e-07, "logps/chosen": -232.94595336914062, "logps/rejected": -324.11932373046875, "loss": 0.0057, "losses/dpo": 0.0003894583787769079, "losses/sft": 0.8579716682434082, "losses/total": 0.0003894583787769079, "ref_logps/chosen": -229.9775848388672, "ref_logps/rejected": -243.13330078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.296836793422699, "rewards/margins": 7.8017659187316895, "rewards/rejected": -8.098602294921875, "step": 800 }, { "epoch": 0.19, "learning_rate": 1.7951999999999998e-07, "logps/chosen": -237.28411865234375, "logps/rejected": -301.5018310546875, "loss": 0.0143, "losses/dpo": 2.4649029001011513e-05, "losses/sft": 0.6500996947288513, "losses/total": 2.4649029001011513e-05, "ref_logps/chosen": -233.57540893554688, "ref_logps/rejected": -225.35992431640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.37087202072143555, "rewards/margins": 7.243319988250732, "rewards/rejected": -7.614192008972168, "step": 801 }, { "epoch": 0.19, "learning_rate": 1.7946666666666666e-07, "logps/chosen": -279.93841552734375, "logps/rejected": -322.110107421875, "loss": 0.0113, "losses/dpo": 0.00013510711141861975, "losses/sft": 0.5268417596817017, "losses/total": 0.00013510711141861975, "ref_logps/chosen": -277.983642578125, "ref_logps/rejected": -243.24700927734375, "rewards/accuracies": 1.0, "rewards/chosen": -0.19547714293003082, "rewards/margins": 7.690834999084473, "rewards/rejected": -7.8863115310668945, "step": 802 }, { "epoch": 0.19, "learning_rate": 1.7941333333333334e-07, "logps/chosen": -207.84713745117188, "logps/rejected": -249.60348510742188, "loss": 0.0335, "losses/dpo": 0.00028627601568587124, "losses/sft": 0.3791774809360504, "losses/total": 0.00028627601568587124, "ref_logps/chosen": -206.0592041015625, "ref_logps/rejected": -184.68838500976562, "rewards/accuracies": 1.0, "rewards/chosen": -0.17879194021224976, "rewards/margins": 6.312718391418457, "rewards/rejected": -6.491509914398193, "step": 803 }, { "epoch": 0.19, "learning_rate": 1.7936e-07, "logps/chosen": -257.87890625, "logps/rejected": -307.822021484375, "loss": 0.0094, "losses/dpo": 0.00017562096763867885, "losses/sft": 0.9142799377441406, "losses/total": 0.00017562096763867885, "ref_logps/chosen": -256.3390197753906, "ref_logps/rejected": -232.29684448242188, "rewards/accuracies": 1.0, "rewards/chosen": -0.153987854719162, "rewards/margins": 7.39853048324585, "rewards/rejected": -7.552518367767334, "step": 804 }, { "epoch": 0.19, "learning_rate": 1.7930666666666666e-07, "logps/chosen": -204.67477416992188, "logps/rejected": -285.650634765625, "loss": 0.0128, "losses/dpo": 1.7674523405730724e-05, "losses/sft": 0.49709799885749817, "losses/total": 1.7674523405730724e-05, "ref_logps/chosen": -202.77699279785156, "ref_logps/rejected": -211.47293090820312, "rewards/accuracies": 1.0, "rewards/chosen": -0.1897794008255005, "rewards/margins": 7.227988243103027, "rewards/rejected": -7.417767524719238, "step": 805 }, { "epoch": 0.19, "learning_rate": 1.792533333333333e-07, "logps/chosen": -193.14047241210938, "logps/rejected": -332.11712646484375, "loss": 0.0101, "losses/dpo": 5.61411798116751e-05, "losses/sft": 0.4513106346130371, "losses/total": 5.61411798116751e-05, "ref_logps/chosen": -191.69073486328125, "ref_logps/rejected": -251.34963989257812, "rewards/accuracies": 1.0, "rewards/chosen": -0.14497192203998566, "rewards/margins": 7.931777000427246, "rewards/rejected": -8.076749801635742, "step": 806 }, { "epoch": 0.19, "learning_rate": 1.792e-07, "logps/chosen": -279.9637756347656, "logps/rejected": -318.84033203125, "loss": 0.003, "losses/dpo": 0.0007040495402179658, "losses/sft": 0.5805478692054749, "losses/total": 0.0007040495402179658, "ref_logps/chosen": -276.43975830078125, "ref_logps/rejected": -240.84573364257812, "rewards/accuracies": 1.0, "rewards/chosen": -0.35240089893341064, "rewards/margins": 7.4470601081848145, "rewards/rejected": -7.7994608879089355, "step": 807 }, { "epoch": 0.19, "learning_rate": 1.7914666666666667e-07, "logps/chosen": -216.1862335205078, "logps/rejected": -284.65216064453125, "loss": 0.0063, "losses/dpo": 4.826292934012599e-05, "losses/sft": 0.3868584632873535, "losses/total": 4.826292934012599e-05, "ref_logps/chosen": -212.12684631347656, "ref_logps/rejected": -211.41415405273438, "rewards/accuracies": 1.0, "rewards/chosen": -0.4059399962425232, "rewards/margins": 6.917863368988037, "rewards/rejected": -7.323803424835205, "step": 808 }, { "epoch": 0.19, "learning_rate": 1.7909333333333332e-07, "logps/chosen": -238.40072631835938, "logps/rejected": -302.1684875488281, "loss": 0.0196, "losses/dpo": 1.8936690366899711e-06, "losses/sft": 0.660865306854248, "losses/total": 1.8936690366899711e-06, "ref_logps/chosen": -233.7598876953125, "ref_logps/rejected": -229.28482055664062, "rewards/accuracies": 1.0, "rewards/chosen": -0.46408361196517944, "rewards/margins": 6.824284076690674, "rewards/rejected": -7.288368225097656, "step": 809 }, { "epoch": 0.19, "learning_rate": 1.7904e-07, "logps/chosen": -218.91836547851562, "logps/rejected": -271.55804443359375, "loss": 0.0122, "losses/dpo": 6.5726735556381755e-06, "losses/sft": 0.8479605317115784, "losses/total": 6.5726735556381755e-06, "ref_logps/chosen": -216.03150939941406, "ref_logps/rejected": -205.75262451171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.2886859178543091, "rewards/margins": 6.291857719421387, "rewards/rejected": -6.580543041229248, "step": 810 }, { "epoch": 0.19, "learning_rate": 1.7898666666666667e-07, "logps/chosen": -235.35630798339844, "logps/rejected": -312.43865966796875, "loss": 0.0191, "losses/dpo": 3.962867413065396e-05, "losses/sft": 0.4998382329940796, "losses/total": 3.962867413065396e-05, "ref_logps/chosen": -232.62832641601562, "ref_logps/rejected": -234.81442260742188, "rewards/accuracies": 1.0, "rewards/chosen": -0.2727988660335541, "rewards/margins": 7.489625930786133, "rewards/rejected": -7.762425422668457, "step": 811 }, { "epoch": 0.19, "learning_rate": 1.7893333333333335e-07, "logps/chosen": -243.6611785888672, "logps/rejected": -275.0142822265625, "loss": 0.0113, "losses/dpo": 4.076711047673598e-05, "losses/sft": 0.6697161197662354, "losses/total": 4.076711047673598e-05, "ref_logps/chosen": -241.7296142578125, "ref_logps/rejected": -207.24783325195312, "rewards/accuracies": 1.0, "rewards/chosen": -0.19315630197525024, "rewards/margins": 6.583487510681152, "rewards/rejected": -6.776643753051758, "step": 812 }, { "epoch": 0.2, "learning_rate": 1.7888e-07, "logps/chosen": -251.87677001953125, "logps/rejected": -309.57025146484375, "loss": 0.0189, "losses/dpo": 6.608917465200648e-05, "losses/sft": 0.7374908924102783, "losses/total": 6.608917465200648e-05, "ref_logps/chosen": -247.94024658203125, "ref_logps/rejected": -231.5111541748047, "rewards/accuracies": 1.0, "rewards/chosen": -0.3936520516872406, "rewards/margins": 7.412256240844727, "rewards/rejected": -7.805907726287842, "step": 813 }, { "epoch": 0.2, "learning_rate": 1.7882666666666665e-07, "logps/chosen": -212.79185485839844, "logps/rejected": -311.72113037109375, "loss": 0.0063, "losses/dpo": 0.00011511526827234775, "losses/sft": 0.5906044244766235, "losses/total": 0.00011511526827234775, "ref_logps/chosen": -209.10159301757812, "ref_logps/rejected": -237.23768615722656, "rewards/accuracies": 1.0, "rewards/chosen": -0.3690270781517029, "rewards/margins": 7.079317092895508, "rewards/rejected": -7.4483442306518555, "step": 814 }, { "epoch": 0.2, "learning_rate": 1.7877333333333332e-07, "logps/chosen": -262.0848388671875, "logps/rejected": -318.31011962890625, "loss": 0.0111, "losses/dpo": 0.005121799651533365, "losses/sft": 0.6973617672920227, "losses/total": 0.005121799651533365, "ref_logps/chosen": -257.72418212890625, "ref_logps/rejected": -237.9014434814453, "rewards/accuracies": 1.0, "rewards/chosen": -0.43606799840927124, "rewards/margins": 7.60479736328125, "rewards/rejected": -8.040864944458008, "step": 815 }, { "epoch": 0.2, "learning_rate": 1.7871999999999998e-07, "logps/chosen": -195.01760864257812, "logps/rejected": -312.77679443359375, "loss": 0.0074, "losses/dpo": 0.0004763085162267089, "losses/sft": 0.6539808511734009, "losses/total": 0.0004763085162267089, "ref_logps/chosen": -192.66851806640625, "ref_logps/rejected": -233.77127075195312, "rewards/accuracies": 1.0, "rewards/chosen": -0.23490911722183228, "rewards/margins": 7.665643692016602, "rewards/rejected": -7.900553226470947, "step": 816 }, { "epoch": 0.2, "learning_rate": 1.7866666666666665e-07, "logps/chosen": -238.19090270996094, "logps/rejected": -340.12530517578125, "loss": 0.0069, "losses/dpo": 1.9575953047024086e-05, "losses/sft": 0.5072950720787048, "losses/total": 1.9575953047024086e-05, "ref_logps/chosen": -233.32066345214844, "ref_logps/rejected": -257.23052978515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.48702317476272583, "rewards/margins": 7.802457809448242, "rewards/rejected": -8.289480209350586, "step": 817 }, { "epoch": 0.2, "learning_rate": 1.7861333333333333e-07, "logps/chosen": -223.3509521484375, "logps/rejected": -260.69281005859375, "loss": 0.0333, "losses/dpo": 1.3739188034378458e-05, "losses/sft": 0.9935981035232544, "losses/total": 1.3739188034378458e-05, "ref_logps/chosen": -220.32278442382812, "ref_logps/rejected": -196.2020263671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.30281752347946167, "rewards/margins": 6.146259307861328, "rewards/rejected": -6.4490766525268555, "step": 818 }, { "epoch": 0.2, "learning_rate": 1.7856e-07, "logps/chosen": -263.42108154296875, "logps/rejected": -295.71759033203125, "loss": 0.0204, "losses/dpo": 0.0004328391805756837, "losses/sft": 0.5346897840499878, "losses/total": 0.0004328391805756837, "ref_logps/chosen": -260.37921142578125, "ref_logps/rejected": -219.83238220214844, "rewards/accuracies": 1.0, "rewards/chosen": -0.304187536239624, "rewards/margins": 7.284331321716309, "rewards/rejected": -7.588519096374512, "step": 819 }, { "epoch": 0.2, "learning_rate": 1.7850666666666666e-07, "logps/chosen": -230.00254821777344, "logps/rejected": -316.001953125, "loss": 0.0186, "losses/dpo": 1.5975230098774773e-06, "losses/sft": 0.5734817385673523, "losses/total": 1.5975230098774773e-06, "ref_logps/chosen": -225.45816040039062, "ref_logps/rejected": -240.57281494140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.45443791151046753, "rewards/margins": 7.088479518890381, "rewards/rejected": -7.542917728424072, "step": 820 }, { "epoch": 0.2, "learning_rate": 1.7845333333333333e-07, "logps/chosen": -220.89646911621094, "logps/rejected": -284.9244384765625, "loss": 0.0241, "losses/dpo": 7.835352334950585e-06, "losses/sft": 0.8256763815879822, "losses/total": 7.835352334950585e-06, "ref_logps/chosen": -219.76107788085938, "ref_logps/rejected": -215.5464324951172, "rewards/accuracies": 1.0, "rewards/chosen": -0.11353892087936401, "rewards/margins": 6.824260234832764, "rewards/rejected": -6.937799453735352, "step": 821 }, { "epoch": 0.2, "learning_rate": 1.7839999999999998e-07, "logps/chosen": -186.63609313964844, "logps/rejected": -288.92095947265625, "loss": 0.0196, "losses/dpo": 4.024719601147808e-05, "losses/sft": 0.4708200991153717, "losses/total": 4.024719601147808e-05, "ref_logps/chosen": -185.69863891601562, "ref_logps/rejected": -219.82664489746094, "rewards/accuracies": 1.0, "rewards/chosen": -0.09374458342790604, "rewards/margins": 6.8156867027282715, "rewards/rejected": -6.909431457519531, "step": 822 }, { "epoch": 0.2, "learning_rate": 1.7834666666666666e-07, "logps/chosen": -192.58773803710938, "logps/rejected": -300.19677734375, "loss": 0.0109, "losses/dpo": 4.548148808680708e-06, "losses/sft": 0.46143609285354614, "losses/total": 4.548148808680708e-06, "ref_logps/chosen": -192.34201049804688, "ref_logps/rejected": -220.92086791992188, "rewards/accuracies": 1.0, "rewards/chosen": -0.024573247879743576, "rewards/margins": 7.903016090393066, "rewards/rejected": -7.927588939666748, "step": 823 }, { "epoch": 0.2, "learning_rate": 1.782933333333333e-07, "logps/chosen": -224.61326599121094, "logps/rejected": -297.9886474609375, "loss": 0.0112, "losses/dpo": 4.9050806410377845e-05, "losses/sft": 0.8266646862030029, "losses/total": 4.9050806410377845e-05, "ref_logps/chosen": -221.52098083496094, "ref_logps/rejected": -219.5428466796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.30922743678092957, "rewards/margins": 7.535348892211914, "rewards/rejected": -7.844576835632324, "step": 824 }, { "epoch": 0.2, "learning_rate": 1.7823999999999999e-07, "logps/chosen": -195.34156799316406, "logps/rejected": -292.18359375, "loss": 0.0099, "losses/dpo": 0.0058101508766412735, "losses/sft": 0.6015763878822327, "losses/total": 0.0058101508766412735, "ref_logps/chosen": -192.1941680908203, "ref_logps/rejected": -219.06182861328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.31474044919013977, "rewards/margins": 6.9974365234375, "rewards/rejected": -7.31217622756958, "step": 825 }, { "epoch": 0.2, "learning_rate": 1.7818666666666666e-07, "logps/chosen": -218.95553588867188, "logps/rejected": -296.4010314941406, "loss": 0.0157, "losses/dpo": 2.005195298693252e-08, "losses/sft": 0.4956941604614258, "losses/total": 2.005195298693252e-08, "ref_logps/chosen": -217.33383178710938, "ref_logps/rejected": -221.69456481933594, "rewards/accuracies": 1.0, "rewards/chosen": -0.1621706485748291, "rewards/margins": 7.30847692489624, "rewards/rejected": -7.47064733505249, "step": 826 }, { "epoch": 0.2, "learning_rate": 1.7813333333333334e-07, "logps/chosen": -260.73956298828125, "logps/rejected": -306.9859924316406, "loss": 0.0119, "losses/dpo": 5.594914000539575e-06, "losses/sft": 0.4843224287033081, "losses/total": 5.594914000539575e-06, "ref_logps/chosen": -258.5158996582031, "ref_logps/rejected": -229.3324432373047, "rewards/accuracies": 1.0, "rewards/chosen": -0.2223658561706543, "rewards/margins": 7.542990207672119, "rewards/rejected": -7.765355587005615, "step": 827 }, { "epoch": 0.2, "learning_rate": 1.7808e-07, "logps/chosen": -264.8351135253906, "logps/rejected": -320.05487060546875, "loss": 0.006, "losses/dpo": 3.841992679554096e-07, "losses/sft": 1.1568655967712402, "losses/total": 3.841992679554096e-07, "ref_logps/chosen": -260.84295654296875, "ref_logps/rejected": -239.34722900390625, "rewards/accuracies": 1.0, "rewards/chosen": -0.3992149531841278, "rewards/margins": 7.671548843383789, "rewards/rejected": -8.07076358795166, "step": 828 }, { "epoch": 0.2, "learning_rate": 1.7802666666666667e-07, "logps/chosen": -199.6001434326172, "logps/rejected": -299.58807373046875, "loss": 0.0239, "losses/dpo": 2.8912825200677617e-06, "losses/sft": 0.6026349663734436, "losses/total": 2.8912825200677617e-06, "ref_logps/chosen": -198.63894653320312, "ref_logps/rejected": -224.85107421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.0961190015077591, "rewards/margins": 7.3775811195373535, "rewards/rejected": -7.473700523376465, "step": 829 }, { "epoch": 0.2, "learning_rate": 1.7797333333333334e-07, "logps/chosen": -268.41656494140625, "logps/rejected": -304.4954528808594, "loss": 0.0055, "losses/dpo": 0.00013081639190204442, "losses/sft": 0.4443124830722809, "losses/total": 0.00013081639190204442, "ref_logps/chosen": -265.72210693359375, "ref_logps/rejected": -228.3326416015625, "rewards/accuracies": 1.0, "rewards/chosen": -0.2694442868232727, "rewards/margins": 7.346838474273682, "rewards/rejected": -7.6162824630737305, "step": 830 }, { "epoch": 0.2, "learning_rate": 1.7792e-07, "logps/chosen": -237.969482421875, "logps/rejected": -304.6568298339844, "loss": 0.0205, "losses/dpo": 2.36003161262488e-05, "losses/sft": 0.5347598791122437, "losses/total": 2.36003161262488e-05, "ref_logps/chosen": -232.8955841064453, "ref_logps/rejected": -227.47097778320312, "rewards/accuracies": 1.0, "rewards/chosen": -0.5073883533477783, "rewards/margins": 7.211199760437012, "rewards/rejected": -7.718587875366211, "step": 831 }, { "epoch": 0.2, "learning_rate": 1.7786666666666664e-07, "logps/chosen": -239.21412658691406, "logps/rejected": -306.2551574707031, "loss": 0.0083, "losses/dpo": 1.3426696568785701e-05, "losses/sft": 0.6709677577018738, "losses/total": 1.3426696568785701e-05, "ref_logps/chosen": -236.16676330566406, "ref_logps/rejected": -232.6319122314453, "rewards/accuracies": 1.0, "rewards/chosen": -0.3047356605529785, "rewards/margins": 7.057589530944824, "rewards/rejected": -7.362325191497803, "step": 832 }, { "epoch": 0.2, "learning_rate": 1.7781333333333332e-07, "logps/chosen": -228.4100799560547, "logps/rejected": -297.8599548339844, "loss": 0.0096, "losses/dpo": 0.0003068489022552967, "losses/sft": 0.44605594873428345, "losses/total": 0.0003068489022552967, "ref_logps/chosen": -226.30242919921875, "ref_logps/rejected": -219.99319458007812, "rewards/accuracies": 1.0, "rewards/chosen": -0.2107643187046051, "rewards/margins": 7.575911045074463, "rewards/rejected": -7.786675453186035, "step": 833 }, { "epoch": 0.2, "learning_rate": 1.7776e-07, "logps/chosen": -237.46200561523438, "logps/rejected": -297.61700439453125, "loss": 0.0102, "losses/dpo": 0.0005663821357302368, "losses/sft": 0.6319602131843567, "losses/total": 0.0005663821357302368, "ref_logps/chosen": -234.1988525390625, "ref_logps/rejected": -224.61654663085938, "rewards/accuracies": 1.0, "rewards/chosen": -0.3263128995895386, "rewards/margins": 6.973734378814697, "rewards/rejected": -7.300047397613525, "step": 834 }, { "epoch": 0.2, "learning_rate": 1.7770666666666665e-07, "logps/chosen": -221.96629333496094, "logps/rejected": -288.146484375, "loss": 0.0137, "losses/dpo": 0.00010527186532272026, "losses/sft": 0.6769012808799744, "losses/total": 0.00010527186532272026, "ref_logps/chosen": -222.23370361328125, "ref_logps/rejected": -216.0267333984375, "rewards/accuracies": 1.0, "rewards/chosen": 0.02674146182835102, "rewards/margins": 7.238715171813965, "rewards/rejected": -7.211973190307617, "step": 835 }, { "epoch": 0.2, "learning_rate": 1.7765333333333333e-07, "logps/chosen": -232.011962890625, "logps/rejected": -295.1957092285156, "loss": 0.0081, "losses/dpo": 0.03092116490006447, "losses/sft": 0.6219330430030823, "losses/total": 0.03092116490006447, "ref_logps/chosen": -228.48858642578125, "ref_logps/rejected": -216.3610076904297, "rewards/accuracies": 1.0, "rewards/chosen": -0.3523368835449219, "rewards/margins": 7.531133651733398, "rewards/rejected": -7.883471488952637, "step": 836 }, { "epoch": 0.2, "learning_rate": 1.776e-07, "logps/chosen": -254.87075805664062, "logps/rejected": -325.4389343261719, "loss": 0.0129, "losses/dpo": 2.97417318506632e-06, "losses/sft": 1.0518654584884644, "losses/total": 2.97417318506632e-06, "ref_logps/chosen": -250.96107482910156, "ref_logps/rejected": -236.35699462890625, "rewards/accuracies": 1.0, "rewards/chosen": -0.39096909761428833, "rewards/margins": 8.517223358154297, "rewards/rejected": -8.908191680908203, "step": 837 }, { "epoch": 0.2, "learning_rate": 1.7754666666666668e-07, "logps/chosen": -221.2044219970703, "logps/rejected": -323.8824768066406, "loss": 0.003, "losses/dpo": 6.94417803970282e-06, "losses/sft": 0.9459185600280762, "losses/total": 6.94417803970282e-06, "ref_logps/chosen": -220.17550659179688, "ref_logps/rejected": -241.35577392578125, "rewards/accuracies": 1.0, "rewards/chosen": -0.10288997739553452, "rewards/margins": 8.149781227111816, "rewards/rejected": -8.25267219543457, "step": 838 }, { "epoch": 0.2, "learning_rate": 1.7749333333333333e-07, "logps/chosen": -201.92015075683594, "logps/rejected": -299.5522766113281, "loss": 0.0081, "losses/dpo": 1.5899055142654106e-05, "losses/sft": 0.7221096158027649, "losses/total": 1.5899055142654106e-05, "ref_logps/chosen": -199.11558532714844, "ref_logps/rejected": -225.04522705078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.2804569602012634, "rewards/margins": 7.170247554779053, "rewards/rejected": -7.4507036209106445, "step": 839 }, { "epoch": 0.2, "learning_rate": 1.7743999999999998e-07, "logps/chosen": -231.45733642578125, "logps/rejected": -306.6019592285156, "loss": 0.0345, "losses/dpo": 0.00010198150266660377, "losses/sft": 0.6481772065162659, "losses/total": 0.00010198150266660377, "ref_logps/chosen": -229.13516235351562, "ref_logps/rejected": -230.37413024902344, "rewards/accuracies": 1.0, "rewards/chosen": -0.232217937707901, "rewards/margins": 7.390565872192383, "rewards/rejected": -7.622783660888672, "step": 840 }, { "epoch": 0.2, "learning_rate": 1.7738666666666666e-07, "logps/chosen": -249.61459350585938, "logps/rejected": -329.8783264160156, "loss": 0.0139, "losses/dpo": 0.0002901747066061944, "losses/sft": 0.5975077152252197, "losses/total": 0.0002901747066061944, "ref_logps/chosen": -246.7811737060547, "ref_logps/rejected": -239.60821533203125, "rewards/accuracies": 1.0, "rewards/chosen": -0.28334081172943115, "rewards/margins": 8.743668556213379, "rewards/rejected": -9.027009963989258, "step": 841 }, { "epoch": 0.2, "learning_rate": 1.7733333333333333e-07, "logps/chosen": -233.49818420410156, "logps/rejected": -296.4463195800781, "loss": 0.0332, "losses/dpo": 0.00020232651149854064, "losses/sft": 0.651628851890564, "losses/total": 0.00020232651149854064, "ref_logps/chosen": -230.4696044921875, "ref_logps/rejected": -222.7774658203125, "rewards/accuracies": 1.0, "rewards/chosen": -0.302858829498291, "rewards/margins": 7.064028739929199, "rewards/rejected": -7.366888046264648, "step": 842 }, { "epoch": 0.2, "learning_rate": 1.7727999999999998e-07, "logps/chosen": -207.8602752685547, "logps/rejected": -315.8627624511719, "loss": 0.0079, "losses/dpo": 5.545988824451342e-05, "losses/sft": 0.7044622898101807, "losses/total": 5.545988824451342e-05, "ref_logps/chosen": -205.9367218017578, "ref_logps/rejected": -239.1097412109375, "rewards/accuracies": 1.0, "rewards/chosen": -0.1923549920320511, "rewards/margins": 7.482947826385498, "rewards/rejected": -7.675302982330322, "step": 843 }, { "epoch": 0.2, "learning_rate": 1.7722666666666666e-07, "logps/chosen": -234.00770568847656, "logps/rejected": -331.64971923828125, "loss": 0.0072, "losses/dpo": 5.303320449456805e-07, "losses/sft": 0.4291670620441437, "losses/total": 5.303320449456805e-07, "ref_logps/chosen": -232.5503692626953, "ref_logps/rejected": -243.19927978515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.1457327902317047, "rewards/margins": 8.699313163757324, "rewards/rejected": -8.845046043395996, "step": 844 }, { "epoch": 0.2, "learning_rate": 1.7717333333333334e-07, "logps/chosen": -267.1375732421875, "logps/rejected": -303.07843017578125, "loss": 0.021, "losses/dpo": 1.271772816835437e-06, "losses/sft": 0.7978495955467224, "losses/total": 1.271772816835437e-06, "ref_logps/chosen": -263.9656677246094, "ref_logps/rejected": -228.42999267578125, "rewards/accuracies": 1.0, "rewards/chosen": -0.31718841195106506, "rewards/margins": 7.147655010223389, "rewards/rejected": -7.46484375, "step": 845 }, { "epoch": 0.2, "learning_rate": 1.7712000000000001e-07, "logps/chosen": -232.86520385742188, "logps/rejected": -259.01361083984375, "loss": 0.0362, "losses/dpo": 0.0001225027663167566, "losses/sft": 0.519304096698761, "losses/total": 0.0001225027663167566, "ref_logps/chosen": -229.10629272460938, "ref_logps/rejected": -198.01141357421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.3758895993232727, "rewards/margins": 5.724328994750977, "rewards/rejected": -6.100218296051025, "step": 846 }, { "epoch": 0.2, "learning_rate": 1.7706666666666666e-07, "logps/chosen": -217.03746032714844, "logps/rejected": -292.91998291015625, "loss": 0.0282, "losses/dpo": 0.0007477406179532409, "losses/sft": 0.5826835632324219, "losses/total": 0.0007477406179532409, "ref_logps/chosen": -214.218505859375, "ref_logps/rejected": -222.12808227539062, "rewards/accuracies": 1.0, "rewards/chosen": -0.2818942666053772, "rewards/margins": 6.797295570373535, "rewards/rejected": -7.079189777374268, "step": 847 }, { "epoch": 0.2, "learning_rate": 1.7701333333333331e-07, "logps/chosen": -221.17819213867188, "logps/rejected": -325.462158203125, "loss": 0.0123, "losses/dpo": 4.460615343759855e-08, "losses/sft": 0.4943687915802002, "losses/total": 4.460615343759855e-08, "ref_logps/chosen": -218.2736358642578, "ref_logps/rejected": -243.6150665283203, "rewards/accuracies": 1.0, "rewards/chosen": -0.2904554009437561, "rewards/margins": 7.894251823425293, "rewards/rejected": -8.184706687927246, "step": 848 }, { "epoch": 0.2, "learning_rate": 1.7696e-07, "logps/chosen": -250.19915771484375, "logps/rejected": -290.711181640625, "loss": 0.0236, "losses/dpo": 4.239693225827068e-06, "losses/sft": 0.6231561303138733, "losses/total": 4.239693225827068e-06, "ref_logps/chosen": -246.78805541992188, "ref_logps/rejected": -212.62637329101562, "rewards/accuracies": 1.0, "rewards/chosen": -0.34111011028289795, "rewards/margins": 7.4673662185668945, "rewards/rejected": -7.808476448059082, "step": 849 }, { "epoch": 0.2, "learning_rate": 1.7690666666666664e-07, "logps/chosen": -218.93453979492188, "logps/rejected": -301.00555419921875, "loss": 0.0171, "losses/dpo": 3.856499120047374e-08, "losses/sft": 0.7081054449081421, "losses/total": 3.856499120047374e-08, "ref_logps/chosen": -217.44869995117188, "ref_logps/rejected": -222.1451873779297, "rewards/accuracies": 1.0, "rewards/chosen": -0.1485825479030609, "rewards/margins": 7.737457275390625, "rewards/rejected": -7.886039733886719, "step": 850 }, { "epoch": 0.2, "learning_rate": 1.7685333333333332e-07, "logps/chosen": -215.454345703125, "logps/rejected": -309.7700500488281, "loss": 0.0161, "losses/dpo": 3.8110338209662586e-05, "losses/sft": 0.7719211578369141, "losses/total": 3.8110338209662586e-05, "ref_logps/chosen": -213.17820739746094, "ref_logps/rejected": -225.19273376464844, "rewards/accuracies": 1.0, "rewards/chosen": -0.22761482000350952, "rewards/margins": 8.230116844177246, "rewards/rejected": -8.457732200622559, "step": 851 }, { "epoch": 0.2, "learning_rate": 1.768e-07, "logps/chosen": -234.4149627685547, "logps/rejected": -323.2982482910156, "loss": 0.0147, "losses/dpo": 0.001776930526830256, "losses/sft": 0.7814204692840576, "losses/total": 0.001776930526830256, "ref_logps/chosen": -231.26480102539062, "ref_logps/rejected": -241.66555786132812, "rewards/accuracies": 1.0, "rewards/chosen": -0.31501704454421997, "rewards/margins": 7.848252773284912, "rewards/rejected": -8.163269996643066, "step": 852 }, { "epoch": 0.2, "learning_rate": 1.7674666666666667e-07, "logps/chosen": -228.93896484375, "logps/rejected": -271.59869384765625, "loss": 0.0072, "losses/dpo": 7.942613592604175e-05, "losses/sft": 0.5755122303962708, "losses/total": 7.942613592604175e-05, "ref_logps/chosen": -224.7207489013672, "ref_logps/rejected": -197.82373046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.42182159423828125, "rewards/margins": 6.955677032470703, "rewards/rejected": -7.377498626708984, "step": 853 }, { "epoch": 0.2, "learning_rate": 1.7669333333333332e-07, "logps/chosen": -229.86102294921875, "logps/rejected": -292.39471435546875, "loss": 0.0093, "losses/dpo": 0.00010021808702731505, "losses/sft": 0.4075026512145996, "losses/total": 0.00010021808702731505, "ref_logps/chosen": -227.62562561035156, "ref_logps/rejected": -216.010986328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.22353798151016235, "rewards/margins": 7.414834499359131, "rewards/rejected": -7.638372898101807, "step": 854 }, { "epoch": 0.21, "learning_rate": 1.7664e-07, "logps/chosen": -226.6110382080078, "logps/rejected": -304.65509033203125, "loss": 0.0047, "losses/dpo": 1.0240882147627417e-05, "losses/sft": 0.5406229496002197, "losses/total": 1.0240882147627417e-05, "ref_logps/chosen": -222.0659942626953, "ref_logps/rejected": -225.93194580078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.4545051157474518, "rewards/margins": 7.417810440063477, "rewards/rejected": -7.872314929962158, "step": 855 }, { "epoch": 0.21, "learning_rate": 1.7658666666666665e-07, "logps/chosen": -217.68865966796875, "logps/rejected": -306.62713623046875, "loss": 0.0053, "losses/dpo": 0.0001405076909577474, "losses/sft": 0.965703547000885, "losses/total": 0.0001405076909577474, "ref_logps/chosen": -216.1571807861328, "ref_logps/rejected": -231.34716796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.1531478613615036, "rewards/margins": 7.37484884262085, "rewards/rejected": -7.527997016906738, "step": 856 }, { "epoch": 0.21, "learning_rate": 1.7653333333333333e-07, "logps/chosen": -214.16317749023438, "logps/rejected": -297.9679260253906, "loss": 0.0267, "losses/dpo": 8.201656601158902e-05, "losses/sft": 0.7124246954917908, "losses/total": 8.201656601158902e-05, "ref_logps/chosen": -213.28704833984375, "ref_logps/rejected": -220.82000732421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.08761287480592728, "rewards/margins": 7.6271796226501465, "rewards/rejected": -7.714792251586914, "step": 857 }, { "epoch": 0.21, "learning_rate": 1.7647999999999998e-07, "logps/chosen": -210.84051513671875, "logps/rejected": -285.29345703125, "loss": 0.0201, "losses/dpo": 1.4896747302373115e-07, "losses/sft": 1.0340255498886108, "losses/total": 1.4896747302373115e-07, "ref_logps/chosen": -209.2246551513672, "ref_logps/rejected": -213.950927734375, "rewards/accuracies": 1.0, "rewards/chosen": -0.16158604621887207, "rewards/margins": 6.972666263580322, "rewards/rejected": -7.134252548217773, "step": 858 }, { "epoch": 0.21, "learning_rate": 1.7642666666666665e-07, "logps/chosen": -205.5862274169922, "logps/rejected": -306.4796142578125, "loss": 0.0063, "losses/dpo": 0.00415765168145299, "losses/sft": 0.8261364102363586, "losses/total": 0.00415765168145299, "ref_logps/chosen": -203.93649291992188, "ref_logps/rejected": -223.7174072265625, "rewards/accuracies": 1.0, "rewards/chosen": -0.1649719774723053, "rewards/margins": 8.111246109008789, "rewards/rejected": -8.27621841430664, "step": 859 }, { "epoch": 0.21, "learning_rate": 1.7637333333333333e-07, "logps/chosen": -240.15484619140625, "logps/rejected": -305.6497802734375, "loss": 0.0311, "losses/dpo": 2.4628869141452014e-05, "losses/sft": 1.0115216970443726, "losses/total": 2.4628869141452014e-05, "ref_logps/chosen": -237.4169921875, "ref_logps/rejected": -232.05487060546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.2737860381603241, "rewards/margins": 7.085704326629639, "rewards/rejected": -7.359490394592285, "step": 860 }, { "epoch": 0.21, "learning_rate": 1.7632e-07, "logps/chosen": -226.95315551757812, "logps/rejected": -294.5758056640625, "loss": 0.0177, "losses/dpo": 2.4905375539674424e-07, "losses/sft": 0.5030496716499329, "losses/total": 2.4905375539674424e-07, "ref_logps/chosen": -226.5556182861328, "ref_logps/rejected": -220.28004455566406, "rewards/accuracies": 1.0, "rewards/chosen": -0.039754465222358704, "rewards/margins": 7.389822959899902, "rewards/rejected": -7.429577827453613, "step": 861 }, { "epoch": 0.21, "learning_rate": 1.7626666666666666e-07, "logps/chosen": -235.05831909179688, "logps/rejected": -296.6875305175781, "loss": 0.0067, "losses/dpo": 5.892371063964674e-06, "losses/sft": 0.4147851765155792, "losses/total": 5.892371063964674e-06, "ref_logps/chosen": -232.56326293945312, "ref_logps/rejected": -219.2647247314453, "rewards/accuracies": 1.0, "rewards/chosen": -0.24950505793094635, "rewards/margins": 7.492778778076172, "rewards/rejected": -7.742283344268799, "step": 862 }, { "epoch": 0.21, "learning_rate": 1.7621333333333333e-07, "logps/chosen": -247.68563842773438, "logps/rejected": -283.26959228515625, "loss": 0.0086, "losses/dpo": 5.076732486486435e-05, "losses/sft": 0.5625148415565491, "losses/total": 5.076732486486435e-05, "ref_logps/chosen": -244.39923095703125, "ref_logps/rejected": -212.62030029296875, "rewards/accuracies": 1.0, "rewards/chosen": -0.32864245772361755, "rewards/margins": 6.736285209655762, "rewards/rejected": -7.06492805480957, "step": 863 }, { "epoch": 0.21, "learning_rate": 1.7616e-07, "logps/chosen": -266.8288269042969, "logps/rejected": -329.11456298828125, "loss": 0.0123, "losses/dpo": 0.00024539418518543243, "losses/sft": 0.938873827457428, "losses/total": 0.00024539418518543243, "ref_logps/chosen": -263.2958984375, "ref_logps/rejected": -242.71754455566406, "rewards/accuracies": 1.0, "rewards/chosen": -0.353293240070343, "rewards/margins": 8.28641128540039, "rewards/rejected": -8.639703750610352, "step": 864 }, { "epoch": 0.21, "learning_rate": 1.7610666666666666e-07, "logps/chosen": -223.1649169921875, "logps/rejected": -283.1258544921875, "loss": 0.0139, "losses/dpo": 0.002214510226622224, "losses/sft": 0.5041617155075073, "losses/total": 0.002214510226622224, "ref_logps/chosen": -220.41513061523438, "ref_logps/rejected": -207.4341278076172, "rewards/accuracies": 1.0, "rewards/chosen": -0.2749785780906677, "rewards/margins": 7.294194221496582, "rewards/rejected": -7.569171905517578, "step": 865 }, { "epoch": 0.21, "learning_rate": 1.760533333333333e-07, "logps/chosen": -250.3105926513672, "logps/rejected": -293.9263916015625, "loss": 0.0125, "losses/dpo": 2.5763381927390583e-05, "losses/sft": 0.6136470437049866, "losses/total": 2.5763381927390583e-05, "ref_logps/chosen": -247.55474853515625, "ref_logps/rejected": -219.99502563476562, "rewards/accuracies": 1.0, "rewards/chosen": -0.2755841612815857, "rewards/margins": 7.117556571960449, "rewards/rejected": -7.39314079284668, "step": 866 }, { "epoch": 0.21, "learning_rate": 1.76e-07, "logps/chosen": -247.52728271484375, "logps/rejected": -331.1080017089844, "loss": 0.0116, "losses/dpo": 0.005376103799790144, "losses/sft": 0.5644733309745789, "losses/total": 0.005376103799790144, "ref_logps/chosen": -243.64488220214844, "ref_logps/rejected": -240.96060180664062, "rewards/accuracies": 1.0, "rewards/chosen": -0.3882398009300232, "rewards/margins": 8.62650203704834, "rewards/rejected": -9.014741897583008, "step": 867 }, { "epoch": 0.21, "learning_rate": 1.7594666666666666e-07, "logps/chosen": -249.3966522216797, "logps/rejected": -319.49591064453125, "loss": 0.0026, "losses/dpo": 6.626871618209407e-05, "losses/sft": 0.7027702927589417, "losses/total": 6.626871618209407e-05, "ref_logps/chosen": -245.98023986816406, "ref_logps/rejected": -232.49606323242188, "rewards/accuracies": 1.0, "rewards/chosen": -0.3416408896446228, "rewards/margins": 8.358344078063965, "rewards/rejected": -8.69998550415039, "step": 868 }, { "epoch": 0.21, "learning_rate": 1.7589333333333331e-07, "logps/chosen": -241.95294189453125, "logps/rejected": -326.8497314453125, "loss": 0.0118, "losses/dpo": 1.0011842277890537e-05, "losses/sft": 0.5525265336036682, "losses/total": 1.0011842277890537e-05, "ref_logps/chosen": -238.96543884277344, "ref_logps/rejected": -238.95220947265625, "rewards/accuracies": 1.0, "rewards/chosen": -0.2987518906593323, "rewards/margins": 8.490999221801758, "rewards/rejected": -8.789751052856445, "step": 869 }, { "epoch": 0.21, "learning_rate": 1.7584e-07, "logps/chosen": -226.42599487304688, "logps/rejected": -268.9964904785156, "loss": 0.0339, "losses/dpo": 0.00038616539677605033, "losses/sft": 0.6265158653259277, "losses/total": 0.00038616539677605033, "ref_logps/chosen": -223.45669555664062, "ref_logps/rejected": -196.8480224609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.2969313859939575, "rewards/margins": 6.917915344238281, "rewards/rejected": -7.214846611022949, "step": 870 }, { "epoch": 0.21, "learning_rate": 1.7578666666666667e-07, "logps/chosen": -230.18185424804688, "logps/rejected": -272.6929626464844, "loss": 0.0289, "losses/dpo": 1.6216968390381226e-07, "losses/sft": 0.8774439692497253, "losses/total": 1.6216968390381226e-07, "ref_logps/chosen": -225.32778930664062, "ref_logps/rejected": -198.1168975830078, "rewards/accuracies": 1.0, "rewards/chosen": -0.48540621995925903, "rewards/margins": 6.972203254699707, "rewards/rejected": -7.457609176635742, "step": 871 }, { "epoch": 0.21, "learning_rate": 1.7573333333333335e-07, "logps/chosen": -289.266845703125, "logps/rejected": -342.29052734375, "loss": 0.0091, "losses/dpo": 7.566251042590011e-06, "losses/sft": 0.43326571583747864, "losses/total": 7.566251042590011e-06, "ref_logps/chosen": -286.9588623046875, "ref_logps/rejected": -255.0435791015625, "rewards/accuracies": 1.0, "rewards/chosen": -0.2308005839586258, "rewards/margins": 8.493895530700684, "rewards/rejected": -8.72469711303711, "step": 872 }, { "epoch": 0.21, "learning_rate": 1.7568e-07, "logps/chosen": -214.91574096679688, "logps/rejected": -256.044189453125, "loss": 0.0289, "losses/dpo": 9.377086098538712e-05, "losses/sft": 0.5025050640106201, "losses/total": 9.377086098538712e-05, "ref_logps/chosen": -213.2642822265625, "ref_logps/rejected": -188.62283325195312, "rewards/accuracies": 1.0, "rewards/chosen": -0.16514916718006134, "rewards/margins": 6.576986789703369, "rewards/rejected": -6.742136001586914, "step": 873 }, { "epoch": 0.21, "learning_rate": 1.7562666666666665e-07, "logps/chosen": -235.45846557617188, "logps/rejected": -295.33306884765625, "loss": 0.0076, "losses/dpo": 0.00018398823158349842, "losses/sft": 0.8718380331993103, "losses/total": 0.00018398823158349842, "ref_logps/chosen": -231.22518920898438, "ref_logps/rejected": -216.526611328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.4233265519142151, "rewards/margins": 7.457321643829346, "rewards/rejected": -7.880648612976074, "step": 874 }, { "epoch": 0.21, "learning_rate": 1.7557333333333332e-07, "logps/chosen": -214.71038818359375, "logps/rejected": -286.77020263671875, "loss": 0.0248, "losses/dpo": 6.675257964161574e-07, "losses/sft": 0.5165572166442871, "losses/total": 6.675257964161574e-07, "ref_logps/chosen": -210.54855346679688, "ref_logps/rejected": -216.18350219726562, "rewards/accuracies": 1.0, "rewards/chosen": -0.4161840081214905, "rewards/margins": 6.6424880027771, "rewards/rejected": -7.058671951293945, "step": 875 }, { "epoch": 0.21, "learning_rate": 1.7552e-07, "logps/chosen": -199.77403259277344, "logps/rejected": -309.2608947753906, "loss": 0.0088, "losses/dpo": 0.0008693182608112693, "losses/sft": 0.5271426439285278, "losses/total": 0.0008693182608112693, "ref_logps/chosen": -196.36395263671875, "ref_logps/rejected": -230.165771484375, "rewards/accuracies": 1.0, "rewards/chosen": -0.34100791811943054, "rewards/margins": 7.568504333496094, "rewards/rejected": -7.909512519836426, "step": 876 }, { "epoch": 0.21, "learning_rate": 1.7546666666666665e-07, "logps/chosen": -213.99205017089844, "logps/rejected": -304.43353271484375, "loss": 0.0105, "losses/dpo": 0.00013311411021277308, "losses/sft": 0.6362010836601257, "losses/total": 0.00013311411021277308, "ref_logps/chosen": -209.23330688476562, "ref_logps/rejected": -229.1801300048828, "rewards/accuracies": 1.0, "rewards/chosen": -0.47587308287620544, "rewards/margins": 7.049466609954834, "rewards/rejected": -7.525339603424072, "step": 877 }, { "epoch": 0.21, "learning_rate": 1.7541333333333333e-07, "logps/chosen": -233.8582000732422, "logps/rejected": -313.47283935546875, "loss": 0.0097, "losses/dpo": 5.952193077973789e-06, "losses/sft": 0.729435384273529, "losses/total": 5.952193077973789e-06, "ref_logps/chosen": -230.64077758789062, "ref_logps/rejected": -225.76104736328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.3217408061027527, "rewards/margins": 8.449440956115723, "rewards/rejected": -8.7711820602417, "step": 878 }, { "epoch": 0.21, "learning_rate": 1.7536e-07, "logps/chosen": -197.28463745117188, "logps/rejected": -249.657470703125, "loss": 0.034, "losses/dpo": 6.699910954921506e-06, "losses/sft": 0.7404444813728333, "losses/total": 6.699910954921506e-06, "ref_logps/chosen": -194.37388610839844, "ref_logps/rejected": -188.00088500976562, "rewards/accuracies": 1.0, "rewards/chosen": -0.2910747826099396, "rewards/margins": 5.874581336975098, "rewards/rejected": -6.165656089782715, "step": 879 }, { "epoch": 0.21, "learning_rate": 1.7530666666666668e-07, "logps/chosen": -194.76829528808594, "logps/rejected": -291.4230651855469, "loss": 0.0107, "losses/dpo": 0.00031877850415185094, "losses/sft": 0.5846778154373169, "losses/total": 0.00031877850415185094, "ref_logps/chosen": -193.08175659179688, "ref_logps/rejected": -213.4586181640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.16865350306034088, "rewards/margins": 7.627793312072754, "rewards/rejected": -7.796446800231934, "step": 880 }, { "epoch": 0.21, "learning_rate": 1.7525333333333333e-07, "logps/chosen": -236.21337890625, "logps/rejected": -316.6245422363281, "loss": 0.0346, "losses/dpo": 0.00010455572919454426, "losses/sft": 0.6600440144538879, "losses/total": 0.00010455572919454426, "ref_logps/chosen": -231.98385620117188, "ref_logps/rejected": -238.93736267089844, "rewards/accuracies": 1.0, "rewards/chosen": -0.42295321822166443, "rewards/margins": 7.345764636993408, "rewards/rejected": -7.768718242645264, "step": 881 }, { "epoch": 0.21, "learning_rate": 1.7519999999999998e-07, "logps/chosen": -235.5697784423828, "logps/rejected": -291.6783142089844, "loss": 0.0281, "losses/dpo": 3.4628181310836226e-05, "losses/sft": 0.573235809803009, "losses/total": 3.4628181310836226e-05, "ref_logps/chosen": -231.94662475585938, "ref_logps/rejected": -220.1573486328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.36231544613838196, "rewards/margins": 6.78978157043457, "rewards/rejected": -7.152096271514893, "step": 882 }, { "epoch": 0.21, "learning_rate": 1.7514666666666666e-07, "logps/chosen": -207.51727294921875, "logps/rejected": -344.12274169921875, "loss": 0.0154, "losses/dpo": 0.00247157271951437, "losses/sft": 0.5670992732048035, "losses/total": 0.00247157271951437, "ref_logps/chosen": -205.0968475341797, "ref_logps/rejected": -250.03831481933594, "rewards/accuracies": 1.0, "rewards/chosen": -0.24204121530056, "rewards/margins": 9.166404724121094, "rewards/rejected": -9.408446311950684, "step": 883 }, { "epoch": 0.21, "learning_rate": 1.750933333333333e-07, "logps/chosen": -205.54769897460938, "logps/rejected": -266.87445068359375, "loss": 0.0398, "losses/dpo": 0.0007896585157141089, "losses/sft": 0.7307229042053223, "losses/total": 0.0007896585157141089, "ref_logps/chosen": -203.08346557617188, "ref_logps/rejected": -198.81838989257812, "rewards/accuracies": 1.0, "rewards/chosen": -0.24642466008663177, "rewards/margins": 6.559183120727539, "rewards/rejected": -6.805607795715332, "step": 884 }, { "epoch": 0.21, "learning_rate": 1.7503999999999998e-07, "logps/chosen": -244.7657012939453, "logps/rejected": -325.0401611328125, "loss": 0.0116, "losses/dpo": 0.0003465301124379039, "losses/sft": 0.5819538235664368, "losses/total": 0.0003465301124379039, "ref_logps/chosen": -241.7021484375, "ref_logps/rejected": -242.57150268554688, "rewards/accuracies": 1.0, "rewards/chosen": -0.3063553273677826, "rewards/margins": 7.9405083656311035, "rewards/rejected": -8.24686336517334, "step": 885 }, { "epoch": 0.21, "learning_rate": 1.7498666666666666e-07, "logps/chosen": -232.0035400390625, "logps/rejected": -320.2389831542969, "loss": 0.013, "losses/dpo": 0.00016926975513342768, "losses/sft": 0.6777315735816956, "losses/total": 0.00016926975513342768, "ref_logps/chosen": -229.3375701904297, "ref_logps/rejected": -238.68809509277344, "rewards/accuracies": 1.0, "rewards/chosen": -0.2665967345237732, "rewards/margins": 7.888491630554199, "rewards/rejected": -8.155088424682617, "step": 886 }, { "epoch": 0.21, "learning_rate": 1.7493333333333334e-07, "logps/chosen": -233.00888061523438, "logps/rejected": -331.8414611816406, "loss": 0.0159, "losses/dpo": 1.0834390877789701e-06, "losses/sft": 0.8593574166297913, "losses/total": 1.0834390877789701e-06, "ref_logps/chosen": -230.80709838867188, "ref_logps/rejected": -243.03302001953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.22017911076545715, "rewards/margins": 8.660664558410645, "rewards/rejected": -8.880844116210938, "step": 887 }, { "epoch": 0.21, "learning_rate": 1.7488e-07, "logps/chosen": -263.4464111328125, "logps/rejected": -299.5776672363281, "loss": 0.0064, "losses/dpo": 0.0025396239943802357, "losses/sft": 1.0254541635513306, "losses/total": 0.0025396239943802357, "ref_logps/chosen": -259.4649658203125, "ref_logps/rejected": -217.56088256835938, "rewards/accuracies": 1.0, "rewards/chosen": -0.3981490135192871, "rewards/margins": 7.803529739379883, "rewards/rejected": -8.201679229736328, "step": 888 }, { "epoch": 0.21, "learning_rate": 1.7482666666666667e-07, "logps/chosen": -249.263427734375, "logps/rejected": -329.33648681640625, "loss": 0.0149, "losses/dpo": 9.442934242542833e-06, "losses/sft": 1.0004774332046509, "losses/total": 9.442934242542833e-06, "ref_logps/chosen": -245.8169403076172, "ref_logps/rejected": -243.3533477783203, "rewards/accuracies": 1.0, "rewards/chosen": -0.3446499705314636, "rewards/margins": 8.253662109375, "rewards/rejected": -8.598312377929688, "step": 889 }, { "epoch": 0.21, "learning_rate": 1.7477333333333332e-07, "logps/chosen": -252.21136474609375, "logps/rejected": -314.0810546875, "loss": 0.0169, "losses/dpo": 6.5458498283987865e-06, "losses/sft": 0.3827478289604187, "losses/total": 6.5458498283987865e-06, "ref_logps/chosen": -250.12759399414062, "ref_logps/rejected": -242.20840454101562, "rewards/accuracies": 1.0, "rewards/chosen": -0.20837675034999847, "rewards/margins": 6.978885173797607, "rewards/rejected": -7.187261581420898, "step": 890 }, { "epoch": 0.21, "learning_rate": 1.7472e-07, "logps/chosen": -213.02279663085938, "logps/rejected": -291.61480712890625, "loss": 0.0061, "losses/dpo": 3.633099186117761e-05, "losses/sft": 0.42546725273132324, "losses/total": 3.633099186117761e-05, "ref_logps/chosen": -209.8051300048828, "ref_logps/rejected": -214.92352294921875, "rewards/accuracies": 1.0, "rewards/chosen": -0.3217669427394867, "rewards/margins": 7.347359657287598, "rewards/rejected": -7.669126987457275, "step": 891 }, { "epoch": 0.21, "learning_rate": 1.7466666666666664e-07, "logps/chosen": -257.66473388671875, "logps/rejected": -293.2585754394531, "loss": 0.0635, "losses/dpo": 0.0009876604890450835, "losses/sft": 0.36070340871810913, "losses/total": 0.0009876604890450835, "ref_logps/chosen": -253.8932647705078, "ref_logps/rejected": -220.74310302734375, "rewards/accuracies": 0.96875, "rewards/chosen": -0.3771466314792633, "rewards/margins": 6.874401092529297, "rewards/rejected": -7.251547813415527, "step": 892 }, { "epoch": 0.21, "learning_rate": 1.7461333333333332e-07, "logps/chosen": -243.539794921875, "logps/rejected": -301.91412353515625, "loss": 0.0125, "losses/dpo": 0.00243365322239697, "losses/sft": 0.6229187250137329, "losses/total": 0.00243365322239697, "ref_logps/chosen": -239.5991668701172, "ref_logps/rejected": -220.51138305664062, "rewards/accuracies": 1.0, "rewards/chosen": -0.3940636217594147, "rewards/margins": 7.746210098266602, "rewards/rejected": -8.140274047851562, "step": 893 }, { "epoch": 0.21, "learning_rate": 1.7456e-07, "logps/chosen": -262.01385498046875, "logps/rejected": -305.0146484375, "loss": 0.0153, "losses/dpo": 0.0001805407227948308, "losses/sft": 1.2877329587936401, "losses/total": 0.0001805407227948308, "ref_logps/chosen": -257.749267578125, "ref_logps/rejected": -221.15213012695312, "rewards/accuracies": 1.0, "rewards/chosen": -0.42645806074142456, "rewards/margins": 7.959794521331787, "rewards/rejected": -8.386253356933594, "step": 894 }, { "epoch": 0.21, "learning_rate": 1.7450666666666667e-07, "logps/chosen": -257.619140625, "logps/rejected": -307.05859375, "loss": 0.013, "losses/dpo": 4.3060358621005435e-06, "losses/sft": 0.5723696351051331, "losses/total": 4.3060358621005435e-06, "ref_logps/chosen": -253.88800048828125, "ref_logps/rejected": -227.4263458251953, "rewards/accuracies": 1.0, "rewards/chosen": -0.3731105625629425, "rewards/margins": 7.590115547180176, "rewards/rejected": -7.963226318359375, "step": 895 }, { "epoch": 0.22, "learning_rate": 1.7445333333333332e-07, "logps/chosen": -248.78973388671875, "logps/rejected": -352.3802185058594, "loss": 0.0027, "losses/dpo": 3.258484184698318e-06, "losses/sft": 0.6543533205986023, "losses/total": 3.258484184698318e-06, "ref_logps/chosen": -246.40740966796875, "ref_logps/rejected": -256.38177490234375, "rewards/accuracies": 1.0, "rewards/chosen": -0.23823177814483643, "rewards/margins": 9.361612319946289, "rewards/rejected": -9.599843978881836, "step": 896 }, { "epoch": 0.22, "learning_rate": 1.744e-07, "logps/chosen": -200.13626098632812, "logps/rejected": -275.3365783691406, "loss": 0.0161, "losses/dpo": 2.339585444133263e-05, "losses/sft": 0.7252234816551208, "losses/total": 2.339585444133263e-05, "ref_logps/chosen": -197.4921112060547, "ref_logps/rejected": -200.18792724609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.26441264152526855, "rewards/margins": 7.25045108795166, "rewards/rejected": -7.514863967895508, "step": 897 }, { "epoch": 0.22, "learning_rate": 1.7434666666666668e-07, "logps/chosen": -213.33392333984375, "logps/rejected": -305.2712707519531, "loss": 0.0245, "losses/dpo": 5.852190838595561e-07, "losses/sft": 0.5335944890975952, "losses/total": 5.852190838595561e-07, "ref_logps/chosen": -210.244384765625, "ref_logps/rejected": -224.54148864746094, "rewards/accuracies": 1.0, "rewards/chosen": -0.30895331501960754, "rewards/margins": 7.76402473449707, "rewards/rejected": -8.072978019714355, "step": 898 }, { "epoch": 0.22, "learning_rate": 1.7429333333333333e-07, "logps/chosen": -232.7842254638672, "logps/rejected": -310.5924072265625, "loss": 0.0078, "losses/dpo": 0.0007395982975140214, "losses/sft": 0.5261110067367554, "losses/total": 0.0007395982975140214, "ref_logps/chosen": -229.28408813476562, "ref_logps/rejected": -231.1173095703125, "rewards/accuracies": 1.0, "rewards/chosen": -0.3500146269798279, "rewards/margins": 7.597492218017578, "rewards/rejected": -7.947507381439209, "step": 899 }, { "epoch": 0.22, "learning_rate": 1.7423999999999998e-07, "logps/chosen": -207.79360961914062, "logps/rejected": -272.03033447265625, "loss": 0.0139, "losses/dpo": 4.310593067202717e-05, "losses/sft": 0.4819704592227936, "losses/total": 4.310593067202717e-05, "ref_logps/chosen": -205.20440673828125, "ref_logps/rejected": -204.04196166992188, "rewards/accuracies": 1.0, "rewards/chosen": -0.25892117619514465, "rewards/margins": 6.539914131164551, "rewards/rejected": -6.798835754394531, "step": 900 }, { "epoch": 0.22, "learning_rate": 1.7418666666666665e-07, "logps/chosen": -218.85086059570312, "logps/rejected": -299.6685791015625, "loss": 0.0113, "losses/dpo": 1.1949182407988701e-05, "losses/sft": 0.6061344146728516, "losses/total": 1.1949182407988701e-05, "ref_logps/chosen": -215.60247802734375, "ref_logps/rejected": -224.470947265625, "rewards/accuracies": 1.0, "rewards/chosen": -0.3248385787010193, "rewards/margins": 7.194925308227539, "rewards/rejected": -7.519763946533203, "step": 901 }, { "epoch": 0.22, "learning_rate": 1.7413333333333333e-07, "logps/chosen": -242.8331756591797, "logps/rejected": -314.26910400390625, "loss": 0.0114, "losses/dpo": 0.0002609836810734123, "losses/sft": 0.6778954863548279, "losses/total": 0.0002609836810734123, "ref_logps/chosen": -238.487548828125, "ref_logps/rejected": -235.6219024658203, "rewards/accuracies": 1.0, "rewards/chosen": -0.4345629811286926, "rewards/margins": 7.430158615112305, "rewards/rejected": -7.864721298217773, "step": 902 }, { "epoch": 0.22, "learning_rate": 1.7407999999999998e-07, "logps/chosen": -193.73944091796875, "logps/rejected": -298.55859375, "loss": 0.024, "losses/dpo": 7.213471235445468e-06, "losses/sft": 0.5733456015586853, "losses/total": 7.213471235445468e-06, "ref_logps/chosen": -189.75442504882812, "ref_logps/rejected": -221.65182495117188, "rewards/accuracies": 1.0, "rewards/chosen": -0.3985033333301544, "rewards/margins": 7.292173862457275, "rewards/rejected": -7.690677165985107, "step": 903 }, { "epoch": 0.22, "learning_rate": 1.7402666666666666e-07, "logps/chosen": -231.35382080078125, "logps/rejected": -306.0281982421875, "loss": 0.0094, "losses/dpo": 9.474550211052701e-07, "losses/sft": 0.43701526522636414, "losses/total": 9.474550211052701e-07, "ref_logps/chosen": -227.33592224121094, "ref_logps/rejected": -225.33041381835938, "rewards/accuracies": 1.0, "rewards/chosen": -0.40178894996643066, "rewards/margins": 7.6679887771606445, "rewards/rejected": -8.069778442382812, "step": 904 }, { "epoch": 0.22, "learning_rate": 1.7397333333333333e-07, "logps/chosen": -222.21478271484375, "logps/rejected": -298.51275634765625, "loss": 0.0189, "losses/dpo": 2.025846697506495e-06, "losses/sft": 0.8610896468162537, "losses/total": 2.025846697506495e-06, "ref_logps/chosen": -219.38775634765625, "ref_logps/rejected": -218.76087951660156, "rewards/accuracies": 1.0, "rewards/chosen": -0.28270116448402405, "rewards/margins": 7.69248628616333, "rewards/rejected": -7.975187301635742, "step": 905 }, { "epoch": 0.22, "learning_rate": 1.7392e-07, "logps/chosen": -195.33912658691406, "logps/rejected": -298.6282958984375, "loss": 0.0198, "losses/dpo": 3.228148852940649e-05, "losses/sft": 0.6446272730827332, "losses/total": 3.228148852940649e-05, "ref_logps/chosen": -190.72622680664062, "ref_logps/rejected": -217.50218200683594, "rewards/accuracies": 1.0, "rewards/chosen": -0.4612913131713867, "rewards/margins": 7.651319980621338, "rewards/rejected": -8.112611770629883, "step": 906 }, { "epoch": 0.22, "learning_rate": 1.7386666666666666e-07, "logps/chosen": -217.48934936523438, "logps/rejected": -289.374267578125, "loss": 0.0118, "losses/dpo": 0.0003580586635507643, "losses/sft": 0.7551294565200806, "losses/total": 0.0003580586635507643, "ref_logps/chosen": -214.5780029296875, "ref_logps/rejected": -217.65963745117188, "rewards/accuracies": 1.0, "rewards/chosen": -0.29113438725471497, "rewards/margins": 6.880327224731445, "rewards/rejected": -7.171462059020996, "step": 907 }, { "epoch": 0.22, "learning_rate": 1.738133333333333e-07, "logps/chosen": -221.51458740234375, "logps/rejected": -307.4088134765625, "loss": 0.0079, "losses/dpo": 6.3938318817235995e-06, "losses/sft": 0.5742473602294922, "losses/total": 6.3938318817235995e-06, "ref_logps/chosen": -219.41519165039062, "ref_logps/rejected": -220.29649353027344, "rewards/accuracies": 1.0, "rewards/chosen": -0.20993921160697937, "rewards/margins": 8.50129508972168, "rewards/rejected": -8.711233139038086, "step": 908 }, { "epoch": 0.22, "learning_rate": 1.7376e-07, "logps/chosen": -234.36074829101562, "logps/rejected": -318.1468505859375, "loss": 0.0154, "losses/dpo": 6.528213361889357e-06, "losses/sft": 0.4589352011680603, "losses/total": 6.528213361889357e-06, "ref_logps/chosen": -231.5409393310547, "ref_logps/rejected": -242.24716186523438, "rewards/accuracies": 1.0, "rewards/chosen": -0.2819817066192627, "rewards/margins": 7.307989597320557, "rewards/rejected": -7.589972019195557, "step": 909 }, { "epoch": 0.22, "learning_rate": 1.7370666666666667e-07, "logps/chosen": -199.31056213378906, "logps/rejected": -260.32452392578125, "loss": 0.0267, "losses/dpo": 9.203601621265989e-06, "losses/sft": 0.7843689322471619, "losses/total": 9.203601621265989e-06, "ref_logps/chosen": -197.5284423828125, "ref_logps/rejected": -189.46310424804688, "rewards/accuracies": 1.0, "rewards/chosen": -0.1782122552394867, "rewards/margins": 6.907928466796875, "rewards/rejected": -7.0861406326293945, "step": 910 }, { "epoch": 0.22, "learning_rate": 1.7365333333333332e-07, "logps/chosen": -197.37066650390625, "logps/rejected": -277.9931335449219, "loss": 0.0116, "losses/dpo": 0.0003685771080199629, "losses/sft": 0.5423823595046997, "losses/total": 0.0003685771080199629, "ref_logps/chosen": -193.95953369140625, "ref_logps/rejected": -203.73992919921875, "rewards/accuracies": 1.0, "rewards/chosen": -0.3411128520965576, "rewards/margins": 7.084208011627197, "rewards/rejected": -7.425320625305176, "step": 911 }, { "epoch": 0.22, "learning_rate": 1.736e-07, "logps/chosen": -230.3223114013672, "logps/rejected": -315.3545837402344, "loss": 0.0098, "losses/dpo": 7.596719569846755e-06, "losses/sft": 0.3402896821498871, "losses/total": 7.596719569846755e-06, "ref_logps/chosen": -226.844970703125, "ref_logps/rejected": -231.76365661621094, "rewards/accuracies": 1.0, "rewards/chosen": -0.3477347493171692, "rewards/margins": 8.011360168457031, "rewards/rejected": -8.359095573425293, "step": 912 }, { "epoch": 0.22, "learning_rate": 1.7354666666666667e-07, "logps/chosen": -203.89816284179688, "logps/rejected": -291.29998779296875, "loss": 0.0133, "losses/dpo": 4.133210950385546e-06, "losses/sft": 1.0969114303588867, "losses/total": 4.133210950385546e-06, "ref_logps/chosen": -202.05328369140625, "ref_logps/rejected": -215.74655151367188, "rewards/accuracies": 1.0, "rewards/chosen": -0.1844891607761383, "rewards/margins": 7.370857238769531, "rewards/rejected": -7.555346488952637, "step": 913 }, { "epoch": 0.22, "learning_rate": 1.7349333333333335e-07, "logps/chosen": -264.277099609375, "logps/rejected": -289.08795166015625, "loss": 0.0183, "losses/dpo": 3.711791941896081e-05, "losses/sft": 0.7523191571235657, "losses/total": 3.711791941896081e-05, "ref_logps/chosen": -259.8885498046875, "ref_logps/rejected": -210.44662475585938, "rewards/accuracies": 1.0, "rewards/chosen": -0.43885481357574463, "rewards/margins": 7.4252777099609375, "rewards/rejected": -7.864132881164551, "step": 914 }, { "epoch": 0.22, "learning_rate": 1.7344e-07, "logps/chosen": -239.9247589111328, "logps/rejected": -311.4068603515625, "loss": 0.0053, "losses/dpo": 6.334133649943396e-05, "losses/sft": 0.7640432715415955, "losses/total": 6.334133649943396e-05, "ref_logps/chosen": -236.36044311523438, "ref_logps/rejected": -231.30392456054688, "rewards/accuracies": 1.0, "rewards/chosen": -0.356431245803833, "rewards/margins": 7.653860092163086, "rewards/rejected": -8.010292053222656, "step": 915 }, { "epoch": 0.22, "learning_rate": 1.7338666666666665e-07, "logps/chosen": -220.41513061523438, "logps/rejected": -308.28839111328125, "loss": 0.007, "losses/dpo": 0.0006127685774117708, "losses/sft": 0.5240042209625244, "losses/total": 0.0006127685774117708, "ref_logps/chosen": -217.07763671875, "ref_logps/rejected": -224.57192993164062, "rewards/accuracies": 1.0, "rewards/chosen": -0.33374783396720886, "rewards/margins": 8.037897109985352, "rewards/rejected": -8.371644973754883, "step": 916 }, { "epoch": 0.22, "learning_rate": 1.7333333333333332e-07, "logps/chosen": -233.47210693359375, "logps/rejected": -322.43597412109375, "loss": 0.0037, "losses/dpo": 2.5172555979224853e-05, "losses/sft": 0.6733924746513367, "losses/total": 2.5172555979224853e-05, "ref_logps/chosen": -229.77757263183594, "ref_logps/rejected": -228.69869995117188, "rewards/accuracies": 1.0, "rewards/chosen": -0.3694537878036499, "rewards/margins": 9.004270553588867, "rewards/rejected": -9.373723983764648, "step": 917 }, { "epoch": 0.22, "learning_rate": 1.7327999999999997e-07, "logps/chosen": -257.1625061035156, "logps/rejected": -327.77813720703125, "loss": 0.0047, "losses/dpo": 1.3415429975793813e-06, "losses/sft": 1.2488347291946411, "losses/total": 1.3415429975793813e-06, "ref_logps/chosen": -255.12242126464844, "ref_logps/rejected": -240.7061004638672, "rewards/accuracies": 1.0, "rewards/chosen": -0.2040085345506668, "rewards/margins": 8.503193855285645, "rewards/rejected": -8.707201957702637, "step": 918 }, { "epoch": 0.22, "learning_rate": 1.7322666666666665e-07, "logps/chosen": -242.39035034179688, "logps/rejected": -326.48883056640625, "loss": 0.0069, "losses/dpo": 0.0006477694259956479, "losses/sft": 0.4102437198162079, "losses/total": 0.0006477694259956479, "ref_logps/chosen": -240.0083465576172, "ref_logps/rejected": -242.96929931640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.23820121586322784, "rewards/margins": 8.113752365112305, "rewards/rejected": -8.351953506469727, "step": 919 }, { "epoch": 0.22, "learning_rate": 1.7317333333333333e-07, "logps/chosen": -193.8613739013672, "logps/rejected": -319.9640808105469, "loss": 0.0128, "losses/dpo": 0.00010206779552390799, "losses/sft": 0.9829915165901184, "losses/total": 0.00010206779552390799, "ref_logps/chosen": -192.1084442138672, "ref_logps/rejected": -236.11624145507812, "rewards/accuracies": 1.0, "rewards/chosen": -0.17529264092445374, "rewards/margins": 8.209490776062012, "rewards/rejected": -8.384783744812012, "step": 920 }, { "epoch": 0.22, "learning_rate": 1.7312e-07, "logps/chosen": -237.0944061279297, "logps/rejected": -326.3660583496094, "loss": 0.0066, "losses/dpo": 2.8923054742335808e-06, "losses/sft": 0.6912876963615417, "losses/total": 2.8923054742335808e-06, "ref_logps/chosen": -233.87100219726562, "ref_logps/rejected": -242.22000122070312, "rewards/accuracies": 1.0, "rewards/chosen": -0.3223402202129364, "rewards/margins": 8.092264175415039, "rewards/rejected": -8.414605140686035, "step": 921 }, { "epoch": 0.22, "learning_rate": 1.7306666666666665e-07, "logps/chosen": -250.96340942382812, "logps/rejected": -299.1307067871094, "loss": 0.0157, "losses/dpo": 2.23356937567587e-06, "losses/sft": 0.8360143303871155, "losses/total": 2.23356937567587e-06, "ref_logps/chosen": -245.78057861328125, "ref_logps/rejected": -212.01739501953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.5182815194129944, "rewards/margins": 8.193049430847168, "rewards/rejected": -8.711331367492676, "step": 922 }, { "epoch": 0.22, "learning_rate": 1.7301333333333333e-07, "logps/chosen": -246.3037872314453, "logps/rejected": -314.272705078125, "loss": 0.0165, "losses/dpo": 3.234172982047312e-05, "losses/sft": 0.6838553547859192, "losses/total": 3.234172982047312e-05, "ref_logps/chosen": -243.36688232421875, "ref_logps/rejected": -235.72811889648438, "rewards/accuracies": 1.0, "rewards/chosen": -0.2936914563179016, "rewards/margins": 7.560767650604248, "rewards/rejected": -7.854458808898926, "step": 923 }, { "epoch": 0.22, "learning_rate": 1.7295999999999998e-07, "logps/chosen": -222.39535522460938, "logps/rejected": -266.80865478515625, "loss": 0.0261, "losses/dpo": 9.288601722801104e-05, "losses/sft": 0.6565678119659424, "losses/total": 9.288601722801104e-05, "ref_logps/chosen": -219.9400177001953, "ref_logps/rejected": -194.60818481445312, "rewards/accuracies": 1.0, "rewards/chosen": -0.24553373456001282, "rewards/margins": 6.974511623382568, "rewards/rejected": -7.220045566558838, "step": 924 }, { "epoch": 0.22, "learning_rate": 1.7290666666666666e-07, "logps/chosen": -221.12344360351562, "logps/rejected": -291.8224182128906, "loss": 0.0285, "losses/dpo": 3.0588158551836386e-05, "losses/sft": 0.7531899213790894, "losses/total": 3.0588158551836386e-05, "ref_logps/chosen": -217.95700073242188, "ref_logps/rejected": -211.9101104736328, "rewards/accuracies": 1.0, "rewards/chosen": -0.31664329767227173, "rewards/margins": 7.674588203430176, "rewards/rejected": -7.991231441497803, "step": 925 }, { "epoch": 0.22, "learning_rate": 1.728533333333333e-07, "logps/chosen": -217.07179260253906, "logps/rejected": -311.6395568847656, "loss": 0.011, "losses/dpo": 0.00014001666568219662, "losses/sft": 0.409464567899704, "losses/total": 0.00014001666568219662, "ref_logps/chosen": -214.53848266601562, "ref_logps/rejected": -230.38528442382812, "rewards/accuracies": 1.0, "rewards/chosen": -0.25333136320114136, "rewards/margins": 7.872097015380859, "rewards/rejected": -8.125428199768066, "step": 926 }, { "epoch": 0.22, "learning_rate": 1.7279999999999999e-07, "logps/chosen": -226.2850341796875, "logps/rejected": -292.78253173828125, "loss": 0.0111, "losses/dpo": 3.9223439671332017e-05, "losses/sft": 0.5677570104598999, "losses/total": 3.9223439671332017e-05, "ref_logps/chosen": -223.27659606933594, "ref_logps/rejected": -216.75048828125, "rewards/accuracies": 1.0, "rewards/chosen": -0.3008418679237366, "rewards/margins": 7.302361488342285, "rewards/rejected": -7.603203773498535, "step": 927 }, { "epoch": 0.22, "learning_rate": 1.7274666666666666e-07, "logps/chosen": -211.11953735351562, "logps/rejected": -302.9476318359375, "loss": 0.0225, "losses/dpo": 0.009523610584437847, "losses/sft": 0.7103052139282227, "losses/total": 0.009523610584437847, "ref_logps/chosen": -206.74542236328125, "ref_logps/rejected": -219.52566528320312, "rewards/accuracies": 1.0, "rewards/chosen": -0.4374108910560608, "rewards/margins": 7.904784202575684, "rewards/rejected": -8.342195510864258, "step": 928 }, { "epoch": 0.22, "learning_rate": 1.7269333333333334e-07, "logps/chosen": -231.73342895507812, "logps/rejected": -307.17828369140625, "loss": 0.0174, "losses/dpo": 0.00014589811326004565, "losses/sft": 0.6870776414871216, "losses/total": 0.00014589811326004565, "ref_logps/chosen": -228.07720947265625, "ref_logps/rejected": -227.76121520996094, "rewards/accuracies": 1.0, "rewards/chosen": -0.36562418937683105, "rewards/margins": 7.576083183288574, "rewards/rejected": -7.941707611083984, "step": 929 }, { "epoch": 0.22, "learning_rate": 1.7264e-07, "logps/chosen": -230.62322998046875, "logps/rejected": -301.4429626464844, "loss": 0.0108, "losses/dpo": 0.0007205812144093215, "losses/sft": 0.5038963556289673, "losses/total": 0.0007205812144093215, "ref_logps/chosen": -228.5020294189453, "ref_logps/rejected": -222.5272216796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.21212159097194672, "rewards/margins": 7.679451942443848, "rewards/rejected": -7.891573905944824, "step": 930 }, { "epoch": 0.22, "learning_rate": 1.7258666666666667e-07, "logps/chosen": -233.10055541992188, "logps/rejected": -269.5909729003906, "loss": 0.0162, "losses/dpo": 0.0002655798161868006, "losses/sft": 0.45623862743377686, "losses/total": 0.0002655798161868006, "ref_logps/chosen": -227.63795471191406, "ref_logps/rejected": -197.33859252929688, "rewards/accuracies": 1.0, "rewards/chosen": -0.5462585091590881, "rewards/margins": 6.678979873657227, "rewards/rejected": -7.22523832321167, "step": 931 }, { "epoch": 0.22, "learning_rate": 1.7253333333333334e-07, "logps/chosen": -206.0832061767578, "logps/rejected": -286.3997802734375, "loss": 0.0209, "losses/dpo": 0.008335738442838192, "losses/sft": 0.500684916973114, "losses/total": 0.008335738442838192, "ref_logps/chosen": -203.05322265625, "ref_logps/rejected": -213.14151000976562, "rewards/accuracies": 1.0, "rewards/chosen": -0.3029975891113281, "rewards/margins": 7.022829532623291, "rewards/rejected": -7.325826644897461, "step": 932 }, { "epoch": 0.22, "learning_rate": 1.7248e-07, "logps/chosen": -252.69882202148438, "logps/rejected": -348.82647705078125, "loss": 0.0044, "losses/dpo": 9.102337230615376e-07, "losses/sft": 0.7573723196983337, "losses/total": 9.102337230615376e-07, "ref_logps/chosen": -248.51393127441406, "ref_logps/rejected": -256.5659484863281, "rewards/accuracies": 1.0, "rewards/chosen": -0.4184902310371399, "rewards/margins": 8.807563781738281, "rewards/rejected": -9.226055145263672, "step": 933 }, { "epoch": 0.22, "learning_rate": 1.7242666666666664e-07, "logps/chosen": -213.14175415039062, "logps/rejected": -288.0103454589844, "loss": 0.009, "losses/dpo": 3.206597102689557e-05, "losses/sft": 0.6003330945968628, "losses/total": 3.206597102689557e-05, "ref_logps/chosen": -209.1359405517578, "ref_logps/rejected": -211.03660583496094, "rewards/accuracies": 1.0, "rewards/chosen": -0.40058422088623047, "rewards/margins": 7.296788215637207, "rewards/rejected": -7.6973724365234375, "step": 934 }, { "epoch": 0.22, "learning_rate": 1.7237333333333332e-07, "logps/chosen": -241.57749938964844, "logps/rejected": -300.6519775390625, "loss": 0.0302, "losses/dpo": 3.27410307363607e-05, "losses/sft": 0.5880498886108398, "losses/total": 3.27410307363607e-05, "ref_logps/chosen": -238.4051513671875, "ref_logps/rejected": -223.02105712890625, "rewards/accuracies": 1.0, "rewards/chosen": -0.3172329068183899, "rewards/margins": 7.445858001708984, "rewards/rejected": -7.763091087341309, "step": 935 }, { "epoch": 0.22, "learning_rate": 1.7232e-07, "logps/chosen": -195.11517333984375, "logps/rejected": -290.1768798828125, "loss": 0.0065, "losses/dpo": 2.097227934427792e-06, "losses/sft": 0.8899042010307312, "losses/total": 2.097227934427792e-06, "ref_logps/chosen": -193.05899047851562, "ref_logps/rejected": -209.01101684570312, "rewards/accuracies": 1.0, "rewards/chosen": -0.2056180238723755, "rewards/margins": 7.910967826843262, "rewards/rejected": -8.116585731506348, "step": 936 }, { "epoch": 0.22, "learning_rate": 1.7226666666666665e-07, "logps/chosen": -210.42245483398438, "logps/rejected": -294.1877136230469, "loss": 0.0241, "losses/dpo": 8.123666339088231e-05, "losses/sft": 0.8334330320358276, "losses/total": 8.123666339088231e-05, "ref_logps/chosen": -208.9781036376953, "ref_logps/rejected": -223.96444702148438, "rewards/accuracies": 1.0, "rewards/chosen": -0.14443448185920715, "rewards/margins": 6.877894401550293, "rewards/rejected": -7.0223283767700195, "step": 937 }, { "epoch": 0.23, "learning_rate": 1.7221333333333332e-07, "logps/chosen": -252.46055603027344, "logps/rejected": -300.6495361328125, "loss": 0.0086, "losses/dpo": 2.610361116239801e-05, "losses/sft": 0.6619402170181274, "losses/total": 2.610361116239801e-05, "ref_logps/chosen": -249.24864196777344, "ref_logps/rejected": -221.4942626953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.32119113206863403, "rewards/margins": 7.594336986541748, "rewards/rejected": -7.915528297424316, "step": 938 }, { "epoch": 0.23, "learning_rate": 1.7216e-07, "logps/chosen": -245.35549926757812, "logps/rejected": -306.9198303222656, "loss": 0.0054, "losses/dpo": 5.3132371249375865e-06, "losses/sft": 0.6531238555908203, "losses/total": 5.3132371249375865e-06, "ref_logps/chosen": -240.33358764648438, "ref_logps/rejected": -227.13900756835938, "rewards/accuracies": 1.0, "rewards/chosen": -0.5021923184394836, "rewards/margins": 7.47589111328125, "rewards/rejected": -7.978083610534668, "step": 939 }, { "epoch": 0.23, "learning_rate": 1.7210666666666668e-07, "logps/chosen": -269.39666748046875, "logps/rejected": -318.19366455078125, "loss": 0.0106, "losses/dpo": 0.004429371561855078, "losses/sft": 0.5409132242202759, "losses/total": 0.004429371561855078, "ref_logps/chosen": -264.6045227050781, "ref_logps/rejected": -234.8404541015625, "rewards/accuracies": 1.0, "rewards/chosen": -0.47921475768089294, "rewards/margins": 7.856106281280518, "rewards/rejected": -8.335320472717285, "step": 940 }, { "epoch": 0.23, "learning_rate": 1.7205333333333333e-07, "logps/chosen": -221.4081268310547, "logps/rejected": -303.65966796875, "loss": 0.0148, "losses/dpo": 1.6429086826974526e-05, "losses/sft": 0.644543468952179, "losses/total": 1.6429086826974526e-05, "ref_logps/chosen": -219.14686584472656, "ref_logps/rejected": -219.84454345703125, "rewards/accuracies": 1.0, "rewards/chosen": -0.22612695395946503, "rewards/margins": 8.155389785766602, "rewards/rejected": -8.381515502929688, "step": 941 }, { "epoch": 0.23, "learning_rate": 1.7199999999999998e-07, "logps/chosen": -254.39895629882812, "logps/rejected": -312.6842346191406, "loss": 0.0135, "losses/dpo": 1.9666125808726065e-05, "losses/sft": 0.37561675906181335, "losses/total": 1.9666125808726065e-05, "ref_logps/chosen": -250.70428466796875, "ref_logps/rejected": -227.19178771972656, "rewards/accuracies": 1.0, "rewards/chosen": -0.3694683611392975, "rewards/margins": 8.179777145385742, "rewards/rejected": -8.54924488067627, "step": 942 }, { "epoch": 0.23, "learning_rate": 1.7194666666666666e-07, "logps/chosen": -226.1549072265625, "logps/rejected": -306.2872619628906, "loss": 0.0122, "losses/dpo": 1.2123531405450194e-06, "losses/sft": 0.6617854833602905, "losses/total": 1.2123531405450194e-06, "ref_logps/chosen": -222.82843017578125, "ref_logps/rejected": -226.1137237548828, "rewards/accuracies": 1.0, "rewards/chosen": -0.33264559507369995, "rewards/margins": 7.684711456298828, "rewards/rejected": -8.017356872558594, "step": 943 }, { "epoch": 0.23, "learning_rate": 1.7189333333333333e-07, "logps/chosen": -276.16741943359375, "logps/rejected": -336.3488464355469, "loss": 0.0055, "losses/dpo": 3.910571194865042e-06, "losses/sft": 0.5990784168243408, "losses/total": 3.910571194865042e-06, "ref_logps/chosen": -272.0216979980469, "ref_logps/rejected": -246.788330078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.414574533700943, "rewards/margins": 8.54147720336914, "rewards/rejected": -8.956050872802734, "step": 944 }, { "epoch": 0.23, "learning_rate": 1.7183999999999998e-07, "logps/chosen": -279.63629150390625, "logps/rejected": -295.98089599609375, "loss": 0.0155, "losses/dpo": 1.3255291833047522e-06, "losses/sft": 0.5831518173217773, "losses/total": 1.3255291833047522e-06, "ref_logps/chosen": -275.9956359863281, "ref_logps/rejected": -216.5743408203125, "rewards/accuracies": 1.0, "rewards/chosen": -0.36406832933425903, "rewards/margins": 7.576589107513428, "rewards/rejected": -7.940657138824463, "step": 945 }, { "epoch": 0.23, "learning_rate": 1.7178666666666666e-07, "logps/chosen": -238.81527709960938, "logps/rejected": -319.4889831542969, "loss": 0.0055, "losses/dpo": 0.0006328853196464479, "losses/sft": 0.5737382769584656, "losses/total": 0.0006328853196464479, "ref_logps/chosen": -234.81719970703125, "ref_logps/rejected": -235.69192504882812, "rewards/accuracies": 1.0, "rewards/chosen": -0.3998074233531952, "rewards/margins": 7.9798994064331055, "rewards/rejected": -8.379706382751465, "step": 946 }, { "epoch": 0.23, "learning_rate": 1.7173333333333334e-07, "logps/chosen": -242.5753631591797, "logps/rejected": -288.45281982421875, "loss": 0.0127, "losses/dpo": 1.3606877473648638e-05, "losses/sft": 1.2924175262451172, "losses/total": 1.3606877473648638e-05, "ref_logps/chosen": -238.4095916748047, "ref_logps/rejected": -209.94610595703125, "rewards/accuracies": 1.0, "rewards/chosen": -0.41657572984695435, "rewards/margins": 7.434098243713379, "rewards/rejected": -7.850673675537109, "step": 947 }, { "epoch": 0.23, "learning_rate": 1.7168e-07, "logps/chosen": -210.4381103515625, "logps/rejected": -296.1393127441406, "loss": 0.0188, "losses/dpo": 2.2397252905648202e-05, "losses/sft": 0.5528942942619324, "losses/total": 2.2397252905648202e-05, "ref_logps/chosen": -208.29855346679688, "ref_logps/rejected": -213.58567810058594, "rewards/accuracies": 1.0, "rewards/chosen": -0.2139555960893631, "rewards/margins": 8.041410446166992, "rewards/rejected": -8.255365371704102, "step": 948 }, { "epoch": 0.23, "learning_rate": 1.7162666666666666e-07, "logps/chosen": -236.22398376464844, "logps/rejected": -312.31353759765625, "loss": 0.0128, "losses/dpo": 0.0005719371256418526, "losses/sft": 0.7780157327651978, "losses/total": 0.0005719371256418526, "ref_logps/chosen": -232.96934509277344, "ref_logps/rejected": -223.63449096679688, "rewards/accuracies": 1.0, "rewards/chosen": -0.3254625201225281, "rewards/margins": 8.542442321777344, "rewards/rejected": -8.867904663085938, "step": 949 }, { "epoch": 0.23, "learning_rate": 1.7157333333333331e-07, "logps/chosen": -281.4952087402344, "logps/rejected": -349.88775634765625, "loss": 0.003, "losses/dpo": 0.0003567163657862693, "losses/sft": 0.5224173665046692, "losses/total": 0.0003567163657862693, "ref_logps/chosen": -277.1940612792969, "ref_logps/rejected": -260.488037109375, "rewards/accuracies": 1.0, "rewards/chosen": -0.4301130473613739, "rewards/margins": 8.509860038757324, "rewards/rejected": -8.939973831176758, "step": 950 }, { "epoch": 0.23, "learning_rate": 1.7152e-07, "logps/chosen": -233.69168090820312, "logps/rejected": -289.15985107421875, "loss": 0.0414, "losses/dpo": 0.00014439842198044062, "losses/sft": 0.5687118768692017, "losses/total": 0.00014439842198044062, "ref_logps/chosen": -229.99148559570312, "ref_logps/rejected": -218.0462646484375, "rewards/accuracies": 1.0, "rewards/chosen": -0.3700222969055176, "rewards/margins": 6.741334915161133, "rewards/rejected": -7.111358165740967, "step": 951 }, { "epoch": 0.23, "learning_rate": 1.7146666666666664e-07, "logps/chosen": -262.9490051269531, "logps/rejected": -361.03765869140625, "loss": 0.0035, "losses/dpo": 1.6747544577810913e-05, "losses/sft": 0.6752359867095947, "losses/total": 1.6747544577810913e-05, "ref_logps/chosen": -258.5693054199219, "ref_logps/rejected": -258.8219299316406, "rewards/accuracies": 1.0, "rewards/chosen": -0.4379708170890808, "rewards/margins": 9.783601760864258, "rewards/rejected": -10.221572875976562, "step": 952 }, { "epoch": 0.23, "learning_rate": 1.7141333333333332e-07, "logps/chosen": -235.77560424804688, "logps/rejected": -352.0504150390625, "loss": 0.0021, "losses/dpo": 4.297347004467156e-06, "losses/sft": 0.5679137706756592, "losses/total": 4.297347004467156e-06, "ref_logps/chosen": -232.08346557617188, "ref_logps/rejected": -251.02273559570312, "rewards/accuracies": 1.0, "rewards/chosen": -0.36921393871307373, "rewards/margins": 9.733551025390625, "rewards/rejected": -10.102766036987305, "step": 953 }, { "epoch": 0.23, "learning_rate": 1.7136e-07, "logps/chosen": -232.73532104492188, "logps/rejected": -324.1063232421875, "loss": 0.0081, "losses/dpo": 1.1685813916528787e-07, "losses/sft": 0.908145546913147, "losses/total": 1.1685813916528787e-07, "ref_logps/chosen": -230.68222045898438, "ref_logps/rejected": -242.1986083984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.20531004667282104, "rewards/margins": 7.985459327697754, "rewards/rejected": -8.19076919555664, "step": 954 }, { "epoch": 0.23, "learning_rate": 1.7130666666666667e-07, "logps/chosen": -232.76011657714844, "logps/rejected": -322.7016296386719, "loss": 0.0073, "losses/dpo": 2.6477152914594626e-06, "losses/sft": 0.644284725189209, "losses/total": 2.6477152914594626e-06, "ref_logps/chosen": -229.62933349609375, "ref_logps/rejected": -231.2429962158203, "rewards/accuracies": 1.0, "rewards/chosen": -0.31307733058929443, "rewards/margins": 8.832785606384277, "rewards/rejected": -9.14586353302002, "step": 955 }, { "epoch": 0.23, "learning_rate": 1.7125333333333332e-07, "logps/chosen": -252.10585021972656, "logps/rejected": -285.31878662109375, "loss": 0.0204, "losses/dpo": 5.39888622697049e-09, "losses/sft": 0.8527103662490845, "losses/total": 5.39888622697049e-09, "ref_logps/chosen": -248.49810791015625, "ref_logps/rejected": -203.92202758789062, "rewards/accuracies": 1.0, "rewards/chosen": -0.3607756793498993, "rewards/margins": 7.778900146484375, "rewards/rejected": -8.139676094055176, "step": 956 }, { "epoch": 0.23, "learning_rate": 1.712e-07, "logps/chosen": -231.3818817138672, "logps/rejected": -311.6600036621094, "loss": 0.0207, "losses/dpo": 3.6127112252870575e-05, "losses/sft": 0.4700433313846588, "losses/total": 3.6127112252870575e-05, "ref_logps/chosen": -227.8498992919922, "ref_logps/rejected": -231.41746520996094, "rewards/accuracies": 1.0, "rewards/chosen": -0.353199303150177, "rewards/margins": 7.671055316925049, "rewards/rejected": -8.024253845214844, "step": 957 }, { "epoch": 0.23, "learning_rate": 1.7114666666666667e-07, "logps/chosen": -203.80010986328125, "logps/rejected": -289.1497802734375, "loss": 0.012, "losses/dpo": 2.393884415141656e-07, "losses/sft": 0.9121307134628296, "losses/total": 2.393884415141656e-07, "ref_logps/chosen": -202.64892578125, "ref_logps/rejected": -215.23236083984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.11511623859405518, "rewards/margins": 7.27662467956543, "rewards/rejected": -7.391740798950195, "step": 958 }, { "epoch": 0.23, "learning_rate": 1.7109333333333332e-07, "logps/chosen": -226.45216369628906, "logps/rejected": -324.11419677734375, "loss": 0.0067, "losses/dpo": 3.489411392365582e-05, "losses/sft": 0.5997603535652161, "losses/total": 3.489411392365582e-05, "ref_logps/chosen": -222.30404663085938, "ref_logps/rejected": -233.11737060546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.4148091673851013, "rewards/margins": 8.68487548828125, "rewards/rejected": -9.099684715270996, "step": 959 }, { "epoch": 0.23, "learning_rate": 1.7103999999999998e-07, "logps/chosen": -264.9325256347656, "logps/rejected": -314.11993408203125, "loss": 0.0046, "losses/dpo": 0.00015221460489556193, "losses/sft": 0.6138193607330322, "losses/total": 0.00015221460489556193, "ref_logps/chosen": -260.60943603515625, "ref_logps/rejected": -226.50140380859375, "rewards/accuracies": 1.0, "rewards/chosen": -0.43231114745140076, "rewards/margins": 8.329538345336914, "rewards/rejected": -8.761850357055664, "step": 960 }, { "epoch": 0.23, "learning_rate": 1.7098666666666665e-07, "logps/chosen": -226.13206481933594, "logps/rejected": -333.5746765136719, "loss": 0.0155, "losses/dpo": 3.3730375434970483e-06, "losses/sft": 0.8741248846054077, "losses/total": 3.3730375434970483e-06, "ref_logps/chosen": -222.9052276611328, "ref_logps/rejected": -239.80807495117188, "rewards/accuracies": 1.0, "rewards/chosen": -0.32268375158309937, "rewards/margins": 9.053976058959961, "rewards/rejected": -9.376660346984863, "step": 961 }, { "epoch": 0.23, "learning_rate": 1.7093333333333333e-07, "logps/chosen": -217.07568359375, "logps/rejected": -283.77227783203125, "loss": 0.0129, "losses/dpo": 9.717396665109845e-08, "losses/sft": 0.48679977655410767, "losses/total": 9.717396665109845e-08, "ref_logps/chosen": -214.18165588378906, "ref_logps/rejected": -208.75802612304688, "rewards/accuracies": 1.0, "rewards/chosen": -0.2894028127193451, "rewards/margins": 7.212025165557861, "rewards/rejected": -7.50142765045166, "step": 962 }, { "epoch": 0.23, "learning_rate": 1.7088e-07, "logps/chosen": -218.38893127441406, "logps/rejected": -260.37713623046875, "loss": 0.0111, "losses/dpo": 4.676484422816429e-06, "losses/sft": 0.5599210858345032, "losses/total": 4.676484422816429e-06, "ref_logps/chosen": -214.92405700683594, "ref_logps/rejected": -183.41201782226562, "rewards/accuracies": 1.0, "rewards/chosen": -0.3464880883693695, "rewards/margins": 7.350025177001953, "rewards/rejected": -7.6965131759643555, "step": 963 }, { "epoch": 0.23, "learning_rate": 1.7082666666666666e-07, "logps/chosen": -217.26959228515625, "logps/rejected": -320.90570068359375, "loss": 0.0125, "losses/dpo": 1.032402110467956e-06, "losses/sft": 0.4156390130519867, "losses/total": 1.032402110467956e-06, "ref_logps/chosen": -214.877685546875, "ref_logps/rejected": -241.83026123046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.23919038474559784, "rewards/margins": 7.6683549880981445, "rewards/rejected": -7.90754508972168, "step": 964 }, { "epoch": 0.23, "learning_rate": 1.7077333333333333e-07, "logps/chosen": -202.61386108398438, "logps/rejected": -315.50042724609375, "loss": 0.02, "losses/dpo": 1.8733858269115444e-06, "losses/sft": 0.6491788625717163, "losses/total": 1.8733858269115444e-06, "ref_logps/chosen": -198.50946044921875, "ref_logps/rejected": -221.58523559570312, "rewards/accuracies": 1.0, "rewards/chosen": -0.4104413390159607, "rewards/margins": 8.981077194213867, "rewards/rejected": -9.391517639160156, "step": 965 }, { "epoch": 0.23, "learning_rate": 1.7072e-07, "logps/chosen": -301.64501953125, "logps/rejected": -346.4744873046875, "loss": 0.0092, "losses/dpo": 6.990575229792739e-07, "losses/sft": 0.5001993775367737, "losses/total": 6.990575229792739e-07, "ref_logps/chosen": -297.60723876953125, "ref_logps/rejected": -262.275390625, "rewards/accuracies": 1.0, "rewards/chosen": -0.403779000043869, "rewards/margins": 8.016130447387695, "rewards/rejected": -8.419910430908203, "step": 966 }, { "epoch": 0.23, "learning_rate": 1.7066666666666666e-07, "logps/chosen": -184.72242736816406, "logps/rejected": -291.2591552734375, "loss": 0.0329, "losses/dpo": 6.908921932335943e-07, "losses/sft": 0.7240629196166992, "losses/total": 6.908921932335943e-07, "ref_logps/chosen": -183.09176635742188, "ref_logps/rejected": -219.80178833007812, "rewards/accuracies": 1.0, "rewards/chosen": -0.16306716203689575, "rewards/margins": 6.982667922973633, "rewards/rejected": -7.145735263824463, "step": 967 }, { "epoch": 0.23, "learning_rate": 1.706133333333333e-07, "logps/chosen": -203.6962127685547, "logps/rejected": -283.9836120605469, "loss": 0.0149, "losses/dpo": 0.00010691916395444423, "losses/sft": 0.5308470129966736, "losses/total": 0.00010691916395444423, "ref_logps/chosen": -199.86154174804688, "ref_logps/rejected": -208.31149291992188, "rewards/accuracies": 1.0, "rewards/chosen": -0.38346755504608154, "rewards/margins": 7.183745384216309, "rewards/rejected": -7.56721305847168, "step": 968 }, { "epoch": 0.23, "learning_rate": 1.7055999999999999e-07, "logps/chosen": -250.13973999023438, "logps/rejected": -326.7091369628906, "loss": 0.0055, "losses/dpo": 2.816842243191786e-05, "losses/sft": 0.45071956515312195, "losses/total": 2.816842243191786e-05, "ref_logps/chosen": -245.53167724609375, "ref_logps/rejected": -233.30401611328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.46080678701400757, "rewards/margins": 8.879704475402832, "rewards/rejected": -9.340511322021484, "step": 969 }, { "epoch": 0.23, "learning_rate": 1.7050666666666666e-07, "logps/chosen": -231.65640258789062, "logps/rejected": -300.4049072265625, "loss": 0.0193, "losses/dpo": 0.00019013606652151793, "losses/sft": 0.6720739006996155, "losses/total": 0.00019013606652151793, "ref_logps/chosen": -227.1368408203125, "ref_logps/rejected": -224.74591064453125, "rewards/accuracies": 1.0, "rewards/chosen": -0.4519558250904083, "rewards/margins": 7.113944053649902, "rewards/rejected": -7.5659003257751465, "step": 970 }, { "epoch": 0.23, "learning_rate": 1.7045333333333331e-07, "logps/chosen": -247.21337890625, "logps/rejected": -303.42333984375, "loss": 0.0113, "losses/dpo": 5.342373697203584e-05, "losses/sft": 0.5295937657356262, "losses/total": 5.342373697203584e-05, "ref_logps/chosen": -244.02923583984375, "ref_logps/rejected": -224.15985107421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.31841281056404114, "rewards/margins": 7.607936859130859, "rewards/rejected": -7.926350116729736, "step": 971 }, { "epoch": 0.23, "learning_rate": 1.704e-07, "logps/chosen": -203.93402099609375, "logps/rejected": -280.0896301269531, "loss": 0.0235, "losses/dpo": 3.1363917514681816e-05, "losses/sft": 1.2349979877471924, "losses/total": 3.1363917514681816e-05, "ref_logps/chosen": -199.73175048828125, "ref_logps/rejected": -204.29922485351562, "rewards/accuracies": 1.0, "rewards/chosen": -0.4202260971069336, "rewards/margins": 7.158816337585449, "rewards/rejected": -7.579042911529541, "step": 972 }, { "epoch": 0.23, "learning_rate": 1.7034666666666667e-07, "logps/chosen": -244.28228759765625, "logps/rejected": -310.83642578125, "loss": 0.0067, "losses/dpo": 0.0005187144852243364, "losses/sft": 1.0437560081481934, "losses/total": 0.0005187144852243364, "ref_logps/chosen": -240.9844970703125, "ref_logps/rejected": -228.054931640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.3297773599624634, "rewards/margins": 7.948373794555664, "rewards/rejected": -8.278151512145996, "step": 973 }, { "epoch": 0.23, "learning_rate": 1.7029333333333334e-07, "logps/chosen": -213.2187957763672, "logps/rejected": -281.08831787109375, "loss": 0.0173, "losses/dpo": 2.716383278311696e-06, "losses/sft": 0.5378488898277283, "losses/total": 2.716383278311696e-06, "ref_logps/chosen": -210.26881408691406, "ref_logps/rejected": -208.09169006347656, "rewards/accuracies": 1.0, "rewards/chosen": -0.2949999272823334, "rewards/margins": 7.004663467407227, "rewards/rejected": -7.299663066864014, "step": 974 }, { "epoch": 0.23, "learning_rate": 1.7024e-07, "logps/chosen": -276.82012939453125, "logps/rejected": -331.83892822265625, "loss": 0.012, "losses/dpo": 0.00033212112612091005, "losses/sft": 0.5118963122367859, "losses/total": 0.00033212112612091005, "ref_logps/chosen": -270.9263916015625, "ref_logps/rejected": -245.18539428710938, "rewards/accuracies": 1.0, "rewards/chosen": -0.5893728137016296, "rewards/margins": 8.075981140136719, "rewards/rejected": -8.665353775024414, "step": 975 }, { "epoch": 0.23, "learning_rate": 1.7018666666666664e-07, "logps/chosen": -210.7969970703125, "logps/rejected": -268.09161376953125, "loss": 0.0171, "losses/dpo": 5.224511596679804e-07, "losses/sft": 0.5288769602775574, "losses/total": 5.224511596679804e-07, "ref_logps/chosen": -207.00575256347656, "ref_logps/rejected": -197.1728515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.37912517786026, "rewards/margins": 6.712754249572754, "rewards/rejected": -7.091879367828369, "step": 976 }, { "epoch": 0.23, "learning_rate": 1.7013333333333332e-07, "logps/chosen": -260.028564453125, "logps/rejected": -321.11468505859375, "loss": 0.0056, "losses/dpo": 3.83526321456884e-06, "losses/sft": 0.8382875323295593, "losses/total": 3.83526321456884e-06, "ref_logps/chosen": -257.7021484375, "ref_logps/rejected": -240.4881134033203, "rewards/accuracies": 1.0, "rewards/chosen": -0.23264513909816742, "rewards/margins": 7.830015182495117, "rewards/rejected": -8.062660217285156, "step": 977 }, { "epoch": 0.23, "learning_rate": 1.7008e-07, "logps/chosen": -241.5682373046875, "logps/rejected": -319.31085205078125, "loss": 0.0043, "losses/dpo": 3.97841313315439e-06, "losses/sft": 0.6460366249084473, "losses/total": 3.97841313315439e-06, "ref_logps/chosen": -235.79727172851562, "ref_logps/rejected": -232.27638244628906, "rewards/accuracies": 1.0, "rewards/chosen": -0.5770962834358215, "rewards/margins": 8.126350402832031, "rewards/rejected": -8.703445434570312, "step": 978 }, { "epoch": 0.23, "learning_rate": 1.7002666666666665e-07, "logps/chosen": -242.64132690429688, "logps/rejected": -311.3553771972656, "loss": 0.0229, "losses/dpo": 0.0008480452233925462, "losses/sft": 0.4224834740161896, "losses/total": 0.0008480452233925462, "ref_logps/chosen": -238.30442810058594, "ref_logps/rejected": -226.87762451171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.43369266390800476, "rewards/margins": 8.014081954956055, "rewards/rejected": -8.447775840759277, "step": 979 }, { "epoch": 0.24, "learning_rate": 1.6997333333333333e-07, "logps/chosen": -267.19921875, "logps/rejected": -324.3103942871094, "loss": 0.0041, "losses/dpo": 3.080287740431231e-07, "losses/sft": 0.688037097454071, "losses/total": 3.080287740431231e-07, "ref_logps/chosen": -262.54949951171875, "ref_logps/rejected": -235.61129760742188, "rewards/accuracies": 1.0, "rewards/chosen": -0.4649732708930969, "rewards/margins": 8.404935836791992, "rewards/rejected": -8.869909286499023, "step": 980 }, { "epoch": 0.24, "learning_rate": 1.6992e-07, "logps/chosen": -238.42343139648438, "logps/rejected": -314.6732177734375, "loss": 0.0064, "losses/dpo": 2.5474768335698172e-05, "losses/sft": 0.7080757021903992, "losses/total": 2.5474768335698172e-05, "ref_logps/chosen": -235.31698608398438, "ref_logps/rejected": -228.64340209960938, "rewards/accuracies": 1.0, "rewards/chosen": -0.3106447458267212, "rewards/margins": 8.292335510253906, "rewards/rejected": -8.60297966003418, "step": 981 }, { "epoch": 0.24, "learning_rate": 1.6986666666666668e-07, "logps/chosen": -225.08651733398438, "logps/rejected": -293.0179748535156, "loss": 0.0087, "losses/dpo": 1.4053632639843272e-06, "losses/sft": 0.5984224677085876, "losses/total": 1.4053632639843272e-06, "ref_logps/chosen": -221.16539001464844, "ref_logps/rejected": -209.95729064941406, "rewards/accuracies": 1.0, "rewards/chosen": -0.39211326837539673, "rewards/margins": 7.913955211639404, "rewards/rejected": -8.306068420410156, "step": 982 }, { "epoch": 0.24, "learning_rate": 1.6981333333333333e-07, "logps/chosen": -231.87521362304688, "logps/rejected": -322.6679382324219, "loss": 0.0091, "losses/dpo": 0.00013533365563489497, "losses/sft": 0.4840983748435974, "losses/total": 0.00013533365563489497, "ref_logps/chosen": -227.30259704589844, "ref_logps/rejected": -234.6487579345703, "rewards/accuracies": 1.0, "rewards/chosen": -0.4572608470916748, "rewards/margins": 8.344660758972168, "rewards/rejected": -8.801921844482422, "step": 983 }, { "epoch": 0.24, "learning_rate": 1.6975999999999998e-07, "logps/chosen": -191.45132446289062, "logps/rejected": -288.71795654296875, "loss": 0.0052, "losses/dpo": 0.0003544315986800939, "losses/sft": 0.5238521099090576, "losses/total": 0.0003544315986800939, "ref_logps/chosen": -188.67184448242188, "ref_logps/rejected": -212.5996551513672, "rewards/accuracies": 1.0, "rewards/chosen": -0.27794793248176575, "rewards/margins": 7.333880424499512, "rewards/rejected": -7.611827850341797, "step": 984 }, { "epoch": 0.24, "learning_rate": 1.6970666666666666e-07, "logps/chosen": -213.27122497558594, "logps/rejected": -300.53289794921875, "loss": 0.0113, "losses/dpo": 3.0023013096069917e-05, "losses/sft": 0.5926625728607178, "losses/total": 3.0023013096069917e-05, "ref_logps/chosen": -209.71189880371094, "ref_logps/rejected": -219.491943359375, "rewards/accuracies": 1.0, "rewards/chosen": -0.35593247413635254, "rewards/margins": 7.7481608390808105, "rewards/rejected": -8.104093551635742, "step": 985 }, { "epoch": 0.24, "learning_rate": 1.696533333333333e-07, "logps/chosen": -215.6306915283203, "logps/rejected": -288.9935607910156, "loss": 0.0198, "losses/dpo": 6.421060788852628e-06, "losses/sft": 0.7365796566009521, "losses/total": 6.421060788852628e-06, "ref_logps/chosen": -212.89556884765625, "ref_logps/rejected": -214.0592041015625, "rewards/accuracies": 1.0, "rewards/chosen": -0.27351295948028564, "rewards/margins": 7.2199225425720215, "rewards/rejected": -7.493435382843018, "step": 986 }, { "epoch": 0.24, "learning_rate": 1.6959999999999998e-07, "logps/chosen": -212.1731719970703, "logps/rejected": -324.57513427734375, "loss": 0.0034, "losses/dpo": 4.965194420947228e-06, "losses/sft": 0.5810254216194153, "losses/total": 4.965194420947228e-06, "ref_logps/chosen": -208.05007934570312, "ref_logps/rejected": -234.92935180664062, "rewards/accuracies": 1.0, "rewards/chosen": -0.4123075306415558, "rewards/margins": 8.552270889282227, "rewards/rejected": -8.964578628540039, "step": 987 }, { "epoch": 0.24, "learning_rate": 1.6954666666666666e-07, "logps/chosen": -222.38523864746094, "logps/rejected": -321.2757873535156, "loss": 0.0058, "losses/dpo": 1.8480037624613033e-06, "losses/sft": 0.7023553252220154, "losses/total": 1.8480037624613033e-06, "ref_logps/chosen": -218.8730010986328, "ref_logps/rejected": -229.52679443359375, "rewards/accuracies": 1.0, "rewards/chosen": -0.351223886013031, "rewards/margins": 8.823675155639648, "rewards/rejected": -9.174899101257324, "step": 988 }, { "epoch": 0.24, "learning_rate": 1.6949333333333334e-07, "logps/chosen": -270.42657470703125, "logps/rejected": -379.34906005859375, "loss": 0.0059, "losses/dpo": 7.216467201942578e-05, "losses/sft": 0.6784871816635132, "losses/total": 7.216467201942578e-05, "ref_logps/chosen": -265.78662109375, "ref_logps/rejected": -272.26654052734375, "rewards/accuracies": 1.0, "rewards/chosen": -0.4639930725097656, "rewards/margins": 10.244260787963867, "rewards/rejected": -10.708253860473633, "step": 989 }, { "epoch": 0.24, "learning_rate": 1.6944e-07, "logps/chosen": -268.96484375, "logps/rejected": -323.70013427734375, "loss": 0.0092, "losses/dpo": 9.208167466567829e-05, "losses/sft": 0.6081080436706543, "losses/total": 9.208167466567829e-05, "ref_logps/chosen": -264.5418395996094, "ref_logps/rejected": -236.82379150390625, "rewards/accuracies": 1.0, "rewards/chosen": -0.44230014085769653, "rewards/margins": 8.24533462524414, "rewards/rejected": -8.68763542175293, "step": 990 }, { "epoch": 0.24, "learning_rate": 1.6938666666666666e-07, "logps/chosen": -236.3406982421875, "logps/rejected": -289.9826965332031, "loss": 0.0121, "losses/dpo": 3.953782197640976e-06, "losses/sft": 1.0683844089508057, "losses/total": 3.953782197640976e-06, "ref_logps/chosen": -231.89425659179688, "ref_logps/rejected": -207.4973907470703, "rewards/accuracies": 1.0, "rewards/chosen": -0.4446446895599365, "rewards/margins": 7.8038835525512695, "rewards/rejected": -8.248528480529785, "step": 991 }, { "epoch": 0.24, "learning_rate": 1.6933333333333334e-07, "logps/chosen": -206.3361053466797, "logps/rejected": -286.421630859375, "loss": 0.0286, "losses/dpo": 2.124142461923384e-08, "losses/sft": 0.5782430171966553, "losses/total": 2.124142461923384e-08, "ref_logps/chosen": -203.92330932617188, "ref_logps/rejected": -219.90255737304688, "rewards/accuracies": 1.0, "rewards/chosen": -0.241279736161232, "rewards/margins": 6.410630226135254, "rewards/rejected": -6.651909351348877, "step": 992 }, { "epoch": 0.24, "learning_rate": 1.6928e-07, "logps/chosen": -281.35321044921875, "logps/rejected": -344.93133544921875, "loss": 0.0037, "losses/dpo": 2.18285504161031e-06, "losses/sft": 0.6298311352729797, "losses/total": 2.18285504161031e-06, "ref_logps/chosen": -275.7375793457031, "ref_logps/rejected": -252.72640991210938, "rewards/accuracies": 1.0, "rewards/chosen": -0.5615626573562622, "rewards/margins": 8.65893268585205, "rewards/rejected": -9.220495223999023, "step": 993 }, { "epoch": 0.24, "learning_rate": 1.6922666666666664e-07, "logps/chosen": -286.71807861328125, "logps/rejected": -378.4259033203125, "loss": 0.0076, "losses/dpo": 4.4820907874054683e-07, "losses/sft": 0.5368049144744873, "losses/total": 4.4820907874054683e-07, "ref_logps/chosen": -281.447509765625, "ref_logps/rejected": -280.68255615234375, "rewards/accuracies": 1.0, "rewards/chosen": -0.5270578861236572, "rewards/margins": 9.247278213500977, "rewards/rejected": -9.774335861206055, "step": 994 }, { "epoch": 0.24, "learning_rate": 1.6917333333333332e-07, "logps/chosen": -216.36648559570312, "logps/rejected": -320.3516540527344, "loss": 0.0151, "losses/dpo": 5.3918051889922936e-06, "losses/sft": 0.6501696109771729, "losses/total": 5.3918051889922936e-06, "ref_logps/chosen": -211.57333374023438, "ref_logps/rejected": -230.35269165039062, "rewards/accuracies": 1.0, "rewards/chosen": -0.4793153405189514, "rewards/margins": 8.52058219909668, "rewards/rejected": -8.999897003173828, "step": 995 }, { "epoch": 0.24, "learning_rate": 1.6912e-07, "logps/chosen": -246.62313842773438, "logps/rejected": -332.44659423828125, "loss": 0.0109, "losses/dpo": 2.9474615530489245e-06, "losses/sft": 0.8895033597946167, "losses/total": 2.9474615530489245e-06, "ref_logps/chosen": -242.1146240234375, "ref_logps/rejected": -247.0590362548828, "rewards/accuracies": 1.0, "rewards/chosen": -0.45085370540618896, "rewards/margins": 8.087902069091797, "rewards/rejected": -8.538755416870117, "step": 996 }, { "epoch": 0.24, "learning_rate": 1.6906666666666667e-07, "logps/chosen": -234.43406677246094, "logps/rejected": -304.23828125, "loss": 0.0131, "losses/dpo": 0.0006006818730384111, "losses/sft": 0.6998189091682434, "losses/total": 0.0006006818730384111, "ref_logps/chosen": -229.1569061279297, "ref_logps/rejected": -220.77967834472656, "rewards/accuracies": 1.0, "rewards/chosen": -0.5277180671691895, "rewards/margins": 7.818143367767334, "rewards/rejected": -8.345861434936523, "step": 997 }, { "epoch": 0.24, "learning_rate": 1.6901333333333332e-07, "logps/chosen": -275.09600830078125, "logps/rejected": -338.9774169921875, "loss": 0.0107, "losses/dpo": 5.5561195040354505e-06, "losses/sft": 0.5967525839805603, "losses/total": 5.5561195040354505e-06, "ref_logps/chosen": -268.35040283203125, "ref_logps/rejected": -246.69960021972656, "rewards/accuracies": 1.0, "rewards/chosen": -0.6745606660842896, "rewards/margins": 8.553220748901367, "rewards/rejected": -9.227782249450684, "step": 998 }, { "epoch": 0.24, "learning_rate": 1.6896e-07, "logps/chosen": -213.73452758789062, "logps/rejected": -316.1957702636719, "loss": 0.0089, "losses/dpo": 0.00030215978040359914, "losses/sft": 0.6120811700820923, "losses/total": 0.00030215978040359914, "ref_logps/chosen": -211.3541259765625, "ref_logps/rejected": -228.6566162109375, "rewards/accuracies": 1.0, "rewards/chosen": -0.23803941905498505, "rewards/margins": 8.515876770019531, "rewards/rejected": -8.753915786743164, "step": 999 }, { "epoch": 0.24, "learning_rate": 1.6890666666666668e-07, "logps/chosen": -248.91567993164062, "logps/rejected": -335.9637145996094, "loss": 0.006, "losses/dpo": 7.83621180744376e-06, "losses/sft": 0.5180022716522217, "losses/total": 7.83621180744376e-06, "ref_logps/chosen": -243.94366455078125, "ref_logps/rejected": -244.84825134277344, "rewards/accuracies": 1.0, "rewards/chosen": -0.4971998929977417, "rewards/margins": 8.614347457885742, "rewards/rejected": -9.111547470092773, "step": 1000 }, { "epoch": 0.24, "learning_rate": 1.6885333333333333e-07, "logps/chosen": -271.5135498046875, "logps/rejected": -303.2235412597656, "loss": 0.0242, "losses/dpo": 1.683598748059012e-05, "losses/sft": 0.6699543595314026, "losses/total": 1.683598748059012e-05, "ref_logps/chosen": -264.8837585449219, "ref_logps/rejected": -217.28614807128906, "rewards/accuracies": 1.0, "rewards/chosen": -0.6629803776741028, "rewards/margins": 7.930757522583008, "rewards/rejected": -8.593738555908203, "step": 1001 }, { "epoch": 0.24, "learning_rate": 1.6879999999999998e-07, "logps/chosen": -248.2102508544922, "logps/rejected": -303.8780517578125, "loss": 0.0083, "losses/dpo": 6.768781304344884e-07, "losses/sft": 0.610529363155365, "losses/total": 6.768781304344884e-07, "ref_logps/chosen": -243.37179565429688, "ref_logps/rejected": -218.67337036132812, "rewards/accuracies": 1.0, "rewards/chosen": -0.48384392261505127, "rewards/margins": 8.036622047424316, "rewards/rejected": -8.520465850830078, "step": 1002 }, { "epoch": 0.24, "learning_rate": 1.6874666666666665e-07, "logps/chosen": -272.9488525390625, "logps/rejected": -333.2247619628906, "loss": 0.0083, "losses/dpo": 0.0008259906317107379, "losses/sft": 0.5242751240730286, "losses/total": 0.0008259906317107379, "ref_logps/chosen": -267.89068603515625, "ref_logps/rejected": -244.631103515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.5058159232139587, "rewards/margins": 8.35354995727539, "rewards/rejected": -8.859366416931152, "step": 1003 }, { "epoch": 0.24, "learning_rate": 1.6869333333333333e-07, "logps/chosen": -241.35336303710938, "logps/rejected": -306.3948669433594, "loss": 0.0161, "losses/dpo": 3.43763422279153e-05, "losses/sft": 0.6037856936454773, "losses/total": 3.43763422279153e-05, "ref_logps/chosen": -238.62289428710938, "ref_logps/rejected": -220.92056274414062, "rewards/accuracies": 1.0, "rewards/chosen": -0.2730495035648346, "rewards/margins": 8.274382591247559, "rewards/rejected": -8.547431945800781, "step": 1004 }, { "epoch": 0.24, "learning_rate": 1.6863999999999998e-07, "logps/chosen": -198.99166870117188, "logps/rejected": -289.6034240722656, "loss": 0.0067, "losses/dpo": 5.787465124740265e-05, "losses/sft": 0.4695609509944916, "losses/total": 5.787465124740265e-05, "ref_logps/chosen": -195.72610473632812, "ref_logps/rejected": -212.2943115234375, "rewards/accuracies": 1.0, "rewards/chosen": -0.3265545964241028, "rewards/margins": 7.404356956481934, "rewards/rejected": -7.730911731719971, "step": 1005 }, { "epoch": 0.24, "learning_rate": 1.6858666666666666e-07, "logps/chosen": -272.4588623046875, "logps/rejected": -324.2609558105469, "loss": 0.0084, "losses/dpo": 1.9242209248204745e-07, "losses/sft": 0.47138136625289917, "losses/total": 1.9242209248204745e-07, "ref_logps/chosen": -268.2049560546875, "ref_logps/rejected": -240.60198974609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.42539042234420776, "rewards/margins": 7.940505504608154, "rewards/rejected": -8.365896224975586, "step": 1006 }, { "epoch": 0.24, "learning_rate": 1.6853333333333333e-07, "logps/chosen": -206.77589416503906, "logps/rejected": -345.2403869628906, "loss": 0.0202, "losses/dpo": 3.795044278831483e-08, "losses/sft": 0.4991765320301056, "losses/total": 3.795044278831483e-08, "ref_logps/chosen": -204.7552490234375, "ref_logps/rejected": -251.75942993164062, "rewards/accuracies": 1.0, "rewards/chosen": -0.20206578075885773, "rewards/margins": 9.146029472351074, "rewards/rejected": -9.348095893859863, "step": 1007 }, { "epoch": 0.24, "learning_rate": 1.6848e-07, "logps/chosen": -231.82211303710938, "logps/rejected": -288.80389404296875, "loss": 0.0049, "losses/dpo": 1.23662300666183e-06, "losses/sft": 0.7336336970329285, "losses/total": 1.23662300666183e-06, "ref_logps/chosen": -227.5737762451172, "ref_logps/rejected": -205.65521240234375, "rewards/accuracies": 1.0, "rewards/chosen": -0.42483407258987427, "rewards/margins": 7.890036582946777, "rewards/rejected": -8.314870834350586, "step": 1008 }, { "epoch": 0.24, "learning_rate": 1.6842666666666666e-07, "logps/chosen": -263.6736145019531, "logps/rejected": -372.923095703125, "loss": 0.0015, "losses/dpo": 1.0985685548803303e-05, "losses/sft": 0.7628118395805359, "losses/total": 1.0985685548803303e-05, "ref_logps/chosen": -259.18072509765625, "ref_logps/rejected": -274.75775146484375, "rewards/accuracies": 1.0, "rewards/chosen": -0.4492890238761902, "rewards/margins": 9.367246627807617, "rewards/rejected": -9.816535949707031, "step": 1009 }, { "epoch": 0.24, "learning_rate": 1.683733333333333e-07, "logps/chosen": -212.26470947265625, "logps/rejected": -307.4449462890625, "loss": 0.012, "losses/dpo": 0.002912198891863227, "losses/sft": 0.6477035880088806, "losses/total": 0.002912198891863227, "ref_logps/chosen": -208.55372619628906, "ref_logps/rejected": -225.94717407226562, "rewards/accuracies": 1.0, "rewards/chosen": -0.37109845876693726, "rewards/margins": 7.7786760330200195, "rewards/rejected": -8.149774551391602, "step": 1010 }, { "epoch": 0.24, "learning_rate": 1.6832e-07, "logps/chosen": -225.13568115234375, "logps/rejected": -277.1495361328125, "loss": 0.0097, "losses/dpo": 0.00015723738761153072, "losses/sft": 0.5017319917678833, "losses/total": 0.00015723738761153072, "ref_logps/chosen": -220.97415161132812, "ref_logps/rejected": -199.59103393554688, "rewards/accuracies": 1.0, "rewards/chosen": -0.4161534011363983, "rewards/margins": 7.339694499969482, "rewards/rejected": -7.755847454071045, "step": 1011 }, { "epoch": 0.24, "learning_rate": 1.6826666666666666e-07, "logps/chosen": -267.90325927734375, "logps/rejected": -326.4248046875, "loss": 0.0174, "losses/dpo": 3.4512460842961445e-05, "losses/sft": 0.6679194569587708, "losses/total": 3.4512460842961445e-05, "ref_logps/chosen": -263.13189697265625, "ref_logps/rejected": -238.70407104492188, "rewards/accuracies": 1.0, "rewards/chosen": -0.4771369695663452, "rewards/margins": 8.294939041137695, "rewards/rejected": -8.772075653076172, "step": 1012 }, { "epoch": 0.24, "learning_rate": 1.6821333333333331e-07, "logps/chosen": -246.95639038085938, "logps/rejected": -319.8163757324219, "loss": 0.0098, "losses/dpo": 0.00028695649234578013, "losses/sft": 0.5150525569915771, "losses/total": 0.00028695649234578013, "ref_logps/chosen": -241.6982421875, "ref_logps/rejected": -235.0367431640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.5258141160011292, "rewards/margins": 7.952145576477051, "rewards/rejected": -8.477959632873535, "step": 1013 }, { "epoch": 0.24, "learning_rate": 1.6816e-07, "logps/chosen": -267.5411376953125, "logps/rejected": -319.6210632324219, "loss": 0.0162, "losses/dpo": 7.5406780524645e-05, "losses/sft": 0.6064038276672363, "losses/total": 7.5406780524645e-05, "ref_logps/chosen": -261.4757080078125, "ref_logps/rejected": -231.4365997314453, "rewards/accuracies": 1.0, "rewards/chosen": -0.6065450310707092, "rewards/margins": 8.21190071105957, "rewards/rejected": -8.818445205688477, "step": 1014 }, { "epoch": 0.24, "learning_rate": 1.6810666666666667e-07, "logps/chosen": -191.70335388183594, "logps/rejected": -301.39910888671875, "loss": 0.0234, "losses/dpo": 1.9221464754082263e-05, "losses/sft": 0.62263423204422, "losses/total": 1.9221464754082263e-05, "ref_logps/chosen": -189.53968811035156, "ref_logps/rejected": -220.83465576171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.21636734902858734, "rewards/margins": 7.840078830718994, "rewards/rejected": -8.056446075439453, "step": 1015 }, { "epoch": 0.24, "learning_rate": 1.6805333333333335e-07, "logps/chosen": -255.2596435546875, "logps/rejected": -285.99981689453125, "loss": 0.0086, "losses/dpo": 1.6986798073048703e-05, "losses/sft": 0.6974373459815979, "losses/total": 1.6986798073048703e-05, "ref_logps/chosen": -252.08908081054688, "ref_logps/rejected": -207.326904296875, "rewards/accuracies": 1.0, "rewards/chosen": -0.3170565366744995, "rewards/margins": 7.550238609313965, "rewards/rejected": -7.867294788360596, "step": 1016 }, { "epoch": 0.24, "learning_rate": 1.68e-07, "logps/chosen": -234.09765625, "logps/rejected": -328.3238525390625, "loss": 0.0033, "losses/dpo": 0.0007772925309836864, "losses/sft": 0.6154706478118896, "losses/total": 0.0007772925309836864, "ref_logps/chosen": -230.78018188476562, "ref_logps/rejected": -238.12442016601562, "rewards/accuracies": 1.0, "rewards/chosen": -0.3317498564720154, "rewards/margins": 8.688196182250977, "rewards/rejected": -9.019946098327637, "step": 1017 }, { "epoch": 0.24, "learning_rate": 1.6794666666666665e-07, "logps/chosen": -214.04515075683594, "logps/rejected": -282.4276123046875, "loss": 0.0179, "losses/dpo": 0.0012748163426294923, "losses/sft": 0.6225574016571045, "losses/total": 0.0012748163426294923, "ref_logps/chosen": -210.32473754882812, "ref_logps/rejected": -206.52850341796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.3720417320728302, "rewards/margins": 7.217870712280273, "rewards/rejected": -7.5899128913879395, "step": 1018 }, { "epoch": 0.24, "learning_rate": 1.6789333333333332e-07, "logps/chosen": -240.46937561035156, "logps/rejected": -265.45428466796875, "loss": 0.007, "losses/dpo": 2.200550625275355e-05, "losses/sft": 0.5606117844581604, "losses/total": 2.200550625275355e-05, "ref_logps/chosen": -238.57044982910156, "ref_logps/rejected": -192.16851806640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.18989163637161255, "rewards/margins": 7.138684272766113, "rewards/rejected": -7.32857608795166, "step": 1019 }, { "epoch": 0.24, "learning_rate": 1.6783999999999997e-07, "logps/chosen": -262.3636779785156, "logps/rejected": -337.44287109375, "loss": 0.0057, "losses/dpo": 3.216514232917689e-05, "losses/sft": 0.6477476954460144, "losses/total": 3.216514232917689e-05, "ref_logps/chosen": -257.4073486328125, "ref_logps/rejected": -247.09353637695312, "rewards/accuracies": 1.0, "rewards/chosen": -0.49563586711883545, "rewards/margins": 8.539299011230469, "rewards/rejected": -9.034934043884277, "step": 1020 }, { "epoch": 0.25, "learning_rate": 1.6778666666666665e-07, "logps/chosen": -226.02783203125, "logps/rejected": -305.67572021484375, "loss": 0.0227, "losses/dpo": 2.2541648547758086e-07, "losses/sft": 0.46388086676597595, "losses/total": 2.2541648547758086e-07, "ref_logps/chosen": -222.080078125, "ref_logps/rejected": -218.15724182128906, "rewards/accuracies": 1.0, "rewards/chosen": -0.394776850938797, "rewards/margins": 8.357070922851562, "rewards/rejected": -8.751848220825195, "step": 1021 }, { "epoch": 0.25, "learning_rate": 1.6773333333333333e-07, "logps/chosen": -237.78512573242188, "logps/rejected": -331.4925842285156, "loss": 0.0072, "losses/dpo": 2.7294990445625444e-07, "losses/sft": 0.5577172636985779, "losses/total": 2.7294990445625444e-07, "ref_logps/chosen": -234.51058959960938, "ref_logps/rejected": -243.79090881347656, "rewards/accuracies": 1.0, "rewards/chosen": -0.32745522260665894, "rewards/margins": 8.442710876464844, "rewards/rejected": -8.770166397094727, "step": 1022 }, { "epoch": 0.25, "learning_rate": 1.6768e-07, "logps/chosen": -209.9251708984375, "logps/rejected": -295.70159912109375, "loss": 0.0139, "losses/dpo": 9.294132610193628e-07, "losses/sft": 0.5142320394515991, "losses/total": 9.294132610193628e-07, "ref_logps/chosen": -207.12579345703125, "ref_logps/rejected": -205.63609313964844, "rewards/accuracies": 1.0, "rewards/chosen": -0.27993708848953247, "rewards/margins": 8.726614952087402, "rewards/rejected": -9.006552696228027, "step": 1023 }, { "epoch": 0.25, "learning_rate": 1.6762666666666665e-07, "logps/chosen": -287.7361755371094, "logps/rejected": -327.243408203125, "loss": 0.0072, "losses/dpo": 1.9610758954513585e-06, "losses/sft": 0.3690319061279297, "losses/total": 1.9610758954513585e-06, "ref_logps/chosen": -283.38330078125, "ref_logps/rejected": -235.33828735351562, "rewards/accuracies": 1.0, "rewards/chosen": -0.43528759479522705, "rewards/margins": 8.755226135253906, "rewards/rejected": -9.190513610839844, "step": 1024 }, { "epoch": 0.25, "learning_rate": 1.6757333333333333e-07, "logps/chosen": -229.08999633789062, "logps/rejected": -315.6673889160156, "loss": 0.0058, "losses/dpo": 5.16988791332551e-07, "losses/sft": 0.6526079773902893, "losses/total": 5.16988791332551e-07, "ref_logps/chosen": -225.81536865234375, "ref_logps/rejected": -225.31182861328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.32746148109436035, "rewards/margins": 8.708094596862793, "rewards/rejected": -9.035555839538574, "step": 1025 }, { "epoch": 0.25, "learning_rate": 1.6752e-07, "logps/chosen": -248.15745544433594, "logps/rejected": -292.1536560058594, "loss": 0.0051, "losses/dpo": 6.422047960086275e-08, "losses/sft": 0.48530709743499756, "losses/total": 6.422047960086275e-08, "ref_logps/chosen": -244.31924438476562, "ref_logps/rejected": -210.571533203125, "rewards/accuracies": 1.0, "rewards/chosen": -0.38382232189178467, "rewards/margins": 7.77439022064209, "rewards/rejected": -8.158212661743164, "step": 1026 }, { "epoch": 0.25, "learning_rate": 1.6746666666666666e-07, "logps/chosen": -186.3841094970703, "logps/rejected": -300.6770935058594, "loss": 0.0207, "losses/dpo": 1.2402856555127073e-06, "losses/sft": 0.7345499396324158, "losses/total": 1.2402856555127073e-06, "ref_logps/chosen": -182.54998779296875, "ref_logps/rejected": -223.3016815185547, "rewards/accuracies": 1.0, "rewards/chosen": -0.38341328501701355, "rewards/margins": 7.354127407073975, "rewards/rejected": -7.737541198730469, "step": 1027 }, { "epoch": 0.25, "learning_rate": 1.674133333333333e-07, "logps/chosen": -216.92591857910156, "logps/rejected": -302.6016845703125, "loss": 0.0094, "losses/dpo": 3.571770790244955e-08, "losses/sft": 0.7476698756217957, "losses/total": 3.571770790244955e-08, "ref_logps/chosen": -213.9566650390625, "ref_logps/rejected": -219.14224243164062, "rewards/accuracies": 1.0, "rewards/chosen": -0.29692497849464417, "rewards/margins": 8.049019813537598, "rewards/rejected": -8.345943450927734, "step": 1028 }, { "epoch": 0.25, "learning_rate": 1.6735999999999998e-07, "logps/chosen": -271.11407470703125, "logps/rejected": -312.7476806640625, "loss": 0.0149, "losses/dpo": 5.650055027217604e-05, "losses/sft": 0.6870683431625366, "losses/total": 5.650055027217604e-05, "ref_logps/chosen": -266.24468994140625, "ref_logps/rejected": -229.90834045410156, "rewards/accuracies": 1.0, "rewards/chosen": -0.48693880438804626, "rewards/margins": 7.796995639801025, "rewards/rejected": -8.283934593200684, "step": 1029 }, { "epoch": 0.25, "learning_rate": 1.6730666666666666e-07, "logps/chosen": -206.34765625, "logps/rejected": -302.04766845703125, "loss": 0.0269, "losses/dpo": 3.5443542856228305e-06, "losses/sft": 0.6097261905670166, "losses/total": 3.5443542856228305e-06, "ref_logps/chosen": -202.4879913330078, "ref_logps/rejected": -224.38824462890625, "rewards/accuracies": 1.0, "rewards/chosen": -0.38596606254577637, "rewards/margins": 7.379978656768799, "rewards/rejected": -7.765944480895996, "step": 1030 }, { "epoch": 0.25, "learning_rate": 1.6725333333333334e-07, "logps/chosen": -250.65213012695312, "logps/rejected": -322.5745849609375, "loss": 0.0021, "losses/dpo": 1.626175162527943e-06, "losses/sft": 1.0208085775375366, "losses/total": 1.626175162527943e-06, "ref_logps/chosen": -248.36679077148438, "ref_logps/rejected": -232.45542907714844, "rewards/accuracies": 1.0, "rewards/chosen": -0.22853276133537292, "rewards/margins": 8.783382415771484, "rewards/rejected": -9.01191520690918, "step": 1031 }, { "epoch": 0.25, "learning_rate": 1.672e-07, "logps/chosen": -250.929443359375, "logps/rejected": -314.21881103515625, "loss": 0.0132, "losses/dpo": 4.221483686706051e-05, "losses/sft": 0.5660199522972107, "losses/total": 4.221483686706051e-05, "ref_logps/chosen": -246.1055908203125, "ref_logps/rejected": -232.0119171142578, "rewards/accuracies": 1.0, "rewards/chosen": -0.4823865294456482, "rewards/margins": 7.738302707672119, "rewards/rejected": -8.220688819885254, "step": 1032 }, { "epoch": 0.25, "learning_rate": 1.6714666666666667e-07, "logps/chosen": -220.08999633789062, "logps/rejected": -318.3841247558594, "loss": 0.0125, "losses/dpo": 1.6617721485090442e-05, "losses/sft": 0.5398431420326233, "losses/total": 1.6617721485090442e-05, "ref_logps/chosen": -218.2030487060547, "ref_logps/rejected": -227.03994750976562, "rewards/accuracies": 1.0, "rewards/chosen": -0.18869337439537048, "rewards/margins": 8.945725440979004, "rewards/rejected": -9.134418487548828, "step": 1033 }, { "epoch": 0.25, "learning_rate": 1.6709333333333334e-07, "logps/chosen": -189.4740753173828, "logps/rejected": -265.8898620605469, "loss": 0.0482, "losses/dpo": 5.404026069300016e-06, "losses/sft": 0.41573643684387207, "losses/total": 5.404026069300016e-06, "ref_logps/chosen": -186.63467407226562, "ref_logps/rejected": -193.7358856201172, "rewards/accuracies": 1.0, "rewards/chosen": -0.28393876552581787, "rewards/margins": 6.931463241577148, "rewards/rejected": -7.215402126312256, "step": 1034 }, { "epoch": 0.25, "learning_rate": 1.6704e-07, "logps/chosen": -232.5015869140625, "logps/rejected": -318.2294921875, "loss": 0.0029, "losses/dpo": 0.00020521022088360041, "losses/sft": 0.586006224155426, "losses/total": 0.00020521022088360041, "ref_logps/chosen": -227.35769653320312, "ref_logps/rejected": -223.85394287109375, "rewards/accuracies": 1.0, "rewards/chosen": -0.5143893957138062, "rewards/margins": 8.923164367675781, "rewards/rejected": -9.437553405761719, "step": 1035 }, { "epoch": 0.25, "learning_rate": 1.6698666666666664e-07, "logps/chosen": -203.31773376464844, "logps/rejected": -307.28326416015625, "loss": 0.0096, "losses/dpo": 4.239470854372485e-06, "losses/sft": 0.3380734324455261, "losses/total": 4.239470854372485e-06, "ref_logps/chosen": -199.40087890625, "ref_logps/rejected": -226.66871643066406, "rewards/accuracies": 1.0, "rewards/chosen": -0.3916855454444885, "rewards/margins": 7.6697678565979, "rewards/rejected": -8.061453819274902, "step": 1036 }, { "epoch": 0.25, "learning_rate": 1.6693333333333332e-07, "logps/chosen": -285.659912109375, "logps/rejected": -341.2190856933594, "loss": 0.0071, "losses/dpo": 3.2929065696407633e-07, "losses/sft": 1.1348645687103271, "losses/total": 3.2929065696407633e-07, "ref_logps/chosen": -282.3112487792969, "ref_logps/rejected": -251.08438110351562, "rewards/accuracies": 1.0, "rewards/chosen": -0.3348672091960907, "rewards/margins": 8.678604125976562, "rewards/rejected": -9.013471603393555, "step": 1037 }, { "epoch": 0.25, "learning_rate": 1.6688e-07, "logps/chosen": -227.60916137695312, "logps/rejected": -300.94403076171875, "loss": 0.0169, "losses/dpo": 0.000246261537540704, "losses/sft": 0.6111275553703308, "losses/total": 0.000246261537540704, "ref_logps/chosen": -222.20101928710938, "ref_logps/rejected": -215.41744995117188, "rewards/accuracies": 1.0, "rewards/chosen": -0.540816068649292, "rewards/margins": 8.011841773986816, "rewards/rejected": -8.552658081054688, "step": 1038 }, { "epoch": 0.25, "learning_rate": 1.6682666666666665e-07, "logps/chosen": -179.3246307373047, "logps/rejected": -266.6275329589844, "loss": 0.0459, "losses/dpo": 5.660519377670425e-07, "losses/sft": 0.9578845500946045, "losses/total": 5.660519377670425e-07, "ref_logps/chosen": -176.07455444335938, "ref_logps/rejected": -190.01361083984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.32500773668289185, "rewards/margins": 7.336385726928711, "rewards/rejected": -7.661394119262695, "step": 1039 }, { "epoch": 0.25, "learning_rate": 1.6677333333333332e-07, "logps/chosen": -264.48126220703125, "logps/rejected": -352.2628479003906, "loss": 0.0038, "losses/dpo": 1.2174033145129215e-05, "losses/sft": 0.7413780093193054, "losses/total": 1.2174033145129215e-05, "ref_logps/chosen": -259.17022705078125, "ref_logps/rejected": -256.59356689453125, "rewards/accuracies": 1.0, "rewards/chosen": -0.5311040878295898, "rewards/margins": 9.035825729370117, "rewards/rejected": -9.566929817199707, "step": 1040 }, { "epoch": 0.25, "learning_rate": 1.6672e-07, "logps/chosen": -257.3270568847656, "logps/rejected": -313.95501708984375, "loss": 0.0106, "losses/dpo": 4.231298225931823e-05, "losses/sft": 0.5563872456550598, "losses/total": 4.231298225931823e-05, "ref_logps/chosen": -252.36785888671875, "ref_logps/rejected": -226.44091796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.49591946601867676, "rewards/margins": 8.255489349365234, "rewards/rejected": -8.751409530639648, "step": 1041 }, { "epoch": 0.25, "learning_rate": 1.6666666666666668e-07, "logps/chosen": -260.02105712890625, "logps/rejected": -285.59014892578125, "loss": 0.024, "losses/dpo": 5.258140663499944e-05, "losses/sft": 0.6881401538848877, "losses/total": 5.258140663499944e-05, "ref_logps/chosen": -254.500244140625, "ref_logps/rejected": -208.8559112548828, "rewards/accuracies": 1.0, "rewards/chosen": -0.5520796775817871, "rewards/margins": 7.121342658996582, "rewards/rejected": -7.6734232902526855, "step": 1042 }, { "epoch": 0.25, "learning_rate": 1.6661333333333333e-07, "logps/chosen": -228.73330688476562, "logps/rejected": -322.0538024902344, "loss": 0.0065, "losses/dpo": 3.111856017312675e-08, "losses/sft": 0.6051989197731018, "losses/total": 3.111856017312675e-08, "ref_logps/chosen": -224.24766540527344, "ref_logps/rejected": -229.36109924316406, "rewards/accuracies": 1.0, "rewards/chosen": -0.44856202602386475, "rewards/margins": 8.820707321166992, "rewards/rejected": -9.269269943237305, "step": 1043 }, { "epoch": 0.25, "learning_rate": 1.6655999999999998e-07, "logps/chosen": -193.6600341796875, "logps/rejected": -282.3317565917969, "loss": 0.0067, "losses/dpo": 6.00065595790511e-06, "losses/sft": 0.5457104444503784, "losses/total": 6.00065595790511e-06, "ref_logps/chosen": -190.9470977783203, "ref_logps/rejected": -200.28756713867188, "rewards/accuracies": 1.0, "rewards/chosen": -0.27129340171813965, "rewards/margins": 7.933126926422119, "rewards/rejected": -8.20442008972168, "step": 1044 }, { "epoch": 0.25, "learning_rate": 1.6650666666666665e-07, "logps/chosen": -217.61312866210938, "logps/rejected": -297.7377624511719, "loss": 0.0041, "losses/dpo": 0.0008752068388275802, "losses/sft": 0.5330756306648254, "losses/total": 0.0008752068388275802, "ref_logps/chosen": -213.92587280273438, "ref_logps/rejected": -219.32321166992188, "rewards/accuracies": 1.0, "rewards/chosen": -0.3687255382537842, "rewards/margins": 7.472728729248047, "rewards/rejected": -7.841454029083252, "step": 1045 }, { "epoch": 0.25, "learning_rate": 1.6645333333333333e-07, "logps/chosen": -245.59913635253906, "logps/rejected": -315.6767578125, "loss": 0.0143, "losses/dpo": 1.1256017842242727e-06, "losses/sft": 0.6013286709785461, "losses/total": 1.1256017842242727e-06, "ref_logps/chosen": -241.7451934814453, "ref_logps/rejected": -229.09085083007812, "rewards/accuracies": 1.0, "rewards/chosen": -0.3853943943977356, "rewards/margins": 8.273195266723633, "rewards/rejected": -8.658590316772461, "step": 1046 }, { "epoch": 0.25, "learning_rate": 1.6639999999999998e-07, "logps/chosen": -249.5113067626953, "logps/rejected": -284.33026123046875, "loss": 0.0262, "losses/dpo": 2.4147075237124227e-05, "losses/sft": 0.5182355046272278, "losses/total": 2.4147075237124227e-05, "ref_logps/chosen": -244.5184326171875, "ref_logps/rejected": -201.1419677734375, "rewards/accuracies": 1.0, "rewards/chosen": -0.4992884397506714, "rewards/margins": 7.819543838500977, "rewards/rejected": -8.318832397460938, "step": 1047 }, { "epoch": 0.25, "learning_rate": 1.6634666666666666e-07, "logps/chosen": -236.86297607421875, "logps/rejected": -285.7398681640625, "loss": 0.0246, "losses/dpo": 2.585119364084676e-05, "losses/sft": 0.5978273153305054, "losses/total": 2.585119364084676e-05, "ref_logps/chosen": -233.10690307617188, "ref_logps/rejected": -204.10296630859375, "rewards/accuracies": 1.0, "rewards/chosen": -0.37560808658599854, "rewards/margins": 7.788081169128418, "rewards/rejected": -8.163688659667969, "step": 1048 }, { "epoch": 0.25, "learning_rate": 1.6629333333333333e-07, "logps/chosen": -213.48509216308594, "logps/rejected": -284.41497802734375, "loss": 0.0108, "losses/dpo": 5.954119615125819e-07, "losses/sft": 0.7464039325714111, "losses/total": 5.954119615125819e-07, "ref_logps/chosen": -208.6313018798828, "ref_logps/rejected": -197.82546997070312, "rewards/accuracies": 1.0, "rewards/chosen": -0.48537948727607727, "rewards/margins": 8.173572540283203, "rewards/rejected": -8.658952713012695, "step": 1049 }, { "epoch": 0.25, "learning_rate": 1.6624e-07, "logps/chosen": -228.09613037109375, "logps/rejected": -340.581787109375, "loss": 0.0128, "losses/dpo": 4.269330020179041e-05, "losses/sft": 0.5395565032958984, "losses/total": 4.269330020179041e-05, "ref_logps/chosen": -225.1246337890625, "ref_logps/rejected": -241.9838409423828, "rewards/accuracies": 1.0, "rewards/chosen": -0.29715079069137573, "rewards/margins": 9.562644958496094, "rewards/rejected": -9.859796524047852, "step": 1050 }, { "epoch": 0.25, "learning_rate": 1.6618666666666666e-07, "logps/chosen": -246.12811279296875, "logps/rejected": -330.7427978515625, "loss": 0.0198, "losses/dpo": 6.38957935734652e-05, "losses/sft": 0.715198814868927, "losses/total": 6.38957935734652e-05, "ref_logps/chosen": -242.321044921875, "ref_logps/rejected": -241.41368103027344, "rewards/accuracies": 1.0, "rewards/chosen": -0.3807080388069153, "rewards/margins": 8.552204132080078, "rewards/rejected": -8.932912826538086, "step": 1051 }, { "epoch": 0.25, "learning_rate": 1.661333333333333e-07, "logps/chosen": -208.2987060546875, "logps/rejected": -280.7796936035156, "loss": 0.0204, "losses/dpo": 3.773273533624888e-07, "losses/sft": 0.685158371925354, "losses/total": 3.773273533624888e-07, "ref_logps/chosen": -204.47088623046875, "ref_logps/rejected": -197.61129760742188, "rewards/accuracies": 1.0, "rewards/chosen": -0.38278263807296753, "rewards/margins": 7.934057712554932, "rewards/rejected": -8.316840171813965, "step": 1052 }, { "epoch": 0.25, "learning_rate": 1.6608e-07, "logps/chosen": -280.94085693359375, "logps/rejected": -349.3674621582031, "loss": 0.0128, "losses/dpo": 0.00012372339551802725, "losses/sft": 0.4893743097782135, "losses/total": 0.00012372339551802725, "ref_logps/chosen": -274.46148681640625, "ref_logps/rejected": -258.64373779296875, "rewards/accuracies": 1.0, "rewards/chosen": -0.6479382514953613, "rewards/margins": 8.424433708190918, "rewards/rejected": -9.072371482849121, "step": 1053 }, { "epoch": 0.25, "learning_rate": 1.6602666666666664e-07, "logps/chosen": -249.2715301513672, "logps/rejected": -283.2001953125, "loss": 0.0193, "losses/dpo": 2.8652990295086056e-05, "losses/sft": 0.517291784286499, "losses/total": 2.8652990295086056e-05, "ref_logps/chosen": -244.9075927734375, "ref_logps/rejected": -201.40126037597656, "rewards/accuracies": 1.0, "rewards/chosen": -0.43639451265335083, "rewards/margins": 7.743498802185059, "rewards/rejected": -8.179893493652344, "step": 1054 }, { "epoch": 0.25, "learning_rate": 1.6597333333333332e-07, "logps/chosen": -224.59280395507812, "logps/rejected": -294.900146484375, "loss": 0.0131, "losses/dpo": 0.0004312737437430769, "losses/sft": 0.683036744594574, "losses/total": 0.0004312737437430769, "ref_logps/chosen": -220.3988494873047, "ref_logps/rejected": -211.6056365966797, "rewards/accuracies": 1.0, "rewards/chosen": -0.419394850730896, "rewards/margins": 7.910055160522461, "rewards/rejected": -8.329450607299805, "step": 1055 }, { "epoch": 0.25, "learning_rate": 1.6592e-07, "logps/chosen": -256.0829162597656, "logps/rejected": -326.3863830566406, "loss": 0.0064, "losses/dpo": 2.4578353986726142e-05, "losses/sft": 0.48053625226020813, "losses/total": 2.4578353986726142e-05, "ref_logps/chosen": -252.11361694335938, "ref_logps/rejected": -236.39939880371094, "rewards/accuracies": 1.0, "rewards/chosen": -0.39693063497543335, "rewards/margins": 8.601765632629395, "rewards/rejected": -8.998696327209473, "step": 1056 }, { "epoch": 0.25, "learning_rate": 1.6586666666666667e-07, "logps/chosen": -254.06085205078125, "logps/rejected": -335.80419921875, "loss": 0.0077, "losses/dpo": 1.1975240568062873e-06, "losses/sft": 0.5255292654037476, "losses/total": 1.1975240568062873e-06, "ref_logps/chosen": -250.23309326171875, "ref_logps/rejected": -241.90643310546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.3827756643295288, "rewards/margins": 9.007001876831055, "rewards/rejected": -9.389777183532715, "step": 1057 }, { "epoch": 0.25, "learning_rate": 1.6581333333333332e-07, "logps/chosen": -258.3802185058594, "logps/rejected": -338.6360778808594, "loss": 0.0096, "losses/dpo": 4.138942585996119e-06, "losses/sft": 0.6800442337989807, "losses/total": 4.138942585996119e-06, "ref_logps/chosen": -255.4610137939453, "ref_logps/rejected": -240.951171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.2919198274612427, "rewards/margins": 9.476571083068848, "rewards/rejected": -9.768491744995117, "step": 1058 }, { "epoch": 0.25, "learning_rate": 1.6576e-07, "logps/chosen": -228.1238555908203, "logps/rejected": -319.260498046875, "loss": 0.0122, "losses/dpo": 1.6313133528456092e-05, "losses/sft": 0.5884807705879211, "losses/total": 1.6313133528456092e-05, "ref_logps/chosen": -222.4395294189453, "ref_logps/rejected": -228.92953491210938, "rewards/accuracies": 1.0, "rewards/chosen": -0.568433403968811, "rewards/margins": 8.464665412902832, "rewards/rejected": -9.033098220825195, "step": 1059 }, { "epoch": 0.25, "learning_rate": 1.6570666666666667e-07, "logps/chosen": -212.671142578125, "logps/rejected": -309.246826171875, "loss": 0.0158, "losses/dpo": 1.1773075812016032e-06, "losses/sft": 0.4838963747024536, "losses/total": 1.1773075812016032e-06, "ref_logps/chosen": -208.09197998046875, "ref_logps/rejected": -223.2904052734375, "rewards/accuracies": 1.0, "rewards/chosen": -0.4579155445098877, "rewards/margins": 8.13772964477539, "rewards/rejected": -8.595643997192383, "step": 1060 }, { "epoch": 0.25, "learning_rate": 1.6565333333333332e-07, "logps/chosen": -252.05142211914062, "logps/rejected": -337.3895263671875, "loss": 0.0015, "losses/dpo": 0.0022218499798327684, "losses/sft": 0.5178987383842468, "losses/total": 0.0022218499798327684, "ref_logps/chosen": -247.28741455078125, "ref_logps/rejected": -243.93020629882812, "rewards/accuracies": 1.0, "rewards/chosen": -0.476399302482605, "rewards/margins": 8.86953353881836, "rewards/rejected": -9.345932960510254, "step": 1061 }, { "epoch": 0.25, "learning_rate": 1.6559999999999997e-07, "logps/chosen": -233.01271057128906, "logps/rejected": -315.02203369140625, "loss": 0.0086, "losses/dpo": 2.5953781005227938e-05, "losses/sft": 0.7667746543884277, "losses/total": 2.5953781005227938e-05, "ref_logps/chosen": -228.84837341308594, "ref_logps/rejected": -229.51145935058594, "rewards/accuracies": 1.0, "rewards/chosen": -0.4164316654205322, "rewards/margins": 8.134627342224121, "rewards/rejected": -8.551058769226074, "step": 1062 }, { "epoch": 0.26, "learning_rate": 1.6554666666666665e-07, "logps/chosen": -238.6940460205078, "logps/rejected": -305.9718322753906, "loss": 0.0095, "losses/dpo": 4.370651367935352e-05, "losses/sft": 0.5545089244842529, "losses/total": 4.370651367935352e-05, "ref_logps/chosen": -233.58718872070312, "ref_logps/rejected": -215.41880798339844, "rewards/accuracies": 1.0, "rewards/chosen": -0.5106862783432007, "rewards/margins": 8.544617652893066, "rewards/rejected": -9.055303573608398, "step": 1063 }, { "epoch": 0.26, "learning_rate": 1.6549333333333333e-07, "logps/chosen": -201.483642578125, "logps/rejected": -321.712646484375, "loss": 0.004, "losses/dpo": 0.00023768845130689442, "losses/sft": 0.6753583550453186, "losses/total": 0.00023768845130689442, "ref_logps/chosen": -197.29603576660156, "ref_logps/rejected": -232.3717041015625, "rewards/accuracies": 1.0, "rewards/chosen": -0.4187593460083008, "rewards/margins": 8.51533317565918, "rewards/rejected": -8.934093475341797, "step": 1064 }, { "epoch": 0.26, "learning_rate": 1.6544e-07, "logps/chosen": -246.38778686523438, "logps/rejected": -306.814697265625, "loss": 0.0072, "losses/dpo": 0.001967846881598234, "losses/sft": 0.7935401201248169, "losses/total": 0.001967846881598234, "ref_logps/chosen": -241.23118591308594, "ref_logps/rejected": -221.73634338378906, "rewards/accuracies": 1.0, "rewards/chosen": -0.5156592130661011, "rewards/margins": 7.99217414855957, "rewards/rejected": -8.507833480834961, "step": 1065 }, { "epoch": 0.26, "learning_rate": 1.6538666666666665e-07, "logps/chosen": -219.00753784179688, "logps/rejected": -307.9250793457031, "loss": 0.0024, "losses/dpo": 6.09699156939314e-07, "losses/sft": 0.6341100335121155, "losses/total": 6.09699156939314e-07, "ref_logps/chosen": -215.4927978515625, "ref_logps/rejected": -219.9945831298828, "rewards/accuracies": 1.0, "rewards/chosen": -0.35147416591644287, "rewards/margins": 8.441573143005371, "rewards/rejected": -8.793046951293945, "step": 1066 }, { "epoch": 0.26, "learning_rate": 1.6533333333333333e-07, "logps/chosen": -221.117919921875, "logps/rejected": -297.2166442871094, "loss": 0.032, "losses/dpo": 9.995996151701547e-06, "losses/sft": 0.5104149580001831, "losses/total": 9.995996151701547e-06, "ref_logps/chosen": -217.24386596679688, "ref_logps/rejected": -212.35169982910156, "rewards/accuracies": 1.0, "rewards/chosen": -0.3874037563800812, "rewards/margins": 8.099090576171875, "rewards/rejected": -8.486494064331055, "step": 1067 }, { "epoch": 0.26, "learning_rate": 1.6528e-07, "logps/chosen": -250.17080688476562, "logps/rejected": -332.06884765625, "loss": 0.0038, "losses/dpo": 5.724136372009525e-06, "losses/sft": 0.6781471967697144, "losses/total": 5.724136372009525e-06, "ref_logps/chosen": -246.92147827148438, "ref_logps/rejected": -235.8365936279297, "rewards/accuracies": 1.0, "rewards/chosen": -0.3249351978302002, "rewards/margins": 9.298290252685547, "rewards/rejected": -9.623225212097168, "step": 1068 }, { "epoch": 0.26, "learning_rate": 1.6522666666666666e-07, "logps/chosen": -219.05111694335938, "logps/rejected": -297.9358825683594, "loss": 0.0088, "losses/dpo": 0.0051598805002868176, "losses/sft": 0.5170899033546448, "losses/total": 0.0051598805002868176, "ref_logps/chosen": -214.6580047607422, "ref_logps/rejected": -214.494384765625, "rewards/accuracies": 1.0, "rewards/chosen": -0.43931126594543457, "rewards/margins": 7.904839992523193, "rewards/rejected": -8.344151496887207, "step": 1069 }, { "epoch": 0.26, "learning_rate": 1.651733333333333e-07, "logps/chosen": -224.88668823242188, "logps/rejected": -342.210205078125, "loss": 0.0058, "losses/dpo": 1.5397454262711108e-05, "losses/sft": 0.4591645896434784, "losses/total": 1.5397454262711108e-05, "ref_logps/chosen": -221.83053588867188, "ref_logps/rejected": -240.6817169189453, "rewards/accuracies": 1.0, "rewards/chosen": -0.30561563372612, "rewards/margins": 9.847236633300781, "rewards/rejected": -10.152853012084961, "step": 1070 }, { "epoch": 0.26, "learning_rate": 1.6511999999999999e-07, "logps/chosen": -250.04287719726562, "logps/rejected": -306.5748291015625, "loss": 0.0125, "losses/dpo": 7.422330236295238e-05, "losses/sft": 0.4827194809913635, "losses/total": 7.422330236295238e-05, "ref_logps/chosen": -246.9451446533203, "ref_logps/rejected": -215.04803466796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.3097720146179199, "rewards/margins": 8.842905044555664, "rewards/rejected": -9.152677536010742, "step": 1071 }, { "epoch": 0.26, "learning_rate": 1.6506666666666666e-07, "logps/chosen": -238.97828674316406, "logps/rejected": -323.131103515625, "loss": 0.0128, "losses/dpo": 1.0476221177668776e-05, "losses/sft": 0.6935738325119019, "losses/total": 1.0476221177668776e-05, "ref_logps/chosen": -234.23887634277344, "ref_logps/rejected": -235.11752319335938, "rewards/accuracies": 1.0, "rewards/chosen": -0.4739404320716858, "rewards/margins": 8.327417373657227, "rewards/rejected": -8.801358222961426, "step": 1072 }, { "epoch": 0.26, "learning_rate": 1.650133333333333e-07, "logps/chosen": -260.4219970703125, "logps/rejected": -337.487060546875, "loss": 0.0031, "losses/dpo": 3.149466465401929e-07, "losses/sft": 0.6502716541290283, "losses/total": 3.149466465401929e-07, "ref_logps/chosen": -255.22930908203125, "ref_logps/rejected": -238.8742218017578, "rewards/accuracies": 1.0, "rewards/chosen": -0.5192722678184509, "rewards/margins": 9.342012405395508, "rewards/rejected": -9.861285209655762, "step": 1073 }, { "epoch": 0.26, "learning_rate": 1.6496e-07, "logps/chosen": -255.94253540039062, "logps/rejected": -331.32403564453125, "loss": 0.0144, "losses/dpo": 1.433755642210599e-06, "losses/sft": 0.5597394108772278, "losses/total": 1.433755642210599e-06, "ref_logps/chosen": -250.52346801757812, "ref_logps/rejected": -236.9294891357422, "rewards/accuracies": 1.0, "rewards/chosen": -0.5419058799743652, "rewards/margins": 8.897550582885742, "rewards/rejected": -9.43945598602295, "step": 1074 }, { "epoch": 0.26, "learning_rate": 1.6490666666666667e-07, "logps/chosen": -239.1215362548828, "logps/rejected": -327.6188659667969, "loss": 0.0028, "losses/dpo": 1.471614865522497e-07, "losses/sft": 0.9995285868644714, "losses/total": 1.471614865522497e-07, "ref_logps/chosen": -234.8173828125, "ref_logps/rejected": -233.2873077392578, "rewards/accuracies": 1.0, "rewards/chosen": -0.4304143190383911, "rewards/margins": 9.002740859985352, "rewards/rejected": -9.433155059814453, "step": 1075 }, { "epoch": 0.26, "learning_rate": 1.6485333333333334e-07, "logps/chosen": -218.60919189453125, "logps/rejected": -321.44189453125, "loss": 0.0145, "losses/dpo": 6.674426913377829e-06, "losses/sft": 0.5274972915649414, "losses/total": 6.674426913377829e-06, "ref_logps/chosen": -214.36288452148438, "ref_logps/rejected": -238.9093017578125, "rewards/accuracies": 1.0, "rewards/chosen": -0.42463141679763794, "rewards/margins": 7.82862663269043, "rewards/rejected": -8.253257751464844, "step": 1076 }, { "epoch": 0.26, "learning_rate": 1.648e-07, "logps/chosen": -222.24395751953125, "logps/rejected": -321.9611511230469, "loss": 0.0072, "losses/dpo": 1.0581187780189794e-05, "losses/sft": 0.7774080038070679, "losses/total": 1.0581187780189794e-05, "ref_logps/chosen": -217.94407653808594, "ref_logps/rejected": -233.57362365722656, "rewards/accuracies": 1.0, "rewards/chosen": -0.42998865246772766, "rewards/margins": 8.408763885498047, "rewards/rejected": -8.838752746582031, "step": 1077 }, { "epoch": 0.26, "learning_rate": 1.6474666666666664e-07, "logps/chosen": -230.9168243408203, "logps/rejected": -317.2247314453125, "loss": 0.0156, "losses/dpo": 0.0001430624834029004, "losses/sft": 0.5602034330368042, "losses/total": 0.0001430624834029004, "ref_logps/chosen": -226.826904296875, "ref_logps/rejected": -238.21490478515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.40899068117141724, "rewards/margins": 7.491990089416504, "rewards/rejected": -7.9009809494018555, "step": 1078 }, { "epoch": 0.26, "learning_rate": 1.6469333333333332e-07, "logps/chosen": -219.4798583984375, "logps/rejected": -330.0706787109375, "loss": 0.0061, "losses/dpo": 6.287972064455971e-05, "losses/sft": 0.4849882423877716, "losses/total": 6.287972064455971e-05, "ref_logps/chosen": -215.37088012695312, "ref_logps/rejected": -232.94117736816406, "rewards/accuracies": 1.0, "rewards/chosen": -0.4109005331993103, "rewards/margins": 9.302051544189453, "rewards/rejected": -9.71295166015625, "step": 1079 }, { "epoch": 0.26, "learning_rate": 1.6464e-07, "logps/chosen": -240.19703674316406, "logps/rejected": -312.663818359375, "loss": 0.0109, "losses/dpo": 9.681368737801677e-07, "losses/sft": 0.4776720404624939, "losses/total": 9.681368737801677e-07, "ref_logps/chosen": -236.54019165039062, "ref_logps/rejected": -220.29150390625, "rewards/accuracies": 1.0, "rewards/chosen": -0.36568570137023926, "rewards/margins": 8.87154483795166, "rewards/rejected": -9.237232208251953, "step": 1080 }, { "epoch": 0.26, "learning_rate": 1.6458666666666665e-07, "logps/chosen": -237.17955017089844, "logps/rejected": -315.5612487792969, "loss": 0.0032, "losses/dpo": 1.4594344293072936e-06, "losses/sft": 0.46818262338638306, "losses/total": 1.4594344293072936e-06, "ref_logps/chosen": -231.944091796875, "ref_logps/rejected": -229.965576171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.5235453248023987, "rewards/margins": 8.03602123260498, "rewards/rejected": -8.559566497802734, "step": 1081 }, { "epoch": 0.26, "learning_rate": 1.6453333333333332e-07, "logps/chosen": -256.1077575683594, "logps/rejected": -309.59820556640625, "loss": 0.0079, "losses/dpo": 3.970616035076091e-06, "losses/sft": 0.6629073619842529, "losses/total": 3.970616035076091e-06, "ref_logps/chosen": -251.281494140625, "ref_logps/rejected": -221.21896362304688, "rewards/accuracies": 1.0, "rewards/chosen": -0.48262539505958557, "rewards/margins": 8.355300903320312, "rewards/rejected": -8.837925910949707, "step": 1082 }, { "epoch": 0.26, "learning_rate": 1.6448e-07, "logps/chosen": -193.33578491210938, "logps/rejected": -291.2379150390625, "loss": 0.0293, "losses/dpo": 4.724624432128621e-06, "losses/sft": 0.34337398409843445, "losses/total": 4.724624432128621e-06, "ref_logps/chosen": -188.9903564453125, "ref_logps/rejected": -208.26675415039062, "rewards/accuracies": 1.0, "rewards/chosen": -0.43454208970069885, "rewards/margins": 7.862573146820068, "rewards/rejected": -8.297115325927734, "step": 1083 }, { "epoch": 0.26, "learning_rate": 1.6442666666666668e-07, "logps/chosen": -241.37796020507812, "logps/rejected": -354.3179931640625, "loss": 0.0036, "losses/dpo": 6.028113602951635e-06, "losses/sft": 0.6220069527626038, "losses/total": 6.028113602951635e-06, "ref_logps/chosen": -237.12118530273438, "ref_logps/rejected": -257.18267822265625, "rewards/accuracies": 1.0, "rewards/chosen": -0.4256771504878998, "rewards/margins": 9.28785514831543, "rewards/rejected": -9.713531494140625, "step": 1084 }, { "epoch": 0.26, "learning_rate": 1.6437333333333333e-07, "logps/chosen": -281.2237548828125, "logps/rejected": -321.6523742675781, "loss": 0.0136, "losses/dpo": 6.060536907170899e-05, "losses/sft": 0.6471336483955383, "losses/total": 6.060536907170899e-05, "ref_logps/chosen": -275.501708984375, "ref_logps/rejected": -231.57986450195312, "rewards/accuracies": 1.0, "rewards/chosen": -0.5722015500068665, "rewards/margins": 8.435049057006836, "rewards/rejected": -9.007251739501953, "step": 1085 }, { "epoch": 0.26, "learning_rate": 1.6432e-07, "logps/chosen": -233.88662719726562, "logps/rejected": -344.097900390625, "loss": 0.0092, "losses/dpo": 0.0004624215071089566, "losses/sft": 0.617963433265686, "losses/total": 0.0004624215071089566, "ref_logps/chosen": -230.74227905273438, "ref_logps/rejected": -248.02957153320312, "rewards/accuracies": 1.0, "rewards/chosen": -0.31443458795547485, "rewards/margins": 9.292397499084473, "rewards/rejected": -9.606832504272461, "step": 1086 }, { "epoch": 0.26, "learning_rate": 1.6426666666666666e-07, "logps/chosen": -209.18545532226562, "logps/rejected": -324.04412841796875, "loss": 0.0029, "losses/dpo": 0.00026018457720056176, "losses/sft": 0.620208203792572, "losses/total": 0.00026018457720056176, "ref_logps/chosen": -204.52369689941406, "ref_logps/rejected": -234.8450927734375, "rewards/accuracies": 1.0, "rewards/chosen": -0.4661744236946106, "rewards/margins": 8.453726768493652, "rewards/rejected": -8.919900894165039, "step": 1087 }, { "epoch": 0.26, "learning_rate": 1.642133333333333e-07, "logps/chosen": -245.8065643310547, "logps/rejected": -306.893310546875, "loss": 0.009, "losses/dpo": 2.011928336287383e-05, "losses/sft": 0.5817844271659851, "losses/total": 2.011928336287383e-05, "ref_logps/chosen": -243.2110595703125, "ref_logps/rejected": -225.4821319580078, "rewards/accuracies": 1.0, "rewards/chosen": -0.2595512866973877, "rewards/margins": 7.881565093994141, "rewards/rejected": -8.14111614227295, "step": 1088 }, { "epoch": 0.26, "learning_rate": 1.6415999999999998e-07, "logps/chosen": -244.301025390625, "logps/rejected": -361.73321533203125, "loss": 0.0053, "losses/dpo": 5.2818770200246945e-05, "losses/sft": 0.46588099002838135, "losses/total": 5.2818770200246945e-05, "ref_logps/chosen": -239.6454620361328, "ref_logps/rejected": -251.58706665039062, "rewards/accuracies": 1.0, "rewards/chosen": -0.46555593609809875, "rewards/margins": 10.54905891418457, "rewards/rejected": -11.014615058898926, "step": 1089 }, { "epoch": 0.26, "learning_rate": 1.6410666666666666e-07, "logps/chosen": -203.13922119140625, "logps/rejected": -310.9552917480469, "loss": 0.0201, "losses/dpo": 5.548394028664916e-07, "losses/sft": 0.5185356736183167, "losses/total": 5.548394028664916e-07, "ref_logps/chosen": -200.8794403076172, "ref_logps/rejected": -220.8304901123047, "rewards/accuracies": 1.0, "rewards/chosen": -0.22597751021385193, "rewards/margins": 8.786500930786133, "rewards/rejected": -9.01247787475586, "step": 1090 }, { "epoch": 0.26, "learning_rate": 1.6405333333333334e-07, "logps/chosen": -260.2601318359375, "logps/rejected": -304.9100341796875, "loss": 0.003, "losses/dpo": 1.2487819731177296e-05, "losses/sft": 0.5671398639678955, "losses/total": 1.2487819731177296e-05, "ref_logps/chosen": -255.46304321289062, "ref_logps/rejected": -219.08154296875, "rewards/accuracies": 1.0, "rewards/chosen": -0.47970885038375854, "rewards/margins": 8.103139877319336, "rewards/rejected": -8.58284854888916, "step": 1091 }, { "epoch": 0.26, "learning_rate": 1.6399999999999999e-07, "logps/chosen": -227.64498901367188, "logps/rejected": -283.7200012207031, "loss": 0.021, "losses/dpo": 0.00012096785940229893, "losses/sft": 0.5326306819915771, "losses/total": 0.00012096785940229893, "ref_logps/chosen": -223.47659301757812, "ref_logps/rejected": -200.44973754882812, "rewards/accuracies": 1.0, "rewards/chosen": -0.4168396592140198, "rewards/margins": 7.910187721252441, "rewards/rejected": -8.327027320861816, "step": 1092 }, { "epoch": 0.26, "learning_rate": 1.6394666666666666e-07, "logps/chosen": -214.46275329589844, "logps/rejected": -307.0354309082031, "loss": 0.0116, "losses/dpo": 9.545947250444442e-07, "losses/sft": 0.5794049501419067, "losses/total": 9.545947250444442e-07, "ref_logps/chosen": -210.99746704101562, "ref_logps/rejected": -219.70916748046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.3465307652950287, "rewards/margins": 8.38609504699707, "rewards/rejected": -8.732625961303711, "step": 1093 }, { "epoch": 0.26, "learning_rate": 1.6389333333333334e-07, "logps/chosen": -195.31402587890625, "logps/rejected": -301.3881530761719, "loss": 0.0186, "losses/dpo": 7.67255733080674e-06, "losses/sft": 0.7670896053314209, "losses/total": 7.67255733080674e-06, "ref_logps/chosen": -191.36947631835938, "ref_logps/rejected": -219.94580078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.39445650577545166, "rewards/margins": 7.749777793884277, "rewards/rejected": -8.144234657287598, "step": 1094 }, { "epoch": 0.26, "learning_rate": 1.6384e-07, "logps/chosen": -222.82192993164062, "logps/rejected": -274.20416259765625, "loss": 0.0104, "losses/dpo": 0.00010332692909287289, "losses/sft": 0.5997670888900757, "losses/total": 0.00010332692909287289, "ref_logps/chosen": -219.00918579101562, "ref_logps/rejected": -196.94407653808594, "rewards/accuracies": 1.0, "rewards/chosen": -0.38127386569976807, "rewards/margins": 7.344735622406006, "rewards/rejected": -7.726009368896484, "step": 1095 }, { "epoch": 0.26, "learning_rate": 1.6378666666666664e-07, "logps/chosen": -197.6004638671875, "logps/rejected": -306.67352294921875, "loss": 0.0075, "losses/dpo": 3.6980050026613753e-06, "losses/sft": 0.9106229543685913, "losses/total": 3.6980050026613753e-06, "ref_logps/chosen": -194.26904296875, "ref_logps/rejected": -216.2097625732422, "rewards/accuracies": 1.0, "rewards/chosen": -0.33314213156700134, "rewards/margins": 8.713234901428223, "rewards/rejected": -9.04637622833252, "step": 1096 }, { "epoch": 0.26, "learning_rate": 1.6373333333333332e-07, "logps/chosen": -188.0382537841797, "logps/rejected": -276.3831787109375, "loss": 0.0089, "losses/dpo": 2.0185952962492593e-06, "losses/sft": 0.547142744064331, "losses/total": 2.0185952962492593e-06, "ref_logps/chosen": -184.706787109375, "ref_logps/rejected": -198.3092498779297, "rewards/accuracies": 1.0, "rewards/chosen": -0.33314621448516846, "rewards/margins": 7.474245071411133, "rewards/rejected": -7.807391166687012, "step": 1097 }, { "epoch": 0.26, "learning_rate": 1.6368e-07, "logps/chosen": -218.20169067382812, "logps/rejected": -322.6071472167969, "loss": 0.0075, "losses/dpo": 0.00017699504678603262, "losses/sft": 0.5800955295562744, "losses/total": 0.00017699504678603262, "ref_logps/chosen": -213.79159545898438, "ref_logps/rejected": -232.87416076660156, "rewards/accuracies": 1.0, "rewards/chosen": -0.44100815057754517, "rewards/margins": 8.532289505004883, "rewards/rejected": -8.973298072814941, "step": 1098 }, { "epoch": 0.26, "learning_rate": 1.6362666666666667e-07, "logps/chosen": -240.22149658203125, "logps/rejected": -300.4964599609375, "loss": 0.0154, "losses/dpo": 6.71769342375228e-08, "losses/sft": 0.698857307434082, "losses/total": 6.71769342375228e-08, "ref_logps/chosen": -233.77520751953125, "ref_logps/rejected": -216.0216064453125, "rewards/accuracies": 1.0, "rewards/chosen": -0.6446294784545898, "rewards/margins": 7.802853584289551, "rewards/rejected": -8.44748306274414, "step": 1099 }, { "epoch": 0.26, "learning_rate": 1.6357333333333332e-07, "logps/chosen": -235.99826049804688, "logps/rejected": -316.71026611328125, "loss": 0.012, "losses/dpo": 5.282422989694169e-06, "losses/sft": 0.6234874725341797, "losses/total": 5.282422989694169e-06, "ref_logps/chosen": -230.55917358398438, "ref_logps/rejected": -223.486328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.5439082384109497, "rewards/margins": 8.778484344482422, "rewards/rejected": -9.322393417358398, "step": 1100 }, { "epoch": 0.26, "learning_rate": 1.6352e-07, "logps/chosen": -249.1162109375, "logps/rejected": -316.736328125, "loss": 0.0063, "losses/dpo": 2.509662226657383e-05, "losses/sft": 0.566949188709259, "losses/total": 2.509662226657383e-05, "ref_logps/chosen": -243.39886474609375, "ref_logps/rejected": -222.11070251464844, "rewards/accuracies": 1.0, "rewards/chosen": -0.571733832359314, "rewards/margins": 8.890829086303711, "rewards/rejected": -9.462562561035156, "step": 1101 }, { "epoch": 0.26, "learning_rate": 1.6346666666666667e-07, "logps/chosen": -273.07000732421875, "logps/rejected": -336.7984313964844, "loss": 0.0069, "losses/dpo": 1.865079866547603e-05, "losses/sft": 0.5916767716407776, "losses/total": 1.865079866547603e-05, "ref_logps/chosen": -268.0878601074219, "ref_logps/rejected": -233.266845703125, "rewards/accuracies": 1.0, "rewards/chosen": -0.4982149302959442, "rewards/margins": 9.854945182800293, "rewards/rejected": -10.35315990447998, "step": 1102 }, { "epoch": 0.26, "learning_rate": 1.6341333333333332e-07, "logps/chosen": -235.90089416503906, "logps/rejected": -316.5350341796875, "loss": 0.0094, "losses/dpo": 0.00022589278523810208, "losses/sft": 0.7088958621025085, "losses/total": 0.00022589278523810208, "ref_logps/chosen": -230.70333862304688, "ref_logps/rejected": -221.163818359375, "rewards/accuracies": 1.0, "rewards/chosen": -0.51975417137146, "rewards/margins": 9.017372131347656, "rewards/rejected": -9.537126541137695, "step": 1103 }, { "epoch": 0.26, "learning_rate": 1.6335999999999998e-07, "logps/chosen": -243.60816955566406, "logps/rejected": -317.4364318847656, "loss": 0.0105, "losses/dpo": 6.326907896436751e-05, "losses/sft": 0.45767468214035034, "losses/total": 6.326907896436751e-05, "ref_logps/chosen": -238.97958374023438, "ref_logps/rejected": -227.9213409423828, "rewards/accuracies": 1.0, "rewards/chosen": -0.46285873651504517, "rewards/margins": 8.488649368286133, "rewards/rejected": -8.951508522033691, "step": 1104 }, { "epoch": 0.27, "learning_rate": 1.6330666666666665e-07, "logps/chosen": -231.34934997558594, "logps/rejected": -334.2886657714844, "loss": 0.0071, "losses/dpo": 1.1168393029947765e-05, "losses/sft": 0.6451650261878967, "losses/total": 1.1168393029947765e-05, "ref_logps/chosen": -224.66171264648438, "ref_logps/rejected": -233.20361328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.6687645316123962, "rewards/margins": 9.439741134643555, "rewards/rejected": -10.108505249023438, "step": 1105 }, { "epoch": 0.27, "learning_rate": 1.6325333333333333e-07, "logps/chosen": -269.5260009765625, "logps/rejected": -353.89593505859375, "loss": 0.0026, "losses/dpo": 1.4524871971843822e-08, "losses/sft": 0.6005225777626038, "losses/total": 1.4524871971843822e-08, "ref_logps/chosen": -263.54461669921875, "ref_logps/rejected": -247.93630981445312, "rewards/accuracies": 1.0, "rewards/chosen": -0.5981390476226807, "rewards/margins": 9.997825622558594, "rewards/rejected": -10.595964431762695, "step": 1106 }, { "epoch": 0.27, "learning_rate": 1.6319999999999998e-07, "logps/chosen": -213.54336547851562, "logps/rejected": -296.4245910644531, "loss": 0.0224, "losses/dpo": 1.8410639313515276e-05, "losses/sft": 0.8805439472198486, "losses/total": 1.8410639313515276e-05, "ref_logps/chosen": -209.04144287109375, "ref_logps/rejected": -209.787109375, "rewards/accuracies": 1.0, "rewards/chosen": -0.4501934349536896, "rewards/margins": 8.213554382324219, "rewards/rejected": -8.66374683380127, "step": 1107 }, { "epoch": 0.27, "learning_rate": 1.6314666666666666e-07, "logps/chosen": -240.0233612060547, "logps/rejected": -288.387451171875, "loss": 0.0063, "losses/dpo": 0.003175538033246994, "losses/sft": 0.4701502025127411, "losses/total": 0.003175538033246994, "ref_logps/chosen": -234.21429443359375, "ref_logps/rejected": -202.28561401367188, "rewards/accuracies": 1.0, "rewards/chosen": -0.580907940864563, "rewards/margins": 8.029275894165039, "rewards/rejected": -8.610183715820312, "step": 1108 }, { "epoch": 0.27, "learning_rate": 1.6309333333333333e-07, "logps/chosen": -261.6294860839844, "logps/rejected": -310.8537902832031, "loss": 0.0074, "losses/dpo": 0.0003417670668568462, "losses/sft": 0.9956147074699402, "losses/total": 0.0003417670668568462, "ref_logps/chosen": -256.8258056640625, "ref_logps/rejected": -224.7946319580078, "rewards/accuracies": 1.0, "rewards/chosen": -0.4803694486618042, "rewards/margins": 8.125545501708984, "rewards/rejected": -8.605915069580078, "step": 1109 }, { "epoch": 0.27, "learning_rate": 1.6304e-07, "logps/chosen": -248.397705078125, "logps/rejected": -328.503173828125, "loss": 0.0097, "losses/dpo": 8.058432285906747e-06, "losses/sft": 0.5915327072143555, "losses/total": 8.058432285906747e-06, "ref_logps/chosen": -244.29554748535156, "ref_logps/rejected": -239.02194213867188, "rewards/accuracies": 1.0, "rewards/chosen": -0.41021567583084106, "rewards/margins": 8.537910461425781, "rewards/rejected": -8.948125839233398, "step": 1110 }, { "epoch": 0.27, "learning_rate": 1.6298666666666666e-07, "logps/chosen": -217.69534301757812, "logps/rejected": -301.444091796875, "loss": 0.0178, "losses/dpo": 4.044211164000444e-05, "losses/sft": 0.7861855030059814, "losses/total": 4.044211164000444e-05, "ref_logps/chosen": -213.73390197753906, "ref_logps/rejected": -215.11981201171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.3961438536643982, "rewards/margins": 8.236283302307129, "rewards/rejected": -8.632427215576172, "step": 1111 }, { "epoch": 0.27, "learning_rate": 1.629333333333333e-07, "logps/chosen": -191.74398803710938, "logps/rejected": -270.8448181152344, "loss": 0.0182, "losses/dpo": 0.0003971128608100116, "losses/sft": 0.725942075252533, "losses/total": 0.0003971128608100116, "ref_logps/chosen": -187.25820922851562, "ref_logps/rejected": -190.91384887695312, "rewards/accuracies": 1.0, "rewards/chosen": -0.4485795795917511, "rewards/margins": 7.544516563415527, "rewards/rejected": -7.993096351623535, "step": 1112 }, { "epoch": 0.27, "learning_rate": 1.6288e-07, "logps/chosen": -239.78390502929688, "logps/rejected": -315.8155517578125, "loss": 0.0047, "losses/dpo": 1.3138541362422984e-05, "losses/sft": 0.37171247601509094, "losses/total": 1.3138541362422984e-05, "ref_logps/chosen": -234.50241088867188, "ref_logps/rejected": -228.08847045898438, "rewards/accuracies": 1.0, "rewards/chosen": -0.5281476378440857, "rewards/margins": 8.244562149047852, "rewards/rejected": -8.772709846496582, "step": 1113 }, { "epoch": 0.27, "learning_rate": 1.6282666666666666e-07, "logps/chosen": -253.7200927734375, "logps/rejected": -323.9759216308594, "loss": 0.0163, "losses/dpo": 3.20836843457073e-05, "losses/sft": 0.6044346690177917, "losses/total": 3.20836843457073e-05, "ref_logps/chosen": -248.8291778564453, "ref_logps/rejected": -227.90109252929688, "rewards/accuracies": 1.0, "rewards/chosen": -0.4890902638435364, "rewards/margins": 9.118391990661621, "rewards/rejected": -9.60748291015625, "step": 1114 }, { "epoch": 0.27, "learning_rate": 1.6277333333333331e-07, "logps/chosen": -242.39366149902344, "logps/rejected": -325.183349609375, "loss": 0.0034, "losses/dpo": 4.535359039437026e-05, "losses/sft": 0.5651654601097107, "losses/total": 4.535359039437026e-05, "ref_logps/chosen": -237.8141326904297, "ref_logps/rejected": -231.92630004882812, "rewards/accuracies": 1.0, "rewards/chosen": -0.45795220136642456, "rewards/margins": 8.867751121520996, "rewards/rejected": -9.325703620910645, "step": 1115 }, { "epoch": 0.27, "learning_rate": 1.6272e-07, "logps/chosen": -186.42941284179688, "logps/rejected": -260.26318359375, "loss": 0.0116, "losses/dpo": 1.4541364556919234e-08, "losses/sft": 0.6321532726287842, "losses/total": 1.4541364556919234e-08, "ref_logps/chosen": -184.75071716308594, "ref_logps/rejected": -183.60433959960938, "rewards/accuracies": 1.0, "rewards/chosen": -0.1678714156150818, "rewards/margins": 7.498008728027344, "rewards/rejected": -7.66588020324707, "step": 1116 }, { "epoch": 0.27, "learning_rate": 1.6266666666666667e-07, "logps/chosen": -237.37208557128906, "logps/rejected": -281.82110595703125, "loss": 0.0239, "losses/dpo": 3.073333573411219e-06, "losses/sft": 0.5653253197669983, "losses/total": 3.073333573411219e-06, "ref_logps/chosen": -233.73594665527344, "ref_logps/rejected": -213.7664031982422, "rewards/accuracies": 1.0, "rewards/chosen": -0.36361461877822876, "rewards/margins": 6.441854953765869, "rewards/rejected": -6.805469512939453, "step": 1117 }, { "epoch": 0.27, "learning_rate": 1.6261333333333334e-07, "logps/chosen": -250.6980438232422, "logps/rejected": -308.5121154785156, "loss": 0.0072, "losses/dpo": 0.0002708381216507405, "losses/sft": 0.5339182019233704, "losses/total": 0.0002708381216507405, "ref_logps/chosen": -244.62759399414062, "ref_logps/rejected": -217.8408660888672, "rewards/accuracies": 1.0, "rewards/chosen": -0.6070427894592285, "rewards/margins": 8.460081100463867, "rewards/rejected": -9.067124366760254, "step": 1118 }, { "epoch": 0.27, "learning_rate": 1.6256e-07, "logps/chosen": -224.08743286132812, "logps/rejected": -304.5152893066406, "loss": 0.0113, "losses/dpo": 0.005225463770329952, "losses/sft": 0.3586600720882416, "losses/total": 0.005225463770329952, "ref_logps/chosen": -220.2293701171875, "ref_logps/rejected": -221.93544006347656, "rewards/accuracies": 1.0, "rewards/chosen": -0.3858059048652649, "rewards/margins": 7.87217903137207, "rewards/rejected": -8.25798511505127, "step": 1119 }, { "epoch": 0.27, "learning_rate": 1.6250666666666667e-07, "logps/chosen": -230.94500732421875, "logps/rejected": -305.2582092285156, "loss": 0.0073, "losses/dpo": 0.00019842624897137284, "losses/sft": 0.6677121520042419, "losses/total": 0.00019842624897137284, "ref_logps/chosen": -225.34478759765625, "ref_logps/rejected": -215.74159240722656, "rewards/accuracies": 1.0, "rewards/chosen": -0.5600248575210571, "rewards/margins": 8.391636848449707, "rewards/rejected": -8.951662063598633, "step": 1120 }, { "epoch": 0.27, "learning_rate": 1.6245333333333332e-07, "logps/chosen": -261.83990478515625, "logps/rejected": -316.1853942871094, "loss": 0.0146, "losses/dpo": 2.3878581600911275e-07, "losses/sft": 0.5436355471611023, "losses/total": 2.3878581600911275e-07, "ref_logps/chosen": -256.7939453125, "ref_logps/rejected": -226.84384155273438, "rewards/accuracies": 1.0, "rewards/chosen": -0.5045957565307617, "rewards/margins": 8.429559707641602, "rewards/rejected": -8.934155464172363, "step": 1121 }, { "epoch": 0.27, "learning_rate": 1.624e-07, "logps/chosen": -224.4140167236328, "logps/rejected": -312.5706787109375, "loss": 0.0108, "losses/dpo": 2.8774680686183274e-05, "losses/sft": 0.5912872552871704, "losses/total": 2.8774680686183274e-05, "ref_logps/chosen": -220.64208984375, "ref_logps/rejected": -222.54205322265625, "rewards/accuracies": 1.0, "rewards/chosen": -0.3771927058696747, "rewards/margins": 8.625669479370117, "rewards/rejected": -9.002861976623535, "step": 1122 }, { "epoch": 0.27, "learning_rate": 1.6234666666666665e-07, "logps/chosen": -251.73574829101562, "logps/rejected": -325.2638244628906, "loss": 0.005, "losses/dpo": 0.002114641247317195, "losses/sft": 0.674163281917572, "losses/total": 0.002114641247317195, "ref_logps/chosen": -246.53297424316406, "ref_logps/rejected": -228.35043334960938, "rewards/accuracies": 1.0, "rewards/chosen": -0.5202797651290894, "rewards/margins": 9.171058654785156, "rewards/rejected": -9.691339492797852, "step": 1123 }, { "epoch": 0.27, "learning_rate": 1.6229333333333333e-07, "logps/chosen": -199.49081420898438, "logps/rejected": -285.19195556640625, "loss": 0.0201, "losses/dpo": 2.5515166726108873e-06, "losses/sft": 0.5139191150665283, "losses/total": 2.5515166726108873e-06, "ref_logps/chosen": -196.21826171875, "ref_logps/rejected": -206.75840759277344, "rewards/accuracies": 1.0, "rewards/chosen": -0.32725492119789124, "rewards/margins": 7.5160980224609375, "rewards/rejected": -7.843353271484375, "step": 1124 }, { "epoch": 0.27, "learning_rate": 1.6224e-07, "logps/chosen": -265.99176025390625, "logps/rejected": -327.94580078125, "loss": 0.0055, "losses/dpo": 3.0433133360929787e-05, "losses/sft": 1.3215742111206055, "losses/total": 3.0433133360929787e-05, "ref_logps/chosen": -260.5416259765625, "ref_logps/rejected": -234.58551025390625, "rewards/accuracies": 1.0, "rewards/chosen": -0.5450139045715332, "rewards/margins": 8.791016578674316, "rewards/rejected": -9.336030960083008, "step": 1125 }, { "epoch": 0.27, "learning_rate": 1.6218666666666665e-07, "logps/chosen": -254.07110595703125, "logps/rejected": -339.7467041015625, "loss": 0.0053, "losses/dpo": 0.00045183603651821613, "losses/sft": 0.6242967247962952, "losses/total": 0.00045183603651821613, "ref_logps/chosen": -249.18836975097656, "ref_logps/rejected": -241.0879364013672, "rewards/accuracies": 1.0, "rewards/chosen": -0.4882742464542389, "rewards/margins": 9.377601623535156, "rewards/rejected": -9.865876197814941, "step": 1126 }, { "epoch": 0.27, "learning_rate": 1.6213333333333333e-07, "logps/chosen": -184.62979125976562, "logps/rejected": -282.81451416015625, "loss": 0.0193, "losses/dpo": 5.302283625496784e-07, "losses/sft": 0.7681980133056641, "losses/total": 5.302283625496784e-07, "ref_logps/chosen": -179.00091552734375, "ref_logps/rejected": -200.36968994140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.5628880858421326, "rewards/margins": 7.681593418121338, "rewards/rejected": -8.244481086730957, "step": 1127 }, { "epoch": 0.27, "learning_rate": 1.6208e-07, "logps/chosen": -213.1852264404297, "logps/rejected": -324.3049011230469, "loss": 0.0035, "losses/dpo": 8.138903649523854e-05, "losses/sft": 0.7008992433547974, "losses/total": 8.138903649523854e-05, "ref_logps/chosen": -209.57546997070312, "ref_logps/rejected": -229.5164794921875, "rewards/accuracies": 1.0, "rewards/chosen": -0.3609755039215088, "rewards/margins": 9.117864608764648, "rewards/rejected": -9.478839874267578, "step": 1128 }, { "epoch": 0.27, "learning_rate": 1.6202666666666666e-07, "logps/chosen": -205.8533935546875, "logps/rejected": -292.85235595703125, "loss": 0.0147, "losses/dpo": 1.937726494816161e-07, "losses/sft": 0.6621019244194031, "losses/total": 1.937726494816161e-07, "ref_logps/chosen": -201.78717041015625, "ref_logps/rejected": -210.35623168945312, "rewards/accuracies": 1.0, "rewards/chosen": -0.4066222906112671, "rewards/margins": 7.842988014221191, "rewards/rejected": -8.24960994720459, "step": 1129 }, { "epoch": 0.27, "learning_rate": 1.619733333333333e-07, "logps/chosen": -247.87225341796875, "logps/rejected": -329.4400329589844, "loss": 0.0088, "losses/dpo": 5.591996341536287e-07, "losses/sft": 0.45255452394485474, "losses/total": 5.591996341536287e-07, "ref_logps/chosen": -243.70309448242188, "ref_logps/rejected": -236.15830993652344, "rewards/accuracies": 1.0, "rewards/chosen": -0.4169158339500427, "rewards/margins": 8.9112548828125, "rewards/rejected": -9.328171730041504, "step": 1130 }, { "epoch": 0.27, "learning_rate": 1.6191999999999998e-07, "logps/chosen": -207.38450622558594, "logps/rejected": -295.475341796875, "loss": 0.0282, "losses/dpo": 8.747298352318467e-07, "losses/sft": 0.631464958190918, "losses/total": 8.747298352318467e-07, "ref_logps/chosen": -202.6304931640625, "ref_logps/rejected": -209.3546600341797, "rewards/accuracies": 1.0, "rewards/chosen": -0.4754025936126709, "rewards/margins": 8.136667251586914, "rewards/rejected": -8.612070083618164, "step": 1131 }, { "epoch": 0.27, "learning_rate": 1.6186666666666666e-07, "logps/chosen": -198.538818359375, "logps/rejected": -308.7484130859375, "loss": 0.0112, "losses/dpo": 1.9305072783026844e-05, "losses/sft": 0.4072849154472351, "losses/total": 1.9305072783026844e-05, "ref_logps/chosen": -195.0639190673828, "ref_logps/rejected": -215.8079071044922, "rewards/accuracies": 1.0, "rewards/chosen": -0.3474907875061035, "rewards/margins": 8.946558952331543, "rewards/rejected": -9.294050216674805, "step": 1132 }, { "epoch": 0.27, "learning_rate": 1.6181333333333334e-07, "logps/chosen": -221.001708984375, "logps/rejected": -280.38720703125, "loss": 0.0098, "losses/dpo": 0.0001769781665643677, "losses/sft": 0.5482199788093567, "losses/total": 0.0001769781665643677, "ref_logps/chosen": -215.76983642578125, "ref_logps/rejected": -198.0987091064453, "rewards/accuracies": 1.0, "rewards/chosen": -0.5231865644454956, "rewards/margins": 7.7056660652160645, "rewards/rejected": -8.228853225708008, "step": 1133 }, { "epoch": 0.27, "learning_rate": 1.6176e-07, "logps/chosen": -196.160888671875, "logps/rejected": -278.6800537109375, "loss": 0.0082, "losses/dpo": 3.022041255462682e-06, "losses/sft": 0.46562978625297546, "losses/total": 3.022041255462682e-06, "ref_logps/chosen": -193.985595703125, "ref_logps/rejected": -198.23574829101562, "rewards/accuracies": 1.0, "rewards/chosen": -0.2175271213054657, "rewards/margins": 7.826901912689209, "rewards/rejected": -8.044428825378418, "step": 1134 }, { "epoch": 0.27, "learning_rate": 1.6170666666666666e-07, "logps/chosen": -192.3503875732422, "logps/rejected": -297.96417236328125, "loss": 0.0142, "losses/dpo": 1.1728183380910195e-05, "losses/sft": 0.4662282466888428, "losses/total": 1.1728183380910195e-05, "ref_logps/chosen": -188.11178588867188, "ref_logps/rejected": -208.51113891601562, "rewards/accuracies": 1.0, "rewards/chosen": -0.4238598346710205, "rewards/margins": 8.521442413330078, "rewards/rejected": -8.945302963256836, "step": 1135 }, { "epoch": 0.27, "learning_rate": 1.6165333333333334e-07, "logps/chosen": -255.0152130126953, "logps/rejected": -339.46649169921875, "loss": 0.0098, "losses/dpo": 0.001093612634576857, "losses/sft": 0.42177802324295044, "losses/total": 0.001093612634576857, "ref_logps/chosen": -251.29434204101562, "ref_logps/rejected": -244.61758422851562, "rewards/accuracies": 1.0, "rewards/chosen": -0.3720894455909729, "rewards/margins": 9.112799644470215, "rewards/rejected": -9.484889030456543, "step": 1136 }, { "epoch": 0.27, "learning_rate": 1.616e-07, "logps/chosen": -252.9120330810547, "logps/rejected": -338.5488586425781, "loss": 0.0083, "losses/dpo": 5.00208479934372e-05, "losses/sft": 0.9748411774635315, "losses/total": 5.00208479934372e-05, "ref_logps/chosen": -248.64686584472656, "ref_logps/rejected": -247.065185546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.4265173375606537, "rewards/margins": 8.721854209899902, "rewards/rejected": -9.148370742797852, "step": 1137 }, { "epoch": 0.27, "learning_rate": 1.6154666666666664e-07, "logps/chosen": -252.62051391601562, "logps/rejected": -333.91717529296875, "loss": 0.0082, "losses/dpo": 1.1635848977675778e-06, "losses/sft": 0.9337674975395203, "losses/total": 1.1635848977675778e-06, "ref_logps/chosen": -247.53463745117188, "ref_logps/rejected": -238.18954467773438, "rewards/accuracies": 1.0, "rewards/chosen": -0.5085887908935547, "rewards/margins": 9.06417465209961, "rewards/rejected": -9.572763442993164, "step": 1138 }, { "epoch": 0.27, "learning_rate": 1.6149333333333332e-07, "logps/chosen": -224.48062133789062, "logps/rejected": -312.64935302734375, "loss": 0.0065, "losses/dpo": 7.281856596819125e-06, "losses/sft": 0.7148618698120117, "losses/total": 7.281856596819125e-06, "ref_logps/chosen": -219.72366333007812, "ref_logps/rejected": -212.33609008789062, "rewards/accuracies": 1.0, "rewards/chosen": -0.47569453716278076, "rewards/margins": 9.555630683898926, "rewards/rejected": -10.031325340270996, "step": 1139 }, { "epoch": 0.27, "learning_rate": 1.6144e-07, "logps/chosen": -230.20986938476562, "logps/rejected": -335.5874938964844, "loss": 0.0065, "losses/dpo": 1.6886786397662945e-05, "losses/sft": 0.5463975667953491, "losses/total": 1.6886786397662945e-05, "ref_logps/chosen": -225.22283935546875, "ref_logps/rejected": -243.18019104003906, "rewards/accuracies": 1.0, "rewards/chosen": -0.49870336055755615, "rewards/margins": 8.742025375366211, "rewards/rejected": -9.240728378295898, "step": 1140 }, { "epoch": 0.27, "learning_rate": 1.6138666666666665e-07, "logps/chosen": -221.4991455078125, "logps/rejected": -320.3246765136719, "loss": 0.004, "losses/dpo": 1.0126007055077935e-06, "losses/sft": 0.6093847751617432, "losses/total": 1.0126007055077935e-06, "ref_logps/chosen": -217.495361328125, "ref_logps/rejected": -215.9403076171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.4003788232803345, "rewards/margins": 10.038057327270508, "rewards/rejected": -10.438436508178711, "step": 1141 }, { "epoch": 0.27, "learning_rate": 1.6133333333333332e-07, "logps/chosen": -250.88848876953125, "logps/rejected": -322.98828125, "loss": 0.007, "losses/dpo": 1.8927927158074453e-05, "losses/sft": 0.5425437092781067, "losses/total": 1.8927927158074453e-05, "ref_logps/chosen": -246.26768493652344, "ref_logps/rejected": -228.9654541015625, "rewards/accuracies": 1.0, "rewards/chosen": -0.4620809257030487, "rewards/margins": 8.940201759338379, "rewards/rejected": -9.40228271484375, "step": 1142 }, { "epoch": 0.27, "learning_rate": 1.6128e-07, "logps/chosen": -216.18365478515625, "logps/rejected": -293.291015625, "loss": 0.0276, "losses/dpo": 5.2826675528194755e-05, "losses/sft": 0.5985231399536133, "losses/total": 5.2826675528194755e-05, "ref_logps/chosen": -210.8964080810547, "ref_logps/rejected": -201.775634765625, "rewards/accuracies": 1.0, "rewards/chosen": -0.5287234783172607, "rewards/margins": 8.622814178466797, "rewards/rejected": -9.15153694152832, "step": 1143 }, { "epoch": 0.27, "learning_rate": 1.6122666666666668e-07, "logps/chosen": -201.09283447265625, "logps/rejected": -294.90887451171875, "loss": 0.0127, "losses/dpo": 0.0005992997903376818, "losses/sft": 0.5220276713371277, "losses/total": 0.0005992997903376818, "ref_logps/chosen": -197.01699829101562, "ref_logps/rejected": -211.32945251464844, "rewards/accuracies": 1.0, "rewards/chosen": -0.40758296847343445, "rewards/margins": 7.950360298156738, "rewards/rejected": -8.357943534851074, "step": 1144 }, { "epoch": 0.27, "learning_rate": 1.6117333333333333e-07, "logps/chosen": -258.5621337890625, "logps/rejected": -346.127197265625, "loss": 0.0046, "losses/dpo": 6.394675438059494e-05, "losses/sft": 0.3570171594619751, "losses/total": 6.394675438059494e-05, "ref_logps/chosen": -254.00657653808594, "ref_logps/rejected": -240.86192321777344, "rewards/accuracies": 1.0, "rewards/chosen": -0.4555589556694031, "rewards/margins": 10.070969581604004, "rewards/rejected": -10.526528358459473, "step": 1145 }, { "epoch": 0.28, "learning_rate": 1.6111999999999998e-07, "logps/chosen": -222.5377197265625, "logps/rejected": -338.0068664550781, "loss": 0.0064, "losses/dpo": 4.353002441348508e-05, "losses/sft": 0.44537824392318726, "losses/total": 4.353002441348508e-05, "ref_logps/chosen": -218.47817993164062, "ref_logps/rejected": -246.14804077148438, "rewards/accuracies": 1.0, "rewards/chosen": -0.4059528708457947, "rewards/margins": 8.779932022094727, "rewards/rejected": -9.185884475708008, "step": 1146 }, { "epoch": 0.28, "learning_rate": 1.6106666666666665e-07, "logps/chosen": -213.095458984375, "logps/rejected": -290.0361328125, "loss": 0.0101, "losses/dpo": 0.00024820174439810216, "losses/sft": 0.6187179684638977, "losses/total": 0.00024820174439810216, "ref_logps/chosen": -209.70980834960938, "ref_logps/rejected": -206.1544189453125, "rewards/accuracies": 1.0, "rewards/chosen": -0.33856338262557983, "rewards/margins": 8.04960823059082, "rewards/rejected": -8.388172149658203, "step": 1147 }, { "epoch": 0.28, "learning_rate": 1.6101333333333333e-07, "logps/chosen": -253.49708557128906, "logps/rejected": -321.6876220703125, "loss": 0.0152, "losses/dpo": 4.209265114241134e-08, "losses/sft": 0.631446897983551, "losses/total": 4.209265114241134e-08, "ref_logps/chosen": -249.1172637939453, "ref_logps/rejected": -226.47665405273438, "rewards/accuracies": 1.0, "rewards/chosen": -0.4379841089248657, "rewards/margins": 9.083112716674805, "rewards/rejected": -9.521097183227539, "step": 1148 }, { "epoch": 0.28, "learning_rate": 1.6095999999999998e-07, "logps/chosen": -240.71218872070312, "logps/rejected": -331.51092529296875, "loss": 0.0099, "losses/dpo": 6.449469947256148e-05, "losses/sft": 0.6680575609207153, "losses/total": 6.449469947256148e-05, "ref_logps/chosen": -233.07020568847656, "ref_logps/rejected": -231.92892456054688, "rewards/accuracies": 1.0, "rewards/chosen": -0.7641968727111816, "rewards/margins": 9.19400405883789, "rewards/rejected": -9.95820140838623, "step": 1149 }, { "epoch": 0.28, "learning_rate": 1.6090666666666666e-07, "logps/chosen": -269.211181640625, "logps/rejected": -322.5999755859375, "loss": 0.0049, "losses/dpo": 0.0001316637935815379, "losses/sft": 0.7891276478767395, "losses/total": 0.0001316637935815379, "ref_logps/chosen": -262.741455078125, "ref_logps/rejected": -223.96511840820312, "rewards/accuracies": 1.0, "rewards/chosen": -0.6469736099243164, "rewards/margins": 9.216512680053711, "rewards/rejected": -9.863485336303711, "step": 1150 }, { "epoch": 0.28, "learning_rate": 1.6085333333333333e-07, "logps/chosen": -237.55075073242188, "logps/rejected": -297.8074035644531, "loss": 0.0098, "losses/dpo": 2.1096142518217675e-05, "losses/sft": 0.5377599596977234, "losses/total": 2.1096142518217675e-05, "ref_logps/chosen": -232.98593139648438, "ref_logps/rejected": -213.52195739746094, "rewards/accuracies": 1.0, "rewards/chosen": -0.4564821422100067, "rewards/margins": 7.972062110900879, "rewards/rejected": -8.428544044494629, "step": 1151 }, { "epoch": 0.28, "learning_rate": 1.608e-07, "logps/chosen": -196.05615234375, "logps/rejected": -259.6943359375, "loss": 0.0158, "losses/dpo": 2.419767270112061e-06, "losses/sft": 0.4617113173007965, "losses/total": 2.419767270112061e-06, "ref_logps/chosen": -192.02743530273438, "ref_logps/rejected": -185.47409057617188, "rewards/accuracies": 1.0, "rewards/chosen": -0.4028717279434204, "rewards/margins": 7.0191521644592285, "rewards/rejected": -7.422024250030518, "step": 1152 }, { "epoch": 0.28, "learning_rate": 1.6074666666666666e-07, "logps/chosen": -251.63963317871094, "logps/rejected": -307.5787658691406, "loss": 0.0083, "losses/dpo": 7.073194865370169e-06, "losses/sft": 0.5074294209480286, "losses/total": 7.073194865370169e-06, "ref_logps/chosen": -248.20562744140625, "ref_logps/rejected": -221.82162475585938, "rewards/accuracies": 1.0, "rewards/chosen": -0.34339961409568787, "rewards/margins": 8.232315063476562, "rewards/rejected": -8.575714111328125, "step": 1153 }, { "epoch": 0.28, "learning_rate": 1.6069333333333334e-07, "logps/chosen": -197.034912109375, "logps/rejected": -299.96722412109375, "loss": 0.012, "losses/dpo": 7.184160494944081e-05, "losses/sft": 0.4618920683860779, "losses/total": 7.184160494944081e-05, "ref_logps/chosen": -192.51174926757812, "ref_logps/rejected": -209.2880096435547, "rewards/accuracies": 1.0, "rewards/chosen": -0.4523181915283203, "rewards/margins": 8.615601539611816, "rewards/rejected": -9.067919731140137, "step": 1154 }, { "epoch": 0.28, "learning_rate": 1.6064e-07, "logps/chosen": -185.22357177734375, "logps/rejected": -305.6374816894531, "loss": 0.0249, "losses/dpo": 1.1630069820967037e-05, "losses/sft": 0.5864198803901672, "losses/total": 1.1630069820967037e-05, "ref_logps/chosen": -182.18170166015625, "ref_logps/rejected": -208.6343994140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.3041868209838867, "rewards/margins": 9.396119117736816, "rewards/rejected": -9.700305938720703, "step": 1155 }, { "epoch": 0.28, "learning_rate": 1.6058666666666666e-07, "logps/chosen": -241.39923095703125, "logps/rejected": -319.5164794921875, "loss": 0.0289, "losses/dpo": 0.00025233448832295835, "losses/sft": 0.8075910210609436, "losses/total": 0.00025233448832295835, "ref_logps/chosen": -237.382080078125, "ref_logps/rejected": -218.89491271972656, "rewards/accuracies": 1.0, "rewards/chosen": -0.4017145037651062, "rewards/margins": 9.660443305969238, "rewards/rejected": -10.062158584594727, "step": 1156 }, { "epoch": 0.28, "learning_rate": 1.6053333333333331e-07, "logps/chosen": -221.91542053222656, "logps/rejected": -313.72998046875, "loss": 0.004, "losses/dpo": 2.773425694613252e-05, "losses/sft": 0.6946456432342529, "losses/total": 2.773425694613252e-05, "ref_logps/chosen": -216.72216796875, "ref_logps/rejected": -221.317626953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.5193266868591309, "rewards/margins": 8.72191047668457, "rewards/rejected": -9.24123764038086, "step": 1157 }, { "epoch": 0.28, "learning_rate": 1.6048e-07, "logps/chosen": -213.89846801757812, "logps/rejected": -305.2623291015625, "loss": 0.0024, "losses/dpo": 0.001960332039743662, "losses/sft": 0.7113137245178223, "losses/total": 0.001960332039743662, "ref_logps/chosen": -212.12744140625, "ref_logps/rejected": -216.62744140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.17710210382938385, "rewards/margins": 8.68638801574707, "rewards/rejected": -8.863490104675293, "step": 1158 }, { "epoch": 0.28, "learning_rate": 1.6042666666666667e-07, "logps/chosen": -185.9448699951172, "logps/rejected": -314.2581481933594, "loss": 0.0092, "losses/dpo": 6.139130709925666e-05, "losses/sft": 0.6572833061218262, "losses/total": 6.139130709925666e-05, "ref_logps/chosen": -180.71397399902344, "ref_logps/rejected": -219.33432006835938, "rewards/accuracies": 1.0, "rewards/chosen": -0.5230889320373535, "rewards/margins": 8.969295501708984, "rewards/rejected": -9.49238395690918, "step": 1159 }, { "epoch": 0.28, "learning_rate": 1.6037333333333332e-07, "logps/chosen": -239.62693786621094, "logps/rejected": -289.697998046875, "loss": 0.0059, "losses/dpo": 3.4130678159272065e-06, "losses/sft": 0.6406005024909973, "losses/total": 3.4130678159272065e-06, "ref_logps/chosen": -235.86199951171875, "ref_logps/rejected": -199.752197265625, "rewards/accuracies": 1.0, "rewards/chosen": -0.37649595737457275, "rewards/margins": 8.618085861206055, "rewards/rejected": -8.994582176208496, "step": 1160 }, { "epoch": 0.28, "learning_rate": 1.6032e-07, "logps/chosen": -218.09739685058594, "logps/rejected": -313.2002868652344, "loss": 0.0044, "losses/dpo": 4.485842964641051e-06, "losses/sft": 0.6697046756744385, "losses/total": 4.485842964641051e-06, "ref_logps/chosen": -214.71234130859375, "ref_logps/rejected": -224.26136779785156, "rewards/accuracies": 1.0, "rewards/chosen": -0.338506281375885, "rewards/margins": 8.555384635925293, "rewards/rejected": -8.893891334533691, "step": 1161 }, { "epoch": 0.28, "learning_rate": 1.6026666666666667e-07, "logps/chosen": -259.33056640625, "logps/rejected": -324.4783630371094, "loss": 0.0059, "losses/dpo": 2.9244149118312635e-05, "losses/sft": 0.7809543609619141, "losses/total": 2.9244149118312635e-05, "ref_logps/chosen": -255.859619140625, "ref_logps/rejected": -220.97152709960938, "rewards/accuracies": 1.0, "rewards/chosen": -0.3470938801765442, "rewards/margins": 10.003589630126953, "rewards/rejected": -10.350683212280273, "step": 1162 }, { "epoch": 0.28, "learning_rate": 1.6021333333333332e-07, "logps/chosen": -244.38986206054688, "logps/rejected": -326.1348876953125, "loss": 0.0046, "losses/dpo": 3.5507917345967144e-05, "losses/sft": 0.95566725730896, "losses/total": 3.5507917345967144e-05, "ref_logps/chosen": -240.83383178710938, "ref_logps/rejected": -236.65826416015625, "rewards/accuracies": 1.0, "rewards/chosen": -0.3556015193462372, "rewards/margins": 8.592062950134277, "rewards/rejected": -8.947664260864258, "step": 1163 }, { "epoch": 0.28, "learning_rate": 1.6015999999999997e-07, "logps/chosen": -221.2098388671875, "logps/rejected": -320.8876037597656, "loss": 0.0035, "losses/dpo": 1.0009704055846669e-05, "losses/sft": 0.9569573402404785, "losses/total": 1.0009704055846669e-05, "ref_logps/chosen": -217.143798828125, "ref_logps/rejected": -223.14822387695312, "rewards/accuracies": 1.0, "rewards/chosen": -0.40660402178764343, "rewards/margins": 9.367335319519043, "rewards/rejected": -9.773940086364746, "step": 1164 }, { "epoch": 0.28, "learning_rate": 1.6010666666666665e-07, "logps/chosen": -203.30343627929688, "logps/rejected": -301.2158508300781, "loss": 0.009, "losses/dpo": 2.6230978619423695e-05, "losses/sft": 0.9168374538421631, "losses/total": 2.6230978619423695e-05, "ref_logps/chosen": -200.4078369140625, "ref_logps/rejected": -217.32810974121094, "rewards/accuracies": 1.0, "rewards/chosen": -0.2895602285861969, "rewards/margins": 8.099214553833008, "rewards/rejected": -8.388774871826172, "step": 1165 }, { "epoch": 0.28, "learning_rate": 1.6005333333333333e-07, "logps/chosen": -258.73486328125, "logps/rejected": -347.9583740234375, "loss": 0.0093, "losses/dpo": 0.00016144687833730131, "losses/sft": 0.6078358888626099, "losses/total": 0.00016144687833730131, "ref_logps/chosen": -252.01449584960938, "ref_logps/rejected": -240.85186767578125, "rewards/accuracies": 1.0, "rewards/chosen": -0.6720367074012756, "rewards/margins": 10.038617134094238, "rewards/rejected": -10.710653305053711, "step": 1166 }, { "epoch": 0.28, "learning_rate": 1.6e-07, "logps/chosen": -278.9888916015625, "logps/rejected": -307.715576171875, "loss": 0.0184, "losses/dpo": 8.389668801100925e-05, "losses/sft": 0.699006199836731, "losses/total": 8.389668801100925e-05, "ref_logps/chosen": -274.3030090332031, "ref_logps/rejected": -218.2624969482422, "rewards/accuracies": 1.0, "rewards/chosen": -0.4685865044593811, "rewards/margins": 8.47672176361084, "rewards/rejected": -8.945308685302734, "step": 1167 }, { "epoch": 0.28, "learning_rate": 1.5994666666666665e-07, "logps/chosen": -211.9648895263672, "logps/rejected": -276.5823974609375, "loss": 0.0118, "losses/dpo": 4.037228791275993e-05, "losses/sft": 0.6170954704284668, "losses/total": 4.037228791275993e-05, "ref_logps/chosen": -207.42819213867188, "ref_logps/rejected": -194.10470581054688, "rewards/accuracies": 1.0, "rewards/chosen": -0.45367202162742615, "rewards/margins": 7.794096946716309, "rewards/rejected": -8.24776840209961, "step": 1168 }, { "epoch": 0.28, "learning_rate": 1.5989333333333333e-07, "logps/chosen": -201.5165252685547, "logps/rejected": -302.0193176269531, "loss": 0.0137, "losses/dpo": 1.0145849955733865e-05, "losses/sft": 0.4769345223903656, "losses/total": 1.0145849955733865e-05, "ref_logps/chosen": -197.7200927734375, "ref_logps/rejected": -214.40847778320312, "rewards/accuracies": 1.0, "rewards/chosen": -0.37964391708374023, "rewards/margins": 8.38144302368164, "rewards/rejected": -8.761086463928223, "step": 1169 }, { "epoch": 0.28, "learning_rate": 1.5984e-07, "logps/chosen": -221.02940368652344, "logps/rejected": -310.603515625, "loss": 0.0089, "losses/dpo": 0.00011189691576873884, "losses/sft": 0.6204862594604492, "losses/total": 0.00011189691576873884, "ref_logps/chosen": -216.35211181640625, "ref_logps/rejected": -222.51080322265625, "rewards/accuracies": 1.0, "rewards/chosen": -0.4677294194698334, "rewards/margins": 8.341541290283203, "rewards/rejected": -8.809269905090332, "step": 1170 }, { "epoch": 0.28, "learning_rate": 1.5978666666666666e-07, "logps/chosen": -180.21102905273438, "logps/rejected": -277.08685302734375, "loss": 0.0249, "losses/dpo": 9.495670383330435e-05, "losses/sft": 0.7202059626579285, "losses/total": 9.495670383330435e-05, "ref_logps/chosen": -177.0571746826172, "ref_logps/rejected": -197.71981811523438, "rewards/accuracies": 1.0, "rewards/chosen": -0.3153858780860901, "rewards/margins": 7.621316909790039, "rewards/rejected": -7.936702728271484, "step": 1171 }, { "epoch": 0.28, "learning_rate": 1.597333333333333e-07, "logps/chosen": -225.33209228515625, "logps/rejected": -290.17535400390625, "loss": 0.0377, "losses/dpo": 1.8931554222945124e-05, "losses/sft": 0.7517935633659363, "losses/total": 1.8931554222945124e-05, "ref_logps/chosen": -221.53863525390625, "ref_logps/rejected": -208.7020263671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.37934449315071106, "rewards/margins": 7.767988204956055, "rewards/rejected": -8.147333145141602, "step": 1172 }, { "epoch": 0.28, "learning_rate": 1.5967999999999998e-07, "logps/chosen": -229.56849670410156, "logps/rejected": -298.78607177734375, "loss": 0.0081, "losses/dpo": 0.00015845934103708714, "losses/sft": 1.2014055252075195, "losses/total": 0.00015845934103708714, "ref_logps/chosen": -225.31446838378906, "ref_logps/rejected": -214.97901916503906, "rewards/accuracies": 1.0, "rewards/chosen": -0.42540213465690613, "rewards/margins": 7.955301284790039, "rewards/rejected": -8.380703926086426, "step": 1173 }, { "epoch": 0.28, "learning_rate": 1.5962666666666666e-07, "logps/chosen": -235.35165405273438, "logps/rejected": -314.750244140625, "loss": 0.0029, "losses/dpo": 1.9661692931549624e-06, "losses/sft": 0.8334255218505859, "losses/total": 1.9661692931549624e-06, "ref_logps/chosen": -231.91578674316406, "ref_logps/rejected": -225.4693603515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.343586802482605, "rewards/margins": 8.58449935913086, "rewards/rejected": -8.928085327148438, "step": 1174 }, { "epoch": 0.28, "learning_rate": 1.595733333333333e-07, "logps/chosen": -198.29092407226562, "logps/rejected": -332.26739501953125, "loss": 0.0118, "losses/dpo": 1.279567385381597e-07, "losses/sft": 0.7078176736831665, "losses/total": 1.279567385381597e-07, "ref_logps/chosen": -195.256591796875, "ref_logps/rejected": -231.46551513671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.30343517661094666, "rewards/margins": 9.776750564575195, "rewards/rejected": -10.08018684387207, "step": 1175 }, { "epoch": 0.28, "learning_rate": 1.5952e-07, "logps/chosen": -232.09500122070312, "logps/rejected": -305.72967529296875, "loss": 0.0086, "losses/dpo": 0.035057637840509415, "losses/sft": 0.6721334457397461, "losses/total": 0.035057637840509415, "ref_logps/chosen": -228.08251953125, "ref_logps/rejected": -214.3968963623047, "rewards/accuracies": 1.0, "rewards/chosen": -0.40124809741973877, "rewards/margins": 8.73202896118164, "rewards/rejected": -9.133275985717773, "step": 1176 }, { "epoch": 0.28, "learning_rate": 1.5946666666666667e-07, "logps/chosen": -217.51527404785156, "logps/rejected": -295.322021484375, "loss": 0.0147, "losses/dpo": 2.574418886069907e-06, "losses/sft": 0.9925723671913147, "losses/total": 2.574418886069907e-06, "ref_logps/chosen": -214.11537170410156, "ref_logps/rejected": -211.28903198242188, "rewards/accuracies": 1.0, "rewards/chosen": -0.33998972177505493, "rewards/margins": 8.063308715820312, "rewards/rejected": -8.403299331665039, "step": 1177 }, { "epoch": 0.28, "learning_rate": 1.5941333333333334e-07, "logps/chosen": -241.2336883544922, "logps/rejected": -338.3731689453125, "loss": 0.0019, "losses/dpo": 2.0525716593056131e-07, "losses/sft": 0.5404543876647949, "losses/total": 2.0525716593056131e-07, "ref_logps/chosen": -235.27508544921875, "ref_logps/rejected": -240.83877563476562, "rewards/accuracies": 1.0, "rewards/chosen": -0.5958588123321533, "rewards/margins": 9.157581329345703, "rewards/rejected": -9.753440856933594, "step": 1178 }, { "epoch": 0.28, "learning_rate": 1.5936e-07, "logps/chosen": -271.7259826660156, "logps/rejected": -357.13916015625, "loss": 0.0021, "losses/dpo": 0.00011167763295816258, "losses/sft": 0.5768582224845886, "losses/total": 0.00011167763295816258, "ref_logps/chosen": -265.689453125, "ref_logps/rejected": -262.2110595703125, "rewards/accuracies": 1.0, "rewards/chosen": -0.6036533713340759, "rewards/margins": 8.88916015625, "rewards/rejected": -9.492813110351562, "step": 1179 }, { "epoch": 0.28, "learning_rate": 1.5930666666666667e-07, "logps/chosen": -255.02035522460938, "logps/rejected": -317.771484375, "loss": 0.0065, "losses/dpo": 6.82112101912935e-07, "losses/sft": 0.8650135397911072, "losses/total": 6.82112101912935e-07, "ref_logps/chosen": -248.9051513671875, "ref_logps/rejected": -223.58360290527344, "rewards/accuracies": 1.0, "rewards/chosen": -0.6115206480026245, "rewards/margins": 8.807267189025879, "rewards/rejected": -9.418787002563477, "step": 1180 }, { "epoch": 0.28, "learning_rate": 1.5925333333333332e-07, "logps/chosen": -262.3930969238281, "logps/rejected": -354.6094970703125, "loss": 0.0172, "losses/dpo": 1.911162144097034e-06, "losses/sft": 0.5220153331756592, "losses/total": 1.911162144097034e-06, "ref_logps/chosen": -256.3526916503906, "ref_logps/rejected": -259.88201904296875, "rewards/accuracies": 1.0, "rewards/chosen": -0.6040400266647339, "rewards/margins": 8.868707656860352, "rewards/rejected": -9.472746849060059, "step": 1181 }, { "epoch": 0.28, "learning_rate": 1.592e-07, "logps/chosen": -249.93603515625, "logps/rejected": -334.1038818359375, "loss": 0.0014, "losses/dpo": 5.892623903491767e-06, "losses/sft": 1.0035189390182495, "losses/total": 5.892623903491767e-06, "ref_logps/chosen": -246.5472869873047, "ref_logps/rejected": -236.58755493164062, "rewards/accuracies": 1.0, "rewards/chosen": -0.3388751149177551, "rewards/margins": 9.41275691986084, "rewards/rejected": -9.751631736755371, "step": 1182 }, { "epoch": 0.28, "learning_rate": 1.5914666666666665e-07, "logps/chosen": -218.16175842285156, "logps/rejected": -329.49359130859375, "loss": 0.0021, "losses/dpo": 2.042599049900673e-07, "losses/sft": 0.7862468957901001, "losses/total": 2.042599049900673e-07, "ref_logps/chosen": -213.5908203125, "ref_logps/rejected": -230.32614135742188, "rewards/accuracies": 1.0, "rewards/chosen": -0.457092821598053, "rewards/margins": 9.459650039672852, "rewards/rejected": -9.916744232177734, "step": 1183 }, { "epoch": 0.28, "learning_rate": 1.5909333333333332e-07, "logps/chosen": -214.80224609375, "logps/rejected": -279.64794921875, "loss": 0.0188, "losses/dpo": 8.379039400097099e-07, "losses/sft": 0.9760599136352539, "losses/total": 8.379039400097099e-07, "ref_logps/chosen": -210.84439086914062, "ref_logps/rejected": -191.97869873046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.3957856297492981, "rewards/margins": 8.371139526367188, "rewards/rejected": -8.766923904418945, "step": 1184 }, { "epoch": 0.28, "learning_rate": 1.5904e-07, "logps/chosen": -209.96115112304688, "logps/rejected": -301.0637512207031, "loss": 0.0137, "losses/dpo": 3.6057119814358884e-06, "losses/sft": 0.5999115705490112, "losses/total": 3.6057119814358884e-06, "ref_logps/chosen": -205.86962890625, "ref_logps/rejected": -209.93954467773438, "rewards/accuracies": 1.0, "rewards/chosen": -0.4091533422470093, "rewards/margins": 8.703266143798828, "rewards/rejected": -9.112420082092285, "step": 1185 }, { "epoch": 0.28, "learning_rate": 1.5898666666666668e-07, "logps/chosen": -214.45985412597656, "logps/rejected": -296.0555725097656, "loss": 0.008, "losses/dpo": 2.67083066773921e-07, "losses/sft": 1.078750491142273, "losses/total": 2.67083066773921e-07, "ref_logps/chosen": -210.21583557128906, "ref_logps/rejected": -205.65423583984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.4244013726711273, "rewards/margins": 8.615735054016113, "rewards/rejected": -9.040136337280273, "step": 1186 }, { "epoch": 0.28, "learning_rate": 1.5893333333333333e-07, "logps/chosen": -216.55801391601562, "logps/rejected": -312.8250427246094, "loss": 0.0058, "losses/dpo": 2.2245603759074584e-05, "losses/sft": 0.6735621690750122, "losses/total": 2.2245603759074584e-05, "ref_logps/chosen": -213.64764404296875, "ref_logps/rejected": -220.49539184570312, "rewards/accuracies": 1.0, "rewards/chosen": -0.29103797674179077, "rewards/margins": 8.941926002502441, "rewards/rejected": -9.232963562011719, "step": 1187 }, { "epoch": 0.29, "learning_rate": 1.5888e-07, "logps/chosen": -222.5192413330078, "logps/rejected": -288.73529052734375, "loss": 0.0194, "losses/dpo": 7.480933163606096e-06, "losses/sft": 0.7408782243728638, "losses/total": 7.480933163606096e-06, "ref_logps/chosen": -217.32164001464844, "ref_logps/rejected": -204.1632080078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.5197609663009644, "rewards/margins": 7.937448978424072, "rewards/rejected": -8.457210540771484, "step": 1188 }, { "epoch": 0.29, "learning_rate": 1.5882666666666665e-07, "logps/chosen": -260.3497314453125, "logps/rejected": -336.95953369140625, "loss": 0.009, "losses/dpo": 6.384414155036211e-05, "losses/sft": 0.5388484001159668, "losses/total": 6.384414155036211e-05, "ref_logps/chosen": -253.9727325439453, "ref_logps/rejected": -236.39932250976562, "rewards/accuracies": 1.0, "rewards/chosen": -0.6376982927322388, "rewards/margins": 9.418319702148438, "rewards/rejected": -10.056018829345703, "step": 1189 }, { "epoch": 0.29, "learning_rate": 1.5877333333333333e-07, "logps/chosen": -234.16453552246094, "logps/rejected": -298.8039855957031, "loss": 0.0131, "losses/dpo": 5.620492447633296e-05, "losses/sft": 0.4973466992378235, "losses/total": 5.620492447633296e-05, "ref_logps/chosen": -228.96978759765625, "ref_logps/rejected": -207.30467224121094, "rewards/accuracies": 1.0, "rewards/chosen": -0.519476056098938, "rewards/margins": 8.630455017089844, "rewards/rejected": -9.149930953979492, "step": 1190 }, { "epoch": 0.29, "learning_rate": 1.5871999999999998e-07, "logps/chosen": -231.482666015625, "logps/rejected": -351.21636962890625, "loss": 0.0045, "losses/dpo": 8.947424066718668e-05, "losses/sft": 0.8322038650512695, "losses/total": 8.947424066718668e-05, "ref_logps/chosen": -227.46792602539062, "ref_logps/rejected": -251.989013671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.4014733135700226, "rewards/margins": 9.521262168884277, "rewards/rejected": -9.922735214233398, "step": 1191 }, { "epoch": 0.29, "learning_rate": 1.5866666666666666e-07, "logps/chosen": -227.24281311035156, "logps/rejected": -307.0352478027344, "loss": 0.0078, "losses/dpo": 1.0469153322034686e-09, "losses/sft": 0.9891782999038696, "losses/total": 1.0469153322034686e-09, "ref_logps/chosen": -224.53579711914062, "ref_logps/rejected": -219.05810546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.2707017958164215, "rewards/margins": 8.527012825012207, "rewards/rejected": -8.797715187072754, "step": 1192 }, { "epoch": 0.29, "learning_rate": 1.5861333333333333e-07, "logps/chosen": -203.75694274902344, "logps/rejected": -299.3144226074219, "loss": 0.0069, "losses/dpo": 1.4486110558209475e-05, "losses/sft": 0.5523771643638611, "losses/total": 1.4486110558209475e-05, "ref_logps/chosen": -201.28421020507812, "ref_logps/rejected": -212.1500244140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.24727419018745422, "rewards/margins": 8.469165802001953, "rewards/rejected": -8.716440200805664, "step": 1193 }, { "epoch": 0.29, "learning_rate": 1.5855999999999999e-07, "logps/chosen": -209.92938232421875, "logps/rejected": -333.9148254394531, "loss": 0.0063, "losses/dpo": 1.0507693559702602e-07, "losses/sft": 0.6750990748405457, "losses/total": 1.0507693559702602e-07, "ref_logps/chosen": -206.9490966796875, "ref_logps/rejected": -235.06881713867188, "rewards/accuracies": 1.0, "rewards/chosen": -0.2980293035507202, "rewards/margins": 9.586572647094727, "rewards/rejected": -9.884601593017578, "step": 1194 }, { "epoch": 0.29, "learning_rate": 1.5850666666666666e-07, "logps/chosen": -247.13055419921875, "logps/rejected": -351.99981689453125, "loss": 0.0089, "losses/dpo": 0.0002827756688930094, "losses/sft": 0.5403079390525818, "losses/total": 0.0002827756688930094, "ref_logps/chosen": -241.45343017578125, "ref_logps/rejected": -252.8442840576172, "rewards/accuracies": 1.0, "rewards/chosen": -0.5677115321159363, "rewards/margins": 9.34783935546875, "rewards/rejected": -9.91555118560791, "step": 1195 }, { "epoch": 0.29, "learning_rate": 1.5845333333333334e-07, "logps/chosen": -144.99908447265625, "logps/rejected": -265.49493408203125, "loss": 0.0272, "losses/dpo": 2.0030907488255423e-10, "losses/sft": 0.6891390681266785, "losses/total": 2.0030907488255423e-10, "ref_logps/chosen": -143.9313507080078, "ref_logps/rejected": -189.66165161132812, "rewards/accuracies": 1.0, "rewards/chosen": -0.10677353292703629, "rewards/margins": 7.476552963256836, "rewards/rejected": -7.583326816558838, "step": 1196 }, { "epoch": 0.29, "learning_rate": 1.584e-07, "logps/chosen": -225.27438354492188, "logps/rejected": -339.36004638671875, "loss": 0.0121, "losses/dpo": 6.224953722266946e-06, "losses/sft": 0.5521683096885681, "losses/total": 6.224953722266946e-06, "ref_logps/chosen": -220.40773010253906, "ref_logps/rejected": -242.31573486328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.4866662621498108, "rewards/margins": 9.21776294708252, "rewards/rejected": -9.704429626464844, "step": 1197 }, { "epoch": 0.29, "learning_rate": 1.5834666666666664e-07, "logps/chosen": -259.69561767578125, "logps/rejected": -338.5205078125, "loss": 0.0032, "losses/dpo": 6.158778376175178e-08, "losses/sft": 0.9748654961585999, "losses/total": 6.158778376175178e-08, "ref_logps/chosen": -254.20449829101562, "ref_logps/rejected": -237.17176818847656, "rewards/accuracies": 1.0, "rewards/chosen": -0.5491108894348145, "rewards/margins": 9.585762977600098, "rewards/rejected": -10.13487434387207, "step": 1198 }, { "epoch": 0.29, "learning_rate": 1.5829333333333332e-07, "logps/chosen": -241.836181640625, "logps/rejected": -293.933349609375, "loss": 0.0028, "losses/dpo": 1.76427306541882e-07, "losses/sft": 0.578137218952179, "losses/total": 1.76427306541882e-07, "ref_logps/chosen": -238.2745361328125, "ref_logps/rejected": -210.4554443359375, "rewards/accuracies": 1.0, "rewards/chosen": -0.3561629056930542, "rewards/margins": 7.991626262664795, "rewards/rejected": -8.347789764404297, "step": 1199 }, { "epoch": 0.29, "learning_rate": 1.5824e-07, "logps/chosen": -245.84681701660156, "logps/rejected": -342.0043640136719, "loss": 0.0089, "losses/dpo": 0.0006975354044698179, "losses/sft": 0.7425802946090698, "losses/total": 0.0006975354044698179, "ref_logps/chosen": -241.5284423828125, "ref_logps/rejected": -243.73065185546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.4318383038043976, "rewards/margins": 9.395533561706543, "rewards/rejected": -9.827371597290039, "step": 1200 }, { "epoch": 0.29, "learning_rate": 1.5818666666666667e-07, "logps/chosen": -256.3808288574219, "logps/rejected": -334.4014587402344, "loss": 0.0061, "losses/dpo": 7.309442935365951e-07, "losses/sft": 0.6593630909919739, "losses/total": 7.309442935365951e-07, "ref_logps/chosen": -252.4275665283203, "ref_logps/rejected": -240.99624633789062, "rewards/accuracies": 1.0, "rewards/chosen": -0.3953264355659485, "rewards/margins": 8.945194244384766, "rewards/rejected": -9.340520858764648, "step": 1201 }, { "epoch": 0.29, "learning_rate": 1.5813333333333332e-07, "logps/chosen": -205.44007873535156, "logps/rejected": -340.15130615234375, "loss": 0.007, "losses/dpo": 1.1898850971192587e-05, "losses/sft": 0.6511049866676331, "losses/total": 1.1898850971192587e-05, "ref_logps/chosen": -203.39764404296875, "ref_logps/rejected": -236.76712036132812, "rewards/accuracies": 1.0, "rewards/chosen": -0.2042442113161087, "rewards/margins": 10.134173393249512, "rewards/rejected": -10.338418960571289, "step": 1202 }, { "epoch": 0.29, "learning_rate": 1.5808e-07, "logps/chosen": -234.09507751464844, "logps/rejected": -328.3758544921875, "loss": 0.0052, "losses/dpo": 0.0009326955187134445, "losses/sft": 0.4545285105705261, "losses/total": 0.0009326955187134445, "ref_logps/chosen": -232.49771118164062, "ref_logps/rejected": -238.96902465820312, "rewards/accuracies": 1.0, "rewards/chosen": -0.15973931550979614, "rewards/margins": 8.78094482421875, "rewards/rejected": -8.94068431854248, "step": 1203 }, { "epoch": 0.29, "learning_rate": 1.5802666666666667e-07, "logps/chosen": -220.68997192382812, "logps/rejected": -320.6006164550781, "loss": 0.0045, "losses/dpo": 0.00041019756463356316, "losses/sft": 0.21533827483654022, "losses/total": 0.00041019756463356316, "ref_logps/chosen": -216.98263549804688, "ref_logps/rejected": -228.35691833496094, "rewards/accuracies": 1.0, "rewards/chosen": -0.37073466181755066, "rewards/margins": 8.853636741638184, "rewards/rejected": -9.224370956420898, "step": 1204 }, { "epoch": 0.29, "learning_rate": 1.5797333333333332e-07, "logps/chosen": -251.0227508544922, "logps/rejected": -307.022705078125, "loss": 0.0058, "losses/dpo": 7.097270281519741e-05, "losses/sft": 0.6789131164550781, "losses/total": 7.097270281519741e-05, "ref_logps/chosen": -243.0373992919922, "ref_logps/rejected": -211.55029296875, "rewards/accuracies": 1.0, "rewards/chosen": -0.7985357642173767, "rewards/margins": 8.748706817626953, "rewards/rejected": -9.547242164611816, "step": 1205 }, { "epoch": 0.29, "learning_rate": 1.5791999999999997e-07, "logps/chosen": -221.37576293945312, "logps/rejected": -312.284423828125, "loss": 0.0044, "losses/dpo": 8.327473324243329e-07, "losses/sft": 1.1934592723846436, "losses/total": 8.327473324243329e-07, "ref_logps/chosen": -215.04782104492188, "ref_logps/rejected": -216.71826171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.6327941417694092, "rewards/margins": 8.923822402954102, "rewards/rejected": -9.556615829467773, "step": 1206 }, { "epoch": 0.29, "learning_rate": 1.5786666666666665e-07, "logps/chosen": -241.64479064941406, "logps/rejected": -340.65802001953125, "loss": 0.0015, "losses/dpo": 2.8128499707236188e-06, "losses/sft": 0.8094635605812073, "losses/total": 2.8128499707236188e-06, "ref_logps/chosen": -235.31141662597656, "ref_logps/rejected": -238.89849853515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.633337140083313, "rewards/margins": 9.54261302947998, "rewards/rejected": -10.175950050354004, "step": 1207 }, { "epoch": 0.29, "learning_rate": 1.5781333333333333e-07, "logps/chosen": -252.51675415039062, "logps/rejected": -312.8394470214844, "loss": 0.0165, "losses/dpo": 3.449992436799221e-05, "losses/sft": 0.5435553193092346, "losses/total": 3.449992436799221e-05, "ref_logps/chosen": -245.89463806152344, "ref_logps/rejected": -223.05271911621094, "rewards/accuracies": 1.0, "rewards/chosen": -0.6622097492218018, "rewards/margins": 8.3164644241333, "rewards/rejected": -8.978673934936523, "step": 1208 }, { "epoch": 0.29, "learning_rate": 1.5775999999999998e-07, "logps/chosen": -193.60317993164062, "logps/rejected": -317.6153259277344, "loss": 0.0015, "losses/dpo": 0.0001026005411404185, "losses/sft": 0.8629012703895569, "losses/total": 0.0001026005411404185, "ref_logps/chosen": -189.17507934570312, "ref_logps/rejected": -219.5804901123047, "rewards/accuracies": 1.0, "rewards/chosen": -0.4428083896636963, "rewards/margins": 9.360675811767578, "rewards/rejected": -9.803483963012695, "step": 1209 }, { "epoch": 0.29, "learning_rate": 1.5770666666666665e-07, "logps/chosen": -199.76663208007812, "logps/rejected": -288.7474365234375, "loss": 0.0109, "losses/dpo": 0.008908621966838837, "losses/sft": 0.6753736138343811, "losses/total": 0.008908621966838837, "ref_logps/chosen": -195.79034423828125, "ref_logps/rejected": -209.62359619140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.39762815833091736, "rewards/margins": 7.514753341674805, "rewards/rejected": -7.912382125854492, "step": 1210 }, { "epoch": 0.29, "learning_rate": 1.5765333333333333e-07, "logps/chosen": -204.74307250976562, "logps/rejected": -358.7725830078125, "loss": 0.0012, "losses/dpo": 1.1205560213056742e-06, "losses/sft": 0.503153920173645, "losses/total": 1.1205560213056742e-06, "ref_logps/chosen": -199.40841674804688, "ref_logps/rejected": -248.0216522216797, "rewards/accuracies": 1.0, "rewards/chosen": -0.5334645509719849, "rewards/margins": 10.541629791259766, "rewards/rejected": -11.075094223022461, "step": 1211 }, { "epoch": 0.29, "learning_rate": 1.576e-07, "logps/chosen": -208.25440979003906, "logps/rejected": -305.54376220703125, "loss": 0.0054, "losses/dpo": 3.26906047121156e-06, "losses/sft": 0.3983682096004486, "losses/total": 3.26906047121156e-06, "ref_logps/chosen": -203.9764404296875, "ref_logps/rejected": -215.29022216796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.4277957081794739, "rewards/margins": 8.59755802154541, "rewards/rejected": -9.02535343170166, "step": 1212 }, { "epoch": 0.29, "learning_rate": 1.5754666666666666e-07, "logps/chosen": -250.45773315429688, "logps/rejected": -345.9722595214844, "loss": 0.0027, "losses/dpo": 1.0294621461071074e-05, "losses/sft": 0.5554839372634888, "losses/total": 1.0294621461071074e-05, "ref_logps/chosen": -243.25650024414062, "ref_logps/rejected": -242.22055053710938, "rewards/accuracies": 1.0, "rewards/chosen": -0.7201236486434937, "rewards/margins": 9.655044555664062, "rewards/rejected": -10.375167846679688, "step": 1213 }, { "epoch": 0.29, "learning_rate": 1.5749333333333334e-07, "logps/chosen": -168.61154174804688, "logps/rejected": -295.24505615234375, "loss": 0.0107, "losses/dpo": 1.241312208577483e-08, "losses/sft": 0.5511359572410583, "losses/total": 1.241312208577483e-08, "ref_logps/chosen": -166.93775939941406, "ref_logps/rejected": -206.88641357421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.16737693548202515, "rewards/margins": 8.66848373413086, "rewards/rejected": -8.835860252380371, "step": 1214 }, { "epoch": 0.29, "learning_rate": 1.5743999999999999e-07, "logps/chosen": -240.9099578857422, "logps/rejected": -308.66864013671875, "loss": 0.0119, "losses/dpo": 2.8443031624192372e-05, "losses/sft": 0.4570766091346741, "losses/total": 2.8443031624192372e-05, "ref_logps/chosen": -235.868408203125, "ref_logps/rejected": -217.74974060058594, "rewards/accuracies": 1.0, "rewards/chosen": -0.5041546821594238, "rewards/margins": 8.587735176086426, "rewards/rejected": -9.091889381408691, "step": 1215 }, { "epoch": 0.29, "learning_rate": 1.5738666666666666e-07, "logps/chosen": -204.01504516601562, "logps/rejected": -298.970947265625, "loss": 0.0191, "losses/dpo": 0.0006293426267802715, "losses/sft": 0.648844301700592, "losses/total": 0.0006293426267802715, "ref_logps/chosen": -198.8653106689453, "ref_logps/rejected": -207.41078186035156, "rewards/accuracies": 1.0, "rewards/chosen": -0.5149739980697632, "rewards/margins": 8.641043663024902, "rewards/rejected": -9.156017303466797, "step": 1216 }, { "epoch": 0.29, "learning_rate": 1.573333333333333e-07, "logps/chosen": -189.009033203125, "logps/rejected": -282.0081787109375, "loss": 0.0257, "losses/dpo": 2.2118264553228073e-07, "losses/sft": 0.615964949131012, "losses/total": 2.2118264553228073e-07, "ref_logps/chosen": -185.41212463378906, "ref_logps/rejected": -196.12326049804688, "rewards/accuracies": 1.0, "rewards/chosen": -0.3596901297569275, "rewards/margins": 8.228805541992188, "rewards/rejected": -8.588495254516602, "step": 1217 }, { "epoch": 0.29, "learning_rate": 1.5728e-07, "logps/chosen": -234.732177734375, "logps/rejected": -359.0458984375, "loss": 0.0062, "losses/dpo": 8.317766742038657e-07, "losses/sft": 0.7069624662399292, "losses/total": 8.317766742038657e-07, "ref_logps/chosen": -230.62351989746094, "ref_logps/rejected": -255.8897247314453, "rewards/accuracies": 1.0, "rewards/chosen": -0.4108654856681824, "rewards/margins": 9.904754638671875, "rewards/rejected": -10.315620422363281, "step": 1218 }, { "epoch": 0.29, "learning_rate": 1.5722666666666667e-07, "logps/chosen": -230.70968627929688, "logps/rejected": -326.68585205078125, "loss": 0.0041, "losses/dpo": 0.00024242994550149888, "losses/sft": 0.7351371049880981, "losses/total": 0.00024242994550149888, "ref_logps/chosen": -226.18008422851562, "ref_logps/rejected": -228.5223846435547, "rewards/accuracies": 1.0, "rewards/chosen": -0.45295971632003784, "rewards/margins": 9.363386154174805, "rewards/rejected": -9.816346168518066, "step": 1219 }, { "epoch": 0.29, "learning_rate": 1.5717333333333334e-07, "logps/chosen": -247.82186889648438, "logps/rejected": -318.20269775390625, "loss": 0.0194, "losses/dpo": 4.1803250496741384e-06, "losses/sft": 0.7595615386962891, "losses/total": 4.1803250496741384e-06, "ref_logps/chosen": -239.08773803710938, "ref_logps/rejected": -224.98104858398438, "rewards/accuracies": 1.0, "rewards/chosen": -0.8734138607978821, "rewards/margins": 8.448750495910645, "rewards/rejected": -9.322164535522461, "step": 1220 }, { "epoch": 0.29, "learning_rate": 1.5712e-07, "logps/chosen": -229.80709838867188, "logps/rejected": -294.6924133300781, "loss": 0.0245, "losses/dpo": 6.55995080478533e-08, "losses/sft": 0.5641210079193115, "losses/total": 6.55995080478533e-08, "ref_logps/chosen": -222.4475555419922, "ref_logps/rejected": -209.597900390625, "rewards/accuracies": 1.0, "rewards/chosen": -0.7359521389007568, "rewards/margins": 7.773499011993408, "rewards/rejected": -8.509450912475586, "step": 1221 }, { "epoch": 0.29, "learning_rate": 1.5706666666666667e-07, "logps/chosen": -175.7428741455078, "logps/rejected": -259.0398254394531, "loss": 0.0163, "losses/dpo": 9.963076081476174e-06, "losses/sft": 0.5014711618423462, "losses/total": 9.963076081476174e-06, "ref_logps/chosen": -172.46478271484375, "ref_logps/rejected": -182.16021728515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.32780906558036804, "rewards/margins": 7.3601531982421875, "rewards/rejected": -7.687962055206299, "step": 1222 }, { "epoch": 0.29, "learning_rate": 1.5701333333333332e-07, "logps/chosen": -243.56686401367188, "logps/rejected": -335.29541015625, "loss": 0.002, "losses/dpo": 6.650878731306875e-06, "losses/sft": 0.8683505654335022, "losses/total": 6.650878731306875e-06, "ref_logps/chosen": -238.7523956298828, "ref_logps/rejected": -237.946044921875, "rewards/accuracies": 1.0, "rewards/chosen": -0.48144739866256714, "rewards/margins": 9.25349235534668, "rewards/rejected": -9.734939575195312, "step": 1223 }, { "epoch": 0.29, "learning_rate": 1.5696e-07, "logps/chosen": -210.29086303710938, "logps/rejected": -282.91522216796875, "loss": 0.0094, "losses/dpo": 1.4645605006080586e-05, "losses/sft": 0.4965025782585144, "losses/total": 1.4645605006080586e-05, "ref_logps/chosen": -204.7000274658203, "ref_logps/rejected": -192.59664916992188, "rewards/accuracies": 1.0, "rewards/chosen": -0.5590837001800537, "rewards/margins": 8.472774505615234, "rewards/rejected": -9.031859397888184, "step": 1224 }, { "epoch": 0.29, "learning_rate": 1.5690666666666665e-07, "logps/chosen": -217.1562042236328, "logps/rejected": -326.6448974609375, "loss": 0.0032, "losses/dpo": 4.0399267930979477e-08, "losses/sft": 1.083921194076538, "losses/total": 4.0399267930979477e-08, "ref_logps/chosen": -209.27651977539062, "ref_logps/rejected": -230.22830200195312, "rewards/accuracies": 1.0, "rewards/chosen": -0.7879694700241089, "rewards/margins": 8.853691101074219, "rewards/rejected": -9.6416597366333, "step": 1225 }, { "epoch": 0.29, "learning_rate": 1.5685333333333332e-07, "logps/chosen": -271.8254699707031, "logps/rejected": -326.94683837890625, "loss": 0.0031, "losses/dpo": 0.00036221451591700315, "losses/sft": 0.4410064220428467, "losses/total": 0.00036221451591700315, "ref_logps/chosen": -265.95477294921875, "ref_logps/rejected": -233.1473846435547, "rewards/accuracies": 1.0, "rewards/chosen": -0.5870682001113892, "rewards/margins": 8.792880058288574, "rewards/rejected": -9.379947662353516, "step": 1226 }, { "epoch": 0.29, "learning_rate": 1.568e-07, "logps/chosen": -218.26986694335938, "logps/rejected": -320.13714599609375, "loss": 0.0175, "losses/dpo": 1.913858977786731e-05, "losses/sft": 0.5049278736114502, "losses/total": 1.913858977786731e-05, "ref_logps/chosen": -213.48052978515625, "ref_logps/rejected": -221.43763732910156, "rewards/accuracies": 1.0, "rewards/chosen": -0.4789338707923889, "rewards/margins": 9.391016960144043, "rewards/rejected": -9.869950294494629, "step": 1227 }, { "epoch": 0.29, "learning_rate": 1.5674666666666665e-07, "logps/chosen": -274.9866943359375, "logps/rejected": -308.9238586425781, "loss": 0.0066, "losses/dpo": 0.002717583905905485, "losses/sft": 0.8344677090644836, "losses/total": 0.002717583905905485, "ref_logps/chosen": -264.76373291015625, "ref_logps/rejected": -210.20703125, "rewards/accuracies": 1.0, "rewards/chosen": -1.0222973823547363, "rewards/margins": 8.849386215209961, "rewards/rejected": -9.871683120727539, "step": 1228 }, { "epoch": 0.29, "learning_rate": 1.5669333333333333e-07, "logps/chosen": -220.40228271484375, "logps/rejected": -305.7164306640625, "loss": 0.0091, "losses/dpo": 0.0001383775961585343, "losses/sft": 0.6330100297927856, "losses/total": 0.0001383775961585343, "ref_logps/chosen": -215.57798767089844, "ref_logps/rejected": -215.9233856201172, "rewards/accuracies": 1.0, "rewards/chosen": -0.4824307858943939, "rewards/margins": 8.49687385559082, "rewards/rejected": -8.979305267333984, "step": 1229 }, { "epoch": 0.3, "learning_rate": 1.5664e-07, "logps/chosen": -205.10183715820312, "logps/rejected": -287.17974853515625, "loss": 0.0122, "losses/dpo": 0.0003695365448947996, "losses/sft": 0.4319477081298828, "losses/total": 0.0003695365448947996, "ref_logps/chosen": -201.55532836914062, "ref_logps/rejected": -201.4537353515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.3546510934829712, "rewards/margins": 8.217948913574219, "rewards/rejected": -8.572600364685059, "step": 1230 }, { "epoch": 0.3, "learning_rate": 1.5658666666666666e-07, "logps/chosen": -229.29653930664062, "logps/rejected": -345.80975341796875, "loss": 0.0087, "losses/dpo": 2.526246589695802e-08, "losses/sft": 0.4985724985599518, "losses/total": 2.526246589695802e-08, "ref_logps/chosen": -220.81619262695312, "ref_logps/rejected": -241.7655792236328, "rewards/accuracies": 1.0, "rewards/chosen": -0.8480342626571655, "rewards/margins": 9.556385040283203, "rewards/rejected": -10.4044189453125, "step": 1231 }, { "epoch": 0.3, "learning_rate": 1.565333333333333e-07, "logps/chosen": -220.76754760742188, "logps/rejected": -311.39697265625, "loss": 0.0031, "losses/dpo": 4.1437806430622e-06, "losses/sft": 0.5073609352111816, "losses/total": 4.1437806430622e-06, "ref_logps/chosen": -215.46633911132812, "ref_logps/rejected": -212.89767456054688, "rewards/accuracies": 1.0, "rewards/chosen": -0.5301222801208496, "rewards/margins": 9.319809913635254, "rewards/rejected": -9.849933624267578, "step": 1232 }, { "epoch": 0.3, "learning_rate": 1.5647999999999998e-07, "logps/chosen": -229.99671936035156, "logps/rejected": -322.62847900390625, "loss": 0.0061, "losses/dpo": 1.061417151504429e-06, "losses/sft": 0.8159807920455933, "losses/total": 1.061417151504429e-06, "ref_logps/chosen": -225.56649780273438, "ref_logps/rejected": -224.116455078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.4430217146873474, "rewards/margins": 9.408180236816406, "rewards/rejected": -9.851202011108398, "step": 1233 }, { "epoch": 0.3, "learning_rate": 1.5642666666666666e-07, "logps/chosen": -202.38082885742188, "logps/rejected": -314.8982238769531, "loss": 0.0106, "losses/dpo": 5.558050179388374e-05, "losses/sft": 0.5822467803955078, "losses/total": 5.558050179388374e-05, "ref_logps/chosen": -196.96844482421875, "ref_logps/rejected": -223.07662963867188, "rewards/accuracies": 1.0, "rewards/chosen": -0.541237473487854, "rewards/margins": 8.640922546386719, "rewards/rejected": -9.182159423828125, "step": 1234 }, { "epoch": 0.3, "learning_rate": 1.5637333333333334e-07, "logps/chosen": -222.64559936523438, "logps/rejected": -344.70220947265625, "loss": 0.0049, "losses/dpo": 0.00016393515397794545, "losses/sft": 0.4782753884792328, "losses/total": 0.00016393515397794545, "ref_logps/chosen": -217.61346435546875, "ref_logps/rejected": -245.09617614746094, "rewards/accuracies": 1.0, "rewards/chosen": -0.5032134056091309, "rewards/margins": 9.457391738891602, "rewards/rejected": -9.96060562133789, "step": 1235 }, { "epoch": 0.3, "learning_rate": 1.5631999999999999e-07, "logps/chosen": -231.0933837890625, "logps/rejected": -298.6788330078125, "loss": 0.0074, "losses/dpo": 3.365193333593197e-05, "losses/sft": 0.5900442004203796, "losses/total": 3.365193333593197e-05, "ref_logps/chosen": -225.88880920410156, "ref_logps/rejected": -211.46604919433594, "rewards/accuracies": 1.0, "rewards/chosen": -0.5204544067382812, "rewards/margins": 8.200824737548828, "rewards/rejected": -8.721278190612793, "step": 1236 }, { "epoch": 0.3, "learning_rate": 1.5626666666666666e-07, "logps/chosen": -209.06813049316406, "logps/rejected": -367.6751708984375, "loss": 0.0009, "losses/dpo": 1.0269257444406321e-07, "losses/sft": 0.8125948309898376, "losses/total": 1.0269257444406321e-07, "ref_logps/chosen": -205.2718505859375, "ref_logps/rejected": -259.3652038574219, "rewards/accuracies": 1.0, "rewards/chosen": -0.37962913513183594, "rewards/margins": 10.451370239257812, "rewards/rejected": -10.830999374389648, "step": 1237 }, { "epoch": 0.3, "learning_rate": 1.5621333333333334e-07, "logps/chosen": -265.9210205078125, "logps/rejected": -345.564453125, "loss": 0.0028, "losses/dpo": 5.068218888482079e-05, "losses/sft": 0.661011815071106, "losses/total": 5.068218888482079e-05, "ref_logps/chosen": -259.84039306640625, "ref_logps/rejected": -240.5866241455078, "rewards/accuracies": 1.0, "rewards/chosen": -0.6080598831176758, "rewards/margins": 9.889724731445312, "rewards/rejected": -10.497784614562988, "step": 1238 }, { "epoch": 0.3, "learning_rate": 1.5616e-07, "logps/chosen": -245.87803649902344, "logps/rejected": -336.025634765625, "loss": 0.0083, "losses/dpo": 0.001321726362220943, "losses/sft": 0.8019676804542542, "losses/total": 0.001321726362220943, "ref_logps/chosen": -240.02053833007812, "ref_logps/rejected": -243.23497009277344, "rewards/accuracies": 1.0, "rewards/chosen": -0.5857502222061157, "rewards/margins": 8.693315505981445, "rewards/rejected": -9.27906608581543, "step": 1239 }, { "epoch": 0.3, "learning_rate": 1.5610666666666664e-07, "logps/chosen": -188.74647521972656, "logps/rejected": -304.01593017578125, "loss": 0.0029, "losses/dpo": 1.0518170938667026e-06, "losses/sft": 0.5944394469261169, "losses/total": 1.0518170938667026e-06, "ref_logps/chosen": -183.91470336914062, "ref_logps/rejected": -215.9570770263672, "rewards/accuracies": 1.0, "rewards/chosen": -0.4831780195236206, "rewards/margins": 8.32270622253418, "rewards/rejected": -8.805885314941406, "step": 1240 }, { "epoch": 0.3, "learning_rate": 1.5605333333333332e-07, "logps/chosen": -263.94451904296875, "logps/rejected": -313.42034912109375, "loss": 0.02, "losses/dpo": 6.617132839892292e-06, "losses/sft": 0.7774032354354858, "losses/total": 6.617132839892292e-06, "ref_logps/chosen": -255.3319091796875, "ref_logps/rejected": -226.6964111328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.8612605929374695, "rewards/margins": 7.811132907867432, "rewards/rejected": -8.672393798828125, "step": 1241 }, { "epoch": 0.3, "learning_rate": 1.56e-07, "logps/chosen": -242.11642456054688, "logps/rejected": -299.33856201171875, "loss": 0.0195, "losses/dpo": 7.1880904215504415e-06, "losses/sft": 0.44429129362106323, "losses/total": 7.1880904215504415e-06, "ref_logps/chosen": -238.12234497070312, "ref_logps/rejected": -213.70028686523438, "rewards/accuracies": 1.0, "rewards/chosen": -0.399406373500824, "rewards/margins": 8.164421081542969, "rewards/rejected": -8.563828468322754, "step": 1242 }, { "epoch": 0.3, "learning_rate": 1.5594666666666664e-07, "logps/chosen": -250.488037109375, "logps/rejected": -368.9501647949219, "loss": 0.005, "losses/dpo": 9.223329655583257e-09, "losses/sft": 0.6423609256744385, "losses/total": 9.223329655583257e-09, "ref_logps/chosen": -243.2746124267578, "ref_logps/rejected": -264.006591796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.7213422060012817, "rewards/margins": 9.7730131149292, "rewards/rejected": -10.494355201721191, "step": 1243 }, { "epoch": 0.3, "learning_rate": 1.5589333333333332e-07, "logps/chosen": -222.01177978515625, "logps/rejected": -312.1578063964844, "loss": 0.0132, "losses/dpo": 8.549810672775493e-07, "losses/sft": 0.5187113881111145, "losses/total": 8.549810672775493e-07, "ref_logps/chosen": -217.6536865234375, "ref_logps/rejected": -220.4456024169922, "rewards/accuracies": 1.0, "rewards/chosen": -0.43580904603004456, "rewards/margins": 8.735408782958984, "rewards/rejected": -9.171217918395996, "step": 1244 }, { "epoch": 0.3, "learning_rate": 1.5584e-07, "logps/chosen": -222.72467041015625, "logps/rejected": -315.6297607421875, "loss": 0.0047, "losses/dpo": 5.474071258504409e-07, "losses/sft": 1.029208779335022, "losses/total": 5.474071258504409e-07, "ref_logps/chosen": -218.24191284179688, "ref_logps/rejected": -222.53558349609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.44827646017074585, "rewards/margins": 8.861139297485352, "rewards/rejected": -9.309415817260742, "step": 1245 }, { "epoch": 0.3, "learning_rate": 1.5578666666666667e-07, "logps/chosen": -224.7595977783203, "logps/rejected": -324.29315185546875, "loss": 0.0101, "losses/dpo": 1.2504139021984884e-06, "losses/sft": 0.7935410141944885, "losses/total": 1.2504139021984884e-06, "ref_logps/chosen": -221.25466918945312, "ref_logps/rejected": -234.40908813476562, "rewards/accuracies": 1.0, "rewards/chosen": -0.35049185156822205, "rewards/margins": 8.637913703918457, "rewards/rejected": -8.98840618133545, "step": 1246 }, { "epoch": 0.3, "learning_rate": 1.5573333333333332e-07, "logps/chosen": -238.296875, "logps/rejected": -335.4449157714844, "loss": 0.0124, "losses/dpo": 3.0409837563638575e-05, "losses/sft": 0.5481995344161987, "losses/total": 3.0409837563638575e-05, "ref_logps/chosen": -234.55374145507812, "ref_logps/rejected": -236.8326416015625, "rewards/accuracies": 1.0, "rewards/chosen": -0.37431415915489197, "rewards/margins": 9.486912727355957, "rewards/rejected": -9.861227035522461, "step": 1247 }, { "epoch": 0.3, "learning_rate": 1.5568e-07, "logps/chosen": -227.65164184570312, "logps/rejected": -320.133056640625, "loss": 0.0138, "losses/dpo": 8.698507940607669e-07, "losses/sft": 0.9026473760604858, "losses/total": 8.698507940607669e-07, "ref_logps/chosen": -220.59889221191406, "ref_logps/rejected": -225.62916564941406, "rewards/accuracies": 1.0, "rewards/chosen": -0.7052738070487976, "rewards/margins": 8.745113372802734, "rewards/rejected": -9.450386047363281, "step": 1248 }, { "epoch": 0.3, "learning_rate": 1.5562666666666665e-07, "logps/chosen": -214.17657470703125, "logps/rejected": -289.15673828125, "loss": 0.0208, "losses/dpo": 3.691465281008277e-06, "losses/sft": 1.0170202255249023, "losses/total": 3.691465281008277e-06, "ref_logps/chosen": -209.32737731933594, "ref_logps/rejected": -202.49703979492188, "rewards/accuracies": 1.0, "rewards/chosen": -0.48491957783699036, "rewards/margins": 8.181045532226562, "rewards/rejected": -8.66596508026123, "step": 1249 }, { "epoch": 0.3, "learning_rate": 1.5557333333333333e-07, "logps/chosen": -261.45074462890625, "logps/rejected": -323.44281005859375, "loss": 0.0111, "losses/dpo": 6.570207187905908e-05, "losses/sft": 0.9249168634414673, "losses/total": 6.570207187905908e-05, "ref_logps/chosen": -255.77447509765625, "ref_logps/rejected": -231.1844482421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.5676265954971313, "rewards/margins": 8.658210754394531, "rewards/rejected": -9.225837707519531, "step": 1250 }, { "epoch": 0.3, "learning_rate": 1.5551999999999998e-07, "logps/chosen": -229.83053588867188, "logps/rejected": -312.2069396972656, "loss": 0.0101, "losses/dpo": 6.906051112309797e-06, "losses/sft": 0.49107131361961365, "losses/total": 6.906051112309797e-06, "ref_logps/chosen": -221.59954833984375, "ref_logps/rejected": -220.48251342773438, "rewards/accuracies": 1.0, "rewards/chosen": -0.8230988383293152, "rewards/margins": 8.349346160888672, "rewards/rejected": -9.172445297241211, "step": 1251 }, { "epoch": 0.3, "learning_rate": 1.5546666666666666e-07, "logps/chosen": -262.71807861328125, "logps/rejected": -319.3741149902344, "loss": 0.0024, "losses/dpo": 2.6885433612733323e-07, "losses/sft": 0.9662647843360901, "losses/total": 2.6885433612733323e-07, "ref_logps/chosen": -257.95086669921875, "ref_logps/rejected": -223.79344177246094, "rewards/accuracies": 1.0, "rewards/chosen": -0.4767242968082428, "rewards/margins": 9.081342697143555, "rewards/rejected": -9.558067321777344, "step": 1252 }, { "epoch": 0.3, "learning_rate": 1.5541333333333333e-07, "logps/chosen": -182.83053588867188, "logps/rejected": -323.9176025390625, "loss": 0.0109, "losses/dpo": 8.536651563417763e-08, "losses/sft": 0.539486825466156, "losses/total": 8.536651563417763e-08, "ref_logps/chosen": -179.5562744140625, "ref_logps/rejected": -225.05697631835938, "rewards/accuracies": 1.0, "rewards/chosen": -0.3274269700050354, "rewards/margins": 9.558634757995605, "rewards/rejected": -9.886061668395996, "step": 1253 }, { "epoch": 0.3, "learning_rate": 1.5536e-07, "logps/chosen": -243.44857788085938, "logps/rejected": -340.60394287109375, "loss": 0.002, "losses/dpo": 5.530126045982797e-08, "losses/sft": 0.5707559585571289, "losses/total": 5.530126045982797e-08, "ref_logps/chosen": -238.7167205810547, "ref_logps/rejected": -235.62905883789062, "rewards/accuracies": 1.0, "rewards/chosen": -0.47318601608276367, "rewards/margins": 10.024299621582031, "rewards/rejected": -10.497486114501953, "step": 1254 }, { "epoch": 0.3, "learning_rate": 1.5530666666666666e-07, "logps/chosen": -224.25059509277344, "logps/rejected": -334.98297119140625, "loss": 0.0147, "losses/dpo": 4.7055260665729293e-07, "losses/sft": 1.0464543104171753, "losses/total": 4.7055260665729293e-07, "ref_logps/chosen": -218.80519104003906, "ref_logps/rejected": -232.93637084960938, "rewards/accuracies": 1.0, "rewards/chosen": -0.5445423126220703, "rewards/margins": 9.66011905670166, "rewards/rejected": -10.204660415649414, "step": 1255 }, { "epoch": 0.3, "learning_rate": 1.5525333333333334e-07, "logps/chosen": -221.43357849121094, "logps/rejected": -295.84515380859375, "loss": 0.0053, "losses/dpo": 8.136746146192309e-06, "losses/sft": 0.6160399913787842, "losses/total": 8.136746146192309e-06, "ref_logps/chosen": -218.10943603515625, "ref_logps/rejected": -205.91009521484375, "rewards/accuracies": 1.0, "rewards/chosen": -0.3324130177497864, "rewards/margins": 8.661092758178711, "rewards/rejected": -8.993505477905273, "step": 1256 }, { "epoch": 0.3, "learning_rate": 1.552e-07, "logps/chosen": -219.71078491210938, "logps/rejected": -310.3875427246094, "loss": 0.0076, "losses/dpo": 1.160765350505244e-05, "losses/sft": 0.5657063126564026, "losses/total": 1.160765350505244e-05, "ref_logps/chosen": -215.7532501220703, "ref_logps/rejected": -217.64111328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.3957555592060089, "rewards/margins": 8.878887176513672, "rewards/rejected": -9.274641990661621, "step": 1257 }, { "epoch": 0.3, "learning_rate": 1.5514666666666666e-07, "logps/chosen": -248.3994140625, "logps/rejected": -356.0810241699219, "loss": 0.0048, "losses/dpo": 1.3196241525292862e-05, "losses/sft": 0.7132801413536072, "losses/total": 1.3196241525292862e-05, "ref_logps/chosen": -242.98294067382812, "ref_logps/rejected": -261.3005676269531, "rewards/accuracies": 1.0, "rewards/chosen": -0.5416470766067505, "rewards/margins": 8.936399459838867, "rewards/rejected": -9.478046417236328, "step": 1258 }, { "epoch": 0.3, "learning_rate": 1.5509333333333331e-07, "logps/chosen": -203.61849975585938, "logps/rejected": -317.44793701171875, "loss": 0.0015, "losses/dpo": 0.00034986078389920294, "losses/sft": 0.4334481656551361, "losses/total": 0.00034986078389920294, "ref_logps/chosen": -199.69479370117188, "ref_logps/rejected": -219.7209014892578, "rewards/accuracies": 1.0, "rewards/chosen": -0.39237070083618164, "rewards/margins": 9.380331039428711, "rewards/rejected": -9.77270221710205, "step": 1259 }, { "epoch": 0.3, "learning_rate": 1.5504e-07, "logps/chosen": -232.50067138671875, "logps/rejected": -335.7529296875, "loss": 0.0126, "losses/dpo": 0.0001244402228621766, "losses/sft": 0.7872216105461121, "losses/total": 0.0001244402228621766, "ref_logps/chosen": -226.00653076171875, "ref_logps/rejected": -231.09461975097656, "rewards/accuracies": 1.0, "rewards/chosen": -0.649412989616394, "rewards/margins": 9.816421508789062, "rewards/rejected": -10.46583366394043, "step": 1260 }, { "epoch": 0.3, "learning_rate": 1.5498666666666667e-07, "logps/chosen": -238.7518310546875, "logps/rejected": -314.4451599121094, "loss": 0.0071, "losses/dpo": 8.549205085728317e-05, "losses/sft": 0.7464805841445923, "losses/total": 8.549205085728317e-05, "ref_logps/chosen": -233.74322509765625, "ref_logps/rejected": -223.3853302001953, "rewards/accuracies": 1.0, "rewards/chosen": -0.5008625984191895, "rewards/margins": 8.605120658874512, "rewards/rejected": -9.10598373413086, "step": 1261 }, { "epoch": 0.3, "learning_rate": 1.5493333333333332e-07, "logps/chosen": -258.5528564453125, "logps/rejected": -307.86773681640625, "loss": 0.0133, "losses/dpo": 6.473026132880477e-06, "losses/sft": 0.5462654829025269, "losses/total": 6.473026132880477e-06, "ref_logps/chosen": -253.97421264648438, "ref_logps/rejected": -214.31466674804688, "rewards/accuracies": 1.0, "rewards/chosen": -0.4578625559806824, "rewards/margins": 8.897444725036621, "rewards/rejected": -9.355306625366211, "step": 1262 }, { "epoch": 0.3, "learning_rate": 1.5488e-07, "logps/chosen": -253.67236328125, "logps/rejected": -340.4366455078125, "loss": 0.0066, "losses/dpo": 7.391021881630877e-06, "losses/sft": 0.39549335837364197, "losses/total": 7.391021881630877e-06, "ref_logps/chosen": -247.1615753173828, "ref_logps/rejected": -239.41494750976562, "rewards/accuracies": 1.0, "rewards/chosen": -0.651078462600708, "rewards/margins": 9.451092720031738, "rewards/rejected": -10.102170944213867, "step": 1263 }, { "epoch": 0.3, "learning_rate": 1.5482666666666667e-07, "logps/chosen": -197.90365600585938, "logps/rejected": -298.0989074707031, "loss": 0.0286, "losses/dpo": 1.3288035916048102e-07, "losses/sft": 0.5246617794036865, "losses/total": 1.3288035916048102e-07, "ref_logps/chosen": -192.09725952148438, "ref_logps/rejected": -207.11041259765625, "rewards/accuracies": 1.0, "rewards/chosen": -0.580637514591217, "rewards/margins": 8.51821231842041, "rewards/rejected": -9.09885025024414, "step": 1264 }, { "epoch": 0.3, "learning_rate": 1.5477333333333332e-07, "logps/chosen": -230.5642547607422, "logps/rejected": -323.71624755859375, "loss": 0.0118, "losses/dpo": 2.2832782065052015e-07, "losses/sft": 0.5054766535758972, "losses/total": 2.2832782065052015e-07, "ref_logps/chosen": -225.32266235351562, "ref_logps/rejected": -227.4381866455078, "rewards/accuracies": 1.0, "rewards/chosen": -0.5241568684577942, "rewards/margins": 9.103647232055664, "rewards/rejected": -9.627803802490234, "step": 1265 }, { "epoch": 0.3, "learning_rate": 1.5471999999999997e-07, "logps/chosen": -213.13717651367188, "logps/rejected": -308.641845703125, "loss": 0.0029, "losses/dpo": 0.004306885879486799, "losses/sft": 0.3847660422325134, "losses/total": 0.004306885879486799, "ref_logps/chosen": -208.69869995117188, "ref_logps/rejected": -213.1486053466797, "rewards/accuracies": 1.0, "rewards/chosen": -0.4438459277153015, "rewards/margins": 9.105478286743164, "rewards/rejected": -9.549324035644531, "step": 1266 }, { "epoch": 0.3, "learning_rate": 1.5466666666666665e-07, "logps/chosen": -205.25241088867188, "logps/rejected": -326.7038879394531, "loss": 0.0075, "losses/dpo": 2.3431740991952665e-09, "losses/sft": 0.5259135365486145, "losses/total": 2.3431740991952665e-09, "ref_logps/chosen": -198.54342651367188, "ref_logps/rejected": -225.7536163330078, "rewards/accuracies": 1.0, "rewards/chosen": -0.6709000468254089, "rewards/margins": 9.424128532409668, "rewards/rejected": -10.0950288772583, "step": 1267 }, { "epoch": 0.3, "learning_rate": 1.5461333333333333e-07, "logps/chosen": -223.62109375, "logps/rejected": -305.6531677246094, "loss": 0.0061, "losses/dpo": 1.1598899618547875e-06, "losses/sft": 0.5721191763877869, "losses/total": 1.1598899618547875e-06, "ref_logps/chosen": -221.72689819335938, "ref_logps/rejected": -216.14089965820312, "rewards/accuracies": 1.0, "rewards/chosen": -0.18941882252693176, "rewards/margins": 8.761808395385742, "rewards/rejected": -8.951226234436035, "step": 1268 }, { "epoch": 0.3, "learning_rate": 1.5456e-07, "logps/chosen": -233.58160400390625, "logps/rejected": -312.43450927734375, "loss": 0.0306, "losses/dpo": 0.09452291578054428, "losses/sft": 0.4645574986934662, "losses/total": 0.09452291578054428, "ref_logps/chosen": -225.9017333984375, "ref_logps/rejected": -220.95416259765625, "rewards/accuracies": 1.0, "rewards/chosen": -0.7679876089096069, "rewards/margins": 8.380046844482422, "rewards/rejected": -9.14803409576416, "step": 1269 }, { "epoch": 0.3, "learning_rate": 1.5450666666666665e-07, "logps/chosen": -229.97216796875, "logps/rejected": -299.97332763671875, "loss": 0.0122, "losses/dpo": 1.7779899508241215e-07, "losses/sft": 0.701418936252594, "losses/total": 1.7779899508241215e-07, "ref_logps/chosen": -223.93692016601562, "ref_logps/rejected": -207.4124755859375, "rewards/accuracies": 1.0, "rewards/chosen": -0.6035261154174805, "rewards/margins": 8.652558326721191, "rewards/rejected": -9.256085395812988, "step": 1270 }, { "epoch": 0.31, "learning_rate": 1.5445333333333333e-07, "logps/chosen": -252.32940673828125, "logps/rejected": -329.078369140625, "loss": 0.0028, "losses/dpo": 2.9057166102575138e-05, "losses/sft": 0.5450447201728821, "losses/total": 2.9057166102575138e-05, "ref_logps/chosen": -244.42457580566406, "ref_logps/rejected": -228.8743438720703, "rewards/accuracies": 1.0, "rewards/chosen": -0.7904833555221558, "rewards/margins": 9.229920387268066, "rewards/rejected": -10.020402908325195, "step": 1271 }, { "epoch": 0.31, "learning_rate": 1.544e-07, "logps/chosen": -230.76666259765625, "logps/rejected": -337.1124267578125, "loss": 0.0022, "losses/dpo": 0.0001237973920069635, "losses/sft": 0.5447176098823547, "losses/total": 0.0001237973920069635, "ref_logps/chosen": -225.71080017089844, "ref_logps/rejected": -229.16021728515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.5055866241455078, "rewards/margins": 10.28963565826416, "rewards/rejected": -10.795222282409668, "step": 1272 }, { "epoch": 0.31, "learning_rate": 1.5434666666666666e-07, "logps/chosen": -203.1470184326172, "logps/rejected": -319.7229309082031, "loss": 0.0068, "losses/dpo": 1.0774185454920371e-07, "losses/sft": 0.570380449295044, "losses/total": 1.0774185454920371e-07, "ref_logps/chosen": -199.40672302246094, "ref_logps/rejected": -218.77610778808594, "rewards/accuracies": 1.0, "rewards/chosen": -0.3740294277667999, "rewards/margins": 9.72065258026123, "rewards/rejected": -10.094682693481445, "step": 1273 }, { "epoch": 0.31, "learning_rate": 1.542933333333333e-07, "logps/chosen": -219.36842346191406, "logps/rejected": -293.8841857910156, "loss": 0.0197, "losses/dpo": 5.382433982958901e-07, "losses/sft": 0.674654483795166, "losses/total": 5.382433982958901e-07, "ref_logps/chosen": -214.97532653808594, "ref_logps/rejected": -203.00840759277344, "rewards/accuracies": 1.0, "rewards/chosen": -0.4393094778060913, "rewards/margins": 8.648268699645996, "rewards/rejected": -9.087578773498535, "step": 1274 }, { "epoch": 0.31, "learning_rate": 1.5423999999999998e-07, "logps/chosen": -202.63278198242188, "logps/rejected": -314.0445556640625, "loss": 0.0065, "losses/dpo": 1.3097952660245937e-06, "losses/sft": 0.555736243724823, "losses/total": 1.3097952660245937e-06, "ref_logps/chosen": -197.1507110595703, "ref_logps/rejected": -225.14230346679688, "rewards/accuracies": 1.0, "rewards/chosen": -0.5482059717178345, "rewards/margins": 8.342018127441406, "rewards/rejected": -8.89022445678711, "step": 1275 }, { "epoch": 0.31, "learning_rate": 1.5418666666666666e-07, "logps/chosen": -254.06044006347656, "logps/rejected": -359.02618408203125, "loss": 0.0041, "losses/dpo": 7.438724878738867e-06, "losses/sft": 0.6438769102096558, "losses/total": 7.438724878738867e-06, "ref_logps/chosen": -247.995849609375, "ref_logps/rejected": -256.3502502441406, "rewards/accuracies": 1.0, "rewards/chosen": -0.606460690498352, "rewards/margins": 9.6611328125, "rewards/rejected": -10.267593383789062, "step": 1276 }, { "epoch": 0.31, "learning_rate": 1.5413333333333334e-07, "logps/chosen": -193.07162475585938, "logps/rejected": -312.63848876953125, "loss": 0.0139, "losses/dpo": 4.512811301538022e-06, "losses/sft": 0.6338217854499817, "losses/total": 4.512811301538022e-06, "ref_logps/chosen": -189.97528076171875, "ref_logps/rejected": -223.29664611816406, "rewards/accuracies": 1.0, "rewards/chosen": -0.30963462591171265, "rewards/margins": 8.624549865722656, "rewards/rejected": -8.934184074401855, "step": 1277 }, { "epoch": 0.31, "learning_rate": 1.5408e-07, "logps/chosen": -217.4915313720703, "logps/rejected": -300.5253601074219, "loss": 0.0041, "losses/dpo": 6.330462929327041e-05, "losses/sft": 0.4812081456184387, "losses/total": 6.330462929327041e-05, "ref_logps/chosen": -212.13711547851562, "ref_logps/rejected": -206.61846923828125, "rewards/accuracies": 1.0, "rewards/chosen": -0.5354423522949219, "rewards/margins": 8.85524845123291, "rewards/rejected": -9.390689849853516, "step": 1278 }, { "epoch": 0.31, "learning_rate": 1.5402666666666666e-07, "logps/chosen": -254.78662109375, "logps/rejected": -325.8052978515625, "loss": 0.0206, "losses/dpo": 1.6395454167650314e-06, "losses/sft": 1.289985179901123, "losses/total": 1.6395454167650314e-06, "ref_logps/chosen": -248.28469848632812, "ref_logps/rejected": -231.73406982421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.650191068649292, "rewards/margins": 8.75693130493164, "rewards/rejected": -9.407121658325195, "step": 1279 }, { "epoch": 0.31, "learning_rate": 1.5397333333333334e-07, "logps/chosen": -237.11093139648438, "logps/rejected": -322.9643249511719, "loss": 0.0148, "losses/dpo": 0.00016672803030814976, "losses/sft": 0.531230628490448, "losses/total": 0.00016672803030814976, "ref_logps/chosen": -230.29525756835938, "ref_logps/rejected": -219.13658142089844, "rewards/accuracies": 1.0, "rewards/chosen": -0.6815675497055054, "rewards/margins": 9.70120906829834, "rewards/rejected": -10.382776260375977, "step": 1280 }, { "epoch": 0.31, "learning_rate": 1.5392e-07, "logps/chosen": -248.75686645507812, "logps/rejected": -308.6773681640625, "loss": 0.0113, "losses/dpo": 1.971221536223311e-06, "losses/sft": 0.4671797752380371, "losses/total": 1.971221536223311e-06, "ref_logps/chosen": -242.86752319335938, "ref_logps/rejected": -216.9508514404297, "rewards/accuracies": 1.0, "rewards/chosen": -0.5889337658882141, "rewards/margins": 8.583717346191406, "rewards/rejected": -9.172650337219238, "step": 1281 }, { "epoch": 0.31, "learning_rate": 1.5386666666666667e-07, "logps/chosen": -250.84754943847656, "logps/rejected": -341.6280212402344, "loss": 0.012, "losses/dpo": 4.6403201849898323e-05, "losses/sft": 0.5497161149978638, "losses/total": 4.6403201849898323e-05, "ref_logps/chosen": -243.9247589111328, "ref_logps/rejected": -234.911865234375, "rewards/accuracies": 1.0, "rewards/chosen": -0.6922784447669983, "rewards/margins": 9.979336738586426, "rewards/rejected": -10.671615600585938, "step": 1282 }, { "epoch": 0.31, "learning_rate": 1.5381333333333332e-07, "logps/chosen": -242.254638671875, "logps/rejected": -343.16998291015625, "loss": 0.0129, "losses/dpo": 1.3825174960402364e-07, "losses/sft": 0.6482068300247192, "losses/total": 1.3825174960402364e-07, "ref_logps/chosen": -234.86180114746094, "ref_logps/rejected": -237.034912109375, "rewards/accuracies": 1.0, "rewards/chosen": -0.7392830848693848, "rewards/margins": 9.874223709106445, "rewards/rejected": -10.613506317138672, "step": 1283 }, { "epoch": 0.31, "learning_rate": 1.5376e-07, "logps/chosen": -241.14862060546875, "logps/rejected": -348.802001953125, "loss": 0.0009, "losses/dpo": 9.233437481270812e-08, "losses/sft": 0.6025311946868896, "losses/total": 9.233437481270812e-08, "ref_logps/chosen": -234.51898193359375, "ref_logps/rejected": -241.609619140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.6629651784896851, "rewards/margins": 10.056273460388184, "rewards/rejected": -10.71923828125, "step": 1284 }, { "epoch": 0.31, "learning_rate": 1.5370666666666665e-07, "logps/chosen": -220.093505859375, "logps/rejected": -298.13067626953125, "loss": 0.0069, "losses/dpo": 1.2448491304439813e-07, "losses/sft": 0.5842992663383484, "losses/total": 1.2448491304439813e-07, "ref_logps/chosen": -215.59568786621094, "ref_logps/rejected": -210.87167358398438, "rewards/accuracies": 1.0, "rewards/chosen": -0.4497790038585663, "rewards/margins": 8.276123046875, "rewards/rejected": -8.725900650024414, "step": 1285 }, { "epoch": 0.31, "learning_rate": 1.5365333333333332e-07, "logps/chosen": -212.64620971679688, "logps/rejected": -326.470703125, "loss": 0.0017, "losses/dpo": 2.9290762881828414e-07, "losses/sft": 0.7193203568458557, "losses/total": 2.9290762881828414e-07, "ref_logps/chosen": -207.8659210205078, "ref_logps/rejected": -227.8123779296875, "rewards/accuracies": 1.0, "rewards/chosen": -0.4780287444591522, "rewards/margins": 9.387804985046387, "rewards/rejected": -9.86583423614502, "step": 1286 }, { "epoch": 0.31, "learning_rate": 1.536e-07, "logps/chosen": -229.51058959960938, "logps/rejected": -317.28631591796875, "loss": 0.0261, "losses/dpo": 1.7021752682921942e-06, "losses/sft": 1.2384840250015259, "losses/total": 1.7021752682921942e-06, "ref_logps/chosen": -225.3968048095703, "ref_logps/rejected": -219.27099609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.41137948632240295, "rewards/margins": 9.390154838562012, "rewards/rejected": -9.801533699035645, "step": 1287 }, { "epoch": 0.31, "learning_rate": 1.5354666666666668e-07, "logps/chosen": -231.47731018066406, "logps/rejected": -318.172607421875, "loss": 0.0101, "losses/dpo": 2.0112109268666245e-05, "losses/sft": 0.61398845911026, "losses/total": 2.0112109268666245e-05, "ref_logps/chosen": -226.2255096435547, "ref_logps/rejected": -216.79086303710938, "rewards/accuracies": 1.0, "rewards/chosen": -0.5251813530921936, "rewards/margins": 9.612993240356445, "rewards/rejected": -10.138175010681152, "step": 1288 }, { "epoch": 0.31, "learning_rate": 1.5349333333333333e-07, "logps/chosen": -256.5119934082031, "logps/rejected": -360.2528991699219, "loss": 0.0091, "losses/dpo": 0.00052739551756531, "losses/sft": 0.5359795689582825, "losses/total": 0.00052739551756531, "ref_logps/chosen": -251.36111450195312, "ref_logps/rejected": -251.81686401367188, "rewards/accuracies": 1.0, "rewards/chosen": -0.5150899291038513, "rewards/margins": 10.328511238098145, "rewards/rejected": -10.84360122680664, "step": 1289 }, { "epoch": 0.31, "learning_rate": 1.5344e-07, "logps/chosen": -228.3255157470703, "logps/rejected": -339.68743896484375, "loss": 0.0088, "losses/dpo": 0.00016446723020635545, "losses/sft": 0.700842559337616, "losses/total": 0.00016446723020635545, "ref_logps/chosen": -222.60345458984375, "ref_logps/rejected": -235.6214141845703, "rewards/accuracies": 1.0, "rewards/chosen": -0.5722060203552246, "rewards/margins": 9.834396362304688, "rewards/rejected": -10.40660285949707, "step": 1290 }, { "epoch": 0.31, "learning_rate": 1.5338666666666665e-07, "logps/chosen": -235.36402893066406, "logps/rejected": -354.257080078125, "loss": 0.0038, "losses/dpo": 0.00015273148892447352, "losses/sft": 1.325525164604187, "losses/total": 0.00015273148892447352, "ref_logps/chosen": -229.77865600585938, "ref_logps/rejected": -247.94178771972656, "rewards/accuracies": 1.0, "rewards/chosen": -0.5585370659828186, "rewards/margins": 10.072990417480469, "rewards/rejected": -10.631528854370117, "step": 1291 }, { "epoch": 0.31, "learning_rate": 1.5333333333333333e-07, "logps/chosen": -260.75201416015625, "logps/rejected": -351.27764892578125, "loss": 0.0029, "losses/dpo": 8.17891941551352e-06, "losses/sft": 0.8457173109054565, "losses/total": 8.17891941551352e-06, "ref_logps/chosen": -255.81732177734375, "ref_logps/rejected": -250.59326171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.49346989393234253, "rewards/margins": 9.574970245361328, "rewards/rejected": -10.068441390991211, "step": 1292 }, { "epoch": 0.31, "learning_rate": 1.5327999999999998e-07, "logps/chosen": -251.7550048828125, "logps/rejected": -329.593505859375, "loss": 0.0071, "losses/dpo": 1.0062118462883518e-06, "losses/sft": 0.5221445560455322, "losses/total": 1.0062118462883518e-06, "ref_logps/chosen": -247.72787475585938, "ref_logps/rejected": -227.810546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.4027140736579895, "rewards/margins": 9.775579452514648, "rewards/rejected": -10.17829418182373, "step": 1293 }, { "epoch": 0.31, "learning_rate": 1.5322666666666666e-07, "logps/chosen": -255.58096313476562, "logps/rejected": -317.950439453125, "loss": 0.0296, "losses/dpo": 3.7699555832659826e-05, "losses/sft": 0.4631650447845459, "losses/total": 3.7699555832659826e-05, "ref_logps/chosen": -249.63638305664062, "ref_logps/rejected": -219.5851593017578, "rewards/accuracies": 1.0, "rewards/chosen": -0.5944580435752869, "rewards/margins": 9.242073059082031, "rewards/rejected": -9.836530685424805, "step": 1294 }, { "epoch": 0.31, "learning_rate": 1.5317333333333333e-07, "logps/chosen": -245.626220703125, "logps/rejected": -326.2213134765625, "loss": 0.0154, "losses/dpo": 5.258342298475327e-06, "losses/sft": 0.6526766419410706, "losses/total": 5.258342298475327e-06, "ref_logps/chosen": -238.69911193847656, "ref_logps/rejected": -231.2916717529297, "rewards/accuracies": 1.0, "rewards/chosen": -0.6927112936973572, "rewards/margins": 8.800251960754395, "rewards/rejected": -9.492963790893555, "step": 1295 }, { "epoch": 0.31, "learning_rate": 1.5311999999999998e-07, "logps/chosen": -211.61305236816406, "logps/rejected": -308.6951904296875, "loss": 0.0087, "losses/dpo": 0.046848949044942856, "losses/sft": 0.5539349913597107, "losses/total": 0.046848949044942856, "ref_logps/chosen": -206.15234375, "ref_logps/rejected": -216.8296661376953, "rewards/accuracies": 1.0, "rewards/chosen": -0.5460677146911621, "rewards/margins": 8.640485763549805, "rewards/rejected": -9.186553955078125, "step": 1296 }, { "epoch": 0.31, "learning_rate": 1.5306666666666666e-07, "logps/chosen": -235.23162841796875, "logps/rejected": -336.7865905761719, "loss": 0.0072, "losses/dpo": 1.46121647048858e-05, "losses/sft": 0.4833112061023712, "losses/total": 1.46121647048858e-05, "ref_logps/chosen": -229.885009765625, "ref_logps/rejected": -233.5688018798828, "rewards/accuracies": 1.0, "rewards/chosen": -0.5346603989601135, "rewards/margins": 9.787118911743164, "rewards/rejected": -10.321779251098633, "step": 1297 }, { "epoch": 0.31, "learning_rate": 1.5301333333333334e-07, "logps/chosen": -246.1136932373047, "logps/rejected": -337.009521484375, "loss": 0.0026, "losses/dpo": 0.0001445898087695241, "losses/sft": 0.45870938897132874, "losses/total": 0.0001445898087695241, "ref_logps/chosen": -241.4662628173828, "ref_logps/rejected": -229.3301239013672, "rewards/accuracies": 1.0, "rewards/chosen": -0.4647427201271057, "rewards/margins": 10.30319595336914, "rewards/rejected": -10.767938613891602, "step": 1298 }, { "epoch": 0.31, "learning_rate": 1.5296e-07, "logps/chosen": -223.00796508789062, "logps/rejected": -329.49627685546875, "loss": 0.004, "losses/dpo": 1.7511707710582414e-06, "losses/sft": 0.6824342012405396, "losses/total": 1.7511707710582414e-06, "ref_logps/chosen": -217.08712768554688, "ref_logps/rejected": -233.92071533203125, "rewards/accuracies": 1.0, "rewards/chosen": -0.5920823812484741, "rewards/margins": 8.965474128723145, "rewards/rejected": -9.55755615234375, "step": 1299 }, { "epoch": 0.31, "learning_rate": 1.5290666666666664e-07, "logps/chosen": -230.68936157226562, "logps/rejected": -321.9729309082031, "loss": 0.0022, "losses/dpo": 2.131664814442047e-06, "losses/sft": 0.7833679914474487, "losses/total": 2.131664814442047e-06, "ref_logps/chosen": -225.09979248046875, "ref_logps/rejected": -222.62368774414062, "rewards/accuracies": 1.0, "rewards/chosen": -0.5589572191238403, "rewards/margins": 9.375967979431152, "rewards/rejected": -9.934925079345703, "step": 1300 }, { "epoch": 0.31, "learning_rate": 1.5285333333333332e-07, "logps/chosen": -193.1736602783203, "logps/rejected": -324.4103088378906, "loss": 0.0055, "losses/dpo": 6.118703458923846e-05, "losses/sft": 0.8143848180770874, "losses/total": 6.118703458923846e-05, "ref_logps/chosen": -189.0751953125, "ref_logps/rejected": -225.34963989257812, "rewards/accuracies": 1.0, "rewards/chosen": -0.40984654426574707, "rewards/margins": 9.496220588684082, "rewards/rejected": -9.906067848205566, "step": 1301 }, { "epoch": 0.31, "learning_rate": 1.528e-07, "logps/chosen": -208.52011108398438, "logps/rejected": -362.22442626953125, "loss": 0.0092, "losses/dpo": 4.101758804608835e-06, "losses/sft": 0.4975624680519104, "losses/total": 4.101758804608835e-06, "ref_logps/chosen": -201.93515014648438, "ref_logps/rejected": -247.55123901367188, "rewards/accuracies": 1.0, "rewards/chosen": -0.6584964394569397, "rewards/margins": 10.808821678161621, "rewards/rejected": -11.467317581176758, "step": 1302 }, { "epoch": 0.31, "learning_rate": 1.5274666666666667e-07, "logps/chosen": -190.6588134765625, "logps/rejected": -308.794677734375, "loss": 0.0038, "losses/dpo": 0.0018243517260998487, "losses/sft": 0.47973963618278503, "losses/total": 0.0018243517260998487, "ref_logps/chosen": -186.5093994140625, "ref_logps/rejected": -217.48873901367188, "rewards/accuracies": 1.0, "rewards/chosen": -0.41494038701057434, "rewards/margins": 8.715649604797363, "rewards/rejected": -9.130590438842773, "step": 1303 }, { "epoch": 0.31, "learning_rate": 1.5269333333333332e-07, "logps/chosen": -252.20474243164062, "logps/rejected": -302.2943115234375, "loss": 0.0138, "losses/dpo": 2.5590921381990483e-07, "losses/sft": 0.5937131643295288, "losses/total": 2.5590921381990483e-07, "ref_logps/chosen": -248.1339111328125, "ref_logps/rejected": -206.79757690429688, "rewards/accuracies": 1.0, "rewards/chosen": -0.4070812463760376, "rewards/margins": 9.142594337463379, "rewards/rejected": -9.549674987792969, "step": 1304 }, { "epoch": 0.31, "learning_rate": 1.5264e-07, "logps/chosen": -236.3704376220703, "logps/rejected": -309.48602294921875, "loss": 0.017, "losses/dpo": 1.414618509443244e-05, "losses/sft": 0.9118052124977112, "losses/total": 1.414618509443244e-05, "ref_logps/chosen": -229.7910614013672, "ref_logps/rejected": -213.1536865234375, "rewards/accuracies": 1.0, "rewards/chosen": -0.6579374670982361, "rewards/margins": 8.97529411315918, "rewards/rejected": -9.633232116699219, "step": 1305 }, { "epoch": 0.31, "learning_rate": 1.5258666666666667e-07, "logps/chosen": -252.3428192138672, "logps/rejected": -330.0506591796875, "loss": 0.0015, "losses/dpo": 3.922760356545041e-07, "losses/sft": 0.702403724193573, "losses/total": 3.922760356545041e-07, "ref_logps/chosen": -246.045166015625, "ref_logps/rejected": -224.1416015625, "rewards/accuracies": 1.0, "rewards/chosen": -0.6297670602798462, "rewards/margins": 9.961140632629395, "rewards/rejected": -10.59090805053711, "step": 1306 }, { "epoch": 0.31, "learning_rate": 1.5253333333333335e-07, "logps/chosen": -236.02906799316406, "logps/rejected": -320.8851623535156, "loss": 0.0052, "losses/dpo": 6.101759481680347e-06, "losses/sft": 0.6467117071151733, "losses/total": 6.101759481680347e-06, "ref_logps/chosen": -230.3201904296875, "ref_logps/rejected": -225.7181854248047, "rewards/accuracies": 1.0, "rewards/chosen": -0.5708914995193481, "rewards/margins": 8.945807456970215, "rewards/rejected": -9.516698837280273, "step": 1307 }, { "epoch": 0.31, "learning_rate": 1.5248e-07, "logps/chosen": -227.37594604492188, "logps/rejected": -297.282958984375, "loss": 0.0054, "losses/dpo": 0.00013940791541244835, "losses/sft": 0.6184750199317932, "losses/total": 0.00013940791541244835, "ref_logps/chosen": -221.20657348632812, "ref_logps/rejected": -202.5924072265625, "rewards/accuracies": 1.0, "rewards/chosen": -0.6169359683990479, "rewards/margins": 8.852117538452148, "rewards/rejected": -9.469053268432617, "step": 1308 }, { "epoch": 0.31, "learning_rate": 1.5242666666666665e-07, "logps/chosen": -259.95037841796875, "logps/rejected": -346.23748779296875, "loss": 0.0011, "losses/dpo": 6.461365131826824e-08, "losses/sft": 0.6751222014427185, "losses/total": 6.461365131826824e-08, "ref_logps/chosen": -254.4333953857422, "ref_logps/rejected": -239.50389099121094, "rewards/accuracies": 1.0, "rewards/chosen": -0.5516977906227112, "rewards/margins": 10.121663093566895, "rewards/rejected": -10.673360824584961, "step": 1309 }, { "epoch": 0.31, "learning_rate": 1.5237333333333333e-07, "logps/chosen": -225.81509399414062, "logps/rejected": -310.97698974609375, "loss": 0.0165, "losses/dpo": 8.325488352056709e-07, "losses/sft": 1.157738447189331, "losses/total": 8.325488352056709e-07, "ref_logps/chosen": -220.91635131835938, "ref_logps/rejected": -214.94134521484375, "rewards/accuracies": 1.0, "rewards/chosen": -0.4898724853992462, "rewards/margins": 9.113692283630371, "rewards/rejected": -9.603565216064453, "step": 1310 }, { "epoch": 0.31, "learning_rate": 1.5232e-07, "logps/chosen": -245.93173217773438, "logps/rejected": -329.5175476074219, "loss": 0.0034, "losses/dpo": 1.2402182392179384e-06, "losses/sft": 0.49123603105545044, "losses/total": 1.2402182392179384e-06, "ref_logps/chosen": -242.0838623046875, "ref_logps/rejected": -232.03256225585938, "rewards/accuracies": 1.0, "rewards/chosen": -0.38478851318359375, "rewards/margins": 9.36370849609375, "rewards/rejected": -9.74849796295166, "step": 1311 }, { "epoch": 0.31, "learning_rate": 1.5226666666666665e-07, "logps/chosen": -210.93576049804688, "logps/rejected": -305.65985107421875, "loss": 0.0072, "losses/dpo": 5.8203863773087505e-06, "losses/sft": 0.4779006540775299, "losses/total": 5.8203863773087505e-06, "ref_logps/chosen": -207.04925537109375, "ref_logps/rejected": -214.227294921875, "rewards/accuracies": 1.0, "rewards/chosen": -0.38864967226982117, "rewards/margins": 8.754607200622559, "rewards/rejected": -9.143257141113281, "step": 1312 }, { "epoch": 0.32, "learning_rate": 1.5221333333333333e-07, "logps/chosen": -251.13864135742188, "logps/rejected": -343.784912109375, "loss": 0.0237, "losses/dpo": 0.00029614008963108063, "losses/sft": 0.5745328664779663, "losses/total": 0.00029614008963108063, "ref_logps/chosen": -246.51022338867188, "ref_logps/rejected": -244.03614807128906, "rewards/accuracies": 1.0, "rewards/chosen": -0.4628409147262573, "rewards/margins": 9.512039184570312, "rewards/rejected": -9.974879264831543, "step": 1313 }, { "epoch": 0.32, "learning_rate": 1.5216e-07, "logps/chosen": -214.8599090576172, "logps/rejected": -283.093994140625, "loss": 0.0155, "losses/dpo": 1.7688583966446458e-06, "losses/sft": 0.5861110687255859, "losses/total": 1.7688583966446458e-06, "ref_logps/chosen": -209.94520568847656, "ref_logps/rejected": -196.98043823242188, "rewards/accuracies": 1.0, "rewards/chosen": -0.4914720356464386, "rewards/margins": 8.119882583618164, "rewards/rejected": -8.61135482788086, "step": 1314 }, { "epoch": 0.32, "learning_rate": 1.5210666666666666e-07, "logps/chosen": -230.83502197265625, "logps/rejected": -332.2359313964844, "loss": 0.0026, "losses/dpo": 2.1391029747519497e-07, "losses/sft": 0.7189926505088806, "losses/total": 2.1391029747519497e-07, "ref_logps/chosen": -224.62265014648438, "ref_logps/rejected": -232.75308227539062, "rewards/accuracies": 1.0, "rewards/chosen": -0.6212350726127625, "rewards/margins": 9.327048301696777, "rewards/rejected": -9.948283195495605, "step": 1315 }, { "epoch": 0.32, "learning_rate": 1.5205333333333333e-07, "logps/chosen": -206.64967346191406, "logps/rejected": -319.9766540527344, "loss": 0.0108, "losses/dpo": 4.304169749502762e-07, "losses/sft": 0.4727171063423157, "losses/total": 4.304169749502762e-07, "ref_logps/chosen": -203.42715454101562, "ref_logps/rejected": -223.42489624023438, "rewards/accuracies": 1.0, "rewards/chosen": -0.3222518563270569, "rewards/margins": 9.332921981811523, "rewards/rejected": -9.655174255371094, "step": 1316 }, { "epoch": 0.32, "learning_rate": 1.5199999999999998e-07, "logps/chosen": -230.9103240966797, "logps/rejected": -326.6207580566406, "loss": 0.0098, "losses/dpo": 8.297298336401582e-06, "losses/sft": 0.47253483533859253, "losses/total": 8.297298336401582e-06, "ref_logps/chosen": -226.24205017089844, "ref_logps/rejected": -234.6693572998047, "rewards/accuracies": 1.0, "rewards/chosen": -0.46682634949684143, "rewards/margins": 8.728316307067871, "rewards/rejected": -9.19514274597168, "step": 1317 }, { "epoch": 0.32, "learning_rate": 1.5194666666666666e-07, "logps/chosen": -236.05026245117188, "logps/rejected": -329.57861328125, "loss": 0.0039, "losses/dpo": 1.1171128466713753e-09, "losses/sft": 0.45705217123031616, "losses/total": 1.1171128466713753e-09, "ref_logps/chosen": -231.115478515625, "ref_logps/rejected": -235.88648986816406, "rewards/accuracies": 1.0, "rewards/chosen": -0.49347949028015137, "rewards/margins": 8.875731468200684, "rewards/rejected": -9.369211196899414, "step": 1318 }, { "epoch": 0.32, "learning_rate": 1.518933333333333e-07, "logps/chosen": -294.1067810058594, "logps/rejected": -363.97454833984375, "loss": 0.0005, "losses/dpo": 2.7089740797237027e-06, "losses/sft": 0.6897270679473877, "losses/total": 2.7089740797237027e-06, "ref_logps/chosen": -288.0840148925781, "ref_logps/rejected": -253.01124572753906, "rewards/accuracies": 1.0, "rewards/chosen": -0.6022762060165405, "rewards/margins": 10.494056701660156, "rewards/rejected": -11.096332550048828, "step": 1319 }, { "epoch": 0.32, "learning_rate": 1.5184e-07, "logps/chosen": -213.37680053710938, "logps/rejected": -324.4110412597656, "loss": 0.0125, "losses/dpo": 6.407702312571928e-05, "losses/sft": 0.6923926472663879, "losses/total": 6.407702312571928e-05, "ref_logps/chosen": -209.041015625, "ref_logps/rejected": -230.4123077392578, "rewards/accuracies": 1.0, "rewards/chosen": -0.43358075618743896, "rewards/margins": 8.966291427612305, "rewards/rejected": -9.399872779846191, "step": 1320 }, { "epoch": 0.32, "learning_rate": 1.5178666666666667e-07, "logps/chosen": -240.91232299804688, "logps/rejected": -324.2810363769531, "loss": 0.0038, "losses/dpo": 2.0064732098035165e-08, "losses/sft": 0.9102341532707214, "losses/total": 2.0064732098035165e-08, "ref_logps/chosen": -233.44346618652344, "ref_logps/rejected": -225.1798858642578, "rewards/accuracies": 1.0, "rewards/chosen": -0.7468844652175903, "rewards/margins": 9.16323184967041, "rewards/rejected": -9.910116195678711, "step": 1321 }, { "epoch": 0.32, "learning_rate": 1.5173333333333334e-07, "logps/chosen": -236.95046997070312, "logps/rejected": -326.59136962890625, "loss": 0.0015, "losses/dpo": 1.3327356100489851e-05, "losses/sft": 0.6157123446464539, "losses/total": 1.3327356100489851e-05, "ref_logps/chosen": -232.6039581298828, "ref_logps/rejected": -230.85313415527344, "rewards/accuracies": 1.0, "rewards/chosen": -0.43464961647987366, "rewards/margins": 9.139175415039062, "rewards/rejected": -9.57382583618164, "step": 1322 }, { "epoch": 0.32, "learning_rate": 1.5168e-07, "logps/chosen": -239.36996459960938, "logps/rejected": -342.3965759277344, "loss": 0.0049, "losses/dpo": 1.182572830771278e-07, "losses/sft": 0.6722537875175476, "losses/total": 1.182572830771278e-07, "ref_logps/chosen": -234.58419799804688, "ref_logps/rejected": -238.0987548828125, "rewards/accuracies": 1.0, "rewards/chosen": -0.4785771369934082, "rewards/margins": 9.95120620727539, "rewards/rejected": -10.429783821105957, "step": 1323 }, { "epoch": 0.32, "learning_rate": 1.5162666666666667e-07, "logps/chosen": -211.11343383789062, "logps/rejected": -305.05462646484375, "loss": 0.0058, "losses/dpo": 0.0002413806359982118, "losses/sft": 0.5278406739234924, "losses/total": 0.0002413806359982118, "ref_logps/chosen": -205.68115234375, "ref_logps/rejected": -208.2910919189453, "rewards/accuracies": 1.0, "rewards/chosen": -0.543229341506958, "rewards/margins": 9.133125305175781, "rewards/rejected": -9.67635440826416, "step": 1324 }, { "epoch": 0.32, "learning_rate": 1.5157333333333332e-07, "logps/chosen": -241.63555908203125, "logps/rejected": -344.95233154296875, "loss": 0.0035, "losses/dpo": 7.721744623268023e-06, "losses/sft": 0.46581852436065674, "losses/total": 7.721744623268023e-06, "ref_logps/chosen": -237.01129150390625, "ref_logps/rejected": -241.87257385253906, "rewards/accuracies": 1.0, "rewards/chosen": -0.4624273478984833, "rewards/margins": 9.845549583435059, "rewards/rejected": -10.307976722717285, "step": 1325 }, { "epoch": 0.32, "learning_rate": 1.5152e-07, "logps/chosen": -191.4515380859375, "logps/rejected": -320.37017822265625, "loss": 0.0036, "losses/dpo": 1.426557901140768e-05, "losses/sft": 0.6086558103561401, "losses/total": 1.426557901140768e-05, "ref_logps/chosen": -187.97447204589844, "ref_logps/rejected": -226.7220458984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.34770604968070984, "rewards/margins": 9.017108917236328, "rewards/rejected": -9.364814758300781, "step": 1326 }, { "epoch": 0.32, "learning_rate": 1.5146666666666665e-07, "logps/chosen": -244.56680297851562, "logps/rejected": -259.4893798828125, "loss": 0.0121, "losses/dpo": 0.00016505199891980737, "losses/sft": 0.9789283871650696, "losses/total": 0.00016505199891980737, "ref_logps/chosen": -238.81907653808594, "ref_logps/rejected": -180.06430053710938, "rewards/accuracies": 1.0, "rewards/chosen": -0.5747740268707275, "rewards/margins": 7.367734909057617, "rewards/rejected": -7.942508697509766, "step": 1327 }, { "epoch": 0.32, "learning_rate": 1.5141333333333332e-07, "logps/chosen": -222.956787109375, "logps/rejected": -340.3514404296875, "loss": 0.0029, "losses/dpo": 1.69185307186126e-06, "losses/sft": 0.6913164258003235, "losses/total": 1.69185307186126e-06, "ref_logps/chosen": -220.02935791015625, "ref_logps/rejected": -234.31361389160156, "rewards/accuracies": 1.0, "rewards/chosen": -0.29274141788482666, "rewards/margins": 10.311040878295898, "rewards/rejected": -10.603781700134277, "step": 1328 }, { "epoch": 0.32, "learning_rate": 1.5136e-07, "logps/chosen": -197.9213409423828, "logps/rejected": -290.48065185546875, "loss": 0.0405, "losses/dpo": 0.00027099568978883326, "losses/sft": 0.549614429473877, "losses/total": 0.00027099568978883326, "ref_logps/chosen": -194.26657104492188, "ref_logps/rejected": -201.44070434570312, "rewards/accuracies": 1.0, "rewards/chosen": -0.3654772639274597, "rewards/margins": 8.538519859313965, "rewards/rejected": -8.903997421264648, "step": 1329 }, { "epoch": 0.32, "learning_rate": 1.5130666666666665e-07, "logps/chosen": -211.89515686035156, "logps/rejected": -311.41204833984375, "loss": 0.0107, "losses/dpo": 0.005545780993998051, "losses/sft": 0.6002678275108337, "losses/total": 0.005545780993998051, "ref_logps/chosen": -207.41586303710938, "ref_logps/rejected": -216.34219360351562, "rewards/accuracies": 1.0, "rewards/chosen": -0.44793108105659485, "rewards/margins": 9.059053421020508, "rewards/rejected": -9.50698471069336, "step": 1330 }, { "epoch": 0.32, "learning_rate": 1.5125333333333333e-07, "logps/chosen": -187.5830078125, "logps/rejected": -303.37689208984375, "loss": 0.0051, "losses/dpo": 7.217481083898747e-07, "losses/sft": 0.5362374186515808, "losses/total": 7.217481083898747e-07, "ref_logps/chosen": -184.16946411132812, "ref_logps/rejected": -205.94955444335938, "rewards/accuracies": 1.0, "rewards/chosen": -0.34135425090789795, "rewards/margins": 9.40138053894043, "rewards/rejected": -9.742735862731934, "step": 1331 }, { "epoch": 0.32, "learning_rate": 1.512e-07, "logps/chosen": -231.88389587402344, "logps/rejected": -321.1219482421875, "loss": 0.0042, "losses/dpo": 2.183232209063135e-05, "losses/sft": 0.629075288772583, "losses/total": 2.183232209063135e-05, "ref_logps/chosen": -225.90003967285156, "ref_logps/rejected": -216.30201721191406, "rewards/accuracies": 1.0, "rewards/chosen": -0.5983853936195374, "rewards/margins": 9.883609771728516, "rewards/rejected": -10.48199462890625, "step": 1332 }, { "epoch": 0.32, "learning_rate": 1.5114666666666665e-07, "logps/chosen": -222.7227783203125, "logps/rejected": -302.441162109375, "loss": 0.0021, "losses/dpo": 6.096227934904164e-06, "losses/sft": 0.46439307928085327, "losses/total": 6.096227934904164e-06, "ref_logps/chosen": -216.13804626464844, "ref_logps/rejected": -211.02682495117188, "rewards/accuracies": 1.0, "rewards/chosen": -0.6584718227386475, "rewards/margins": 8.482962608337402, "rewards/rejected": -9.141434669494629, "step": 1333 }, { "epoch": 0.32, "learning_rate": 1.510933333333333e-07, "logps/chosen": -239.250732421875, "logps/rejected": -345.3768310546875, "loss": 0.0014, "losses/dpo": 2.2777018671149563e-07, "losses/sft": 0.4013851583003998, "losses/total": 2.2777018671149563e-07, "ref_logps/chosen": -232.83102416992188, "ref_logps/rejected": -242.9510955810547, "rewards/accuracies": 1.0, "rewards/chosen": -0.6419718265533447, "rewards/margins": 9.600603103637695, "rewards/rejected": -10.242574691772461, "step": 1334 }, { "epoch": 0.32, "learning_rate": 1.5103999999999998e-07, "logps/chosen": -200.071533203125, "logps/rejected": -313.4945068359375, "loss": 0.0059, "losses/dpo": 6.123678303993074e-06, "losses/sft": 0.6816984415054321, "losses/total": 6.123678303993074e-06, "ref_logps/chosen": -195.4429931640625, "ref_logps/rejected": -210.55917358398438, "rewards/accuracies": 1.0, "rewards/chosen": -0.4628535509109497, "rewards/margins": 9.830678939819336, "rewards/rejected": -10.293533325195312, "step": 1335 }, { "epoch": 0.32, "learning_rate": 1.5098666666666666e-07, "logps/chosen": -217.78219604492188, "logps/rejected": -323.8787536621094, "loss": 0.0039, "losses/dpo": 1.3861068509868346e-05, "losses/sft": 0.6598237752914429, "losses/total": 1.3861068509868346e-05, "ref_logps/chosen": -210.7811279296875, "ref_logps/rejected": -222.03988647460938, "rewards/accuracies": 1.0, "rewards/chosen": -0.7001043558120728, "rewards/margins": 9.483781814575195, "rewards/rejected": -10.183886528015137, "step": 1336 }, { "epoch": 0.32, "learning_rate": 1.5093333333333333e-07, "logps/chosen": -230.3267059326172, "logps/rejected": -340.95159912109375, "loss": 0.0043, "losses/dpo": 1.946460542967543e-06, "losses/sft": 0.6809713840484619, "losses/total": 1.946460542967543e-06, "ref_logps/chosen": -222.650390625, "ref_logps/rejected": -235.1551513671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.7676323652267456, "rewards/margins": 9.81201171875, "rewards/rejected": -10.579644203186035, "step": 1337 }, { "epoch": 0.32, "learning_rate": 1.5087999999999999e-07, "logps/chosen": -219.04647827148438, "logps/rejected": -351.7901306152344, "loss": 0.0063, "losses/dpo": 3.1339884571934817e-06, "losses/sft": 0.42418816685676575, "losses/total": 3.1339884571934817e-06, "ref_logps/chosen": -213.57168579101562, "ref_logps/rejected": -245.20700073242188, "rewards/accuracies": 1.0, "rewards/chosen": -0.5474772453308105, "rewards/margins": 10.110837936401367, "rewards/rejected": -10.658315658569336, "step": 1338 }, { "epoch": 0.32, "learning_rate": 1.5082666666666666e-07, "logps/chosen": -215.87576293945312, "logps/rejected": -312.63348388671875, "loss": 0.0134, "losses/dpo": 2.409726448604488e-06, "losses/sft": 0.3499062657356262, "losses/total": 2.409726448604488e-06, "ref_logps/chosen": -210.05709838867188, "ref_logps/rejected": -210.7677001953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.5818673372268677, "rewards/margins": 9.604711532592773, "rewards/rejected": -10.186579704284668, "step": 1339 }, { "epoch": 0.32, "learning_rate": 1.5077333333333334e-07, "logps/chosen": -226.13162231445312, "logps/rejected": -327.1390380859375, "loss": 0.0068, "losses/dpo": 1.164430756261936e-07, "losses/sft": 0.6765455007553101, "losses/total": 1.164430756261936e-07, "ref_logps/chosen": -221.947021484375, "ref_logps/rejected": -226.82626342773438, "rewards/accuracies": 1.0, "rewards/chosen": -0.4184606969356537, "rewards/margins": 9.612813949584961, "rewards/rejected": -10.031274795532227, "step": 1340 }, { "epoch": 0.32, "learning_rate": 1.5072000000000002e-07, "logps/chosen": -231.3577880859375, "logps/rejected": -346.37835693359375, "loss": 0.0008, "losses/dpo": 1.1501340395625448e-06, "losses/sft": 0.49578893184661865, "losses/total": 1.1501340395625448e-06, "ref_logps/chosen": -225.327392578125, "ref_logps/rejected": -234.97618103027344, "rewards/accuracies": 1.0, "rewards/chosen": -0.6030397415161133, "rewards/margins": 10.537178039550781, "rewards/rejected": -11.140217781066895, "step": 1341 }, { "epoch": 0.32, "learning_rate": 1.5066666666666667e-07, "logps/chosen": -191.6284637451172, "logps/rejected": -301.5350341796875, "loss": 0.0063, "losses/dpo": 1.6425914850515255e-07, "losses/sft": 0.682041585445404, "losses/total": 1.6425914850515255e-07, "ref_logps/chosen": -186.00656127929688, "ref_logps/rejected": -213.7534637451172, "rewards/accuracies": 1.0, "rewards/chosen": -0.5621905326843262, "rewards/margins": 8.215967178344727, "rewards/rejected": -8.778158187866211, "step": 1342 }, { "epoch": 0.32, "learning_rate": 1.5061333333333332e-07, "logps/chosen": -210.64706420898438, "logps/rejected": -299.7415771484375, "loss": 0.0081, "losses/dpo": 3.452171029039164e-07, "losses/sft": 1.017959713935852, "losses/total": 3.452171029039164e-07, "ref_logps/chosen": -207.9378662109375, "ref_logps/rejected": -203.98858642578125, "rewards/accuracies": 1.0, "rewards/chosen": -0.2709197998046875, "rewards/margins": 9.304381370544434, "rewards/rejected": -9.575300216674805, "step": 1343 }, { "epoch": 0.32, "learning_rate": 1.5056e-07, "logps/chosen": -210.30258178710938, "logps/rejected": -296.6068115234375, "loss": 0.0033, "losses/dpo": 1.0029665645561181e-05, "losses/sft": 0.9093440175056458, "losses/total": 1.0029665645561181e-05, "ref_logps/chosen": -206.2042694091797, "ref_logps/rejected": -208.4755859375, "rewards/accuracies": 1.0, "rewards/chosen": -0.409829318523407, "rewards/margins": 8.403292655944824, "rewards/rejected": -8.813121795654297, "step": 1344 }, { "epoch": 0.32, "learning_rate": 1.5050666666666667e-07, "logps/chosen": -256.589599609375, "logps/rejected": -339.770751953125, "loss": 0.005, "losses/dpo": 2.1219018890406005e-05, "losses/sft": 0.6782197952270508, "losses/total": 2.1219018890406005e-05, "ref_logps/chosen": -249.3812255859375, "ref_logps/rejected": -235.55856323242188, "rewards/accuracies": 1.0, "rewards/chosen": -0.7208377718925476, "rewards/margins": 9.7003812789917, "rewards/rejected": -10.421218872070312, "step": 1345 }, { "epoch": 0.32, "learning_rate": 1.5045333333333332e-07, "logps/chosen": -243.18263244628906, "logps/rejected": -326.2312316894531, "loss": 0.012, "losses/dpo": 2.232172391813947e-06, "losses/sft": 0.5772309899330139, "losses/total": 2.232172391813947e-06, "ref_logps/chosen": -236.6880340576172, "ref_logps/rejected": -227.48379516601562, "rewards/accuracies": 1.0, "rewards/chosen": -0.6494592428207397, "rewards/margins": 9.2252836227417, "rewards/rejected": -9.874743461608887, "step": 1346 }, { "epoch": 0.32, "learning_rate": 1.504e-07, "logps/chosen": -191.0936279296875, "logps/rejected": -321.9908447265625, "loss": 0.0045, "losses/dpo": 1.1701549738063477e-06, "losses/sft": 0.44043731689453125, "losses/total": 1.1701549738063477e-06, "ref_logps/chosen": -189.13400268554688, "ref_logps/rejected": -224.3729248046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.195962056517601, "rewards/margins": 9.565831184387207, "rewards/rejected": -9.76179313659668, "step": 1347 }, { "epoch": 0.32, "learning_rate": 1.5034666666666667e-07, "logps/chosen": -191.34693908691406, "logps/rejected": -298.30029296875, "loss": 0.0069, "losses/dpo": 1.5926264040899696e-06, "losses/sft": 0.696051299571991, "losses/total": 1.5926264040899696e-06, "ref_logps/chosen": -185.76031494140625, "ref_logps/rejected": -204.68069458007812, "rewards/accuracies": 1.0, "rewards/chosen": -0.5586625337600708, "rewards/margins": 8.803296089172363, "rewards/rejected": -9.361959457397461, "step": 1348 }, { "epoch": 0.32, "learning_rate": 1.5029333333333332e-07, "logps/chosen": -207.64144897460938, "logps/rejected": -313.13958740234375, "loss": 0.0041, "losses/dpo": 2.4806226065265946e-05, "losses/sft": 0.5272203683853149, "losses/total": 2.4806226065265946e-05, "ref_logps/chosen": -202.299072265625, "ref_logps/rejected": -218.24766540527344, "rewards/accuracies": 1.0, "rewards/chosen": -0.5342371463775635, "rewards/margins": 8.954954147338867, "rewards/rejected": -9.489191055297852, "step": 1349 }, { "epoch": 0.32, "learning_rate": 1.5024e-07, "logps/chosen": -264.94219970703125, "logps/rejected": -334.4981689453125, "loss": 0.0079, "losses/dpo": 9.419585694558918e-05, "losses/sft": 0.6323447227478027, "losses/total": 9.419585694558918e-05, "ref_logps/chosen": -259.34710693359375, "ref_logps/rejected": -233.81393432617188, "rewards/accuracies": 1.0, "rewards/chosen": -0.5595054030418396, "rewards/margins": 9.508918762207031, "rewards/rejected": -10.068424224853516, "step": 1350 }, { "epoch": 0.32, "learning_rate": 1.5018666666666665e-07, "logps/chosen": -206.85952758789062, "logps/rejected": -324.07904052734375, "loss": 0.0062, "losses/dpo": 5.389882517192746e-06, "losses/sft": 0.4695289134979248, "losses/total": 5.389882517192746e-06, "ref_logps/chosen": -200.00619506835938, "ref_logps/rejected": -225.241943359375, "rewards/accuracies": 1.0, "rewards/chosen": -0.6853328943252563, "rewards/margins": 9.19837760925293, "rewards/rejected": -9.883710861206055, "step": 1351 }, { "epoch": 0.32, "learning_rate": 1.5013333333333333e-07, "logps/chosen": -219.67608642578125, "logps/rejected": -316.2841796875, "loss": 0.0118, "losses/dpo": 2.1879714040551335e-06, "losses/sft": 0.5131600499153137, "losses/total": 2.1879714040551335e-06, "ref_logps/chosen": -215.0205078125, "ref_logps/rejected": -216.0992889404297, "rewards/accuracies": 1.0, "rewards/chosen": -0.46555858850479126, "rewards/margins": 9.55292797088623, "rewards/rejected": -10.018486022949219, "step": 1352 }, { "epoch": 0.32, "learning_rate": 1.5007999999999998e-07, "logps/chosen": -202.90138244628906, "logps/rejected": -295.889892578125, "loss": 0.0259, "losses/dpo": 1.973478674699436e-06, "losses/sft": 1.2290613651275635, "losses/total": 1.973478674699436e-06, "ref_logps/chosen": -197.9677734375, "ref_logps/rejected": -207.61563110351562, "rewards/accuracies": 1.0, "rewards/chosen": -0.4933605492115021, "rewards/margins": 8.334064483642578, "rewards/rejected": -8.827425003051758, "step": 1353 }, { "epoch": 0.32, "learning_rate": 1.5002666666666665e-07, "logps/chosen": -210.6041259765625, "logps/rejected": -318.3199768066406, "loss": 0.0123, "losses/dpo": 3.932280833396362e-06, "losses/sft": 0.6257404685020447, "losses/total": 3.932280833396362e-06, "ref_logps/chosen": -206.41909790039062, "ref_logps/rejected": -227.68197631835938, "rewards/accuracies": 1.0, "rewards/chosen": -0.41850388050079346, "rewards/margins": 8.645296096801758, "rewards/rejected": -9.063799858093262, "step": 1354 }, { "epoch": 0.33, "learning_rate": 1.4997333333333333e-07, "logps/chosen": -235.23065185546875, "logps/rejected": -332.8733215332031, "loss": 0.0089, "losses/dpo": 0.0023605774622410536, "losses/sft": 0.5530644059181213, "losses/total": 0.0023605774622410536, "ref_logps/chosen": -228.45867919921875, "ref_logps/rejected": -237.48985290527344, "rewards/accuracies": 1.0, "rewards/chosen": -0.6771982908248901, "rewards/margins": 8.861150741577148, "rewards/rejected": -9.538349151611328, "step": 1355 }, { "epoch": 0.33, "learning_rate": 1.4992e-07, "logps/chosen": -262.990478515625, "logps/rejected": -346.90753173828125, "loss": 0.0059, "losses/dpo": 9.955811037798412e-06, "losses/sft": 0.528274655342102, "losses/total": 9.955811037798412e-06, "ref_logps/chosen": -257.5215148925781, "ref_logps/rejected": -242.46707153320312, "rewards/accuracies": 1.0, "rewards/chosen": -0.5468965172767639, "rewards/margins": 9.897148132324219, "rewards/rejected": -10.444045066833496, "step": 1356 }, { "epoch": 0.33, "learning_rate": 1.4986666666666666e-07, "logps/chosen": -196.39703369140625, "logps/rejected": -287.8876647949219, "loss": 0.0057, "losses/dpo": 6.963084160815924e-06, "losses/sft": 0.8069180846214294, "losses/total": 6.963084160815924e-06, "ref_logps/chosen": -190.68792724609375, "ref_logps/rejected": -192.03472900390625, "rewards/accuracies": 1.0, "rewards/chosen": -0.5709101557731628, "rewards/margins": 9.014384269714355, "rewards/rejected": -9.585294723510742, "step": 1357 }, { "epoch": 0.33, "learning_rate": 1.4981333333333334e-07, "logps/chosen": -234.97824096679688, "logps/rejected": -317.21087646484375, "loss": 0.0075, "losses/dpo": 0.00023257524298969656, "losses/sft": 0.6589913368225098, "losses/total": 0.00023257524298969656, "ref_logps/chosen": -230.3460235595703, "ref_logps/rejected": -220.89047241210938, "rewards/accuracies": 1.0, "rewards/chosen": -0.46322065591812134, "rewards/margins": 9.168817520141602, "rewards/rejected": -9.632037162780762, "step": 1358 }, { "epoch": 0.33, "learning_rate": 1.4975999999999999e-07, "logps/chosen": -209.46047973632812, "logps/rejected": -307.39825439453125, "loss": 0.0065, "losses/dpo": 1.9364961190149188e-05, "losses/sft": 0.5845063328742981, "losses/total": 1.9364961190149188e-05, "ref_logps/chosen": -205.7923583984375, "ref_logps/rejected": -215.46182250976562, "rewards/accuracies": 1.0, "rewards/chosen": -0.36681413650512695, "rewards/margins": 8.826828956604004, "rewards/rejected": -9.193643569946289, "step": 1359 }, { "epoch": 0.33, "learning_rate": 1.4970666666666666e-07, "logps/chosen": -231.78518676757812, "logps/rejected": -307.77923583984375, "loss": 0.0083, "losses/dpo": 0.00018703937530517578, "losses/sft": 0.6537184119224548, "losses/total": 0.00018703937530517578, "ref_logps/chosen": -226.21058654785156, "ref_logps/rejected": -214.77389526367188, "rewards/accuracies": 1.0, "rewards/chosen": -0.5574622750282288, "rewards/margins": 8.743070602416992, "rewards/rejected": -9.300533294677734, "step": 1360 }, { "epoch": 0.33, "learning_rate": 1.496533333333333e-07, "logps/chosen": -221.1462860107422, "logps/rejected": -323.049560546875, "loss": 0.0026, "losses/dpo": 2.0979409498522728e-07, "losses/sft": 0.6343128085136414, "losses/total": 2.0979409498522728e-07, "ref_logps/chosen": -215.46351623535156, "ref_logps/rejected": -220.0496826171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.5682787895202637, "rewards/margins": 9.731707572937012, "rewards/rejected": -10.29998779296875, "step": 1361 }, { "epoch": 0.33, "learning_rate": 1.496e-07, "logps/chosen": -227.81838989257812, "logps/rejected": -349.4015197753906, "loss": 0.0014, "losses/dpo": 1.431782129657222e-05, "losses/sft": 0.4105115234851837, "losses/total": 1.431782129657222e-05, "ref_logps/chosen": -222.96072387695312, "ref_logps/rejected": -250.07435607910156, "rewards/accuracies": 1.0, "rewards/chosen": -0.48576468229293823, "rewards/margins": 9.44694995880127, "rewards/rejected": -9.93271541595459, "step": 1362 }, { "epoch": 0.33, "learning_rate": 1.4954666666666667e-07, "logps/chosen": -205.59527587890625, "logps/rejected": -326.17730712890625, "loss": 0.0096, "losses/dpo": 0.00010775440750876442, "losses/sft": 0.5006946921348572, "losses/total": 0.00010775440750876442, "ref_logps/chosen": -201.82261657714844, "ref_logps/rejected": -225.42568969726562, "rewards/accuracies": 1.0, "rewards/chosen": -0.37726444005966187, "rewards/margins": 9.697896957397461, "rewards/rejected": -10.07516098022461, "step": 1363 }, { "epoch": 0.33, "learning_rate": 1.4949333333333332e-07, "logps/chosen": -230.55307006835938, "logps/rejected": -336.6698303222656, "loss": 0.0161, "losses/dpo": 3.5898569876735564e-06, "losses/sft": 0.5458938479423523, "losses/total": 3.5898569876735564e-06, "ref_logps/chosen": -224.38046264648438, "ref_logps/rejected": -233.59317016601562, "rewards/accuracies": 1.0, "rewards/chosen": -0.6172577142715454, "rewards/margins": 9.690407752990723, "rewards/rejected": -10.307665824890137, "step": 1364 }, { "epoch": 0.33, "learning_rate": 1.4944e-07, "logps/chosen": -221.36834716796875, "logps/rejected": -304.78509521484375, "loss": 0.0025, "losses/dpo": 3.4575148788462684e-07, "losses/sft": 0.5425221920013428, "losses/total": 3.4575148788462684e-07, "ref_logps/chosen": -212.90762329101562, "ref_logps/rejected": -212.81320190429688, "rewards/accuracies": 1.0, "rewards/chosen": -0.8460723161697388, "rewards/margins": 8.351120948791504, "rewards/rejected": -9.197192192077637, "step": 1365 }, { "epoch": 0.33, "learning_rate": 1.4938666666666667e-07, "logps/chosen": -261.5478210449219, "logps/rejected": -320.822998046875, "loss": 0.0093, "losses/dpo": 7.597088824695675e-06, "losses/sft": 0.573924720287323, "losses/total": 7.597088824695675e-06, "ref_logps/chosen": -252.5909881591797, "ref_logps/rejected": -220.604736328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.8956854343414307, "rewards/margins": 9.126142501831055, "rewards/rejected": -10.021827697753906, "step": 1366 }, { "epoch": 0.33, "learning_rate": 1.4933333333333332e-07, "logps/chosen": -242.2112274169922, "logps/rejected": -307.6710205078125, "loss": 0.0044, "losses/dpo": 0.001693839323706925, "losses/sft": 0.5658240914344788, "losses/total": 0.001693839323706925, "ref_logps/chosen": -234.54925537109375, "ref_logps/rejected": -217.89080810546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.7661959528923035, "rewards/margins": 8.211827278137207, "rewards/rejected": -8.978023529052734, "step": 1367 }, { "epoch": 0.33, "learning_rate": 1.4927999999999997e-07, "logps/chosen": -187.80072021484375, "logps/rejected": -297.7024230957031, "loss": 0.0157, "losses/dpo": 6.947010842850432e-05, "losses/sft": 0.39203715324401855, "losses/total": 6.947010842850432e-05, "ref_logps/chosen": -184.30886840820312, "ref_logps/rejected": -207.09458923339844, "rewards/accuracies": 1.0, "rewards/chosen": -0.34918662905693054, "rewards/margins": 8.71159839630127, "rewards/rejected": -9.060785293579102, "step": 1368 }, { "epoch": 0.33, "learning_rate": 1.4922666666666665e-07, "logps/chosen": -273.1112060546875, "logps/rejected": -319.75299072265625, "loss": 0.0095, "losses/dpo": 3.384762385394424e-05, "losses/sft": 0.5690529346466064, "losses/total": 3.384762385394424e-05, "ref_logps/chosen": -265.5378112792969, "ref_logps/rejected": -219.05447387695312, "rewards/accuracies": 1.0, "rewards/chosen": -0.7573384046554565, "rewards/margins": 9.312512397766113, "rewards/rejected": -10.069851875305176, "step": 1369 }, { "epoch": 0.33, "learning_rate": 1.4917333333333332e-07, "logps/chosen": -242.8868408203125, "logps/rejected": -352.4659118652344, "loss": 0.002, "losses/dpo": 3.3542226447025314e-05, "losses/sft": 0.6414245963096619, "losses/total": 3.3542226447025314e-05, "ref_logps/chosen": -238.11805725097656, "ref_logps/rejected": -240.7495880126953, "rewards/accuracies": 1.0, "rewards/chosen": -0.47687819600105286, "rewards/margins": 10.694755554199219, "rewards/rejected": -11.171632766723633, "step": 1370 }, { "epoch": 0.33, "learning_rate": 1.4912e-07, "logps/chosen": -231.61041259765625, "logps/rejected": -335.570068359375, "loss": 0.0022, "losses/dpo": 7.283454124262789e-06, "losses/sft": 0.5845365524291992, "losses/total": 7.283454124262789e-06, "ref_logps/chosen": -226.0439453125, "ref_logps/rejected": -227.76268005371094, "rewards/accuracies": 1.0, "rewards/chosen": -0.5566458702087402, "rewards/margins": 10.22409439086914, "rewards/rejected": -10.780740737915039, "step": 1371 }, { "epoch": 0.33, "learning_rate": 1.4906666666666665e-07, "logps/chosen": -214.07867431640625, "logps/rejected": -330.9697265625, "loss": 0.0029, "losses/dpo": 6.625780315516749e-07, "losses/sft": 0.514330267906189, "losses/total": 6.625780315516749e-07, "ref_logps/chosen": -206.58436584472656, "ref_logps/rejected": -226.4656524658203, "rewards/accuracies": 1.0, "rewards/chosen": -0.749430775642395, "rewards/margins": 9.700973510742188, "rewards/rejected": -10.450404167175293, "step": 1372 }, { "epoch": 0.33, "learning_rate": 1.4901333333333333e-07, "logps/chosen": -247.19459533691406, "logps/rejected": -338.5703125, "loss": 0.002, "losses/dpo": 2.6970950784743764e-05, "losses/sft": 0.5669020414352417, "losses/total": 2.6970950784743764e-05, "ref_logps/chosen": -241.54049682617188, "ref_logps/rejected": -236.59568786621094, "rewards/accuracies": 1.0, "rewards/chosen": -0.5654078722000122, "rewards/margins": 9.632054328918457, "rewards/rejected": -10.19746208190918, "step": 1373 }, { "epoch": 0.33, "learning_rate": 1.4896e-07, "logps/chosen": -219.7584228515625, "logps/rejected": -297.95001220703125, "loss": 0.0166, "losses/dpo": 1.3092512745060958e-05, "losses/sft": 0.5878767371177673, "losses/total": 1.3092512745060958e-05, "ref_logps/chosen": -213.0048828125, "ref_logps/rejected": -200.85433959960938, "rewards/accuracies": 1.0, "rewards/chosen": -0.6753565073013306, "rewards/margins": 9.034208297729492, "rewards/rejected": -9.709566116333008, "step": 1374 }, { "epoch": 0.33, "learning_rate": 1.4890666666666668e-07, "logps/chosen": -213.73904418945312, "logps/rejected": -306.9911804199219, "loss": 0.0054, "losses/dpo": 8.85876687561904e-08, "losses/sft": 0.690064549446106, "losses/total": 8.85876687561904e-08, "ref_logps/chosen": -209.3993682861328, "ref_logps/rejected": -214.84815979003906, "rewards/accuracies": 1.0, "rewards/chosen": -0.43396899104118347, "rewards/margins": 8.7803316116333, "rewards/rejected": -9.214300155639648, "step": 1375 }, { "epoch": 0.33, "learning_rate": 1.4885333333333333e-07, "logps/chosen": -234.08889770507812, "logps/rejected": -292.6962585449219, "loss": 0.0071, "losses/dpo": 3.289341066192719e-07, "losses/sft": 0.5247437357902527, "losses/total": 3.289341066192719e-07, "ref_logps/chosen": -228.41006469726562, "ref_logps/rejected": -194.78598022460938, "rewards/accuracies": 1.0, "rewards/chosen": -0.5678839683532715, "rewards/margins": 9.22314453125, "rewards/rejected": -9.79102897644043, "step": 1376 }, { "epoch": 0.33, "learning_rate": 1.4879999999999998e-07, "logps/chosen": -267.45111083984375, "logps/rejected": -373.0047912597656, "loss": 0.0025, "losses/dpo": 1.024039647745667e-06, "losses/sft": 0.44401848316192627, "losses/total": 1.024039647745667e-06, "ref_logps/chosen": -261.8041076660156, "ref_logps/rejected": -261.0214538574219, "rewards/accuracies": 1.0, "rewards/chosen": -0.5647010803222656, "rewards/margins": 10.63363265991211, "rewards/rejected": -11.198334693908691, "step": 1377 }, { "epoch": 0.33, "learning_rate": 1.4874666666666666e-07, "logps/chosen": -258.9595947265625, "logps/rejected": -384.92578125, "loss": 0.0005, "losses/dpo": 1.6388908363751398e-07, "losses/sft": 0.7235451936721802, "losses/total": 1.6388908363751398e-07, "ref_logps/chosen": -252.34963989257812, "ref_logps/rejected": -269.93115234375, "rewards/accuracies": 1.0, "rewards/chosen": -0.660993754863739, "rewards/margins": 10.838469505310059, "rewards/rejected": -11.499463081359863, "step": 1378 }, { "epoch": 0.33, "learning_rate": 1.4869333333333334e-07, "logps/chosen": -260.44024658203125, "logps/rejected": -317.8164367675781, "loss": 0.0034, "losses/dpo": 4.1179500840371475e-05, "losses/sft": 0.8553674817085266, "losses/total": 4.1179500840371475e-05, "ref_logps/chosen": -252.1997833251953, "ref_logps/rejected": -210.5656280517578, "rewards/accuracies": 1.0, "rewards/chosen": -0.8240471482276917, "rewards/margins": 9.901033401489258, "rewards/rejected": -10.725079536437988, "step": 1379 }, { "epoch": 0.33, "learning_rate": 1.4863999999999999e-07, "logps/chosen": -271.73468017578125, "logps/rejected": -356.5469055175781, "loss": 0.0194, "losses/dpo": 1.1646251323327306e-06, "losses/sft": 0.7742062211036682, "losses/total": 1.1646251323327306e-06, "ref_logps/chosen": -262.07916259765625, "ref_logps/rejected": -251.80026245117188, "rewards/accuracies": 1.0, "rewards/chosen": -0.9655534625053406, "rewards/margins": 9.509109497070312, "rewards/rejected": -10.474663734436035, "step": 1380 }, { "epoch": 0.33, "learning_rate": 1.4858666666666666e-07, "logps/chosen": -192.8662567138672, "logps/rejected": -298.69805908203125, "loss": 0.0021, "losses/dpo": 5.970950951450504e-07, "losses/sft": 0.5095474720001221, "losses/total": 5.970950951450504e-07, "ref_logps/chosen": -186.58189392089844, "ref_logps/rejected": -204.48397827148438, "rewards/accuracies": 1.0, "rewards/chosen": -0.6284350156784058, "rewards/margins": 8.792973518371582, "rewards/rejected": -9.421407699584961, "step": 1381 }, { "epoch": 0.33, "learning_rate": 1.4853333333333334e-07, "logps/chosen": -249.7978973388672, "logps/rejected": -342.45294189453125, "loss": 0.0078, "losses/dpo": 7.745430963268518e-09, "losses/sft": 0.5017475485801697, "losses/total": 7.745430963268518e-09, "ref_logps/chosen": -244.49339294433594, "ref_logps/rejected": -234.02267456054688, "rewards/accuracies": 1.0, "rewards/chosen": -0.5304479598999023, "rewards/margins": 10.312580108642578, "rewards/rejected": -10.843027114868164, "step": 1382 }, { "epoch": 0.33, "learning_rate": 1.4848e-07, "logps/chosen": -229.30255126953125, "logps/rejected": -298.4708557128906, "loss": 0.0085, "losses/dpo": 6.1672230913245585e-06, "losses/sft": 0.9385590553283691, "losses/total": 6.1672230913245585e-06, "ref_logps/chosen": -223.30038452148438, "ref_logps/rejected": -206.22625732421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.6002172231674194, "rewards/margins": 8.624244689941406, "rewards/rejected": -9.22446060180664, "step": 1383 }, { "epoch": 0.33, "learning_rate": 1.4842666666666667e-07, "logps/chosen": -227.7900848388672, "logps/rejected": -323.6591796875, "loss": 0.0021, "losses/dpo": 1.3744927855441347e-05, "losses/sft": 0.5810624361038208, "losses/total": 1.3744927855441347e-05, "ref_logps/chosen": -223.41427612304688, "ref_logps/rejected": -214.5009765625, "rewards/accuracies": 1.0, "rewards/chosen": -0.43758076429367065, "rewards/margins": 10.478240013122559, "rewards/rejected": -10.91582202911377, "step": 1384 }, { "epoch": 0.33, "learning_rate": 1.4837333333333332e-07, "logps/chosen": -227.75570678710938, "logps/rejected": -312.79437255859375, "loss": 0.0041, "losses/dpo": 7.480200292775407e-05, "losses/sft": 0.6348578333854675, "losses/total": 7.480200292775407e-05, "ref_logps/chosen": -222.07748413085938, "ref_logps/rejected": -215.8572235107422, "rewards/accuracies": 1.0, "rewards/chosen": -0.5678222179412842, "rewards/margins": 9.125892639160156, "rewards/rejected": -9.693714141845703, "step": 1385 }, { "epoch": 0.33, "learning_rate": 1.4832e-07, "logps/chosen": -240.6046600341797, "logps/rejected": -315.52825927734375, "loss": 0.0012, "losses/dpo": 3.751873407509265e-07, "losses/sft": 1.0399531126022339, "losses/total": 3.751873407509265e-07, "ref_logps/chosen": -235.87313842773438, "ref_logps/rejected": -215.51113891601562, "rewards/accuracies": 1.0, "rewards/chosen": -0.47315338253974915, "rewards/margins": 9.528556823730469, "rewards/rejected": -10.001710891723633, "step": 1386 }, { "epoch": 0.33, "learning_rate": 1.4826666666666664e-07, "logps/chosen": -244.64169311523438, "logps/rejected": -335.7666931152344, "loss": 0.0134, "losses/dpo": 1.0215466090812697e-06, "losses/sft": 0.6966581344604492, "losses/total": 1.0215466090812697e-06, "ref_logps/chosen": -238.19161987304688, "ref_logps/rejected": -221.78868103027344, "rewards/accuracies": 1.0, "rewards/chosen": -0.6450071334838867, "rewards/margins": 10.75279426574707, "rewards/rejected": -11.397801399230957, "step": 1387 }, { "epoch": 0.33, "learning_rate": 1.4821333333333332e-07, "logps/chosen": -227.23150634765625, "logps/rejected": -304.40533447265625, "loss": 0.0166, "losses/dpo": 5.537429714763675e-08, "losses/sft": 0.5162010788917542, "losses/total": 5.537429714763675e-08, "ref_logps/chosen": -220.5857696533203, "ref_logps/rejected": -211.07980346679688, "rewards/accuracies": 1.0, "rewards/chosen": -0.6645753383636475, "rewards/margins": 8.667976379394531, "rewards/rejected": -9.332551002502441, "step": 1388 }, { "epoch": 0.33, "learning_rate": 1.4816e-07, "logps/chosen": -190.0219268798828, "logps/rejected": -308.26654052734375, "loss": 0.0049, "losses/dpo": 2.9550101316999644e-06, "losses/sft": 0.7551262378692627, "losses/total": 2.9550101316999644e-06, "ref_logps/chosen": -186.40609741210938, "ref_logps/rejected": -208.3712158203125, "rewards/accuracies": 1.0, "rewards/chosen": -0.3615827262401581, "rewards/margins": 9.627948760986328, "rewards/rejected": -9.989530563354492, "step": 1389 }, { "epoch": 0.33, "learning_rate": 1.4810666666666667e-07, "logps/chosen": -247.000244140625, "logps/rejected": -353.6350402832031, "loss": 0.005, "losses/dpo": 8.068331567301357e-07, "losses/sft": 0.5652164816856384, "losses/total": 8.068331567301357e-07, "ref_logps/chosen": -242.5013427734375, "ref_logps/rejected": -241.12451171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.4498903751373291, "rewards/margins": 10.801165580749512, "rewards/rejected": -11.251055717468262, "step": 1390 }, { "epoch": 0.33, "learning_rate": 1.4805333333333332e-07, "logps/chosen": -205.4481658935547, "logps/rejected": -289.79998779296875, "loss": 0.0252, "losses/dpo": 6.814025965695691e-08, "losses/sft": 0.9705914258956909, "losses/total": 6.814025965695691e-08, "ref_logps/chosen": -199.9121551513672, "ref_logps/rejected": -198.01193237304688, "rewards/accuracies": 1.0, "rewards/chosen": -0.5536011457443237, "rewards/margins": 8.625202178955078, "rewards/rejected": -9.178802490234375, "step": 1391 }, { "epoch": 0.33, "learning_rate": 1.48e-07, "logps/chosen": -206.9844970703125, "logps/rejected": -308.33392333984375, "loss": 0.0031, "losses/dpo": 0.004176395479589701, "losses/sft": 0.5403367280960083, "losses/total": 0.004176395479589701, "ref_logps/chosen": -201.36911010742188, "ref_logps/rejected": -216.16748046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.5615397691726685, "rewards/margins": 8.655104637145996, "rewards/rejected": -9.216644287109375, "step": 1392 }, { "epoch": 0.33, "learning_rate": 1.4794666666666665e-07, "logps/chosen": -226.6364288330078, "logps/rejected": -324.74090576171875, "loss": 0.0081, "losses/dpo": 1.7276817061429028e-06, "losses/sft": 0.47629937529563904, "losses/total": 1.7276817061429028e-06, "ref_logps/chosen": -221.72357177734375, "ref_logps/rejected": -223.32994079589844, "rewards/accuracies": 1.0, "rewards/chosen": -0.4912845492362976, "rewards/margins": 9.649810791015625, "rewards/rejected": -10.141096115112305, "step": 1393 }, { "epoch": 0.33, "learning_rate": 1.4789333333333333e-07, "logps/chosen": -218.7001953125, "logps/rejected": -301.35272216796875, "loss": 0.0079, "losses/dpo": 1.7264881080336636e-07, "losses/sft": 0.5018095374107361, "losses/total": 1.7264881080336636e-07, "ref_logps/chosen": -210.98728942871094, "ref_logps/rejected": -206.63865661621094, "rewards/accuracies": 1.0, "rewards/chosen": -0.7712904810905457, "rewards/margins": 8.700117111206055, "rewards/rejected": -9.471406936645508, "step": 1394 }, { "epoch": 0.33, "learning_rate": 1.4783999999999998e-07, "logps/chosen": -245.23153686523438, "logps/rejected": -341.9892272949219, "loss": 0.0092, "losses/dpo": 1.3967011227578041e-06, "losses/sft": 0.8042377829551697, "losses/total": 1.3967011227578041e-06, "ref_logps/chosen": -237.6697540283203, "ref_logps/rejected": -236.67742919921875, "rewards/accuracies": 1.0, "rewards/chosen": -0.7561777830123901, "rewards/margins": 9.77500057220459, "rewards/rejected": -10.53117847442627, "step": 1395 }, { "epoch": 0.34, "learning_rate": 1.4778666666666666e-07, "logps/chosen": -246.2534942626953, "logps/rejected": -323.0847473144531, "loss": 0.0034, "losses/dpo": 4.23372803197708e-05, "losses/sft": 0.7828828692436218, "losses/total": 4.23372803197708e-05, "ref_logps/chosen": -239.45782470703125, "ref_logps/rejected": -219.66859436035156, "rewards/accuracies": 1.0, "rewards/chosen": -0.6795660853385925, "rewards/margins": 9.662050247192383, "rewards/rejected": -10.3416166305542, "step": 1396 }, { "epoch": 0.34, "learning_rate": 1.4773333333333333e-07, "logps/chosen": -183.57412719726562, "logps/rejected": -290.72119140625, "loss": 0.0076, "losses/dpo": 7.135131454560906e-06, "losses/sft": 0.47452545166015625, "losses/total": 7.135131454560906e-06, "ref_logps/chosen": -179.52926635742188, "ref_logps/rejected": -190.98529052734375, "rewards/accuracies": 1.0, "rewards/chosen": -0.40448638796806335, "rewards/margins": 9.56910514831543, "rewards/rejected": -9.973592758178711, "step": 1397 }, { "epoch": 0.34, "learning_rate": 1.4767999999999998e-07, "logps/chosen": -225.97128295898438, "logps/rejected": -313.9520568847656, "loss": 0.0189, "losses/dpo": 3.2163657124328893e-06, "losses/sft": 0.48592886328697205, "losses/total": 3.2163657124328893e-06, "ref_logps/chosen": -219.92416381835938, "ref_logps/rejected": -214.449462890625, "rewards/accuracies": 1.0, "rewards/chosen": -0.604712963104248, "rewards/margins": 9.34554672241211, "rewards/rejected": -9.950260162353516, "step": 1398 }, { "epoch": 0.34, "learning_rate": 1.4762666666666666e-07, "logps/chosen": -282.30523681640625, "logps/rejected": -363.6171875, "loss": 0.0103, "losses/dpo": 1.9395977801650588e-07, "losses/sft": 0.881365180015564, "losses/total": 1.9395977801650588e-07, "ref_logps/chosen": -276.4660949707031, "ref_logps/rejected": -255.8371124267578, "rewards/accuracies": 1.0, "rewards/chosen": -0.5839158892631531, "rewards/margins": 10.194091796875, "rewards/rejected": -10.778007507324219, "step": 1399 }, { "epoch": 0.34, "learning_rate": 1.4757333333333334e-07, "logps/chosen": -252.45338439941406, "logps/rejected": -322.43536376953125, "loss": 0.0048, "losses/dpo": 0.0007537341443821788, "losses/sft": 0.6713389754295349, "losses/total": 0.0007537341443821788, "ref_logps/chosen": -244.38134765625, "ref_logps/rejected": -224.3592529296875, "rewards/accuracies": 1.0, "rewards/chosen": -0.8072034120559692, "rewards/margins": 9.000408172607422, "rewards/rejected": -9.807612419128418, "step": 1400 }, { "epoch": 0.34, "learning_rate": 1.4752e-07, "logps/chosen": -290.375244140625, "logps/rejected": -369.4338684082031, "loss": 0.0166, "losses/dpo": 3.3654256981208164e-07, "losses/sft": 0.5755988359451294, "losses/total": 3.3654256981208164e-07, "ref_logps/chosen": -283.0096130371094, "ref_logps/rejected": -250.4342498779297, "rewards/accuracies": 1.0, "rewards/chosen": -0.736564576625824, "rewards/margins": 11.163396835327148, "rewards/rejected": -11.899961471557617, "step": 1401 }, { "epoch": 0.34, "learning_rate": 1.4746666666666664e-07, "logps/chosen": -201.8477020263672, "logps/rejected": -302.2115783691406, "loss": 0.0189, "losses/dpo": 2.8666436264757067e-05, "losses/sft": 0.5956442356109619, "losses/total": 2.8666436264757067e-05, "ref_logps/chosen": -196.76947021484375, "ref_logps/rejected": -205.2494659423828, "rewards/accuracies": 1.0, "rewards/chosen": -0.5078233480453491, "rewards/margins": 9.188386917114258, "rewards/rejected": -9.696211814880371, "step": 1402 }, { "epoch": 0.34, "learning_rate": 1.4741333333333331e-07, "logps/chosen": -218.92724609375, "logps/rejected": -326.93817138671875, "loss": 0.003, "losses/dpo": 1.8229708302897052e-06, "losses/sft": 0.5809467434883118, "losses/total": 1.8229708302897052e-06, "ref_logps/chosen": -214.09603881835938, "ref_logps/rejected": -223.99440002441406, "rewards/accuracies": 1.0, "rewards/chosen": -0.4831210970878601, "rewards/margins": 9.811254501342773, "rewards/rejected": -10.294374465942383, "step": 1403 }, { "epoch": 0.34, "learning_rate": 1.4736e-07, "logps/chosen": -258.2540283203125, "logps/rejected": -355.65997314453125, "loss": 0.0018, "losses/dpo": 3.668509862109204e-06, "losses/sft": 0.5465099811553955, "losses/total": 3.668509862109204e-06, "ref_logps/chosen": -251.37667846679688, "ref_logps/rejected": -251.00648498535156, "rewards/accuracies": 1.0, "rewards/chosen": -0.6877383589744568, "rewards/margins": 9.777606964111328, "rewards/rejected": -10.46534538269043, "step": 1404 }, { "epoch": 0.34, "learning_rate": 1.4730666666666667e-07, "logps/chosen": -216.09568786621094, "logps/rejected": -319.7093505859375, "loss": 0.0102, "losses/dpo": 1.9349796431811228e-08, "losses/sft": 0.5509861707687378, "losses/total": 1.9349796431811228e-08, "ref_logps/chosen": -213.35455322265625, "ref_logps/rejected": -219.9917449951172, "rewards/accuracies": 1.0, "rewards/chosen": -0.27411457896232605, "rewards/margins": 9.697647094726562, "rewards/rejected": -9.971761703491211, "step": 1405 }, { "epoch": 0.34, "learning_rate": 1.4725333333333332e-07, "logps/chosen": -231.06814575195312, "logps/rejected": -327.9761962890625, "loss": 0.011, "losses/dpo": 6.027102017469588e-08, "losses/sft": 0.7009085416793823, "losses/total": 6.027102017469588e-08, "ref_logps/chosen": -224.63995361328125, "ref_logps/rejected": -221.14889526367188, "rewards/accuracies": 1.0, "rewards/chosen": -0.6428195238113403, "rewards/margins": 10.039910316467285, "rewards/rejected": -10.682729721069336, "step": 1406 }, { "epoch": 0.34, "learning_rate": 1.472e-07, "logps/chosen": -198.86013793945312, "logps/rejected": -305.11065673828125, "loss": 0.005, "losses/dpo": 3.4414027183515827e-09, "losses/sft": 0.5905900597572327, "losses/total": 3.4414027183515827e-09, "ref_logps/chosen": -194.38502502441406, "ref_logps/rejected": -213.23057556152344, "rewards/accuracies": 1.0, "rewards/chosen": -0.44751065969467163, "rewards/margins": 8.740495681762695, "rewards/rejected": -9.188007354736328, "step": 1407 }, { "epoch": 0.34, "learning_rate": 1.4714666666666667e-07, "logps/chosen": -190.73699951171875, "logps/rejected": -298.34393310546875, "loss": 0.0041, "losses/dpo": 0.0025460217148065567, "losses/sft": 0.49678361415863037, "losses/total": 0.0025460217148065567, "ref_logps/chosen": -187.507568359375, "ref_logps/rejected": -200.79928588867188, "rewards/accuracies": 1.0, "rewards/chosen": -0.3229426443576813, "rewards/margins": 9.431520462036133, "rewards/rejected": -9.754463195800781, "step": 1408 }, { "epoch": 0.34, "learning_rate": 1.4709333333333335e-07, "logps/chosen": -228.0816650390625, "logps/rejected": -317.5784606933594, "loss": 0.0016, "losses/dpo": 2.414169557596324e-07, "losses/sft": 0.9243432283401489, "losses/total": 2.414169557596324e-07, "ref_logps/chosen": -222.62387084960938, "ref_logps/rejected": -212.05673217773438, "rewards/accuracies": 1.0, "rewards/chosen": -0.5457786321640015, "rewards/margins": 10.006396293640137, "rewards/rejected": -10.55217456817627, "step": 1409 }, { "epoch": 0.34, "learning_rate": 1.4704e-07, "logps/chosen": -209.8715057373047, "logps/rejected": -314.6310729980469, "loss": 0.0084, "losses/dpo": 0.002437870018184185, "losses/sft": 0.6618226170539856, "losses/total": 0.002437870018184185, "ref_logps/chosen": -204.41595458984375, "ref_logps/rejected": -220.12814331054688, "rewards/accuracies": 1.0, "rewards/chosen": -0.5455543994903564, "rewards/margins": 8.904739379882812, "rewards/rejected": -9.450292587280273, "step": 1410 }, { "epoch": 0.34, "learning_rate": 1.4698666666666665e-07, "logps/chosen": -219.9127655029297, "logps/rejected": -319.072265625, "loss": 0.0157, "losses/dpo": 1.5378509488073178e-05, "losses/sft": 0.8130466341972351, "losses/total": 1.5378509488073178e-05, "ref_logps/chosen": -213.3060302734375, "ref_logps/rejected": -218.2029266357422, "rewards/accuracies": 1.0, "rewards/chosen": -0.6606738567352295, "rewards/margins": 9.426263809204102, "rewards/rejected": -10.086936950683594, "step": 1411 }, { "epoch": 0.34, "learning_rate": 1.4693333333333333e-07, "logps/chosen": -257.4957275390625, "logps/rejected": -338.45849609375, "loss": 0.0027, "losses/dpo": 4.697237272921484e-06, "losses/sft": 0.5715996623039246, "losses/total": 4.697237272921484e-06, "ref_logps/chosen": -250.46343994140625, "ref_logps/rejected": -228.156005859375, "rewards/accuracies": 1.0, "rewards/chosen": -0.703227162361145, "rewards/margins": 10.32702350616455, "rewards/rejected": -11.030250549316406, "step": 1412 }, { "epoch": 0.34, "learning_rate": 1.4688e-07, "logps/chosen": -227.3498077392578, "logps/rejected": -320.8736572265625, "loss": 0.006, "losses/dpo": 1.0904839200520655e-06, "losses/sft": 0.5297654271125793, "losses/total": 1.0904839200520655e-06, "ref_logps/chosen": -220.7515411376953, "ref_logps/rejected": -222.3735809326172, "rewards/accuracies": 1.0, "rewards/chosen": -0.6598272323608398, "rewards/margins": 9.190179824829102, "rewards/rejected": -9.850007057189941, "step": 1413 }, { "epoch": 0.34, "learning_rate": 1.4682666666666665e-07, "logps/chosen": -239.26089477539062, "logps/rejected": -353.3898010253906, "loss": 0.003, "losses/dpo": 3.9250804206858447e-07, "losses/sft": 0.47071871161460876, "losses/total": 3.9250804206858447e-07, "ref_logps/chosen": -233.9123077392578, "ref_logps/rejected": -242.4366455078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.534855842590332, "rewards/margins": 10.56046199798584, "rewards/rejected": -11.095317840576172, "step": 1414 }, { "epoch": 0.34, "learning_rate": 1.4677333333333333e-07, "logps/chosen": -188.21243286132812, "logps/rejected": -278.20355224609375, "loss": 0.0141, "losses/dpo": 6.587196548935026e-05, "losses/sft": 0.4548940360546112, "losses/total": 6.587196548935026e-05, "ref_logps/chosen": -185.127685546875, "ref_logps/rejected": -194.3684844970703, "rewards/accuracies": 1.0, "rewards/chosen": -0.3084757328033447, "rewards/margins": 8.075031280517578, "rewards/rejected": -8.383506774902344, "step": 1415 }, { "epoch": 0.34, "learning_rate": 1.4672e-07, "logps/chosen": -236.94711303710938, "logps/rejected": -304.28143310546875, "loss": 0.0088, "losses/dpo": 1.0650811418599915e-05, "losses/sft": 0.8863261342048645, "losses/total": 1.0650811418599915e-05, "ref_logps/chosen": -229.82717895507812, "ref_logps/rejected": -205.15768432617188, "rewards/accuracies": 1.0, "rewards/chosen": -0.7119917869567871, "rewards/margins": 9.200384140014648, "rewards/rejected": -9.912376403808594, "step": 1416 }, { "epoch": 0.34, "learning_rate": 1.4666666666666666e-07, "logps/chosen": -215.87684631347656, "logps/rejected": -317.4160461425781, "loss": 0.0055, "losses/dpo": 0.0001632666535442695, "losses/sft": 0.7035332918167114, "losses/total": 0.0001632666535442695, "ref_logps/chosen": -211.17105102539062, "ref_logps/rejected": -216.02468872070312, "rewards/accuracies": 1.0, "rewards/chosen": -0.47057968378067017, "rewards/margins": 9.668559074401855, "rewards/rejected": -10.139138221740723, "step": 1417 }, { "epoch": 0.34, "learning_rate": 1.4661333333333333e-07, "logps/chosen": -212.53549194335938, "logps/rejected": -303.6109619140625, "loss": 0.0075, "losses/dpo": 9.248578862752765e-07, "losses/sft": 0.6680861115455627, "losses/total": 9.248578862752765e-07, "ref_logps/chosen": -206.64599609375, "ref_logps/rejected": -203.39813232421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.5889511108398438, "rewards/margins": 9.432332038879395, "rewards/rejected": -10.021283149719238, "step": 1418 }, { "epoch": 0.34, "learning_rate": 1.4655999999999998e-07, "logps/chosen": -220.75338745117188, "logps/rejected": -307.9001159667969, "loss": 0.0106, "losses/dpo": 6.836598186055198e-05, "losses/sft": 0.5866129398345947, "losses/total": 6.836598186055198e-05, "ref_logps/chosen": -213.02523803710938, "ref_logps/rejected": -211.17379760742188, "rewards/accuracies": 1.0, "rewards/chosen": -0.7728152871131897, "rewards/margins": 8.899816513061523, "rewards/rejected": -9.67263126373291, "step": 1419 }, { "epoch": 0.34, "learning_rate": 1.4650666666666666e-07, "logps/chosen": -275.6752014160156, "logps/rejected": -333.82354736328125, "loss": 0.009, "losses/dpo": 0.00036748452112078667, "losses/sft": 0.4837472140789032, "losses/total": 0.00036748452112078667, "ref_logps/chosen": -268.6759033203125, "ref_logps/rejected": -232.21710205078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.6999331116676331, "rewards/margins": 9.460711479187012, "rewards/rejected": -10.16064453125, "step": 1420 }, { "epoch": 0.34, "learning_rate": 1.464533333333333e-07, "logps/chosen": -263.4283752441406, "logps/rejected": -312.1733093261719, "loss": 0.0111, "losses/dpo": 9.699951624497771e-05, "losses/sft": 0.547009289264679, "losses/total": 9.699951624497771e-05, "ref_logps/chosen": -257.53546142578125, "ref_logps/rejected": -213.4813690185547, "rewards/accuracies": 1.0, "rewards/chosen": -0.5892921686172485, "rewards/margins": 9.279902458190918, "rewards/rejected": -9.869194030761719, "step": 1421 }, { "epoch": 0.34, "learning_rate": 1.464e-07, "logps/chosen": -233.66629028320312, "logps/rejected": -310.92730712890625, "loss": 0.0128, "losses/dpo": 9.554316875437507e-07, "losses/sft": 0.5335099697113037, "losses/total": 9.554316875437507e-07, "ref_logps/chosen": -229.48971557617188, "ref_logps/rejected": -210.9820556640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.4176557660102844, "rewards/margins": 9.576872825622559, "rewards/rejected": -9.994528770446777, "step": 1422 }, { "epoch": 0.34, "learning_rate": 1.4634666666666666e-07, "logps/chosen": -257.6926574707031, "logps/rejected": -316.01812744140625, "loss": 0.0028, "losses/dpo": 3.2933389348954734e-08, "losses/sft": 1.0025657415390015, "losses/total": 3.2933389348954734e-08, "ref_logps/chosen": -250.50686645507812, "ref_logps/rejected": -219.12649536132812, "rewards/accuracies": 1.0, "rewards/chosen": -0.718576967716217, "rewards/margins": 8.970588684082031, "rewards/rejected": -9.689165115356445, "step": 1423 }, { "epoch": 0.34, "learning_rate": 1.4629333333333334e-07, "logps/chosen": -222.94081115722656, "logps/rejected": -304.22210693359375, "loss": 0.0122, "losses/dpo": 0.00025044457288458943, "losses/sft": 0.508246898651123, "losses/total": 0.00025044457288458943, "ref_logps/chosen": -216.97926330566406, "ref_logps/rejected": -213.48974609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.5961551666259766, "rewards/margins": 8.477084159851074, "rewards/rejected": -9.07323932647705, "step": 1424 }, { "epoch": 0.34, "learning_rate": 1.4624e-07, "logps/chosen": -211.33238220214844, "logps/rejected": -293.65533447265625, "loss": 0.0081, "losses/dpo": 7.309407834554804e-08, "losses/sft": 1.0720634460449219, "losses/total": 7.309407834554804e-08, "ref_logps/chosen": -207.31207275390625, "ref_logps/rejected": -201.5588836669922, "rewards/accuracies": 1.0, "rewards/chosen": -0.40203219652175903, "rewards/margins": 8.807612419128418, "rewards/rejected": -9.20964527130127, "step": 1425 }, { "epoch": 0.34, "learning_rate": 1.4618666666666667e-07, "logps/chosen": -239.98486328125, "logps/rejected": -320.8073425292969, "loss": 0.0022, "losses/dpo": 0.001624310272745788, "losses/sft": 0.6859579086303711, "losses/total": 0.001624310272745788, "ref_logps/chosen": -233.5338592529297, "ref_logps/rejected": -223.6310577392578, "rewards/accuracies": 1.0, "rewards/chosen": -0.6451014876365662, "rewards/margins": 9.072526931762695, "rewards/rejected": -9.717628479003906, "step": 1426 }, { "epoch": 0.34, "learning_rate": 1.4613333333333332e-07, "logps/chosen": -250.49063110351562, "logps/rejected": -353.6320495605469, "loss": 0.0077, "losses/dpo": 1.2110034504075884e-06, "losses/sft": 0.4823315441608429, "losses/total": 1.2110034504075884e-06, "ref_logps/chosen": -240.64987182617188, "ref_logps/rejected": -239.94815063476562, "rewards/accuracies": 1.0, "rewards/chosen": -0.9840735197067261, "rewards/margins": 10.384315490722656, "rewards/rejected": -11.368389129638672, "step": 1427 }, { "epoch": 0.34, "learning_rate": 1.4608e-07, "logps/chosen": -238.51571655273438, "logps/rejected": -310.5906982421875, "loss": 0.0065, "losses/dpo": 1.6474497215313022e-06, "losses/sft": 0.5723968744277954, "losses/total": 1.6474497215313022e-06, "ref_logps/chosen": -233.6089324951172, "ref_logps/rejected": -203.3208465576172, "rewards/accuracies": 1.0, "rewards/chosen": -0.4906788766384125, "rewards/margins": 10.236308097839355, "rewards/rejected": -10.7269868850708, "step": 1428 }, { "epoch": 0.34, "learning_rate": 1.4602666666666665e-07, "logps/chosen": -244.91827392578125, "logps/rejected": -344.94390869140625, "loss": 0.0063, "losses/dpo": 2.966422925965162e-06, "losses/sft": 0.4837718605995178, "losses/total": 2.966422925965162e-06, "ref_logps/chosen": -239.55764770507812, "ref_logps/rejected": -242.9442138671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.5360649228096008, "rewards/margins": 9.663904190063477, "rewards/rejected": -10.199968338012695, "step": 1429 }, { "epoch": 0.34, "learning_rate": 1.4597333333333332e-07, "logps/chosen": -226.67837524414062, "logps/rejected": -309.166748046875, "loss": 0.0246, "losses/dpo": 0.00015017236000858247, "losses/sft": 0.5626065731048584, "losses/total": 0.00015017236000858247, "ref_logps/chosen": -221.06141662597656, "ref_logps/rejected": -202.68319702148438, "rewards/accuracies": 1.0, "rewards/chosen": -0.5616952180862427, "rewards/margins": 10.086663246154785, "rewards/rejected": -10.648358345031738, "step": 1430 }, { "epoch": 0.34, "learning_rate": 1.4592e-07, "logps/chosen": -212.03289794921875, "logps/rejected": -323.694580078125, "loss": 0.0111, "losses/dpo": 1.985797553061275e-06, "losses/sft": 0.5399279594421387, "losses/total": 1.985797553061275e-06, "ref_logps/chosen": -206.5635528564453, "ref_logps/rejected": -211.0391082763672, "rewards/accuracies": 1.0, "rewards/chosen": -0.5469346046447754, "rewards/margins": 10.718612670898438, "rewards/rejected": -11.265546798706055, "step": 1431 }, { "epoch": 0.34, "learning_rate": 1.4586666666666665e-07, "logps/chosen": -196.55291748046875, "logps/rejected": -311.8075866699219, "loss": 0.0025, "losses/dpo": 2.254584978800267e-05, "losses/sft": 0.5309191942214966, "losses/total": 2.254584978800267e-05, "ref_logps/chosen": -190.97097778320312, "ref_logps/rejected": -206.3571319580078, "rewards/accuracies": 1.0, "rewards/chosen": -0.558194637298584, "rewards/margins": 9.986852645874023, "rewards/rejected": -10.545047760009766, "step": 1432 }, { "epoch": 0.34, "learning_rate": 1.4581333333333333e-07, "logps/chosen": -246.53443908691406, "logps/rejected": -320.748779296875, "loss": 0.0044, "losses/dpo": 1.3396593168124582e-08, "losses/sft": 0.6043810844421387, "losses/total": 1.3396593168124582e-08, "ref_logps/chosen": -240.22222900390625, "ref_logps/rejected": -217.8199462890625, "rewards/accuracies": 1.0, "rewards/chosen": -0.6312206983566284, "rewards/margins": 9.661659240722656, "rewards/rejected": -10.292879104614258, "step": 1433 }, { "epoch": 0.34, "learning_rate": 1.4576e-07, "logps/chosen": -223.0400390625, "logps/rejected": -342.60748291015625, "loss": 0.0063, "losses/dpo": 6.333117994472559e-07, "losses/sft": 0.6458010673522949, "losses/total": 6.333117994472559e-07, "ref_logps/chosen": -215.02639770507812, "ref_logps/rejected": -238.65260314941406, "rewards/accuracies": 1.0, "rewards/chosen": -0.8013646602630615, "rewards/margins": 9.594125747680664, "rewards/rejected": -10.395490646362305, "step": 1434 }, { "epoch": 0.34, "learning_rate": 1.4570666666666668e-07, "logps/chosen": -213.1579132080078, "logps/rejected": -338.8310546875, "loss": 0.0065, "losses/dpo": 9.475101592215651e-07, "losses/sft": 0.5614927411079407, "losses/total": 9.475101592215651e-07, "ref_logps/chosen": -205.34298706054688, "ref_logps/rejected": -230.34719848632812, "rewards/accuracies": 1.0, "rewards/chosen": -0.7814919948577881, "rewards/margins": 10.066890716552734, "rewards/rejected": -10.848382949829102, "step": 1435 }, { "epoch": 0.34, "learning_rate": 1.4565333333333333e-07, "logps/chosen": -212.17832946777344, "logps/rejected": -284.8927307128906, "loss": 0.0252, "losses/dpo": 3.573608466922451e-07, "losses/sft": 0.6383061408996582, "losses/total": 3.573608466922451e-07, "ref_logps/chosen": -207.60467529296875, "ref_logps/rejected": -195.42196655273438, "rewards/accuracies": 1.0, "rewards/chosen": -0.45736393332481384, "rewards/margins": 8.489712715148926, "rewards/rejected": -8.947076797485352, "step": 1436 }, { "epoch": 0.34, "learning_rate": 1.4559999999999998e-07, "logps/chosen": -297.0775451660156, "logps/rejected": -377.64013671875, "loss": 0.0022, "losses/dpo": 2.941005732282065e-07, "losses/sft": 0.5119259357452393, "losses/total": 2.941005732282065e-07, "ref_logps/chosen": -288.8704833984375, "ref_logps/rejected": -254.8582305908203, "rewards/accuracies": 1.0, "rewards/chosen": -0.8207057118415833, "rewards/margins": 11.457483291625977, "rewards/rejected": -12.278188705444336, "step": 1437 }, { "epoch": 0.35, "learning_rate": 1.4554666666666666e-07, "logps/chosen": -252.49871826171875, "logps/rejected": -343.63800048828125, "loss": 0.0076, "losses/dpo": 2.4612534232915095e-08, "losses/sft": 0.8855902552604675, "losses/total": 2.4612534232915095e-08, "ref_logps/chosen": -245.14102172851562, "ref_logps/rejected": -229.6265869140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.735770046710968, "rewards/margins": 10.665369033813477, "rewards/rejected": -11.401138305664062, "step": 1438 }, { "epoch": 0.35, "learning_rate": 1.4549333333333333e-07, "logps/chosen": -219.16741943359375, "logps/rejected": -327.2855224609375, "loss": 0.0127, "losses/dpo": 0.0006991161499172449, "losses/sft": 0.48705172538757324, "losses/total": 0.0006991161499172449, "ref_logps/chosen": -210.84619140625, "ref_logps/rejected": -226.29647827148438, "rewards/accuracies": 1.0, "rewards/chosen": -0.8321226239204407, "rewards/margins": 9.266780853271484, "rewards/rejected": -10.09890365600586, "step": 1439 }, { "epoch": 0.35, "learning_rate": 1.4543999999999998e-07, "logps/chosen": -237.37240600585938, "logps/rejected": -386.46209716796875, "loss": 0.003, "losses/dpo": 2.6544712608544963e-12, "losses/sft": 0.6845740079879761, "losses/total": 2.6544712608544963e-12, "ref_logps/chosen": -230.480712890625, "ref_logps/rejected": -260.390625, "rewards/accuracies": 1.0, "rewards/chosen": -0.6891684532165527, "rewards/margins": 11.917978286743164, "rewards/rejected": -12.607145309448242, "step": 1440 }, { "epoch": 0.35, "learning_rate": 1.4538666666666666e-07, "logps/chosen": -260.29449462890625, "logps/rejected": -358.50714111328125, "loss": 0.0075, "losses/dpo": 1.4424228538700845e-05, "losses/sft": 0.6305594444274902, "losses/total": 1.4424228538700845e-05, "ref_logps/chosen": -251.57484436035156, "ref_logps/rejected": -254.61468505859375, "rewards/accuracies": 1.0, "rewards/chosen": -0.8719642758369446, "rewards/margins": 9.517280578613281, "rewards/rejected": -10.389245986938477, "step": 1441 }, { "epoch": 0.35, "learning_rate": 1.4533333333333334e-07, "logps/chosen": -258.00213623046875, "logps/rejected": -337.5716552734375, "loss": 0.0161, "losses/dpo": 1.1071784683736041e-05, "losses/sft": 0.46021926403045654, "losses/total": 1.1071784683736041e-05, "ref_logps/chosen": -248.19869995117188, "ref_logps/rejected": -228.71798706054688, "rewards/accuracies": 1.0, "rewards/chosen": -0.9803428649902344, "rewards/margins": 9.905023574829102, "rewards/rejected": -10.885366439819336, "step": 1442 }, { "epoch": 0.35, "learning_rate": 1.4528000000000001e-07, "logps/chosen": -207.7420196533203, "logps/rejected": -330.0391845703125, "loss": 0.0009, "losses/dpo": 1.2150613656558562e-05, "losses/sft": 0.5180935263633728, "losses/total": 1.2150613656558562e-05, "ref_logps/chosen": -199.99212646484375, "ref_logps/rejected": -223.36697387695312, "rewards/accuracies": 1.0, "rewards/chosen": -0.7749890685081482, "rewards/margins": 9.892231941223145, "rewards/rejected": -10.667221069335938, "step": 1443 }, { "epoch": 0.35, "learning_rate": 1.4522666666666666e-07, "logps/chosen": -233.486083984375, "logps/rejected": -352.0641174316406, "loss": 0.0128, "losses/dpo": 1.1145240932819434e-05, "losses/sft": 0.9884800910949707, "losses/total": 1.1145240932819434e-05, "ref_logps/chosen": -227.3741912841797, "ref_logps/rejected": -244.83950805664062, "rewards/accuracies": 1.0, "rewards/chosen": -0.6111904382705688, "rewards/margins": 10.111271858215332, "rewards/rejected": -10.722461700439453, "step": 1444 }, { "epoch": 0.35, "learning_rate": 1.4517333333333332e-07, "logps/chosen": -182.13214111328125, "logps/rejected": -303.51654052734375, "loss": 0.0041, "losses/dpo": 8.196855196729302e-05, "losses/sft": 0.5659289360046387, "losses/total": 8.196855196729302e-05, "ref_logps/chosen": -176.83070373535156, "ref_logps/rejected": -207.10829162597656, "rewards/accuracies": 1.0, "rewards/chosen": -0.5301447510719299, "rewards/margins": 9.110679626464844, "rewards/rejected": -9.640825271606445, "step": 1445 }, { "epoch": 0.35, "learning_rate": 1.4512e-07, "logps/chosen": -240.314208984375, "logps/rejected": -314.7841796875, "loss": 0.0053, "losses/dpo": 4.179699317319319e-06, "losses/sft": 0.6071019172668457, "losses/total": 4.179699317319319e-06, "ref_logps/chosen": -233.65325927734375, "ref_logps/rejected": -209.7173309326172, "rewards/accuracies": 1.0, "rewards/chosen": -0.6660956144332886, "rewards/margins": 9.840585708618164, "rewards/rejected": -10.506681442260742, "step": 1446 }, { "epoch": 0.35, "learning_rate": 1.4506666666666667e-07, "logps/chosen": -201.727783203125, "logps/rejected": -310.60845947265625, "loss": 0.0062, "losses/dpo": 5.7833658502204344e-05, "losses/sft": 0.5350317358970642, "losses/total": 5.7833658502204344e-05, "ref_logps/chosen": -196.07546997070312, "ref_logps/rejected": -206.53585815429688, "rewards/accuracies": 1.0, "rewards/chosen": -0.5652338266372681, "rewards/margins": 9.84202766418457, "rewards/rejected": -10.40726089477539, "step": 1447 }, { "epoch": 0.35, "learning_rate": 1.4501333333333332e-07, "logps/chosen": -233.847412109375, "logps/rejected": -337.3805847167969, "loss": 0.0017, "losses/dpo": 2.970784862554865e-07, "losses/sft": 0.618369460105896, "losses/total": 2.970784862554865e-07, "ref_logps/chosen": -224.44195556640625, "ref_logps/rejected": -230.94345092773438, "rewards/accuracies": 1.0, "rewards/chosen": -0.9405454397201538, "rewards/margins": 9.703166961669922, "rewards/rejected": -10.643712997436523, "step": 1448 }, { "epoch": 0.35, "learning_rate": 1.4496e-07, "logps/chosen": -259.2063293457031, "logps/rejected": -319.96826171875, "loss": 0.0098, "losses/dpo": 3.8930500068090623e-07, "losses/sft": 0.5846023559570312, "losses/total": 3.8930500068090623e-07, "ref_logps/chosen": -252.65219116210938, "ref_logps/rejected": -215.52699279785156, "rewards/accuracies": 1.0, "rewards/chosen": -0.6554161906242371, "rewards/margins": 9.788711547851562, "rewards/rejected": -10.444128036499023, "step": 1449 }, { "epoch": 0.35, "learning_rate": 1.4490666666666667e-07, "logps/chosen": -235.39064025878906, "logps/rejected": -327.94830322265625, "loss": 0.001, "losses/dpo": 5.774276360170916e-06, "losses/sft": 0.40449386835098267, "losses/total": 5.774276360170916e-06, "ref_logps/chosen": -228.15869140625, "ref_logps/rejected": -221.74635314941406, "rewards/accuracies": 1.0, "rewards/chosen": -0.7231958508491516, "rewards/margins": 9.89699935913086, "rewards/rejected": -10.620195388793945, "step": 1450 }, { "epoch": 0.35, "learning_rate": 1.4485333333333332e-07, "logps/chosen": -210.03720092773438, "logps/rejected": -281.6422424316406, "loss": 0.0118, "losses/dpo": 3.234253017581068e-05, "losses/sft": 0.6210136413574219, "losses/total": 3.234253017581068e-05, "ref_logps/chosen": -201.6185302734375, "ref_logps/rejected": -195.73223876953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.8418682813644409, "rewards/margins": 7.749133110046387, "rewards/rejected": -8.591001510620117, "step": 1451 }, { "epoch": 0.35, "learning_rate": 1.448e-07, "logps/chosen": -252.65890502929688, "logps/rejected": -359.6452331542969, "loss": 0.0027, "losses/dpo": 2.9825488923052035e-07, "losses/sft": 0.4466572701931, "losses/total": 2.9825488923052035e-07, "ref_logps/chosen": -246.36683654785156, "ref_logps/rejected": -246.39706420898438, "rewards/accuracies": 1.0, "rewards/chosen": -0.6292061805725098, "rewards/margins": 10.695610046386719, "rewards/rejected": -11.324816703796387, "step": 1452 }, { "epoch": 0.35, "learning_rate": 1.4474666666666665e-07, "logps/chosen": -257.3125915527344, "logps/rejected": -348.47802734375, "loss": 0.0012, "losses/dpo": 1.5831919881748036e-06, "losses/sft": 0.7457790374755859, "losses/total": 1.5831919881748036e-06, "ref_logps/chosen": -252.38670349121094, "ref_logps/rejected": -233.8638153076172, "rewards/accuracies": 1.0, "rewards/chosen": -0.49258893728256226, "rewards/margins": 10.968830108642578, "rewards/rejected": -11.461419105529785, "step": 1453 }, { "epoch": 0.35, "learning_rate": 1.4469333333333333e-07, "logps/chosen": -222.0502166748047, "logps/rejected": -310.92962646484375, "loss": 0.0091, "losses/dpo": 2.2354531665769173e-06, "losses/sft": 0.5629758834838867, "losses/total": 2.2354531665769173e-06, "ref_logps/chosen": -214.9460906982422, "ref_logps/rejected": -209.7250213623047, "rewards/accuracies": 1.0, "rewards/chosen": -0.7104127407073975, "rewards/margins": 9.41004753112793, "rewards/rejected": -10.120460510253906, "step": 1454 }, { "epoch": 0.35, "learning_rate": 1.4463999999999998e-07, "logps/chosen": -192.57325744628906, "logps/rejected": -318.1990661621094, "loss": 0.0091, "losses/dpo": 3.937297606171342e-06, "losses/sft": 0.5091164708137512, "losses/total": 3.937297606171342e-06, "ref_logps/chosen": -185.1997833251953, "ref_logps/rejected": -219.37814331054688, "rewards/accuracies": 1.0, "rewards/chosen": -0.737348735332489, "rewards/margins": 9.144742965698242, "rewards/rejected": -9.882091522216797, "step": 1455 }, { "epoch": 0.35, "learning_rate": 1.4458666666666665e-07, "logps/chosen": -259.0926513671875, "logps/rejected": -348.51336669921875, "loss": 0.008, "losses/dpo": 1.715666257950943e-06, "losses/sft": 0.493832528591156, "losses/total": 1.715666257950943e-06, "ref_logps/chosen": -253.25506591796875, "ref_logps/rejected": -234.8424072265625, "rewards/accuracies": 1.0, "rewards/chosen": -0.5837606191635132, "rewards/margins": 10.783336639404297, "rewards/rejected": -11.367095947265625, "step": 1456 }, { "epoch": 0.35, "learning_rate": 1.4453333333333333e-07, "logps/chosen": -217.6943359375, "logps/rejected": -271.29010009765625, "loss": 0.0132, "losses/dpo": 0.0022544756066054106, "losses/sft": 0.5640822052955627, "losses/total": 0.0022544756066054106, "ref_logps/chosen": -211.96112060546875, "ref_logps/rejected": -186.53904724121094, "rewards/accuracies": 1.0, "rewards/chosen": -0.5733213424682617, "rewards/margins": 7.901782035827637, "rewards/rejected": -8.475103378295898, "step": 1457 }, { "epoch": 0.35, "learning_rate": 1.4448e-07, "logps/chosen": -247.9178466796875, "logps/rejected": -358.721435546875, "loss": 0.0037, "losses/dpo": 8.345326023118105e-06, "losses/sft": 0.7447223663330078, "losses/total": 8.345326023118105e-06, "ref_logps/chosen": -243.0440673828125, "ref_logps/rejected": -253.33114624023438, "rewards/accuracies": 1.0, "rewards/chosen": -0.4873790144920349, "rewards/margins": 10.05164909362793, "rewards/rejected": -10.53902816772461, "step": 1458 }, { "epoch": 0.35, "learning_rate": 1.4442666666666666e-07, "logps/chosen": -220.2882843017578, "logps/rejected": -318.8404235839844, "loss": 0.0124, "losses/dpo": 1.968584051326161e-08, "losses/sft": 0.6808552145957947, "losses/total": 1.968584051326161e-08, "ref_logps/chosen": -213.8272705078125, "ref_logps/rejected": -215.1128692626953, "rewards/accuracies": 1.0, "rewards/chosen": -0.646101713180542, "rewards/margins": 9.726655006408691, "rewards/rejected": -10.372756958007812, "step": 1459 }, { "epoch": 0.35, "learning_rate": 1.4437333333333333e-07, "logps/chosen": -268.8165283203125, "logps/rejected": -336.8822021484375, "loss": 0.0009, "losses/dpo": 1.1878507955032092e-08, "losses/sft": 0.4603321850299835, "losses/total": 1.1878507955032092e-08, "ref_logps/chosen": -263.67083740234375, "ref_logps/rejected": -223.53480529785156, "rewards/accuracies": 1.0, "rewards/chosen": -0.5145692825317383, "rewards/margins": 10.820169448852539, "rewards/rejected": -11.334739685058594, "step": 1460 }, { "epoch": 0.35, "learning_rate": 1.4431999999999998e-07, "logps/chosen": -224.57333374023438, "logps/rejected": -331.98199462890625, "loss": 0.0055, "losses/dpo": 2.356266850256361e-05, "losses/sft": 0.5844607949256897, "losses/total": 2.356266850256361e-05, "ref_logps/chosen": -214.86834716796875, "ref_logps/rejected": -230.43011474609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9704992771148682, "rewards/margins": 9.184691429138184, "rewards/rejected": -10.155190467834473, "step": 1461 }, { "epoch": 0.35, "learning_rate": 1.4426666666666666e-07, "logps/chosen": -252.06192016601562, "logps/rejected": -332.75445556640625, "loss": 0.0067, "losses/dpo": 0.0005925782606936991, "losses/sft": 0.41766002774238586, "losses/total": 0.0005925782606936991, "ref_logps/chosen": -246.38787841796875, "ref_logps/rejected": -229.96392822265625, "rewards/accuracies": 1.0, "rewards/chosen": -0.5674034953117371, "rewards/margins": 9.711651802062988, "rewards/rejected": -10.279054641723633, "step": 1462 }, { "epoch": 0.35, "learning_rate": 1.442133333333333e-07, "logps/chosen": -234.4608154296875, "logps/rejected": -298.03863525390625, "loss": 0.0117, "losses/dpo": 0.0037546942476183176, "losses/sft": 0.649596095085144, "losses/total": 0.0037546942476183176, "ref_logps/chosen": -227.79434204101562, "ref_logps/rejected": -204.7378387451172, "rewards/accuracies": 1.0, "rewards/chosen": -0.6666460633277893, "rewards/margins": 8.663431167602539, "rewards/rejected": -9.330076217651367, "step": 1463 }, { "epoch": 0.35, "learning_rate": 1.4416e-07, "logps/chosen": -246.34725952148438, "logps/rejected": -323.44482421875, "loss": 0.0083, "losses/dpo": 6.315297650871798e-05, "losses/sft": 0.6308480501174927, "losses/total": 6.315297650871798e-05, "ref_logps/chosen": -238.71661376953125, "ref_logps/rejected": -218.51959228515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.7630626559257507, "rewards/margins": 9.729460716247559, "rewards/rejected": -10.492523193359375, "step": 1464 }, { "epoch": 0.35, "learning_rate": 1.4410666666666667e-07, "logps/chosen": -270.3271484375, "logps/rejected": -348.28643798828125, "loss": 0.0038, "losses/dpo": 1.2689125696851988e-06, "losses/sft": 0.8431488275527954, "losses/total": 1.2689125696851988e-06, "ref_logps/chosen": -265.0185546875, "ref_logps/rejected": -244.21522521972656, "rewards/accuracies": 1.0, "rewards/chosen": -0.5308613181114197, "rewards/margins": 9.87625789642334, "rewards/rejected": -10.407118797302246, "step": 1465 }, { "epoch": 0.35, "learning_rate": 1.4405333333333334e-07, "logps/chosen": -221.2462158203125, "logps/rejected": -350.1430358886719, "loss": 0.0036, "losses/dpo": 0.00017833837773650885, "losses/sft": 0.6260501742362976, "losses/total": 0.00017833837773650885, "ref_logps/chosen": -216.46644592285156, "ref_logps/rejected": -237.20684814453125, "rewards/accuracies": 1.0, "rewards/chosen": -0.47797685861587524, "rewards/margins": 10.815644264221191, "rewards/rejected": -11.293621063232422, "step": 1466 }, { "epoch": 0.35, "learning_rate": 1.44e-07, "logps/chosen": -253.61947631835938, "logps/rejected": -350.2420654296875, "loss": 0.0013, "losses/dpo": 1.4783730875933543e-05, "losses/sft": 0.6133124828338623, "losses/total": 1.4783730875933543e-05, "ref_logps/chosen": -246.69033813476562, "ref_logps/rejected": -237.99697875976562, "rewards/accuracies": 1.0, "rewards/chosen": -0.6929130554199219, "rewards/margins": 10.531598091125488, "rewards/rejected": -11.224512100219727, "step": 1467 }, { "epoch": 0.35, "learning_rate": 1.4394666666666667e-07, "logps/chosen": -271.99810791015625, "logps/rejected": -344.9281311035156, "loss": 0.0041, "losses/dpo": 4.9018385652743746e-06, "losses/sft": 0.6664102077484131, "losses/total": 4.9018385652743746e-06, "ref_logps/chosen": -264.1300354003906, "ref_logps/rejected": -242.3258514404297, "rewards/accuracies": 1.0, "rewards/chosen": -0.7868065237998962, "rewards/margins": 9.473420143127441, "rewards/rejected": -10.260226249694824, "step": 1468 }, { "epoch": 0.35, "learning_rate": 1.4389333333333335e-07, "logps/chosen": -217.64840698242188, "logps/rejected": -326.0164794921875, "loss": 0.0078, "losses/dpo": 0.0001409163960488513, "losses/sft": 0.7112181186676025, "losses/total": 0.0001409163960488513, "ref_logps/chosen": -213.941650390625, "ref_logps/rejected": -232.3435516357422, "rewards/accuracies": 1.0, "rewards/chosen": -0.3706788718700409, "rewards/margins": 8.996617317199707, "rewards/rejected": -9.367295265197754, "step": 1469 }, { "epoch": 0.35, "learning_rate": 1.4384e-07, "logps/chosen": -222.2257080078125, "logps/rejected": -339.9845886230469, "loss": 0.0032, "losses/dpo": 1.9996687115053646e-05, "losses/sft": 0.5202810168266296, "losses/total": 1.9996687115053646e-05, "ref_logps/chosen": -217.0369110107422, "ref_logps/rejected": -231.02554321289062, "rewards/accuracies": 1.0, "rewards/chosen": -0.5188787579536438, "rewards/margins": 10.37702751159668, "rewards/rejected": -10.895906448364258, "step": 1470 }, { "epoch": 0.35, "learning_rate": 1.4378666666666665e-07, "logps/chosen": -174.61851501464844, "logps/rejected": -275.8467102050781, "loss": 0.0209, "losses/dpo": 1.4417448255699128e-05, "losses/sft": 0.6139398813247681, "losses/total": 1.4417448255699128e-05, "ref_logps/chosen": -168.23867797851562, "ref_logps/rejected": -188.06439208984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.6379849314689636, "rewards/margins": 8.14024543762207, "rewards/rejected": -8.778229713439941, "step": 1471 }, { "epoch": 0.35, "learning_rate": 1.4373333333333332e-07, "logps/chosen": -240.7684783935547, "logps/rejected": -318.9577331542969, "loss": 0.0083, "losses/dpo": 0.004330902360379696, "losses/sft": 0.6805931329727173, "losses/total": 0.004330902360379696, "ref_logps/chosen": -234.7780303955078, "ref_logps/rejected": -220.54981994628906, "rewards/accuracies": 1.0, "rewards/chosen": -0.599044919013977, "rewards/margins": 9.241745948791504, "rewards/rejected": -9.840790748596191, "step": 1472 }, { "epoch": 0.35, "learning_rate": 1.4368e-07, "logps/chosen": -204.7841033935547, "logps/rejected": -350.958984375, "loss": 0.0028, "losses/dpo": 3.1365865993393527e-07, "losses/sft": 0.5322738289833069, "losses/total": 3.1365865993393527e-07, "ref_logps/chosen": -200.33676147460938, "ref_logps/rejected": -241.2957305908203, "rewards/accuracies": 1.0, "rewards/chosen": -0.4447348117828369, "rewards/margins": 10.52159309387207, "rewards/rejected": -10.966327667236328, "step": 1473 }, { "epoch": 0.35, "learning_rate": 1.4362666666666665e-07, "logps/chosen": -259.1224365234375, "logps/rejected": -372.37457275390625, "loss": 0.0215, "losses/dpo": 1.0632178693015248e-08, "losses/sft": 0.5868707299232483, "losses/total": 1.0632178693015248e-08, "ref_logps/chosen": -251.0389404296875, "ref_logps/rejected": -250.43148803710938, "rewards/accuracies": 1.0, "rewards/chosen": -0.8083509206771851, "rewards/margins": 11.385958671569824, "rewards/rejected": -12.19430923461914, "step": 1474 }, { "epoch": 0.35, "learning_rate": 1.4357333333333333e-07, "logps/chosen": -259.83001708984375, "logps/rejected": -345.9395751953125, "loss": 0.0107, "losses/dpo": 1.9171552594343666e-06, "losses/sft": 0.7650961875915527, "losses/total": 1.9171552594343666e-06, "ref_logps/chosen": -252.67251586914062, "ref_logps/rejected": -235.96029663085938, "rewards/accuracies": 1.0, "rewards/chosen": -0.7157505750656128, "rewards/margins": 10.282175064086914, "rewards/rejected": -10.997926712036133, "step": 1475 }, { "epoch": 0.35, "learning_rate": 1.4352e-07, "logps/chosen": -236.74192810058594, "logps/rejected": -335.04180908203125, "loss": 0.0095, "losses/dpo": 5.005034608984715e-09, "losses/sft": 0.5908104181289673, "losses/total": 5.005034608984715e-09, "ref_logps/chosen": -231.27688598632812, "ref_logps/rejected": -222.61392211914062, "rewards/accuracies": 1.0, "rewards/chosen": -0.5465042591094971, "rewards/margins": 10.696287155151367, "rewards/rejected": -11.242792129516602, "step": 1476 }, { "epoch": 0.35, "learning_rate": 1.4346666666666668e-07, "logps/chosen": -229.33087158203125, "logps/rejected": -304.4952392578125, "loss": 0.0042, "losses/dpo": 1.8505716070649214e-06, "losses/sft": 0.6942999362945557, "losses/total": 1.8505716070649214e-06, "ref_logps/chosen": -223.37777709960938, "ref_logps/rejected": -201.98150634765625, "rewards/accuracies": 1.0, "rewards/chosen": -0.5953083038330078, "rewards/margins": 9.656062126159668, "rewards/rejected": -10.25136947631836, "step": 1477 }, { "epoch": 0.35, "learning_rate": 1.4341333333333333e-07, "logps/chosen": -223.10427856445312, "logps/rejected": -331.0835266113281, "loss": 0.0213, "losses/dpo": 1.2678596249315888e-05, "losses/sft": 0.5518537759780884, "losses/total": 1.2678596249315888e-05, "ref_logps/chosen": -214.82781982421875, "ref_logps/rejected": -228.94461059570312, "rewards/accuracies": 1.0, "rewards/chosen": -0.8276466131210327, "rewards/margins": 9.38624382019043, "rewards/rejected": -10.21389102935791, "step": 1478 }, { "epoch": 0.35, "learning_rate": 1.4335999999999998e-07, "logps/chosen": -222.74142456054688, "logps/rejected": -315.46551513671875, "loss": 0.0081, "losses/dpo": 1.1272002211626386e-06, "losses/sft": 0.6505651473999023, "losses/total": 1.1272002211626386e-06, "ref_logps/chosen": -215.65084838867188, "ref_logps/rejected": -210.27484130859375, "rewards/accuracies": 1.0, "rewards/chosen": -0.7090578079223633, "rewards/margins": 9.810007095336914, "rewards/rejected": -10.519065856933594, "step": 1479 }, { "epoch": 0.36, "learning_rate": 1.4330666666666666e-07, "logps/chosen": -221.96063232421875, "logps/rejected": -318.70263671875, "loss": 0.0022, "losses/dpo": 1.3495889561454533e-07, "losses/sft": 0.7006264328956604, "losses/total": 1.3495889561454533e-07, "ref_logps/chosen": -214.79222106933594, "ref_logps/rejected": -212.44253540039062, "rewards/accuracies": 1.0, "rewards/chosen": -0.7168385982513428, "rewards/margins": 9.909174919128418, "rewards/rejected": -10.62601375579834, "step": 1480 }, { "epoch": 0.36, "learning_rate": 1.4325333333333333e-07, "logps/chosen": -225.74989318847656, "logps/rejected": -335.6937255859375, "loss": 0.0097, "losses/dpo": 4.5021130063105375e-05, "losses/sft": 0.4766395688056946, "losses/total": 4.5021130063105375e-05, "ref_logps/chosen": -219.06240844726562, "ref_logps/rejected": -236.2933349609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.6687490344047546, "rewards/margins": 9.271289825439453, "rewards/rejected": -9.94003963470459, "step": 1481 }, { "epoch": 0.36, "learning_rate": 1.4319999999999999e-07, "logps/chosen": -266.0519714355469, "logps/rejected": -352.47979736328125, "loss": 0.0175, "losses/dpo": 1.5144564713409636e-05, "losses/sft": 0.7283242344856262, "losses/total": 1.5144564713409636e-05, "ref_logps/chosen": -255.2186737060547, "ref_logps/rejected": -240.75753784179688, "rewards/accuracies": 1.0, "rewards/chosen": -1.0833289623260498, "rewards/margins": 10.088899612426758, "rewards/rejected": -11.17222785949707, "step": 1482 }, { "epoch": 0.36, "learning_rate": 1.4314666666666666e-07, "logps/chosen": -233.2843017578125, "logps/rejected": -311.14849853515625, "loss": 0.0097, "losses/dpo": 1.0387559996161144e-05, "losses/sft": 0.6665297150611877, "losses/total": 1.0387559996161144e-05, "ref_logps/chosen": -226.94088745117188, "ref_logps/rejected": -213.2476806640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.6343424916267395, "rewards/margins": 9.155740737915039, "rewards/rejected": -9.790083885192871, "step": 1483 }, { "epoch": 0.36, "learning_rate": 1.4309333333333334e-07, "logps/chosen": -215.77862548828125, "logps/rejected": -335.7238464355469, "loss": 0.001, "losses/dpo": 0.0005777772748842835, "losses/sft": 0.631263017654419, "losses/total": 0.0005777772748842835, "ref_logps/chosen": -212.16305541992188, "ref_logps/rejected": -225.69554138183594, "rewards/accuracies": 1.0, "rewards/chosen": -0.36155664920806885, "rewards/margins": 10.641273498535156, "rewards/rejected": -11.002830505371094, "step": 1484 }, { "epoch": 0.36, "learning_rate": 1.4304e-07, "logps/chosen": -226.22036743164062, "logps/rejected": -350.22509765625, "loss": 0.0012, "losses/dpo": 0.0022386498749256134, "losses/sft": 0.5453124642372131, "losses/total": 0.0022386498749256134, "ref_logps/chosen": -219.79421997070312, "ref_logps/rejected": -235.5967254638672, "rewards/accuracies": 1.0, "rewards/chosen": -0.6426164507865906, "rewards/margins": 10.82022476196289, "rewards/rejected": -11.462841033935547, "step": 1485 }, { "epoch": 0.36, "learning_rate": 1.4298666666666667e-07, "logps/chosen": -216.23866271972656, "logps/rejected": -321.682861328125, "loss": 0.0036, "losses/dpo": 7.826442924852017e-06, "losses/sft": 0.3811686336994171, "losses/total": 7.826442924852017e-06, "ref_logps/chosen": -210.23294067382812, "ref_logps/rejected": -215.76377868652344, "rewards/accuracies": 1.0, "rewards/chosen": -0.6005726456642151, "rewards/margins": 9.9913330078125, "rewards/rejected": -10.59190559387207, "step": 1486 }, { "epoch": 0.36, "learning_rate": 1.4293333333333332e-07, "logps/chosen": -254.4693603515625, "logps/rejected": -330.5685729980469, "loss": 0.0127, "losses/dpo": 3.782362455240218e-06, "losses/sft": 0.5082125663757324, "losses/total": 3.782362455240218e-06, "ref_logps/chosen": -249.34286499023438, "ref_logps/rejected": -224.0869598388672, "rewards/accuracies": 1.0, "rewards/chosen": -0.5126489400863647, "rewards/margins": 10.13551139831543, "rewards/rejected": -10.648160934448242, "step": 1487 }, { "epoch": 0.36, "learning_rate": 1.4288e-07, "logps/chosen": -240.77291870117188, "logps/rejected": -360.09423828125, "loss": 0.0024, "losses/dpo": 1.2246522373970947e-07, "losses/sft": 0.4284639358520508, "losses/total": 1.2246522373970947e-07, "ref_logps/chosen": -232.203125, "ref_logps/rejected": -246.55003356933594, "rewards/accuracies": 1.0, "rewards/chosen": -0.8569800853729248, "rewards/margins": 10.497442245483398, "rewards/rejected": -11.354421615600586, "step": 1488 }, { "epoch": 0.36, "learning_rate": 1.4282666666666664e-07, "logps/chosen": -206.07192993164062, "logps/rejected": -319.885009765625, "loss": 0.0041, "losses/dpo": 6.998494427534752e-06, "losses/sft": 0.6927555203437805, "losses/total": 6.998494427534752e-06, "ref_logps/chosen": -200.3787841796875, "ref_logps/rejected": -212.8870849609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.5693140029907227, "rewards/margins": 10.130477905273438, "rewards/rejected": -10.699792861938477, "step": 1489 }, { "epoch": 0.36, "learning_rate": 1.4277333333333332e-07, "logps/chosen": -184.76646423339844, "logps/rejected": -317.686767578125, "loss": 0.0142, "losses/dpo": 9.062659955816343e-06, "losses/sft": 0.48922356963157654, "losses/total": 9.062659955816343e-06, "ref_logps/chosen": -180.48983764648438, "ref_logps/rejected": -220.48129272460938, "rewards/accuracies": 1.0, "rewards/chosen": -0.4276616871356964, "rewards/margins": 9.29288387298584, "rewards/rejected": -9.720545768737793, "step": 1490 }, { "epoch": 0.36, "learning_rate": 1.4272e-07, "logps/chosen": -244.09947204589844, "logps/rejected": -348.50732421875, "loss": 0.0035, "losses/dpo": 3.2653775861035683e-07, "losses/sft": 0.5626056790351868, "losses/total": 3.2653775861035683e-07, "ref_logps/chosen": -238.33917236328125, "ref_logps/rejected": -233.57568359375, "rewards/accuracies": 1.0, "rewards/chosen": -0.576030433177948, "rewards/margins": 10.917131423950195, "rewards/rejected": -11.493162155151367, "step": 1491 }, { "epoch": 0.36, "learning_rate": 1.4266666666666667e-07, "logps/chosen": -240.6680908203125, "logps/rejected": -314.60540771484375, "loss": 0.0092, "losses/dpo": 0.00012675618927460164, "losses/sft": 0.8592473268508911, "losses/total": 0.00012675618927460164, "ref_logps/chosen": -232.87188720703125, "ref_logps/rejected": -214.8539581298828, "rewards/accuracies": 1.0, "rewards/chosen": -0.7796199917793274, "rewards/margins": 9.19552993774414, "rewards/rejected": -9.975149154663086, "step": 1492 }, { "epoch": 0.36, "learning_rate": 1.4261333333333332e-07, "logps/chosen": -205.99411010742188, "logps/rejected": -341.9647521972656, "loss": 0.0049, "losses/dpo": 6.011837285768706e-06, "losses/sft": 0.5229544639587402, "losses/total": 6.011837285768706e-06, "ref_logps/chosen": -200.22850036621094, "ref_logps/rejected": -226.142822265625, "rewards/accuracies": 1.0, "rewards/chosen": -0.5765622854232788, "rewards/margins": 11.005631446838379, "rewards/rejected": -11.582194328308105, "step": 1493 }, { "epoch": 0.36, "learning_rate": 1.4256e-07, "logps/chosen": -274.6077880859375, "logps/rejected": -323.2153015136719, "loss": 0.0038, "losses/dpo": 1.347559361875028e-07, "losses/sft": 0.6077597737312317, "losses/total": 1.347559361875028e-07, "ref_logps/chosen": -266.0644836425781, "ref_logps/rejected": -218.95318603515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.8543321490287781, "rewards/margins": 9.571880340576172, "rewards/rejected": -10.426212310791016, "step": 1494 }, { "epoch": 0.36, "learning_rate": 1.4250666666666665e-07, "logps/chosen": -225.5706787109375, "logps/rejected": -301.88494873046875, "loss": 0.007, "losses/dpo": 0.010037797503173351, "losses/sft": 0.9123319387435913, "losses/total": 0.010037797503173351, "ref_logps/chosen": -218.26959228515625, "ref_logps/rejected": -205.42010498046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.7301077842712402, "rewards/margins": 8.916374206542969, "rewards/rejected": -9.646482467651367, "step": 1495 }, { "epoch": 0.36, "learning_rate": 1.4245333333333333e-07, "logps/chosen": -220.91534423828125, "logps/rejected": -332.08587646484375, "loss": 0.0256, "losses/dpo": 6.910443630658847e-07, "losses/sft": 0.6760058403015137, "losses/total": 6.910443630658847e-07, "ref_logps/chosen": -214.59547424316406, "ref_logps/rejected": -232.6492462158203, "rewards/accuracies": 1.0, "rewards/chosen": -0.6319862604141235, "rewards/margins": 9.311677932739258, "rewards/rejected": -9.94366455078125, "step": 1496 }, { "epoch": 0.36, "learning_rate": 1.4239999999999998e-07, "logps/chosen": -240.8857879638672, "logps/rejected": -325.31756591796875, "loss": 0.0131, "losses/dpo": 4.2624860725481994e-06, "losses/sft": 0.46396559476852417, "losses/total": 4.2624860725481994e-06, "ref_logps/chosen": -235.22906494140625, "ref_logps/rejected": -226.9189453125, "rewards/accuracies": 1.0, "rewards/chosen": -0.5656735897064209, "rewards/margins": 9.274191856384277, "rewards/rejected": -9.839864730834961, "step": 1497 }, { "epoch": 0.36, "learning_rate": 1.4234666666666665e-07, "logps/chosen": -197.61019897460938, "logps/rejected": -305.1240234375, "loss": 0.0208, "losses/dpo": 2.1308490527616186e-09, "losses/sft": 0.4814634323120117, "losses/total": 2.1308490527616186e-09, "ref_logps/chosen": -193.49081420898438, "ref_logps/rejected": -202.6080780029297, "rewards/accuracies": 1.0, "rewards/chosen": -0.41193726658821106, "rewards/margins": 9.839656829833984, "rewards/rejected": -10.251593589782715, "step": 1498 }, { "epoch": 0.36, "learning_rate": 1.4229333333333333e-07, "logps/chosen": -211.0356903076172, "logps/rejected": -320.0618591308594, "loss": 0.0078, "losses/dpo": 7.461781024176162e-06, "losses/sft": 0.5822683572769165, "losses/total": 7.461781024176162e-06, "ref_logps/chosen": -205.3186798095703, "ref_logps/rejected": -224.87054443359375, "rewards/accuracies": 1.0, "rewards/chosen": -0.5717012882232666, "rewards/margins": 8.947431564331055, "rewards/rejected": -9.519133567810059, "step": 1499 }, { "epoch": 0.36, "learning_rate": 1.4224e-07, "logps/chosen": -238.602294921875, "logps/rejected": -322.6343994140625, "loss": 0.005, "losses/dpo": 1.4067460085698258e-07, "losses/sft": 0.5274151563644409, "losses/total": 1.4067460085698258e-07, "ref_logps/chosen": -232.65081787109375, "ref_logps/rejected": -224.81515502929688, "rewards/accuracies": 1.0, "rewards/chosen": -0.5951484441757202, "rewards/margins": 9.186773300170898, "rewards/rejected": -9.78192138671875, "step": 1500 }, { "epoch": 0.36, "learning_rate": 1.4218666666666666e-07, "logps/chosen": -228.57583618164062, "logps/rejected": -340.29425048828125, "loss": 0.0036, "losses/dpo": 2.651187891444806e-08, "losses/sft": 0.5697094798088074, "losses/total": 2.651187891444806e-08, "ref_logps/chosen": -222.13980102539062, "ref_logps/rejected": -228.8609161376953, "rewards/accuracies": 1.0, "rewards/chosen": -0.6436043977737427, "rewards/margins": 10.49972915649414, "rewards/rejected": -11.143333435058594, "step": 1501 }, { "epoch": 0.36, "learning_rate": 1.4213333333333334e-07, "logps/chosen": -244.51361083984375, "logps/rejected": -314.909912109375, "loss": 0.0157, "losses/dpo": 2.4552771265007323e-06, "losses/sft": 0.7970927357673645, "losses/total": 2.4552771265007323e-06, "ref_logps/chosen": -238.04486083984375, "ref_logps/rejected": -215.42015075683594, "rewards/accuracies": 1.0, "rewards/chosen": -0.6468748450279236, "rewards/margins": 9.302104949951172, "rewards/rejected": -9.948979377746582, "step": 1502 }, { "epoch": 0.36, "learning_rate": 1.4208e-07, "logps/chosen": -227.15829467773438, "logps/rejected": -346.9058837890625, "loss": 0.0008, "losses/dpo": 4.5773640522384085e-06, "losses/sft": 0.6736599206924438, "losses/total": 4.5773640522384085e-06, "ref_logps/chosen": -221.35662841796875, "ref_logps/rejected": -235.18844604492188, "rewards/accuracies": 1.0, "rewards/chosen": -0.5801687836647034, "rewards/margins": 10.591578483581543, "rewards/rejected": -11.171747207641602, "step": 1503 }, { "epoch": 0.36, "learning_rate": 1.4202666666666666e-07, "logps/chosen": -227.72686767578125, "logps/rejected": -333.6646728515625, "loss": 0.0012, "losses/dpo": 1.1200883818673901e-05, "losses/sft": 0.5681672096252441, "losses/total": 1.1200883818673901e-05, "ref_logps/chosen": -221.51622009277344, "ref_logps/rejected": -225.13668823242188, "rewards/accuracies": 1.0, "rewards/chosen": -0.6210631132125854, "rewards/margins": 10.231735229492188, "rewards/rejected": -10.852798461914062, "step": 1504 }, { "epoch": 0.36, "learning_rate": 1.419733333333333e-07, "logps/chosen": -266.3826904296875, "logps/rejected": -314.62945556640625, "loss": 0.0158, "losses/dpo": 6.45265681669116e-05, "losses/sft": 0.4253169596195221, "losses/total": 6.45265681669116e-05, "ref_logps/chosen": -259.6979064941406, "ref_logps/rejected": -207.99822998046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.6684780716896057, "rewards/margins": 9.994644165039062, "rewards/rejected": -10.663122177124023, "step": 1505 }, { "epoch": 0.36, "learning_rate": 1.4192e-07, "logps/chosen": -258.39263916015625, "logps/rejected": -326.7524719238281, "loss": 0.0021, "losses/dpo": 5.80316111609136e-07, "losses/sft": 0.5520685911178589, "losses/total": 5.80316111609136e-07, "ref_logps/chosen": -251.37655639648438, "ref_logps/rejected": -222.2986297607422, "rewards/accuracies": 1.0, "rewards/chosen": -0.7016090154647827, "rewards/margins": 9.7437744140625, "rewards/rejected": -10.445383071899414, "step": 1506 }, { "epoch": 0.36, "learning_rate": 1.4186666666666667e-07, "logps/chosen": -248.0201416015625, "logps/rejected": -343.075439453125, "loss": 0.0093, "losses/dpo": 5.203523301133828e-07, "losses/sft": 1.1523702144622803, "losses/total": 5.203523301133828e-07, "ref_logps/chosen": -240.8671875, "ref_logps/rejected": -227.35430908203125, "rewards/accuracies": 1.0, "rewards/chosen": -0.7152938842773438, "rewards/margins": 10.856819152832031, "rewards/rejected": -11.572113037109375, "step": 1507 }, { "epoch": 0.36, "learning_rate": 1.4181333333333332e-07, "logps/chosen": -225.25277709960938, "logps/rejected": -339.0182800292969, "loss": 0.0017, "losses/dpo": 1.5441772802660125e-06, "losses/sft": 0.6166609525680542, "losses/total": 1.5441772802660125e-06, "ref_logps/chosen": -218.15444946289062, "ref_logps/rejected": -227.6317138671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.7098323106765747, "rewards/margins": 10.428824424743652, "rewards/rejected": -11.138656616210938, "step": 1508 }, { "epoch": 0.36, "learning_rate": 1.4176e-07, "logps/chosen": -223.82907104492188, "logps/rejected": -330.11065673828125, "loss": 0.0007, "losses/dpo": 4.0211080687413414e-08, "losses/sft": 0.4971695840358734, "losses/total": 4.0211080687413414e-08, "ref_logps/chosen": -217.9696044921875, "ref_logps/rejected": -220.59072875976562, "rewards/accuracies": 1.0, "rewards/chosen": -0.5859447717666626, "rewards/margins": 10.366047859191895, "rewards/rejected": -10.95199203491211, "step": 1509 }, { "epoch": 0.36, "learning_rate": 1.4170666666666667e-07, "logps/chosen": -273.04449462890625, "logps/rejected": -382.62274169921875, "loss": 0.0026, "losses/dpo": 7.034727786958683e-07, "losses/sft": 0.5394922494888306, "losses/total": 7.034727786958683e-07, "ref_logps/chosen": -262.2506103515625, "ref_logps/rejected": -255.12423706054688, "rewards/accuracies": 1.0, "rewards/chosen": -1.0793877840042114, "rewards/margins": 11.670461654663086, "rewards/rejected": -12.749849319458008, "step": 1510 }, { "epoch": 0.36, "learning_rate": 1.4165333333333335e-07, "logps/chosen": -205.40533447265625, "logps/rejected": -304.77728271484375, "loss": 0.0017, "losses/dpo": 0.00017551380733493716, "losses/sft": 0.6923593878746033, "losses/total": 0.00017551380733493716, "ref_logps/chosen": -198.3401641845703, "ref_logps/rejected": -205.763671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.7065176963806152, "rewards/margins": 9.194842338562012, "rewards/rejected": -9.901359558105469, "step": 1511 }, { "epoch": 0.36, "learning_rate": 1.416e-07, "logps/chosen": -204.19725036621094, "logps/rejected": -289.2185363769531, "loss": 0.0087, "losses/dpo": 0.0016182443359866738, "losses/sft": 0.6443734169006348, "losses/total": 0.0016182443359866738, "ref_logps/chosen": -199.1477508544922, "ref_logps/rejected": -186.88864135742188, "rewards/accuracies": 1.0, "rewards/chosen": -0.5049501657485962, "rewards/margins": 9.72804069519043, "rewards/rejected": -10.232990264892578, "step": 1512 }, { "epoch": 0.36, "learning_rate": 1.4154666666666665e-07, "logps/chosen": -275.0159912109375, "logps/rejected": -339.41009521484375, "loss": 0.0038, "losses/dpo": 1.313697430305183e-05, "losses/sft": 0.7045270800590515, "losses/total": 1.313697430305183e-05, "ref_logps/chosen": -267.49267578125, "ref_logps/rejected": -221.1214599609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.7523311972618103, "rewards/margins": 11.076532363891602, "rewards/rejected": -11.828863143920898, "step": 1513 }, { "epoch": 0.36, "learning_rate": 1.4149333333333332e-07, "logps/chosen": -263.62255859375, "logps/rejected": -352.9039001464844, "loss": 0.0036, "losses/dpo": 7.963744792505167e-06, "losses/sft": 0.7623440027236938, "losses/total": 7.963744792505167e-06, "ref_logps/chosen": -253.68934631347656, "ref_logps/rejected": -238.82955932617188, "rewards/accuracies": 1.0, "rewards/chosen": -0.9933245778083801, "rewards/margins": 10.414111137390137, "rewards/rejected": -11.40743637084961, "step": 1514 }, { "epoch": 0.36, "learning_rate": 1.4144e-07, "logps/chosen": -191.80850219726562, "logps/rejected": -303.4111328125, "loss": 0.0056, "losses/dpo": 1.4549832485499792e-06, "losses/sft": 1.0593281984329224, "losses/total": 1.4549832485499792e-06, "ref_logps/chosen": -187.07357788085938, "ref_logps/rejected": -207.78030395507812, "rewards/accuracies": 1.0, "rewards/chosen": -0.47349250316619873, "rewards/margins": 9.089591026306152, "rewards/rejected": -9.563082695007324, "step": 1515 }, { "epoch": 0.36, "learning_rate": 1.4138666666666665e-07, "logps/chosen": -235.71270751953125, "logps/rejected": -331.0447998046875, "loss": 0.0083, "losses/dpo": 1.0199401003774256e-05, "losses/sft": 0.7256459593772888, "losses/total": 1.0199401003774256e-05, "ref_logps/chosen": -228.17210388183594, "ref_logps/rejected": -224.147216796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.7540603280067444, "rewards/margins": 9.93569564819336, "rewards/rejected": -10.689756393432617, "step": 1516 }, { "epoch": 0.36, "learning_rate": 1.4133333333333333e-07, "logps/chosen": -260.0462646484375, "logps/rejected": -325.5519104003906, "loss": 0.0313, "losses/dpo": 4.795991799255717e-07, "losses/sft": 0.5529530048370361, "losses/total": 4.795991799255717e-07, "ref_logps/chosen": -249.6402587890625, "ref_logps/rejected": -218.7259063720703, "rewards/accuracies": 1.0, "rewards/chosen": -1.040601372718811, "rewards/margins": 9.641998291015625, "rewards/rejected": -10.682600021362305, "step": 1517 }, { "epoch": 0.36, "learning_rate": 1.4128e-07, "logps/chosen": -281.5762939453125, "logps/rejected": -373.22430419921875, "loss": 0.0004, "losses/dpo": 1.2951599273947068e-05, "losses/sft": 0.6839383840560913, "losses/total": 1.2951599273947068e-05, "ref_logps/chosen": -272.9964599609375, "ref_logps/rejected": -260.3486633300781, "rewards/accuracies": 1.0, "rewards/chosen": -0.8579832315444946, "rewards/margins": 10.429584503173828, "rewards/rejected": -11.287567138671875, "step": 1518 }, { "epoch": 0.36, "learning_rate": 1.4122666666666666e-07, "logps/chosen": -225.65206909179688, "logps/rejected": -372.09075927734375, "loss": 0.0011, "losses/dpo": 3.132690835627727e-05, "losses/sft": 0.7150185704231262, "losses/total": 3.132690835627727e-05, "ref_logps/chosen": -216.65206909179688, "ref_logps/rejected": -253.97642517089844, "rewards/accuracies": 1.0, "rewards/chosen": -0.899998664855957, "rewards/margins": 10.911434173583984, "rewards/rejected": -11.811432838439941, "step": 1519 }, { "epoch": 0.36, "learning_rate": 1.4117333333333333e-07, "logps/chosen": -201.0170135498047, "logps/rejected": -307.4895935058594, "loss": 0.0049, "losses/dpo": 0.00021449766063597053, "losses/sft": 0.5972175002098083, "losses/total": 0.00021449766063597053, "ref_logps/chosen": -196.3013153076172, "ref_logps/rejected": -210.39193725585938, "rewards/accuracies": 1.0, "rewards/chosen": -0.47156834602355957, "rewards/margins": 9.238195419311523, "rewards/rejected": -9.709762573242188, "step": 1520 }, { "epoch": 0.37, "learning_rate": 1.4111999999999998e-07, "logps/chosen": -244.40895080566406, "logps/rejected": -349.62310791015625, "loss": 0.005, "losses/dpo": 3.226129774702713e-05, "losses/sft": 0.4665714502334595, "losses/total": 3.226129774702713e-05, "ref_logps/chosen": -236.68482971191406, "ref_logps/rejected": -235.32785034179688, "rewards/accuracies": 1.0, "rewards/chosen": -0.7724133133888245, "rewards/margins": 10.657112121582031, "rewards/rejected": -11.429524421691895, "step": 1521 }, { "epoch": 0.37, "learning_rate": 1.4106666666666666e-07, "logps/chosen": -207.7756805419922, "logps/rejected": -288.006103515625, "loss": 0.0038, "losses/dpo": 3.3151456591440365e-05, "losses/sft": 0.8299016952514648, "losses/total": 3.3151456591440365e-05, "ref_logps/chosen": -203.28619384765625, "ref_logps/rejected": -186.92897033691406, "rewards/accuracies": 1.0, "rewards/chosen": -0.44894981384277344, "rewards/margins": 9.658763885498047, "rewards/rejected": -10.10771369934082, "step": 1522 }, { "epoch": 0.37, "learning_rate": 1.410133333333333e-07, "logps/chosen": -220.35595703125, "logps/rejected": -324.23638916015625, "loss": 0.0073, "losses/dpo": 1.5877008081588428e-06, "losses/sft": 0.6830856204032898, "losses/total": 1.5877008081588428e-06, "ref_logps/chosen": -216.2698974609375, "ref_logps/rejected": -218.99285888671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.40860626101493835, "rewards/margins": 10.115744590759277, "rewards/rejected": -10.524351119995117, "step": 1523 }, { "epoch": 0.37, "learning_rate": 1.4095999999999999e-07, "logps/chosen": -205.26739501953125, "logps/rejected": -335.4703369140625, "loss": 0.0078, "losses/dpo": 1.636912566027604e-05, "losses/sft": 0.5092082619667053, "losses/total": 1.636912566027604e-05, "ref_logps/chosen": -198.69322204589844, "ref_logps/rejected": -225.4385528564453, "rewards/accuracies": 1.0, "rewards/chosen": -0.6574169993400574, "rewards/margins": 10.345763206481934, "rewards/rejected": -11.003180503845215, "step": 1524 }, { "epoch": 0.37, "learning_rate": 1.4090666666666666e-07, "logps/chosen": -223.21844482421875, "logps/rejected": -327.79327392578125, "loss": 0.0086, "losses/dpo": 2.778055296914772e-08, "losses/sft": 0.6466891169548035, "losses/total": 2.778055296914772e-08, "ref_logps/chosen": -217.74114990234375, "ref_logps/rejected": -217.2318878173828, "rewards/accuracies": 1.0, "rewards/chosen": -0.5477282404899597, "rewards/margins": 10.50840950012207, "rewards/rejected": -11.05613899230957, "step": 1525 }, { "epoch": 0.37, "learning_rate": 1.4085333333333334e-07, "logps/chosen": -217.7742156982422, "logps/rejected": -346.85321044921875, "loss": 0.0019, "losses/dpo": 4.1329801092615526e-07, "losses/sft": 0.46885618567466736, "losses/total": 4.1329801092615526e-07, "ref_logps/chosen": -212.76034545898438, "ref_logps/rejected": -244.98255920410156, "rewards/accuracies": 1.0, "rewards/chosen": -0.5013879537582397, "rewards/margins": 9.685676574707031, "rewards/rejected": -10.187065124511719, "step": 1526 }, { "epoch": 0.37, "learning_rate": 1.408e-07, "logps/chosen": -264.85113525390625, "logps/rejected": -340.79437255859375, "loss": 0.0117, "losses/dpo": 0.00012994941789656878, "losses/sft": 0.5204294323921204, "losses/total": 0.00012994941789656878, "ref_logps/chosen": -259.75372314453125, "ref_logps/rejected": -232.59384155273438, "rewards/accuracies": 1.0, "rewards/chosen": -0.5097417831420898, "rewards/margins": 10.31031322479248, "rewards/rejected": -10.82005500793457, "step": 1527 }, { "epoch": 0.37, "learning_rate": 1.4074666666666667e-07, "logps/chosen": -259.66973876953125, "logps/rejected": -353.23236083984375, "loss": 0.0012, "losses/dpo": 6.974033749429509e-05, "losses/sft": 0.7376701235771179, "losses/total": 6.974033749429509e-05, "ref_logps/chosen": -250.26858520507812, "ref_logps/rejected": -243.19017028808594, "rewards/accuracies": 1.0, "rewards/chosen": -0.9401150941848755, "rewards/margins": 10.064105033874512, "rewards/rejected": -11.004220008850098, "step": 1528 }, { "epoch": 0.37, "learning_rate": 1.4069333333333332e-07, "logps/chosen": -242.14828491210938, "logps/rejected": -341.43463134765625, "loss": 0.0049, "losses/dpo": 2.5636356326685927e-07, "losses/sft": 0.7026312351226807, "losses/total": 2.5636356326685927e-07, "ref_logps/chosen": -233.17037963867188, "ref_logps/rejected": -230.23435974121094, "rewards/accuracies": 1.0, "rewards/chosen": -0.8977915048599243, "rewards/margins": 10.222238540649414, "rewards/rejected": -11.12002944946289, "step": 1529 }, { "epoch": 0.37, "learning_rate": 1.4064e-07, "logps/chosen": -232.96054077148438, "logps/rejected": -335.05145263671875, "loss": 0.0095, "losses/dpo": 1.1764893315557856e-05, "losses/sft": 0.5152624845504761, "losses/total": 1.1764893315557856e-05, "ref_logps/chosen": -224.25314331054688, "ref_logps/rejected": -226.24908447265625, "rewards/accuracies": 1.0, "rewards/chosen": -0.8707382678985596, "rewards/margins": 10.009498596191406, "rewards/rejected": -10.880237579345703, "step": 1530 }, { "epoch": 0.37, "learning_rate": 1.4058666666666664e-07, "logps/chosen": -227.7447967529297, "logps/rejected": -328.4507751464844, "loss": 0.0128, "losses/dpo": 1.1161377187818289e-05, "losses/sft": 0.8186542987823486, "losses/total": 1.1161377187818289e-05, "ref_logps/chosen": -219.62429809570312, "ref_logps/rejected": -229.2470703125, "rewards/accuracies": 1.0, "rewards/chosen": -0.8120507001876831, "rewards/margins": 9.108318328857422, "rewards/rejected": -9.920368194580078, "step": 1531 }, { "epoch": 0.37, "learning_rate": 1.4053333333333332e-07, "logps/chosen": -246.36614990234375, "logps/rejected": -337.4794921875, "loss": 0.0142, "losses/dpo": 3.297173023497635e-08, "losses/sft": 0.5092642903327942, "losses/total": 3.297173023497635e-08, "ref_logps/chosen": -238.63870239257812, "ref_logps/rejected": -230.8739776611328, "rewards/accuracies": 1.0, "rewards/chosen": -0.7727440595626831, "rewards/margins": 9.887805938720703, "rewards/rejected": -10.660550117492676, "step": 1532 }, { "epoch": 0.37, "learning_rate": 1.4048e-07, "logps/chosen": -209.6246337890625, "logps/rejected": -337.2496337890625, "loss": 0.0062, "losses/dpo": 5.332714536621097e-08, "losses/sft": 1.003100872039795, "losses/total": 5.332714536621097e-08, "ref_logps/chosen": -203.3134002685547, "ref_logps/rejected": -231.02090454101562, "rewards/accuracies": 1.0, "rewards/chosen": -0.631123423576355, "rewards/margins": 9.991748809814453, "rewards/rejected": -10.622872352600098, "step": 1533 }, { "epoch": 0.37, "learning_rate": 1.4042666666666667e-07, "logps/chosen": -259.8731384277344, "logps/rejected": -348.00677490234375, "loss": 0.0025, "losses/dpo": 1.7620986909605563e-05, "losses/sft": 0.706089973449707, "losses/total": 1.7620986909605563e-05, "ref_logps/chosen": -252.74948120117188, "ref_logps/rejected": -241.85104370117188, "rewards/accuracies": 1.0, "rewards/chosen": -0.7123661041259766, "rewards/margins": 9.903204917907715, "rewards/rejected": -10.615570068359375, "step": 1534 }, { "epoch": 0.37, "learning_rate": 1.4037333333333333e-07, "logps/chosen": -248.6246337890625, "logps/rejected": -317.7672119140625, "loss": 0.0084, "losses/dpo": 0.0004179201787337661, "losses/sft": 0.4762216806411743, "losses/total": 0.0004179201787337661, "ref_logps/chosen": -237.89266967773438, "ref_logps/rejected": -221.27206420898438, "rewards/accuracies": 1.0, "rewards/chosen": -1.0731984376907349, "rewards/margins": 8.576314926147461, "rewards/rejected": -9.649513244628906, "step": 1535 }, { "epoch": 0.37, "learning_rate": 1.4032e-07, "logps/chosen": -214.88302612304688, "logps/rejected": -349.6990966796875, "loss": 0.0009, "losses/dpo": 6.694107668181459e-08, "losses/sft": 0.527130663394928, "losses/total": 6.694107668181459e-08, "ref_logps/chosen": -208.21603393554688, "ref_logps/rejected": -235.42080688476562, "rewards/accuracies": 1.0, "rewards/chosen": -0.6666967868804932, "rewards/margins": 10.761133193969727, "rewards/rejected": -11.42782974243164, "step": 1536 }, { "epoch": 0.37, "learning_rate": 1.4026666666666668e-07, "logps/chosen": -212.0635986328125, "logps/rejected": -308.80902099609375, "loss": 0.0167, "losses/dpo": 0.0005264076171442866, "losses/sft": 0.6387202739715576, "losses/total": 0.0005264076171442866, "ref_logps/chosen": -205.12522888183594, "ref_logps/rejected": -212.1842803955078, "rewards/accuracies": 1.0, "rewards/chosen": -0.6938366293907166, "rewards/margins": 8.968635559082031, "rewards/rejected": -9.662471771240234, "step": 1537 }, { "epoch": 0.37, "learning_rate": 1.4021333333333333e-07, "logps/chosen": -222.4747772216797, "logps/rejected": -313.5005798339844, "loss": 0.0075, "losses/dpo": 1.5404177702293964e-06, "losses/sft": 0.5631265640258789, "losses/total": 1.5404177702293964e-06, "ref_logps/chosen": -217.41543579101562, "ref_logps/rejected": -212.30325317382812, "rewards/accuracies": 1.0, "rewards/chosen": -0.5059360265731812, "rewards/margins": 9.613797187805176, "rewards/rejected": -10.119733810424805, "step": 1538 }, { "epoch": 0.37, "learning_rate": 1.4015999999999998e-07, "logps/chosen": -234.02920532226562, "logps/rejected": -361.3059387207031, "loss": 0.0018, "losses/dpo": 1.1807122746176901e-07, "losses/sft": 0.7019343376159668, "losses/total": 1.1807122746176901e-07, "ref_logps/chosen": -228.72647094726562, "ref_logps/rejected": -241.6686553955078, "rewards/accuracies": 1.0, "rewards/chosen": -0.5302718281745911, "rewards/margins": 11.433456420898438, "rewards/rejected": -11.963728904724121, "step": 1539 }, { "epoch": 0.37, "learning_rate": 1.4010666666666666e-07, "logps/chosen": -256.2060241699219, "logps/rejected": -346.5419006347656, "loss": 0.0089, "losses/dpo": 1.0479838238097727e-05, "losses/sft": 0.5890544652938843, "losses/total": 1.0479838238097727e-05, "ref_logps/chosen": -247.144775390625, "ref_logps/rejected": -234.96929931640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.9061262607574463, "rewards/margins": 10.25113582611084, "rewards/rejected": -11.157261848449707, "step": 1540 }, { "epoch": 0.37, "learning_rate": 1.4005333333333333e-07, "logps/chosen": -213.92098999023438, "logps/rejected": -308.40191650390625, "loss": 0.0086, "losses/dpo": 1.1557724777233602e-09, "losses/sft": 0.8756837844848633, "losses/total": 1.1557724777233602e-09, "ref_logps/chosen": -206.39451599121094, "ref_logps/rejected": -205.21743774414062, "rewards/accuracies": 1.0, "rewards/chosen": -0.752646803855896, "rewards/margins": 9.565801620483398, "rewards/rejected": -10.318449020385742, "step": 1541 }, { "epoch": 0.37, "learning_rate": 1.3999999999999998e-07, "logps/chosen": -247.05352783203125, "logps/rejected": -322.9078369140625, "loss": 0.004, "losses/dpo": 8.236509074777132e-07, "losses/sft": 0.6147072911262512, "losses/total": 8.236509074777132e-07, "ref_logps/chosen": -240.92543029785156, "ref_logps/rejected": -221.54281616210938, "rewards/accuracies": 1.0, "rewards/chosen": -0.6128100156784058, "rewards/margins": 9.523693084716797, "rewards/rejected": -10.136503219604492, "step": 1542 }, { "epoch": 0.37, "learning_rate": 1.3994666666666666e-07, "logps/chosen": -267.3838806152344, "logps/rejected": -372.0719299316406, "loss": 0.0044, "losses/dpo": 4.7520874346673736e-08, "losses/sft": 0.5480204820632935, "losses/total": 4.7520874346673736e-08, "ref_logps/chosen": -260.49774169921875, "ref_logps/rejected": -252.53286743164062, "rewards/accuracies": 1.0, "rewards/chosen": -0.6886155009269714, "rewards/margins": 11.265291213989258, "rewards/rejected": -11.953906059265137, "step": 1543 }, { "epoch": 0.37, "learning_rate": 1.3989333333333334e-07, "logps/chosen": -255.6183319091797, "logps/rejected": -334.8689880371094, "loss": 0.0036, "losses/dpo": 2.7458403906166495e-07, "losses/sft": 1.0701372623443604, "losses/total": 2.7458403906166495e-07, "ref_logps/chosen": -247.71978759765625, "ref_logps/rejected": -228.65174865722656, "rewards/accuracies": 1.0, "rewards/chosen": -0.7898560762405396, "rewards/margins": 9.831868171691895, "rewards/rejected": -10.621724128723145, "step": 1544 }, { "epoch": 0.37, "learning_rate": 1.3984000000000001e-07, "logps/chosen": -255.83804321289062, "logps/rejected": -359.5570983886719, "loss": 0.0075, "losses/dpo": 8.799910347079276e-07, "losses/sft": 0.715891420841217, "losses/total": 8.799910347079276e-07, "ref_logps/chosen": -250.2465057373047, "ref_logps/rejected": -240.72503662109375, "rewards/accuracies": 1.0, "rewards/chosen": -0.5591551065444946, "rewards/margins": 11.324052810668945, "rewards/rejected": -11.883207321166992, "step": 1545 }, { "epoch": 0.37, "learning_rate": 1.3978666666666666e-07, "logps/chosen": -232.672607421875, "logps/rejected": -324.7984619140625, "loss": 0.0229, "losses/dpo": 7.795975761837326e-06, "losses/sft": 0.48132947087287903, "losses/total": 7.795975761837326e-06, "ref_logps/chosen": -224.57669067382812, "ref_logps/rejected": -221.16360473632812, "rewards/accuracies": 1.0, "rewards/chosen": -0.8095927238464355, "rewards/margins": 9.553895950317383, "rewards/rejected": -10.36348819732666, "step": 1546 }, { "epoch": 0.37, "learning_rate": 1.3973333333333331e-07, "logps/chosen": -216.60174560546875, "logps/rejected": -313.26177978515625, "loss": 0.0029, "losses/dpo": 3.300307071185671e-05, "losses/sft": 0.5498090386390686, "losses/total": 3.300307071185671e-05, "ref_logps/chosen": -209.35305786132812, "ref_logps/rejected": -202.91387939453125, "rewards/accuracies": 1.0, "rewards/chosen": -0.7248697280883789, "rewards/margins": 10.309919357299805, "rewards/rejected": -11.034789085388184, "step": 1547 }, { "epoch": 0.37, "learning_rate": 1.3968e-07, "logps/chosen": -254.98794555664062, "logps/rejected": -339.9261779785156, "loss": 0.0072, "losses/dpo": 5.8814544345864306e-09, "losses/sft": 0.5060728192329407, "losses/total": 5.8814544345864306e-09, "ref_logps/chosen": -247.29666137695312, "ref_logps/rejected": -228.76205444335938, "rewards/accuracies": 1.0, "rewards/chosen": -0.7691307067871094, "rewards/margins": 10.347283363342285, "rewards/rejected": -11.116414070129395, "step": 1548 }, { "epoch": 0.37, "learning_rate": 1.3962666666666667e-07, "logps/chosen": -206.8220977783203, "logps/rejected": -303.5121154785156, "loss": 0.0052, "losses/dpo": 4.2813218215087545e-07, "losses/sft": 0.4147228002548218, "losses/total": 4.2813218215087545e-07, "ref_logps/chosen": -201.2056884765625, "ref_logps/rejected": -199.00982666015625, "rewards/accuracies": 1.0, "rewards/chosen": -0.5616393089294434, "rewards/margins": 9.888589859008789, "rewards/rejected": -10.45022964477539, "step": 1549 }, { "epoch": 0.37, "learning_rate": 1.3957333333333332e-07, "logps/chosen": -207.0588836669922, "logps/rejected": -335.2640075683594, "loss": 0.0014, "losses/dpo": 1.6463690144519205e-06, "losses/sft": 0.7087132930755615, "losses/total": 1.6463690144519205e-06, "ref_logps/chosen": -201.1288299560547, "ref_logps/rejected": -221.45480346679688, "rewards/accuracies": 1.0, "rewards/chosen": -0.5930064916610718, "rewards/margins": 10.787914276123047, "rewards/rejected": -11.38092041015625, "step": 1550 }, { "epoch": 0.37, "learning_rate": 1.3952e-07, "logps/chosen": -226.71705627441406, "logps/rejected": -327.3498229980469, "loss": 0.025, "losses/dpo": 5.267096092609336e-09, "losses/sft": 0.6170794367790222, "losses/total": 5.267096092609336e-09, "ref_logps/chosen": -220.58709716796875, "ref_logps/rejected": -223.47439575195312, "rewards/accuracies": 1.0, "rewards/chosen": -0.6129963397979736, "rewards/margins": 9.774545669555664, "rewards/rejected": -10.387542724609375, "step": 1551 }, { "epoch": 0.37, "learning_rate": 1.3946666666666667e-07, "logps/chosen": -209.6733856201172, "logps/rejected": -325.600830078125, "loss": 0.0076, "losses/dpo": 1.848467036325019e-05, "losses/sft": 0.4913882315158844, "losses/total": 1.848467036325019e-05, "ref_logps/chosen": -204.07867431640625, "ref_logps/rejected": -221.10032653808594, "rewards/accuracies": 1.0, "rewards/chosen": -0.5594706535339355, "rewards/margins": 9.890580177307129, "rewards/rejected": -10.450050354003906, "step": 1552 }, { "epoch": 0.37, "learning_rate": 1.3941333333333332e-07, "logps/chosen": -263.6995544433594, "logps/rejected": -344.38360595703125, "loss": 0.01, "losses/dpo": 2.3992088316049376e-08, "losses/sft": 1.2388907670974731, "losses/total": 2.3992088316049376e-08, "ref_logps/chosen": -256.85430908203125, "ref_logps/rejected": -229.98101806640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.6845248341560364, "rewards/margins": 10.755733489990234, "rewards/rejected": -11.440258026123047, "step": 1553 }, { "epoch": 0.37, "learning_rate": 1.3936e-07, "logps/chosen": -202.83641052246094, "logps/rejected": -320.655517578125, "loss": 0.006, "losses/dpo": 7.603508129250258e-05, "losses/sft": 0.6193313598632812, "losses/total": 7.603508129250258e-05, "ref_logps/chosen": -196.80450439453125, "ref_logps/rejected": -211.97116088867188, "rewards/accuracies": 1.0, "rewards/chosen": -0.603190004825592, "rewards/margins": 10.265243530273438, "rewards/rejected": -10.86843490600586, "step": 1554 }, { "epoch": 0.37, "learning_rate": 1.3930666666666665e-07, "logps/chosen": -209.60226440429688, "logps/rejected": -314.0016174316406, "loss": 0.0087, "losses/dpo": 4.4812694000029296e-08, "losses/sft": 0.5390483736991882, "losses/total": 4.4812694000029296e-08, "ref_logps/chosen": -199.60446166992188, "ref_logps/rejected": -209.59100341796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.9997797012329102, "rewards/margins": 9.441282272338867, "rewards/rejected": -10.441061019897461, "step": 1555 }, { "epoch": 0.37, "learning_rate": 1.3925333333333333e-07, "logps/chosen": -264.8408508300781, "logps/rejected": -385.6496276855469, "loss": 0.0077, "losses/dpo": 6.6282921373783665e-09, "losses/sft": 0.6683164238929749, "losses/total": 6.6282921373783665e-09, "ref_logps/chosen": -258.6723327636719, "ref_logps/rejected": -256.980224609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.6168506741523743, "rewards/margins": 12.250089645385742, "rewards/rejected": -12.866941452026367, "step": 1556 }, { "epoch": 0.37, "learning_rate": 1.3919999999999998e-07, "logps/chosen": -255.86660766601562, "logps/rejected": -363.748779296875, "loss": 0.0325, "losses/dpo": 6.647257411174223e-09, "losses/sft": 0.7613983750343323, "losses/total": 6.647257411174223e-09, "ref_logps/chosen": -246.26527404785156, "ref_logps/rejected": -244.44093322753906, "rewards/accuracies": 1.0, "rewards/chosen": -0.9601328372955322, "rewards/margins": 10.970653533935547, "rewards/rejected": -11.930787086486816, "step": 1557 }, { "epoch": 0.37, "learning_rate": 1.3914666666666665e-07, "logps/chosen": -220.6186065673828, "logps/rejected": -359.685546875, "loss": 0.0073, "losses/dpo": 1.0018706753101014e-08, "losses/sft": 1.0214909315109253, "losses/total": 1.0018706753101014e-08, "ref_logps/chosen": -212.27235412597656, "ref_logps/rejected": -242.27069091796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.8346239328384399, "rewards/margins": 10.9068603515625, "rewards/rejected": -11.741485595703125, "step": 1558 }, { "epoch": 0.37, "learning_rate": 1.3909333333333333e-07, "logps/chosen": -172.05706787109375, "logps/rejected": -300.5081787109375, "loss": 0.0062, "losses/dpo": 0.00010974781616823748, "losses/sft": 0.7716121077537537, "losses/total": 0.00010974781616823748, "ref_logps/chosen": -167.49493408203125, "ref_logps/rejected": -200.89077758789062, "rewards/accuracies": 1.0, "rewards/chosen": -0.45621395111083984, "rewards/margins": 9.50552749633789, "rewards/rejected": -9.961740493774414, "step": 1559 }, { "epoch": 0.37, "learning_rate": 1.3904e-07, "logps/chosen": -263.64178466796875, "logps/rejected": -347.3362731933594, "loss": 0.0077, "losses/dpo": 3.565815404726891e-06, "losses/sft": 0.6273267269134521, "losses/total": 3.565815404726891e-06, "ref_logps/chosen": -253.71337890625, "ref_logps/rejected": -228.5351104736328, "rewards/accuracies": 1.0, "rewards/chosen": -0.9928382039070129, "rewards/margins": 10.887277603149414, "rewards/rejected": -11.880115509033203, "step": 1560 }, { "epoch": 0.37, "learning_rate": 1.3898666666666666e-07, "logps/chosen": -251.23263549804688, "logps/rejected": -318.9164123535156, "loss": 0.0038, "losses/dpo": 2.104215809595189e-06, "losses/sft": 0.6805963516235352, "losses/total": 2.104215809595189e-06, "ref_logps/chosen": -243.5592803955078, "ref_logps/rejected": -207.4833984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.7673352956771851, "rewards/margins": 10.375967025756836, "rewards/rejected": -11.143302917480469, "step": 1561 }, { "epoch": 0.37, "learning_rate": 1.3893333333333333e-07, "logps/chosen": -261.4281921386719, "logps/rejected": -361.3069152832031, "loss": 0.0012, "losses/dpo": 4.049702795327903e-07, "losses/sft": 0.5804170966148376, "losses/total": 4.049702795327903e-07, "ref_logps/chosen": -254.212158203125, "ref_logps/rejected": -250.421630859375, "rewards/accuracies": 1.0, "rewards/chosen": -0.7216023206710815, "rewards/margins": 10.366928100585938, "rewards/rejected": -11.088529586791992, "step": 1562 }, { "epoch": 0.38, "learning_rate": 1.3888e-07, "logps/chosen": -243.7784881591797, "logps/rejected": -340.2841491699219, "loss": 0.0033, "losses/dpo": 1.5250143405864947e-06, "losses/sft": 0.9121972918510437, "losses/total": 1.5250143405864947e-06, "ref_logps/chosen": -236.95252990722656, "ref_logps/rejected": -233.0753173828125, "rewards/accuracies": 1.0, "rewards/chosen": -0.6825960278511047, "rewards/margins": 10.038289070129395, "rewards/rejected": -10.720885276794434, "step": 1563 }, { "epoch": 0.38, "learning_rate": 1.3882666666666666e-07, "logps/chosen": -205.21435546875, "logps/rejected": -327.06878662109375, "loss": 0.0054, "losses/dpo": 0.0029615722596645355, "losses/sft": 0.5059714317321777, "losses/total": 0.0029615722596645355, "ref_logps/chosen": -195.8858642578125, "ref_logps/rejected": -215.1387939453125, "rewards/accuracies": 1.0, "rewards/chosen": -0.9328485727310181, "rewards/margins": 10.260148048400879, "rewards/rejected": -11.192996978759766, "step": 1564 }, { "epoch": 0.38, "learning_rate": 1.387733333333333e-07, "logps/chosen": -266.64739990234375, "logps/rejected": -338.52783203125, "loss": 0.0046, "losses/dpo": 9.380731285091315e-07, "losses/sft": 1.2527351379394531, "losses/total": 9.380731285091315e-07, "ref_logps/chosen": -257.4656677246094, "ref_logps/rejected": -226.83973693847656, "rewards/accuracies": 1.0, "rewards/chosen": -0.9181762337684631, "rewards/margins": 10.250633239746094, "rewards/rejected": -11.16880989074707, "step": 1565 }, { "epoch": 0.38, "learning_rate": 1.3872e-07, "logps/chosen": -211.83380126953125, "logps/rejected": -379.27728271484375, "loss": 0.0088, "losses/dpo": 1.0902930114298215e-08, "losses/sft": 0.3995422124862671, "losses/total": 1.0902930114298215e-08, "ref_logps/chosen": -204.96340942382812, "ref_logps/rejected": -258.34381103515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.6870391368865967, "rewards/margins": 11.406305313110352, "rewards/rejected": -12.093345642089844, "step": 1566 }, { "epoch": 0.38, "learning_rate": 1.3866666666666666e-07, "logps/chosen": -206.44216918945312, "logps/rejected": -311.9850158691406, "loss": 0.0119, "losses/dpo": 4.414115664985729e-06, "losses/sft": 0.8077772855758667, "losses/total": 4.414115664985729e-06, "ref_logps/chosen": -200.71237182617188, "ref_logps/rejected": -204.17178344726562, "rewards/accuracies": 1.0, "rewards/chosen": -0.572981595993042, "rewards/margins": 10.20833969116211, "rewards/rejected": -10.78132152557373, "step": 1567 }, { "epoch": 0.38, "learning_rate": 1.3861333333333334e-07, "logps/chosen": -309.0090637207031, "logps/rejected": -388.24444580078125, "loss": 0.0015, "losses/dpo": 5.836267291670083e-07, "losses/sft": 0.6708596348762512, "losses/total": 5.836267291670083e-07, "ref_logps/chosen": -297.236083984375, "ref_logps/rejected": -265.32586669921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.177300214767456, "rewards/margins": 11.114559173583984, "rewards/rejected": -12.291858673095703, "step": 1568 }, { "epoch": 0.38, "learning_rate": 1.3856e-07, "logps/chosen": -256.4053955078125, "logps/rejected": -337.1115417480469, "loss": 0.0055, "losses/dpo": 2.661197129327775e-07, "losses/sft": 0.6587148904800415, "losses/total": 2.661197129327775e-07, "ref_logps/chosen": -247.09738159179688, "ref_logps/rejected": -214.44174194335938, "rewards/accuracies": 1.0, "rewards/chosen": -0.9308009147644043, "rewards/margins": 11.33617877960205, "rewards/rejected": -12.266980171203613, "step": 1569 }, { "epoch": 0.38, "learning_rate": 1.3850666666666667e-07, "logps/chosen": -194.1435546875, "logps/rejected": -329.9981994628906, "loss": 0.0087, "losses/dpo": 8.793487722869031e-06, "losses/sft": 1.029355764389038, "losses/total": 8.793487722869031e-06, "ref_logps/chosen": -188.03976440429688, "ref_logps/rejected": -222.80917358398438, "rewards/accuracies": 1.0, "rewards/chosen": -0.6103799343109131, "rewards/margins": 10.108521461486816, "rewards/rejected": -10.718900680541992, "step": 1570 }, { "epoch": 0.38, "learning_rate": 1.3845333333333334e-07, "logps/chosen": -281.54241943359375, "logps/rejected": -363.7070007324219, "loss": 0.0024, "losses/dpo": 1.2218849860801129e-06, "losses/sft": 0.532374382019043, "losses/total": 1.2218849860801129e-06, "ref_logps/chosen": -270.300537109375, "ref_logps/rejected": -243.05160522460938, "rewards/accuracies": 1.0, "rewards/chosen": -1.124185562133789, "rewards/margins": 10.94135570526123, "rewards/rejected": -12.065542221069336, "step": 1571 }, { "epoch": 0.38, "learning_rate": 1.384e-07, "logps/chosen": -217.3929901123047, "logps/rejected": -311.7751159667969, "loss": 0.0021, "losses/dpo": 4.847345280722948e-07, "losses/sft": 0.5422823429107666, "losses/total": 4.847345280722948e-07, "ref_logps/chosen": -212.31480407714844, "ref_logps/rejected": -212.3646697998047, "rewards/accuracies": 1.0, "rewards/chosen": -0.507819414138794, "rewards/margins": 9.433223724365234, "rewards/rejected": -9.941043853759766, "step": 1572 }, { "epoch": 0.38, "learning_rate": 1.3834666666666665e-07, "logps/chosen": -239.3110809326172, "logps/rejected": -368.5772705078125, "loss": 0.0019, "losses/dpo": 1.0258712791255675e-05, "losses/sft": 0.480547696352005, "losses/total": 1.0258712791255675e-05, "ref_logps/chosen": -232.12786865234375, "ref_logps/rejected": -255.445068359375, "rewards/accuracies": 1.0, "rewards/chosen": -0.7183209657669067, "rewards/margins": 10.594902038574219, "rewards/rejected": -11.31322193145752, "step": 1573 }, { "epoch": 0.38, "learning_rate": 1.3829333333333332e-07, "logps/chosen": -211.45245361328125, "logps/rejected": -326.2103271484375, "loss": 0.0051, "losses/dpo": 8.485056355311826e-07, "losses/sft": 0.5441992282867432, "losses/total": 8.485056355311826e-07, "ref_logps/chosen": -203.00355529785156, "ref_logps/rejected": -218.49020385742188, "rewards/accuracies": 1.0, "rewards/chosen": -0.8448891639709473, "rewards/margins": 9.9271240234375, "rewards/rejected": -10.772012710571289, "step": 1574 }, { "epoch": 0.38, "learning_rate": 1.3824e-07, "logps/chosen": -245.73370361328125, "logps/rejected": -328.6727294921875, "loss": 0.009, "losses/dpo": 1.2873219020548277e-05, "losses/sft": 1.0591257810592651, "losses/total": 1.2873219020548277e-05, "ref_logps/chosen": -237.91796875, "ref_logps/rejected": -228.634521484375, "rewards/accuracies": 1.0, "rewards/chosen": -0.7815746665000916, "rewards/margins": 9.222244262695312, "rewards/rejected": -10.00381851196289, "step": 1575 }, { "epoch": 0.38, "learning_rate": 1.3818666666666665e-07, "logps/chosen": -247.8858642578125, "logps/rejected": -345.36962890625, "loss": 0.0011, "losses/dpo": 1.3476572348736227e-05, "losses/sft": 0.698279857635498, "losses/total": 1.3476572348736227e-05, "ref_logps/chosen": -239.8536376953125, "ref_logps/rejected": -232.28558349609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.8032213449478149, "rewards/margins": 10.505182266235352, "rewards/rejected": -11.308403968811035, "step": 1576 }, { "epoch": 0.38, "learning_rate": 1.3813333333333333e-07, "logps/chosen": -197.80140686035156, "logps/rejected": -313.492431640625, "loss": 0.0135, "losses/dpo": 3.571298350379948e-07, "losses/sft": 0.700874924659729, "losses/total": 3.571298350379948e-07, "ref_logps/chosen": -191.68051147460938, "ref_logps/rejected": -208.9837646484375, "rewards/accuracies": 1.0, "rewards/chosen": -0.6120904088020325, "rewards/margins": 9.838777542114258, "rewards/rejected": -10.450867652893066, "step": 1577 }, { "epoch": 0.38, "learning_rate": 1.3808e-07, "logps/chosen": -267.7080383300781, "logps/rejected": -360.89605712890625, "loss": 0.0015, "losses/dpo": 4.0286130342792603e-07, "losses/sft": 0.47645652294158936, "losses/total": 4.0286130342792603e-07, "ref_logps/chosen": -260.46697998046875, "ref_logps/rejected": -245.14984130859375, "rewards/accuracies": 1.0, "rewards/chosen": -0.7241044044494629, "rewards/margins": 10.850517272949219, "rewards/rejected": -11.574623107910156, "step": 1578 }, { "epoch": 0.38, "learning_rate": 1.3802666666666668e-07, "logps/chosen": -246.2270965576172, "logps/rejected": -353.4976501464844, "loss": 0.0054, "losses/dpo": 1.3608369044959545e-06, "losses/sft": 0.6422908902168274, "losses/total": 1.3608369044959545e-06, "ref_logps/chosen": -238.9520263671875, "ref_logps/rejected": -235.65354919433594, "rewards/accuracies": 1.0, "rewards/chosen": -0.7275054454803467, "rewards/margins": 11.056905746459961, "rewards/rejected": -11.78441047668457, "step": 1579 }, { "epoch": 0.38, "learning_rate": 1.3797333333333333e-07, "logps/chosen": -223.104736328125, "logps/rejected": -318.63934326171875, "loss": 0.0025, "losses/dpo": 6.916963712910729e-08, "losses/sft": 1.0957410335540771, "losses/total": 6.916963712910729e-08, "ref_logps/chosen": -217.73550415039062, "ref_logps/rejected": -211.96792602539062, "rewards/accuracies": 1.0, "rewards/chosen": -0.5369223356246948, "rewards/margins": 10.130218505859375, "rewards/rejected": -10.66714096069336, "step": 1580 }, { "epoch": 0.38, "learning_rate": 1.3791999999999998e-07, "logps/chosen": -242.75428771972656, "logps/rejected": -330.2203674316406, "loss": 0.0049, "losses/dpo": 1.7020601035255822e-06, "losses/sft": 0.7368544340133667, "losses/total": 1.7020601035255822e-06, "ref_logps/chosen": -235.85763549804688, "ref_logps/rejected": -219.13787841796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.6896644830703735, "rewards/margins": 10.418583869934082, "rewards/rejected": -11.10824966430664, "step": 1581 }, { "epoch": 0.38, "learning_rate": 1.3786666666666666e-07, "logps/chosen": -201.73532104492188, "logps/rejected": -333.470458984375, "loss": 0.0047, "losses/dpo": 1.7514452110845014e-07, "losses/sft": 0.6310455203056335, "losses/total": 1.7514452110845014e-07, "ref_logps/chosen": -194.95068359375, "ref_logps/rejected": -216.97006225585938, "rewards/accuracies": 1.0, "rewards/chosen": -0.6784628629684448, "rewards/margins": 10.971576690673828, "rewards/rejected": -11.650038719177246, "step": 1582 }, { "epoch": 0.38, "learning_rate": 1.3781333333333333e-07, "logps/chosen": -201.78433227539062, "logps/rejected": -328.59161376953125, "loss": 0.0042, "losses/dpo": 3.303808071564163e-08, "losses/sft": 0.6009325981140137, "losses/total": 3.303808071564163e-08, "ref_logps/chosen": -195.34474182128906, "ref_logps/rejected": -217.09922790527344, "rewards/accuracies": 1.0, "rewards/chosen": -0.6439610123634338, "rewards/margins": 10.505274772644043, "rewards/rejected": -11.149235725402832, "step": 1583 }, { "epoch": 0.38, "learning_rate": 1.3775999999999998e-07, "logps/chosen": -239.3408203125, "logps/rejected": -362.3538818359375, "loss": 0.0027, "losses/dpo": 8.198519196866982e-09, "losses/sft": 0.6009755730628967, "losses/total": 8.198519196866982e-09, "ref_logps/chosen": -232.0980224609375, "ref_logps/rejected": -247.95370483398438, "rewards/accuracies": 1.0, "rewards/chosen": -0.72427898645401, "rewards/margins": 10.715740203857422, "rewards/rejected": -11.440018653869629, "step": 1584 }, { "epoch": 0.38, "learning_rate": 1.3770666666666666e-07, "logps/chosen": -223.9459228515625, "logps/rejected": -349.13458251953125, "loss": 0.0037, "losses/dpo": 5.945914267613261e-07, "losses/sft": 0.5437568426132202, "losses/total": 5.945914267613261e-07, "ref_logps/chosen": -220.1001739501953, "ref_logps/rejected": -234.79644775390625, "rewards/accuracies": 1.0, "rewards/chosen": -0.38457387685775757, "rewards/margins": 11.049240112304688, "rewards/rejected": -11.433815002441406, "step": 1585 }, { "epoch": 0.38, "learning_rate": 1.3765333333333334e-07, "logps/chosen": -261.78155517578125, "logps/rejected": -339.37921142578125, "loss": 0.0054, "losses/dpo": 9.716072391086072e-10, "losses/sft": 0.6410601139068604, "losses/total": 9.716072391086072e-10, "ref_logps/chosen": -252.17129516601562, "ref_logps/rejected": -226.3738250732422, "rewards/accuracies": 1.0, "rewards/chosen": -0.9610264897346497, "rewards/margins": 10.339510917663574, "rewards/rejected": -11.300537109375, "step": 1586 }, { "epoch": 0.38, "learning_rate": 1.376e-07, "logps/chosen": -222.2466278076172, "logps/rejected": -307.890380859375, "loss": 0.0216, "losses/dpo": 1.8843833458959125e-05, "losses/sft": 0.37542596459388733, "losses/total": 1.8843833458959125e-05, "ref_logps/chosen": -215.55718994140625, "ref_logps/rejected": -205.25433349609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.6689449548721313, "rewards/margins": 9.594657897949219, "rewards/rejected": -10.263603210449219, "step": 1587 }, { "epoch": 0.38, "learning_rate": 1.3754666666666666e-07, "logps/chosen": -218.3048553466797, "logps/rejected": -324.6606750488281, "loss": 0.0025, "losses/dpo": 9.532719559501857e-06, "losses/sft": 0.6126608848571777, "losses/total": 9.532719559501857e-06, "ref_logps/chosen": -212.67544555664062, "ref_logps/rejected": -214.37554931640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.5629421472549438, "rewards/margins": 10.465567588806152, "rewards/rejected": -11.028511047363281, "step": 1588 }, { "epoch": 0.38, "learning_rate": 1.3749333333333332e-07, "logps/chosen": -255.9825439453125, "logps/rejected": -358.3914794921875, "loss": 0.0004, "losses/dpo": 6.445152848755242e-06, "losses/sft": 0.8428224325180054, "losses/total": 6.445152848755242e-06, "ref_logps/chosen": -247.7510528564453, "ref_logps/rejected": -233.96963500976562, "rewards/accuracies": 1.0, "rewards/chosen": -0.8231498599052429, "rewards/margins": 11.619034767150879, "rewards/rejected": -12.442184448242188, "step": 1589 }, { "epoch": 0.38, "learning_rate": 1.3744e-07, "logps/chosen": -241.70513916015625, "logps/rejected": -326.5845031738281, "loss": 0.0089, "losses/dpo": 6.485309859272093e-05, "losses/sft": 0.5542730689048767, "losses/total": 6.485309859272093e-05, "ref_logps/chosen": -233.85601806640625, "ref_logps/rejected": -218.9845733642578, "rewards/accuracies": 1.0, "rewards/chosen": -0.7849125862121582, "rewards/margins": 9.975080490112305, "rewards/rejected": -10.759993553161621, "step": 1590 }, { "epoch": 0.38, "learning_rate": 1.3738666666666664e-07, "logps/chosen": -196.43780517578125, "logps/rejected": -297.6552429199219, "loss": 0.0176, "losses/dpo": 0.0008997645345516503, "losses/sft": 0.44541722536087036, "losses/total": 0.0008997645345516503, "ref_logps/chosen": -188.37030029296875, "ref_logps/rejected": -196.9237823486328, "rewards/accuracies": 1.0, "rewards/chosen": -0.8067507743835449, "rewards/margins": 9.26639461517334, "rewards/rejected": -10.073145866394043, "step": 1591 }, { "epoch": 0.38, "learning_rate": 1.3733333333333332e-07, "logps/chosen": -215.57205200195312, "logps/rejected": -319.0796203613281, "loss": 0.0094, "losses/dpo": 1.686007976786641e-06, "losses/sft": 0.9389691352844238, "losses/total": 1.686007976786641e-06, "ref_logps/chosen": -211.09458923339844, "ref_logps/rejected": -214.15255737304688, "rewards/accuracies": 1.0, "rewards/chosen": -0.4477464556694031, "rewards/margins": 10.044957160949707, "rewards/rejected": -10.49270248413086, "step": 1592 }, { "epoch": 0.38, "learning_rate": 1.3728e-07, "logps/chosen": -225.4596405029297, "logps/rejected": -309.9024353027344, "loss": 0.0088, "losses/dpo": 0.00014596003165934235, "losses/sft": 0.5975419282913208, "losses/total": 0.00014596003165934235, "ref_logps/chosen": -216.66311645507812, "ref_logps/rejected": -209.29629516601562, "rewards/accuracies": 1.0, "rewards/chosen": -0.8796529769897461, "rewards/margins": 9.180963516235352, "rewards/rejected": -10.060616493225098, "step": 1593 }, { "epoch": 0.38, "learning_rate": 1.3722666666666667e-07, "logps/chosen": -255.7378387451172, "logps/rejected": -347.64300537109375, "loss": 0.0029, "losses/dpo": 1.738094965730852e-06, "losses/sft": 0.6284354329109192, "losses/total": 1.738094965730852e-06, "ref_logps/chosen": -251.28823852539062, "ref_logps/rejected": -233.0321044921875, "rewards/accuracies": 1.0, "rewards/chosen": -0.4449610114097595, "rewards/margins": 11.016129493713379, "rewards/rejected": -11.461091041564941, "step": 1594 }, { "epoch": 0.38, "learning_rate": 1.3717333333333332e-07, "logps/chosen": -284.9016418457031, "logps/rejected": -374.1834411621094, "loss": 0.0015, "losses/dpo": 6.636525426984008e-07, "losses/sft": 0.5889742374420166, "losses/total": 6.636525426984008e-07, "ref_logps/chosen": -274.5487060546875, "ref_logps/rejected": -250.74270629882812, "rewards/accuracies": 1.0, "rewards/chosen": -1.035293459892273, "rewards/margins": 11.308778762817383, "rewards/rejected": -12.344071388244629, "step": 1595 }, { "epoch": 0.38, "learning_rate": 1.3712e-07, "logps/chosen": -203.51292419433594, "logps/rejected": -321.67303466796875, "loss": 0.0024, "losses/dpo": 1.0001154805650003e-05, "losses/sft": 0.4783094823360443, "losses/total": 1.0001154805650003e-05, "ref_logps/chosen": -196.95054626464844, "ref_logps/rejected": -218.9255828857422, "rewards/accuracies": 1.0, "rewards/chosen": -0.656238317489624, "rewards/margins": 9.618509292602539, "rewards/rejected": -10.274746894836426, "step": 1596 }, { "epoch": 0.38, "learning_rate": 1.3706666666666668e-07, "logps/chosen": -270.67999267578125, "logps/rejected": -337.07373046875, "loss": 0.0069, "losses/dpo": 2.478102032910101e-05, "losses/sft": 0.6013079881668091, "losses/total": 2.478102032910101e-05, "ref_logps/chosen": -263.88873291015625, "ref_logps/rejected": -221.4441680908203, "rewards/accuracies": 1.0, "rewards/chosen": -0.6791263818740845, "rewards/margins": 10.883827209472656, "rewards/rejected": -11.56295394897461, "step": 1597 }, { "epoch": 0.38, "learning_rate": 1.3701333333333333e-07, "logps/chosen": -277.8985595703125, "logps/rejected": -334.75506591796875, "loss": 0.0056, "losses/dpo": 7.76840442995308e-06, "losses/sft": 0.6172342896461487, "losses/total": 7.76840442995308e-06, "ref_logps/chosen": -269.57098388671875, "ref_logps/rejected": -229.31336975097656, "rewards/accuracies": 1.0, "rewards/chosen": -0.8327605128288269, "rewards/margins": 9.711409568786621, "rewards/rejected": -10.544169425964355, "step": 1598 }, { "epoch": 0.38, "learning_rate": 1.3695999999999998e-07, "logps/chosen": -198.52359008789062, "logps/rejected": -328.2916259765625, "loss": 0.0003, "losses/dpo": 3.231128857805743e-07, "losses/sft": 0.8188125491142273, "losses/total": 3.231128857805743e-07, "ref_logps/chosen": -193.06680297851562, "ref_logps/rejected": -216.9424285888672, "rewards/accuracies": 1.0, "rewards/chosen": -0.5456788539886475, "rewards/margins": 10.589242935180664, "rewards/rejected": -11.13492202758789, "step": 1599 }, { "epoch": 0.38, "learning_rate": 1.3690666666666665e-07, "logps/chosen": -265.3850402832031, "logps/rejected": -355.04132080078125, "loss": 0.0012, "losses/dpo": 1.4725351604738535e-08, "losses/sft": 0.596403956413269, "losses/total": 1.4725351604738535e-08, "ref_logps/chosen": -258.033203125, "ref_logps/rejected": -236.6210479736328, "rewards/accuracies": 1.0, "rewards/chosen": -0.7351804375648499, "rewards/margins": 11.106849670410156, "rewards/rejected": -11.842029571533203, "step": 1600 }, { "epoch": 0.38, "learning_rate": 1.3685333333333333e-07, "logps/chosen": -245.43626403808594, "logps/rejected": -324.00323486328125, "loss": 0.0043, "losses/dpo": 2.733633266416291e-07, "losses/sft": 0.6141360998153687, "losses/total": 2.733633266416291e-07, "ref_logps/chosen": -234.9955596923828, "ref_logps/rejected": -210.87025451660156, "rewards/accuracies": 1.0, "rewards/chosen": -1.0440706014633179, "rewards/margins": 10.269230842590332, "rewards/rejected": -11.313302040100098, "step": 1601 }, { "epoch": 0.38, "learning_rate": 1.368e-07, "logps/chosen": -254.62362670898438, "logps/rejected": -353.85980224609375, "loss": 0.0025, "losses/dpo": 1.7136639144155197e-05, "losses/sft": 0.4054460823535919, "losses/total": 1.7136639144155197e-05, "ref_logps/chosen": -246.75054931640625, "ref_logps/rejected": -232.744140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.7873096466064453, "rewards/margins": 11.324256896972656, "rewards/rejected": -12.111567497253418, "step": 1602 }, { "epoch": 0.38, "learning_rate": 1.3674666666666666e-07, "logps/chosen": -182.94378662109375, "logps/rejected": -272.27093505859375, "loss": 0.0063, "losses/dpo": 5.706980300601572e-05, "losses/sft": 0.5124471783638, "losses/total": 5.706980300601572e-05, "ref_logps/chosen": -177.90383911132812, "ref_logps/rejected": -182.06546020507812, "rewards/accuracies": 1.0, "rewards/chosen": -0.5039960741996765, "rewards/margins": 8.516550064086914, "rewards/rejected": -9.020545959472656, "step": 1603 }, { "epoch": 0.38, "learning_rate": 1.3669333333333333e-07, "logps/chosen": -206.90524291992188, "logps/rejected": -309.79638671875, "loss": 0.0048, "losses/dpo": 0.0001609196770004928, "losses/sft": 0.496870756149292, "losses/total": 0.0001609196770004928, "ref_logps/chosen": -201.4677276611328, "ref_logps/rejected": -203.28721618652344, "rewards/accuracies": 1.0, "rewards/chosen": -0.5437516570091248, "rewards/margins": 10.107168197631836, "rewards/rejected": -10.650919914245605, "step": 1604 }, { "epoch": 0.39, "learning_rate": 1.3664e-07, "logps/chosen": -209.5691680908203, "logps/rejected": -329.1486511230469, "loss": 0.0037, "losses/dpo": 4.290361175662838e-06, "losses/sft": 0.6267937421798706, "losses/total": 4.290361175662838e-06, "ref_logps/chosen": -202.08154296875, "ref_logps/rejected": -220.78993225097656, "rewards/accuracies": 1.0, "rewards/chosen": -0.7487601637840271, "rewards/margins": 10.087109565734863, "rewards/rejected": -10.835870742797852, "step": 1605 }, { "epoch": 0.39, "learning_rate": 1.3658666666666666e-07, "logps/chosen": -228.5716552734375, "logps/rejected": -337.9482421875, "loss": 0.0028, "losses/dpo": 7.013705527469938e-08, "losses/sft": 0.4577954113483429, "losses/total": 7.013705527469938e-08, "ref_logps/chosen": -220.5902862548828, "ref_logps/rejected": -225.5953369140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.798136293888092, "rewards/margins": 10.437156677246094, "rewards/rejected": -11.2352933883667, "step": 1606 }, { "epoch": 0.39, "learning_rate": 1.365333333333333e-07, "logps/chosen": -224.07138061523438, "logps/rejected": -333.76971435546875, "loss": 0.0057, "losses/dpo": 1.2794856729669846e-07, "losses/sft": 0.6517574787139893, "losses/total": 1.2794856729669846e-07, "ref_logps/chosen": -216.8384246826172, "ref_logps/rejected": -219.18223571777344, "rewards/accuracies": 1.0, "rewards/chosen": -0.7232968211174011, "rewards/margins": 10.735450744628906, "rewards/rejected": -11.458747863769531, "step": 1607 }, { "epoch": 0.39, "learning_rate": 1.3648e-07, "logps/chosen": -224.2782440185547, "logps/rejected": -333.04132080078125, "loss": 0.0052, "losses/dpo": 8.304108689571876e-08, "losses/sft": 0.6929293274879456, "losses/total": 8.304108689571876e-08, "ref_logps/chosen": -218.01918029785156, "ref_logps/rejected": -226.6568145751953, "rewards/accuracies": 1.0, "rewards/chosen": -0.6259068250656128, "rewards/margins": 10.012544631958008, "rewards/rejected": -10.63845157623291, "step": 1608 }, { "epoch": 0.39, "learning_rate": 1.3642666666666667e-07, "logps/chosen": -186.5635528564453, "logps/rejected": -286.0999755859375, "loss": 0.0168, "losses/dpo": 1.0547981219133362e-05, "losses/sft": 0.6317347288131714, "losses/total": 1.0547981219133362e-05, "ref_logps/chosen": -181.69180297851562, "ref_logps/rejected": -191.37994384765625, "rewards/accuracies": 1.0, "rewards/chosen": -0.4871741533279419, "rewards/margins": 8.984831809997559, "rewards/rejected": -9.472005844116211, "step": 1609 }, { "epoch": 0.39, "learning_rate": 1.3637333333333332e-07, "logps/chosen": -242.23544311523438, "logps/rejected": -351.8688049316406, "loss": 0.005, "losses/dpo": 0.00014822532830294222, "losses/sft": 0.5376024842262268, "losses/total": 0.00014822532830294222, "ref_logps/chosen": -235.35238647460938, "ref_logps/rejected": -230.854248046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.6883032917976379, "rewards/margins": 11.413152694702148, "rewards/rejected": -12.101455688476562, "step": 1610 }, { "epoch": 0.39, "learning_rate": 1.3632e-07, "logps/chosen": -260.0902404785156, "logps/rejected": -341.7730712890625, "loss": 0.0055, "losses/dpo": 9.93022695183754e-06, "losses/sft": 1.1260113716125488, "losses/total": 9.93022695183754e-06, "ref_logps/chosen": -254.5426025390625, "ref_logps/rejected": -237.41793823242188, "rewards/accuracies": 1.0, "rewards/chosen": -0.5547624826431274, "rewards/margins": 9.88074779510498, "rewards/rejected": -10.435510635375977, "step": 1611 }, { "epoch": 0.39, "learning_rate": 1.3626666666666667e-07, "logps/chosen": -231.3826446533203, "logps/rejected": -328.0938720703125, "loss": 0.0021, "losses/dpo": 9.640306188885006e-07, "losses/sft": 0.6554036140441895, "losses/total": 9.640306188885006e-07, "ref_logps/chosen": -223.96554565429688, "ref_logps/rejected": -217.66268920898438, "rewards/accuracies": 1.0, "rewards/chosen": -0.7417099475860596, "rewards/margins": 10.301410675048828, "rewards/rejected": -11.043121337890625, "step": 1612 }, { "epoch": 0.39, "learning_rate": 1.3621333333333335e-07, "logps/chosen": -222.84283447265625, "logps/rejected": -318.5834655761719, "loss": 0.0122, "losses/dpo": 1.1821263115052716e-06, "losses/sft": 0.657528281211853, "losses/total": 1.1821263115052716e-06, "ref_logps/chosen": -215.64955139160156, "ref_logps/rejected": -214.5614013671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.7193289399147034, "rewards/margins": 9.682876586914062, "rewards/rejected": -10.402206420898438, "step": 1613 }, { "epoch": 0.39, "learning_rate": 1.3616e-07, "logps/chosen": -237.15948486328125, "logps/rejected": -329.2806701660156, "loss": 0.0052, "losses/dpo": 1.8667270751393517e-07, "losses/sft": 0.622744619846344, "losses/total": 1.8667270751393517e-07, "ref_logps/chosen": -228.26129150390625, "ref_logps/rejected": -219.60617065429688, "rewards/accuracies": 1.0, "rewards/chosen": -0.8898187279701233, "rewards/margins": 10.077630043029785, "rewards/rejected": -10.967449188232422, "step": 1614 }, { "epoch": 0.39, "learning_rate": 1.3610666666666665e-07, "logps/chosen": -231.59902954101562, "logps/rejected": -325.5088806152344, "loss": 0.0103, "losses/dpo": 4.790816774402629e-07, "losses/sft": 0.6077550649642944, "losses/total": 4.790816774402629e-07, "ref_logps/chosen": -221.94073486328125, "ref_logps/rejected": -219.61729431152344, "rewards/accuracies": 1.0, "rewards/chosen": -0.9658291935920715, "rewards/margins": 9.623329162597656, "rewards/rejected": -10.589158058166504, "step": 1615 }, { "epoch": 0.39, "learning_rate": 1.3605333333333332e-07, "logps/chosen": -269.988037109375, "logps/rejected": -355.39898681640625, "loss": 0.0018, "losses/dpo": 1.741368715890701e-09, "losses/sft": 0.9171624779701233, "losses/total": 1.741368715890701e-09, "ref_logps/chosen": -260.76531982421875, "ref_logps/rejected": -237.7608642578125, "rewards/accuracies": 1.0, "rewards/chosen": -0.9222714900970459, "rewards/margins": 10.84154224395752, "rewards/rejected": -11.763813018798828, "step": 1616 }, { "epoch": 0.39, "learning_rate": 1.36e-07, "logps/chosen": -198.88998413085938, "logps/rejected": -319.466552734375, "loss": 0.0089, "losses/dpo": 2.6019566234936065e-07, "losses/sft": 0.6762941479682922, "losses/total": 2.6019566234936065e-07, "ref_logps/chosen": -194.59066772460938, "ref_logps/rejected": -207.69847106933594, "rewards/accuracies": 1.0, "rewards/chosen": -0.42993366718292236, "rewards/margins": 10.746874809265137, "rewards/rejected": -11.176809310913086, "step": 1617 }, { "epoch": 0.39, "learning_rate": 1.3594666666666665e-07, "logps/chosen": -256.9709167480469, "logps/rejected": -350.8877868652344, "loss": 0.0029, "losses/dpo": 2.7096013582195155e-05, "losses/sft": 0.5374083518981934, "losses/total": 2.7096013582195155e-05, "ref_logps/chosen": -248.4164581298828, "ref_logps/rejected": -237.84951782226562, "rewards/accuracies": 1.0, "rewards/chosen": -0.8554449081420898, "rewards/margins": 10.448384284973145, "rewards/rejected": -11.303829193115234, "step": 1618 }, { "epoch": 0.39, "learning_rate": 1.3589333333333333e-07, "logps/chosen": -215.4047393798828, "logps/rejected": -308.8810729980469, "loss": 0.0098, "losses/dpo": 0.00032509860466234386, "losses/sft": 0.7186345458030701, "losses/total": 0.00032509860466234386, "ref_logps/chosen": -210.15383911132812, "ref_logps/rejected": -218.0213165283203, "rewards/accuracies": 1.0, "rewards/chosen": -0.5250910520553589, "rewards/margins": 8.560883522033691, "rewards/rejected": -9.08597469329834, "step": 1619 }, { "epoch": 0.39, "learning_rate": 1.3584e-07, "logps/chosen": -244.33758544921875, "logps/rejected": -334.9512939453125, "loss": 0.0048, "losses/dpo": 1.7113330752493994e-09, "losses/sft": 0.7659023404121399, "losses/total": 1.7113330752493994e-09, "ref_logps/chosen": -236.101806640625, "ref_logps/rejected": -223.415771484375, "rewards/accuracies": 1.0, "rewards/chosen": -0.8235794305801392, "rewards/margins": 10.329971313476562, "rewards/rejected": -11.153550148010254, "step": 1620 }, { "epoch": 0.39, "learning_rate": 1.3578666666666668e-07, "logps/chosen": -237.1492156982422, "logps/rejected": -370.3252868652344, "loss": 0.0097, "losses/dpo": 9.620009222999215e-06, "losses/sft": 0.7198572754859924, "losses/total": 9.620009222999215e-06, "ref_logps/chosen": -228.43826293945312, "ref_logps/rejected": -249.52792358398438, "rewards/accuracies": 1.0, "rewards/chosen": -0.8710935115814209, "rewards/margins": 11.208642959594727, "rewards/rejected": -12.079736709594727, "step": 1621 }, { "epoch": 0.39, "learning_rate": 1.3573333333333333e-07, "logps/chosen": -222.56712341308594, "logps/rejected": -332.64642333984375, "loss": 0.0113, "losses/dpo": 8.766195946918742e-07, "losses/sft": 0.5119972229003906, "losses/total": 8.766195946918742e-07, "ref_logps/chosen": -216.70758056640625, "ref_logps/rejected": -217.21231079101562, "rewards/accuracies": 1.0, "rewards/chosen": -0.5859540104866028, "rewards/margins": 10.95745849609375, "rewards/rejected": -11.543413162231445, "step": 1622 }, { "epoch": 0.39, "learning_rate": 1.3567999999999998e-07, "logps/chosen": -261.22271728515625, "logps/rejected": -314.139892578125, "loss": 0.0053, "losses/dpo": 2.994649184984155e-05, "losses/sft": 1.0584324598312378, "losses/total": 2.994649184984155e-05, "ref_logps/chosen": -253.7513427734375, "ref_logps/rejected": -207.3902587890625, "rewards/accuracies": 1.0, "rewards/chosen": -0.747137725353241, "rewards/margins": 9.927825927734375, "rewards/rejected": -10.674962997436523, "step": 1623 }, { "epoch": 0.39, "learning_rate": 1.3562666666666666e-07, "logps/chosen": -208.36798095703125, "logps/rejected": -311.147705078125, "loss": 0.013, "losses/dpo": 5.5021917688691246e-08, "losses/sft": 0.986161470413208, "losses/total": 5.5021917688691246e-08, "ref_logps/chosen": -203.32223510742188, "ref_logps/rejected": -205.8411865234375, "rewards/accuracies": 1.0, "rewards/chosen": -0.5045740604400635, "rewards/margins": 10.026079177856445, "rewards/rejected": -10.53065299987793, "step": 1624 }, { "epoch": 0.39, "learning_rate": 1.355733333333333e-07, "logps/chosen": -235.7683563232422, "logps/rejected": -310.41961669921875, "loss": 0.009, "losses/dpo": 1.2311983255131054e-07, "losses/sft": 0.41536909341812134, "losses/total": 1.2311983255131054e-07, "ref_logps/chosen": -227.07275390625, "ref_logps/rejected": -207.11700439453125, "rewards/accuracies": 1.0, "rewards/chosen": -0.8695603609085083, "rewards/margins": 9.460699081420898, "rewards/rejected": -10.330259323120117, "step": 1625 }, { "epoch": 0.39, "learning_rate": 1.3551999999999999e-07, "logps/chosen": -197.73826599121094, "logps/rejected": -326.7109375, "loss": 0.0015, "losses/dpo": 2.374061836007968e-09, "losses/sft": 0.5238254070281982, "losses/total": 2.374061836007968e-09, "ref_logps/chosen": -192.40057373046875, "ref_logps/rejected": -213.181640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.533769428730011, "rewards/margins": 10.819158554077148, "rewards/rejected": -11.352928161621094, "step": 1626 }, { "epoch": 0.39, "learning_rate": 1.3546666666666666e-07, "logps/chosen": -222.34768676757812, "logps/rejected": -298.6003723144531, "loss": 0.0027, "losses/dpo": 6.631039184412657e-08, "losses/sft": 1.136812448501587, "losses/total": 6.631039184412657e-08, "ref_logps/chosen": -215.66412353515625, "ref_logps/rejected": -194.82928466796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.6683578491210938, "rewards/margins": 9.708749771118164, "rewards/rejected": -10.377107620239258, "step": 1627 }, { "epoch": 0.39, "learning_rate": 1.3541333333333334e-07, "logps/chosen": -213.7616424560547, "logps/rejected": -329.9759216308594, "loss": 0.0058, "losses/dpo": 2.3506785851168388e-07, "losses/sft": 0.9024686813354492, "losses/total": 2.3506785851168388e-07, "ref_logps/chosen": -207.53395080566406, "ref_logps/rejected": -220.41424560546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.622769296169281, "rewards/margins": 10.333398818969727, "rewards/rejected": -10.956168174743652, "step": 1628 }, { "epoch": 0.39, "learning_rate": 1.3536e-07, "logps/chosen": -229.17823791503906, "logps/rejected": -328.19244384765625, "loss": 0.0046, "losses/dpo": 2.553436502239492e-07, "losses/sft": 0.4685404598712921, "losses/total": 2.553436502239492e-07, "ref_logps/chosen": -221.80368041992188, "ref_logps/rejected": -217.76077270507812, "rewards/accuracies": 1.0, "rewards/chosen": -0.7374565005302429, "rewards/margins": 10.30571174621582, "rewards/rejected": -11.043169021606445, "step": 1629 }, { "epoch": 0.39, "learning_rate": 1.3530666666666667e-07, "logps/chosen": -232.747314453125, "logps/rejected": -327.50079345703125, "loss": 0.0044, "losses/dpo": 1.0463888422407308e-08, "losses/sft": 1.0163546800613403, "losses/total": 1.0463888422407308e-08, "ref_logps/chosen": -224.55258178710938, "ref_logps/rejected": -215.67916870117188, "rewards/accuracies": 1.0, "rewards/chosen": -0.8194743394851685, "rewards/margins": 10.362689971923828, "rewards/rejected": -11.182164192199707, "step": 1630 }, { "epoch": 0.39, "learning_rate": 1.3525333333333334e-07, "logps/chosen": -209.49510192871094, "logps/rejected": -310.08148193359375, "loss": 0.0095, "losses/dpo": 2.2082469513406977e-05, "losses/sft": 0.599573016166687, "losses/total": 2.2082469513406977e-05, "ref_logps/chosen": -203.94476318359375, "ref_logps/rejected": -209.10020446777344, "rewards/accuracies": 1.0, "rewards/chosen": -0.5550339221954346, "rewards/margins": 9.543095588684082, "rewards/rejected": -10.098129272460938, "step": 1631 }, { "epoch": 0.39, "learning_rate": 1.352e-07, "logps/chosen": -189.64434814453125, "logps/rejected": -333.09393310546875, "loss": 0.0009, "losses/dpo": 1.74061341340348e-07, "losses/sft": 0.6870396733283997, "losses/total": 1.74061341340348e-07, "ref_logps/chosen": -181.90391540527344, "ref_logps/rejected": -222.87083435058594, "rewards/accuracies": 1.0, "rewards/chosen": -0.7740446925163269, "rewards/margins": 10.248268127441406, "rewards/rejected": -11.02231216430664, "step": 1632 }, { "epoch": 0.39, "learning_rate": 1.3514666666666664e-07, "logps/chosen": -230.4002685546875, "logps/rejected": -331.8632507324219, "loss": 0.0019, "losses/dpo": 3.297104558441788e-05, "losses/sft": 0.5816875696182251, "losses/total": 3.297104558441788e-05, "ref_logps/chosen": -220.80224609375, "ref_logps/rejected": -216.4334259033203, "rewards/accuracies": 1.0, "rewards/chosen": -0.9598024487495422, "rewards/margins": 10.583179473876953, "rewards/rejected": -11.54298210144043, "step": 1633 }, { "epoch": 0.39, "learning_rate": 1.3509333333333332e-07, "logps/chosen": -227.3974151611328, "logps/rejected": -304.175048828125, "loss": 0.0035, "losses/dpo": 3.1633830985811073e-06, "losses/sft": 0.47701743245124817, "losses/total": 3.1633830985811073e-06, "ref_logps/chosen": -220.08961486816406, "ref_logps/rejected": -206.89866638183594, "rewards/accuracies": 1.0, "rewards/chosen": -0.7307800054550171, "rewards/margins": 8.996858596801758, "rewards/rejected": -9.727638244628906, "step": 1634 }, { "epoch": 0.39, "learning_rate": 1.3504e-07, "logps/chosen": -290.0552978515625, "logps/rejected": -377.06353759765625, "loss": 0.0021, "losses/dpo": 8.004298592823034e-07, "losses/sft": 0.9544532895088196, "losses/total": 8.004298592823034e-07, "ref_logps/chosen": -280.4367370605469, "ref_logps/rejected": -251.6654052734375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9618557095527649, "rewards/margins": 11.577956199645996, "rewards/rejected": -12.539812088012695, "step": 1635 }, { "epoch": 0.39, "learning_rate": 1.3498666666666667e-07, "logps/chosen": -245.26332092285156, "logps/rejected": -359.4231262207031, "loss": 0.001, "losses/dpo": 2.438805211113504e-07, "losses/sft": 0.5634419322013855, "losses/total": 2.438805211113504e-07, "ref_logps/chosen": -239.23834228515625, "ref_logps/rejected": -236.5060577392578, "rewards/accuracies": 1.0, "rewards/chosen": -0.6024961471557617, "rewards/margins": 11.689210891723633, "rewards/rejected": -12.291707992553711, "step": 1636 }, { "epoch": 0.39, "learning_rate": 1.3493333333333332e-07, "logps/chosen": -210.1586456298828, "logps/rejected": -354.1416320800781, "loss": 0.0014, "losses/dpo": 8.542577234038617e-06, "losses/sft": 0.5636595487594604, "losses/total": 8.542577234038617e-06, "ref_logps/chosen": -200.0314483642578, "ref_logps/rejected": -221.3777313232422, "rewards/accuracies": 1.0, "rewards/chosen": -1.012717604637146, "rewards/margins": 12.263675689697266, "rewards/rejected": -13.276392936706543, "step": 1637 }, { "epoch": 0.39, "learning_rate": 1.3488e-07, "logps/chosen": -222.40945434570312, "logps/rejected": -321.48187255859375, "loss": 0.0005, "losses/dpo": 7.795284545863979e-06, "losses/sft": 1.0515578985214233, "losses/total": 7.795284545863979e-06, "ref_logps/chosen": -216.47927856445312, "ref_logps/rejected": -210.17636108398438, "rewards/accuracies": 1.0, "rewards/chosen": -0.5930193066596985, "rewards/margins": 10.537528991699219, "rewards/rejected": -11.130547523498535, "step": 1638 }, { "epoch": 0.39, "learning_rate": 1.3482666666666668e-07, "logps/chosen": -246.01161193847656, "logps/rejected": -349.7083740234375, "loss": 0.0091, "losses/dpo": 1.4681978427688591e-05, "losses/sft": 0.6113961338996887, "losses/total": 1.4681978427688591e-05, "ref_logps/chosen": -238.94680786132812, "ref_logps/rejected": -229.2204132080078, "rewards/accuracies": 1.0, "rewards/chosen": -0.7064822316169739, "rewards/margins": 11.342313766479492, "rewards/rejected": -12.048795700073242, "step": 1639 }, { "epoch": 0.39, "learning_rate": 1.3477333333333333e-07, "logps/chosen": -204.68826293945312, "logps/rejected": -327.04864501953125, "loss": 0.0158, "losses/dpo": 1.0503705283326781e-07, "losses/sft": 0.5750355124473572, "losses/total": 1.0503705283326781e-07, "ref_logps/chosen": -200.49322509765625, "ref_logps/rejected": -215.5583953857422, "rewards/accuracies": 1.0, "rewards/chosen": -0.41950488090515137, "rewards/margins": 10.729520797729492, "rewards/rejected": -11.149024963378906, "step": 1640 }, { "epoch": 0.39, "learning_rate": 1.3471999999999998e-07, "logps/chosen": -265.5396423339844, "logps/rejected": -339.2200622558594, "loss": 0.0075, "losses/dpo": 2.0254506125638727e-06, "losses/sft": 0.5655736327171326, "losses/total": 2.0254506125638727e-06, "ref_logps/chosen": -257.53350830078125, "ref_logps/rejected": -228.03602600097656, "rewards/accuracies": 1.0, "rewards/chosen": -0.8006118535995483, "rewards/margins": 10.317790985107422, "rewards/rejected": -11.118402481079102, "step": 1641 }, { "epoch": 0.39, "learning_rate": 1.3466666666666665e-07, "logps/chosen": -235.35116577148438, "logps/rejected": -373.9952697753906, "loss": 0.0007, "losses/dpo": 0.00020394298189785331, "losses/sft": 0.38936513662338257, "losses/total": 0.00020394298189785331, "ref_logps/chosen": -228.01760864257812, "ref_logps/rejected": -247.52255249023438, "rewards/accuracies": 1.0, "rewards/chosen": -0.7333552837371826, "rewards/margins": 11.913917541503906, "rewards/rejected": -12.647273063659668, "step": 1642 }, { "epoch": 0.39, "learning_rate": 1.3461333333333333e-07, "logps/chosen": -221.81056213378906, "logps/rejected": -339.7306823730469, "loss": 0.0017, "losses/dpo": 0.00022424729831982404, "losses/sft": 0.8192914128303528, "losses/total": 0.00022424729831982404, "ref_logps/chosen": -213.77560424804688, "ref_logps/rejected": -225.0491180419922, "rewards/accuracies": 1.0, "rewards/chosen": -0.803497314453125, "rewards/margins": 10.664661407470703, "rewards/rejected": -11.468157768249512, "step": 1643 }, { "epoch": 0.39, "learning_rate": 1.3455999999999998e-07, "logps/chosen": -209.96142578125, "logps/rejected": -328.77325439453125, "loss": 0.0038, "losses/dpo": 1.2152583622082602e-05, "losses/sft": 0.46473655104637146, "losses/total": 1.2152583622082602e-05, "ref_logps/chosen": -201.8105010986328, "ref_logps/rejected": -216.99464416503906, "rewards/accuracies": 1.0, "rewards/chosen": -0.8150913715362549, "rewards/margins": 10.362768173217773, "rewards/rejected": -11.177860260009766, "step": 1644 }, { "epoch": 0.39, "learning_rate": 1.3450666666666666e-07, "logps/chosen": -260.650634765625, "logps/rejected": -369.72698974609375, "loss": 0.0028, "losses/dpo": 1.5547962902928703e-05, "losses/sft": 0.556377649307251, "losses/total": 1.5547962902928703e-05, "ref_logps/chosen": -252.77781677246094, "ref_logps/rejected": -247.93161010742188, "rewards/accuracies": 1.0, "rewards/chosen": -0.787282407283783, "rewards/margins": 11.392257690429688, "rewards/rejected": -12.17953872680664, "step": 1645 }, { "epoch": 0.4, "learning_rate": 1.3445333333333334e-07, "logps/chosen": -213.72412109375, "logps/rejected": -330.6304931640625, "loss": 0.0053, "losses/dpo": 0.00027331183082424104, "losses/sft": 0.3867228627204895, "losses/total": 0.00027331183082424104, "ref_logps/chosen": -209.171142578125, "ref_logps/rejected": -219.45860290527344, "rewards/accuracies": 1.0, "rewards/chosen": -0.45529618859291077, "rewards/margins": 10.66189193725586, "rewards/rejected": -11.1171875, "step": 1646 }, { "epoch": 0.4, "learning_rate": 1.344e-07, "logps/chosen": -225.33169555664062, "logps/rejected": -355.5501708984375, "loss": 0.0027, "losses/dpo": 2.6019714027825103e-07, "losses/sft": 0.5699301958084106, "losses/total": 2.6019714027825103e-07, "ref_logps/chosen": -216.36312866210938, "ref_logps/rejected": -238.52342224121094, "rewards/accuracies": 1.0, "rewards/chosen": -0.8968559503555298, "rewards/margins": 10.805816650390625, "rewards/rejected": -11.702672958374023, "step": 1647 }, { "epoch": 0.4, "learning_rate": 1.3434666666666666e-07, "logps/chosen": -251.49441528320312, "logps/rejected": -364.48480224609375, "loss": 0.0023, "losses/dpo": 3.894920610036934e-06, "losses/sft": 0.5504505634307861, "losses/total": 3.894920610036934e-06, "ref_logps/chosen": -244.045654296875, "ref_logps/rejected": -243.87234497070312, "rewards/accuracies": 1.0, "rewards/chosen": -0.7448769807815552, "rewards/margins": 11.316370964050293, "rewards/rejected": -12.061247825622559, "step": 1648 }, { "epoch": 0.4, "learning_rate": 1.342933333333333e-07, "logps/chosen": -260.792236328125, "logps/rejected": -350.50848388671875, "loss": 0.0113, "losses/dpo": 3.947719596908428e-05, "losses/sft": 0.6239089369773865, "losses/total": 3.947719596908428e-05, "ref_logps/chosen": -253.98829650878906, "ref_logps/rejected": -236.83615112304688, "rewards/accuracies": 1.0, "rewards/chosen": -0.6803958415985107, "rewards/margins": 10.686838150024414, "rewards/rejected": -11.367234230041504, "step": 1649 }, { "epoch": 0.4, "learning_rate": 1.3424e-07, "logps/chosen": -264.6005859375, "logps/rejected": -337.09930419921875, "loss": 0.0164, "losses/dpo": 1.5869662917111782e-08, "losses/sft": 1.1285085678100586, "losses/total": 1.5869662917111782e-08, "ref_logps/chosen": -256.47869873046875, "ref_logps/rejected": -216.12289428710938, "rewards/accuracies": 1.0, "rewards/chosen": -0.8121875524520874, "rewards/margins": 11.285453796386719, "rewards/rejected": -12.097640991210938, "step": 1650 }, { "epoch": 0.4, "learning_rate": 1.3418666666666667e-07, "logps/chosen": -262.2591247558594, "logps/rejected": -369.673583984375, "loss": 0.0036, "losses/dpo": 9.643137133252822e-08, "losses/sft": 1.0100467205047607, "losses/total": 9.643137133252822e-08, "ref_logps/chosen": -255.731689453125, "ref_logps/rejected": -248.78562927246094, "rewards/accuracies": 1.0, "rewards/chosen": -0.6527448892593384, "rewards/margins": 11.436050415039062, "rewards/rejected": -12.088796615600586, "step": 1651 }, { "epoch": 0.4, "learning_rate": 1.3413333333333332e-07, "logps/chosen": -205.9734649658203, "logps/rejected": -298.6405334472656, "loss": 0.016, "losses/dpo": 1.476547481615853e-06, "losses/sft": 0.5859718918800354, "losses/total": 1.476547481615853e-06, "ref_logps/chosen": -201.98971557617188, "ref_logps/rejected": -200.5868682861328, "rewards/accuracies": 1.0, "rewards/chosen": -0.3983747363090515, "rewards/margins": 9.406993865966797, "rewards/rejected": -9.805368423461914, "step": 1652 }, { "epoch": 0.4, "learning_rate": 1.3408e-07, "logps/chosen": -227.7519073486328, "logps/rejected": -332.36737060546875, "loss": 0.0033, "losses/dpo": 2.0369890307847527e-07, "losses/sft": 0.5547525882720947, "losses/total": 2.0369890307847527e-07, "ref_logps/chosen": -222.42050170898438, "ref_logps/rejected": -224.47145080566406, "rewards/accuracies": 1.0, "rewards/chosen": -0.5331405401229858, "rewards/margins": 10.256450653076172, "rewards/rejected": -10.789592742919922, "step": 1653 }, { "epoch": 0.4, "learning_rate": 1.3402666666666667e-07, "logps/chosen": -252.640380859375, "logps/rejected": -351.25091552734375, "loss": 0.0013, "losses/dpo": 1.0400791339293392e-08, "losses/sft": 0.7358278036117554, "losses/total": 1.0400791339293392e-08, "ref_logps/chosen": -245.1177978515625, "ref_logps/rejected": -229.46298217773438, "rewards/accuracies": 1.0, "rewards/chosen": -0.7522565126419067, "rewards/margins": 11.426536560058594, "rewards/rejected": -12.178792953491211, "step": 1654 }, { "epoch": 0.4, "learning_rate": 1.3397333333333335e-07, "logps/chosen": -226.81565856933594, "logps/rejected": -352.7840576171875, "loss": 0.0048, "losses/dpo": 1.175491206595325e-06, "losses/sft": 0.5863975286483765, "losses/total": 1.175491206595325e-06, "ref_logps/chosen": -218.3415069580078, "ref_logps/rejected": -232.04588317871094, "rewards/accuracies": 1.0, "rewards/chosen": -0.8474150896072388, "rewards/margins": 11.226400375366211, "rewards/rejected": -12.07381534576416, "step": 1655 }, { "epoch": 0.4, "learning_rate": 1.3392e-07, "logps/chosen": -231.69757080078125, "logps/rejected": -324.24749755859375, "loss": 0.0027, "losses/dpo": 6.613801360799698e-06, "losses/sft": 1.104254126548767, "losses/total": 6.613801360799698e-06, "ref_logps/chosen": -224.2322540283203, "ref_logps/rejected": -214.26144409179688, "rewards/accuracies": 1.0, "rewards/chosen": -0.7465289831161499, "rewards/margins": 10.252074241638184, "rewards/rejected": -10.998603820800781, "step": 1656 }, { "epoch": 0.4, "learning_rate": 1.3386666666666667e-07, "logps/chosen": -218.895751953125, "logps/rejected": -316.77630615234375, "loss": 0.0019, "losses/dpo": 5.748641562774992e-09, "losses/sft": 0.6885779500007629, "losses/total": 5.748641562774992e-09, "ref_logps/chosen": -211.66119384765625, "ref_logps/rejected": -211.72422790527344, "rewards/accuracies": 1.0, "rewards/chosen": -0.7234548330307007, "rewards/margins": 9.781753540039062, "rewards/rejected": -10.505208969116211, "step": 1657 }, { "epoch": 0.4, "learning_rate": 1.3381333333333332e-07, "logps/chosen": -229.28346252441406, "logps/rejected": -329.55706787109375, "loss": 0.0071, "losses/dpo": 0.0001475795143051073, "losses/sft": 0.6552784442901611, "losses/total": 0.0001475795143051073, "ref_logps/chosen": -222.46975708007812, "ref_logps/rejected": -216.94412231445312, "rewards/accuracies": 1.0, "rewards/chosen": -0.681369423866272, "rewards/margins": 10.579927444458008, "rewards/rejected": -11.261297225952148, "step": 1658 }, { "epoch": 0.4, "learning_rate": 1.3375999999999997e-07, "logps/chosen": -231.67135620117188, "logps/rejected": -329.65350341796875, "loss": 0.0049, "losses/dpo": 9.963059710571542e-07, "losses/sft": 0.6974037885665894, "losses/total": 9.963059710571542e-07, "ref_logps/chosen": -223.28675842285156, "ref_logps/rejected": -218.02639770507812, "rewards/accuracies": 1.0, "rewards/chosen": -0.8384597897529602, "rewards/margins": 10.324252128601074, "rewards/rejected": -11.162712097167969, "step": 1659 }, { "epoch": 0.4, "learning_rate": 1.3370666666666665e-07, "logps/chosen": -205.6829833984375, "logps/rejected": -331.2532958984375, "loss": 0.0077, "losses/dpo": 6.673691927971959e-07, "losses/sft": 0.8920798301696777, "losses/total": 6.673691927971959e-07, "ref_logps/chosen": -198.69529724121094, "ref_logps/rejected": -217.35940551757812, "rewards/accuracies": 1.0, "rewards/chosen": -0.6987694501876831, "rewards/margins": 10.690617561340332, "rewards/rejected": -11.389387130737305, "step": 1660 }, { "epoch": 0.4, "learning_rate": 1.3365333333333333e-07, "logps/chosen": -212.9503936767578, "logps/rejected": -325.91558837890625, "loss": 0.004, "losses/dpo": 3.850365828839131e-06, "losses/sft": 0.5337419509887695, "losses/total": 3.850365828839131e-06, "ref_logps/chosen": -206.0360565185547, "ref_logps/rejected": -214.51361083984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.6914350390434265, "rewards/margins": 10.448762893676758, "rewards/rejected": -11.14019775390625, "step": 1661 }, { "epoch": 0.4, "learning_rate": 1.336e-07, "logps/chosen": -181.4792938232422, "logps/rejected": -283.22601318359375, "loss": 0.0134, "losses/dpo": 6.458325970015721e-06, "losses/sft": 0.47199496626853943, "losses/total": 6.458325970015721e-06, "ref_logps/chosen": -174.09979248046875, "ref_logps/rejected": -185.43780517578125, "rewards/accuracies": 1.0, "rewards/chosen": -0.7379491925239563, "rewards/margins": 9.040871620178223, "rewards/rejected": -9.778820991516113, "step": 1662 }, { "epoch": 0.4, "learning_rate": 1.3354666666666666e-07, "logps/chosen": -222.53848266601562, "logps/rejected": -319.9932556152344, "loss": 0.0035, "losses/dpo": 3.9777411586783273e-08, "losses/sft": 1.0268787145614624, "losses/total": 3.9777411586783273e-08, "ref_logps/chosen": -215.694091796875, "ref_logps/rejected": -214.43017578125, "rewards/accuracies": 1.0, "rewards/chosen": -0.6844381093978882, "rewards/margins": 9.871870040893555, "rewards/rejected": -10.55630874633789, "step": 1663 }, { "epoch": 0.4, "learning_rate": 1.3349333333333333e-07, "logps/chosen": -263.06829833984375, "logps/rejected": -378.04144287109375, "loss": 0.0007, "losses/dpo": 2.4660842257162585e-08, "losses/sft": 0.8386070728302002, "losses/total": 2.4660842257162585e-08, "ref_logps/chosen": -255.85057067871094, "ref_logps/rejected": -251.25210571289062, "rewards/accuracies": 1.0, "rewards/chosen": -0.7217724919319153, "rewards/margins": 11.957160949707031, "rewards/rejected": -12.678934097290039, "step": 1664 }, { "epoch": 0.4, "learning_rate": 1.3344e-07, "logps/chosen": -261.06744384765625, "logps/rejected": -383.07073974609375, "loss": 0.0085, "losses/dpo": 1.9084003724856302e-05, "losses/sft": 0.9786036014556885, "losses/total": 1.9084003724856302e-05, "ref_logps/chosen": -252.22952270507812, "ref_logps/rejected": -258.2283935546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.8837917447090149, "rewards/margins": 11.600443840026855, "rewards/rejected": -12.484235763549805, "step": 1665 }, { "epoch": 0.4, "learning_rate": 1.3338666666666666e-07, "logps/chosen": -207.72451782226562, "logps/rejected": -321.71282958984375, "loss": 0.0034, "losses/dpo": 1.521062245046778e-07, "losses/sft": 0.4341101348400116, "losses/total": 1.521062245046778e-07, "ref_logps/chosen": -199.93609619140625, "ref_logps/rejected": -211.2515869140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.7788406014442444, "rewards/margins": 10.26728630065918, "rewards/rejected": -11.046126365661621, "step": 1666 }, { "epoch": 0.4, "learning_rate": 1.333333333333333e-07, "logps/chosen": -230.05967712402344, "logps/rejected": -307.46630859375, "loss": 0.008, "losses/dpo": 9.197439965191734e-08, "losses/sft": 0.6405110359191895, "losses/total": 9.197439965191734e-08, "ref_logps/chosen": -222.2997589111328, "ref_logps/rejected": -199.91615295410156, "rewards/accuracies": 1.0, "rewards/chosen": -0.7759928703308105, "rewards/margins": 9.979020118713379, "rewards/rejected": -10.755012512207031, "step": 1667 }, { "epoch": 0.4, "learning_rate": 1.3327999999999999e-07, "logps/chosen": -210.69528198242188, "logps/rejected": -313.02825927734375, "loss": 0.0074, "losses/dpo": 6.320685770333512e-06, "losses/sft": 0.6675708889961243, "losses/total": 6.320685770333512e-06, "ref_logps/chosen": -204.60089111328125, "ref_logps/rejected": -211.3963623046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.6094383001327515, "rewards/margins": 9.553750991821289, "rewards/rejected": -10.163188934326172, "step": 1668 }, { "epoch": 0.4, "learning_rate": 1.3322666666666666e-07, "logps/chosen": -220.16668701171875, "logps/rejected": -325.28076171875, "loss": 0.0072, "losses/dpo": 1.6610391639915179e-06, "losses/sft": 0.49909770488739014, "losses/total": 1.6610391639915179e-06, "ref_logps/chosen": -213.50830078125, "ref_logps/rejected": -212.6533203125, "rewards/accuracies": 1.0, "rewards/chosen": -0.6658400893211365, "rewards/margins": 10.596902847290039, "rewards/rejected": -11.26274299621582, "step": 1669 }, { "epoch": 0.4, "learning_rate": 1.3317333333333334e-07, "logps/chosen": -256.48876953125, "logps/rejected": -315.287353515625, "loss": 0.0094, "losses/dpo": 2.42465034716588e-06, "losses/sft": 0.917437732219696, "losses/total": 2.42465034716588e-06, "ref_logps/chosen": -245.23294067382812, "ref_logps/rejected": -202.79421997070312, "rewards/accuracies": 1.0, "rewards/chosen": -1.1255824565887451, "rewards/margins": 10.12373161315918, "rewards/rejected": -11.249313354492188, "step": 1670 }, { "epoch": 0.4, "learning_rate": 1.3312e-07, "logps/chosen": -247.49661254882812, "logps/rejected": -364.3367004394531, "loss": 0.0012, "losses/dpo": 1.2121763575123623e-06, "losses/sft": 1.053962230682373, "losses/total": 1.2121763575123623e-06, "ref_logps/chosen": -240.10670471191406, "ref_logps/rejected": -248.84573364257812, "rewards/accuracies": 1.0, "rewards/chosen": -0.7389898300170898, "rewards/margins": 10.810108184814453, "rewards/rejected": -11.549097061157227, "step": 1671 }, { "epoch": 0.4, "learning_rate": 1.3306666666666667e-07, "logps/chosen": -276.97369384765625, "logps/rejected": -374.12451171875, "loss": 0.0029, "losses/dpo": 9.079317919713503e-08, "losses/sft": 0.4331565797328949, "losses/total": 9.079317919713503e-08, "ref_logps/chosen": -267.87872314453125, "ref_logps/rejected": -249.90016174316406, "rewards/accuracies": 1.0, "rewards/chosen": -0.909498393535614, "rewards/margins": 11.512935638427734, "rewards/rejected": -12.422435760498047, "step": 1672 }, { "epoch": 0.4, "learning_rate": 1.3301333333333334e-07, "logps/chosen": -206.94265747070312, "logps/rejected": -310.7410888671875, "loss": 0.009, "losses/dpo": 1.2424267879396211e-05, "losses/sft": 0.4589279294013977, "losses/total": 1.2424267879396211e-05, "ref_logps/chosen": -199.00164794921875, "ref_logps/rejected": -202.91329956054688, "rewards/accuracies": 1.0, "rewards/chosen": -0.7940995693206787, "rewards/margins": 9.988677978515625, "rewards/rejected": -10.782777786254883, "step": 1673 }, { "epoch": 0.4, "learning_rate": 1.3296e-07, "logps/chosen": -226.48486328125, "logps/rejected": -349.053955078125, "loss": 0.0011, "losses/dpo": 2.7664940716931596e-05, "losses/sft": 0.7288601994514465, "losses/total": 2.7664940716931596e-05, "ref_logps/chosen": -218.6662139892578, "ref_logps/rejected": -226.72647094726562, "rewards/accuracies": 1.0, "rewards/chosen": -0.7818642854690552, "rewards/margins": 11.450883865356445, "rewards/rejected": -12.232748031616211, "step": 1674 }, { "epoch": 0.4, "learning_rate": 1.3290666666666664e-07, "logps/chosen": -234.79473876953125, "logps/rejected": -339.640380859375, "loss": 0.0005, "losses/dpo": 3.1371220643450215e-07, "losses/sft": 0.7134463787078857, "losses/total": 3.1371220643450215e-07, "ref_logps/chosen": -225.79653930664062, "ref_logps/rejected": -221.53213500976562, "rewards/accuracies": 1.0, "rewards/chosen": -0.8998212814331055, "rewards/margins": 10.911004066467285, "rewards/rejected": -11.81082534790039, "step": 1675 }, { "epoch": 0.4, "learning_rate": 1.3285333333333332e-07, "logps/chosen": -225.01339721679688, "logps/rejected": -303.49957275390625, "loss": 0.0058, "losses/dpo": 1.668243385211099e-06, "losses/sft": 1.2908246517181396, "losses/total": 1.668243385211099e-06, "ref_logps/chosen": -216.24424743652344, "ref_logps/rejected": -192.45761108398438, "rewards/accuracies": 1.0, "rewards/chosen": -0.8769149780273438, "rewards/margins": 10.22728157043457, "rewards/rejected": -11.104196548461914, "step": 1676 }, { "epoch": 0.4, "learning_rate": 1.328e-07, "logps/chosen": -215.99771118164062, "logps/rejected": -305.880859375, "loss": 0.0045, "losses/dpo": 3.1059617100481773e-08, "losses/sft": 0.6365119814872742, "losses/total": 3.1059617100481773e-08, "ref_logps/chosen": -208.05299377441406, "ref_logps/rejected": -191.88424682617188, "rewards/accuracies": 1.0, "rewards/chosen": -0.7944716215133667, "rewards/margins": 10.60519027709961, "rewards/rejected": -11.399662017822266, "step": 1677 }, { "epoch": 0.4, "learning_rate": 1.3274666666666665e-07, "logps/chosen": -249.7139434814453, "logps/rejected": -333.6048583984375, "loss": 0.0029, "losses/dpo": 7.617685696459375e-06, "losses/sft": 0.6926211714744568, "losses/total": 7.617685696459375e-06, "ref_logps/chosen": -239.55804443359375, "ref_logps/rejected": -220.135986328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.0155913829803467, "rewards/margins": 10.331295013427734, "rewards/rejected": -11.34688663482666, "step": 1678 }, { "epoch": 0.4, "learning_rate": 1.3269333333333333e-07, "logps/chosen": -208.32891845703125, "logps/rejected": -309.081787109375, "loss": 0.0074, "losses/dpo": 1.5800941355337272e-07, "losses/sft": 0.8124882578849792, "losses/total": 1.5800941355337272e-07, "ref_logps/chosen": -197.41207885742188, "ref_logps/rejected": -200.46694946289062, "rewards/accuracies": 1.0, "rewards/chosen": -1.091685175895691, "rewards/margins": 9.769798278808594, "rewards/rejected": -10.86148452758789, "step": 1679 }, { "epoch": 0.4, "learning_rate": 1.3264e-07, "logps/chosen": -220.68983459472656, "logps/rejected": -368.85009765625, "loss": 0.0026, "losses/dpo": 3.1772691500009387e-07, "losses/sft": 0.5996146202087402, "losses/total": 3.1772691500009387e-07, "ref_logps/chosen": -212.79696655273438, "ref_logps/rejected": -242.1004638671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.7892859578132629, "rewards/margins": 11.885675430297852, "rewards/rejected": -12.67496109008789, "step": 1680 }, { "epoch": 0.4, "learning_rate": 1.3258666666666668e-07, "logps/chosen": -270.399169921875, "logps/rejected": -371.6824645996094, "loss": 0.0022, "losses/dpo": 2.323120497749187e-06, "losses/sft": 1.0781254768371582, "losses/total": 2.323120497749187e-06, "ref_logps/chosen": -258.69305419921875, "ref_logps/rejected": -241.32333374023438, "rewards/accuracies": 1.0, "rewards/chosen": -1.1706119775772095, "rewards/margins": 11.865303039550781, "rewards/rejected": -13.035913467407227, "step": 1681 }, { "epoch": 0.4, "learning_rate": 1.3253333333333333e-07, "logps/chosen": -193.95176696777344, "logps/rejected": -304.7308349609375, "loss": 0.0058, "losses/dpo": 1.3609824236482382e-05, "losses/sft": 0.6380676031112671, "losses/total": 1.3609824236482382e-05, "ref_logps/chosen": -185.80105590820312, "ref_logps/rejected": -198.47947692871094, "rewards/accuracies": 1.0, "rewards/chosen": -0.8150713443756104, "rewards/margins": 9.810064315795898, "rewards/rejected": -10.62513542175293, "step": 1682 }, { "epoch": 0.4, "learning_rate": 1.3247999999999998e-07, "logps/chosen": -244.1621856689453, "logps/rejected": -341.8277893066406, "loss": 0.0057, "losses/dpo": 9.067404107554466e-07, "losses/sft": 0.593812108039856, "losses/total": 9.067404107554466e-07, "ref_logps/chosen": -237.0030975341797, "ref_logps/rejected": -215.2578582763672, "rewards/accuracies": 1.0, "rewards/chosen": -0.7159087657928467, "rewards/margins": 11.941084861755371, "rewards/rejected": -12.656993865966797, "step": 1683 }, { "epoch": 0.4, "learning_rate": 1.3242666666666666e-07, "logps/chosen": -261.61614990234375, "logps/rejected": -329.0920715332031, "loss": 0.0044, "losses/dpo": 1.7402029470758862e-06, "losses/sft": 0.8179904818534851, "losses/total": 1.7402029470758862e-06, "ref_logps/chosen": -251.51036071777344, "ref_logps/rejected": -213.47003173828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.0105793476104736, "rewards/margins": 10.551624298095703, "rewards/rejected": -11.562204360961914, "step": 1684 }, { "epoch": 0.4, "learning_rate": 1.3237333333333333e-07, "logps/chosen": -222.46803283691406, "logps/rejected": -331.50311279296875, "loss": 0.0012, "losses/dpo": 2.3202255761134438e-07, "losses/sft": 0.48262935876846313, "losses/total": 2.3202255761134438e-07, "ref_logps/chosen": -215.28424072265625, "ref_logps/rejected": -212.89096069335938, "rewards/accuracies": 1.0, "rewards/chosen": -0.7183786630630493, "rewards/margins": 11.14283561706543, "rewards/rejected": -11.861213684082031, "step": 1685 }, { "epoch": 0.4, "learning_rate": 1.3231999999999998e-07, "logps/chosen": -208.01675415039062, "logps/rejected": -319.21044921875, "loss": 0.0053, "losses/dpo": 9.41619315852904e-09, "losses/sft": 0.807633101940155, "losses/total": 9.41619315852904e-09, "ref_logps/chosen": -200.1741943359375, "ref_logps/rejected": -206.2841796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.7842559218406677, "rewards/margins": 10.508370399475098, "rewards/rejected": -11.29262638092041, "step": 1686 }, { "epoch": 0.4, "learning_rate": 1.3226666666666666e-07, "logps/chosen": -208.80294799804688, "logps/rejected": -295.33428955078125, "loss": 0.0161, "losses/dpo": 5.681807238033798e-07, "losses/sft": 0.9790213108062744, "losses/total": 5.681807238033798e-07, "ref_logps/chosen": -200.47283935546875, "ref_logps/rejected": -201.65536499023438, "rewards/accuracies": 1.0, "rewards/chosen": -0.8330106735229492, "rewards/margins": 8.534881591796875, "rewards/rejected": -9.367892265319824, "step": 1687 }, { "epoch": 0.41, "learning_rate": 1.3221333333333334e-07, "logps/chosen": -276.984130859375, "logps/rejected": -364.2191162109375, "loss": 0.0069, "losses/dpo": 1.0098488018028817e-10, "losses/sft": 0.4626176655292511, "losses/total": 1.0098488018028817e-10, "ref_logps/chosen": -269.2478332519531, "ref_logps/rejected": -234.61936950683594, "rewards/accuracies": 1.0, "rewards/chosen": -0.7736303806304932, "rewards/margins": 12.186344146728516, "rewards/rejected": -12.95997428894043, "step": 1688 }, { "epoch": 0.41, "learning_rate": 1.3216000000000001e-07, "logps/chosen": -252.05526733398438, "logps/rejected": -357.67608642578125, "loss": 0.0041, "losses/dpo": 5.340807547327131e-07, "losses/sft": 0.527899444103241, "losses/total": 5.340807547327131e-07, "ref_logps/chosen": -242.89463806152344, "ref_logps/rejected": -235.26760864257812, "rewards/accuracies": 1.0, "rewards/chosen": -0.9160635471343994, "rewards/margins": 11.324789047241211, "rewards/rejected": -12.240852355957031, "step": 1689 }, { "epoch": 0.41, "learning_rate": 1.3210666666666666e-07, "logps/chosen": -223.46286010742188, "logps/rejected": -297.606689453125, "loss": 0.0098, "losses/dpo": 9.097626389120705e-06, "losses/sft": 1.0492918491363525, "losses/total": 9.097626389120705e-06, "ref_logps/chosen": -216.73834228515625, "ref_logps/rejected": -195.76803588867188, "rewards/accuracies": 1.0, "rewards/chosen": -0.6724509000778198, "rewards/margins": 9.511417388916016, "rewards/rejected": -10.183868408203125, "step": 1690 }, { "epoch": 0.41, "learning_rate": 1.3205333333333334e-07, "logps/chosen": -251.7647705078125, "logps/rejected": -356.3741455078125, "loss": 0.0027, "losses/dpo": 1.076530111276952e-06, "losses/sft": 0.7387228608131409, "losses/total": 1.076530111276952e-06, "ref_logps/chosen": -244.43527221679688, "ref_logps/rejected": -236.84368896484375, "rewards/accuracies": 1.0, "rewards/chosen": -0.7329519987106323, "rewards/margins": 11.2200927734375, "rewards/rejected": -11.953044891357422, "step": 1691 }, { "epoch": 0.41, "learning_rate": 1.32e-07, "logps/chosen": -242.4263153076172, "logps/rejected": -354.8089294433594, "loss": 0.0025, "losses/dpo": 0.0004952008021064103, "losses/sft": 0.4893952012062073, "losses/total": 0.0004952008021064103, "ref_logps/chosen": -234.30120849609375, "ref_logps/rejected": -239.82778930664062, "rewards/accuracies": 1.0, "rewards/chosen": -0.8125119209289551, "rewards/margins": 10.685601234436035, "rewards/rejected": -11.498113632202148, "step": 1692 }, { "epoch": 0.41, "learning_rate": 1.3194666666666664e-07, "logps/chosen": -236.65084838867188, "logps/rejected": -300.8330078125, "loss": 0.0094, "losses/dpo": 4.9773789214668795e-06, "losses/sft": 0.7372027039527893, "losses/total": 4.9773789214668795e-06, "ref_logps/chosen": -228.00167846679688, "ref_logps/rejected": -190.0402374267578, "rewards/accuracies": 1.0, "rewards/chosen": -0.8649178743362427, "rewards/margins": 10.214357376098633, "rewards/rejected": -11.079275131225586, "step": 1693 }, { "epoch": 0.41, "learning_rate": 1.3189333333333332e-07, "logps/chosen": -220.21920776367188, "logps/rejected": -293.84490966796875, "loss": 0.0194, "losses/dpo": 3.823226961685577e-06, "losses/sft": 1.0120047330856323, "losses/total": 3.823226961685577e-06, "ref_logps/chosen": -214.95013427734375, "ref_logps/rejected": -190.5931396484375, "rewards/accuracies": 1.0, "rewards/chosen": -0.5269076824188232, "rewards/margins": 9.798271179199219, "rewards/rejected": -10.325179100036621, "step": 1694 }, { "epoch": 0.41, "learning_rate": 1.3184e-07, "logps/chosen": -271.163818359375, "logps/rejected": -356.34698486328125, "loss": 0.0017, "losses/dpo": 0.0001007541359285824, "losses/sft": 0.5009057521820068, "losses/total": 0.0001007541359285824, "ref_logps/chosen": -261.66326904296875, "ref_logps/rejected": -233.1558837890625, "rewards/accuracies": 1.0, "rewards/chosen": -0.9500560164451599, "rewards/margins": 11.369053840637207, "rewards/rejected": -12.319109916687012, "step": 1695 }, { "epoch": 0.41, "learning_rate": 1.3178666666666667e-07, "logps/chosen": -172.66009521484375, "logps/rejected": -318.693115234375, "loss": 0.0051, "losses/dpo": 8.127315595629625e-06, "losses/sft": 0.7280281186103821, "losses/total": 8.127315595629625e-06, "ref_logps/chosen": -165.20309448242188, "ref_logps/rejected": -208.29153442382812, "rewards/accuracies": 1.0, "rewards/chosen": -0.7456995248794556, "rewards/margins": 10.294459342956543, "rewards/rejected": -11.040159225463867, "step": 1696 }, { "epoch": 0.41, "learning_rate": 1.3173333333333332e-07, "logps/chosen": -229.73367309570312, "logps/rejected": -340.7404479980469, "loss": 0.0028, "losses/dpo": 2.684436424260639e-07, "losses/sft": 1.0379102230072021, "losses/total": 2.684436424260639e-07, "ref_logps/chosen": -220.2022705078125, "ref_logps/rejected": -221.72393798828125, "rewards/accuracies": 1.0, "rewards/chosen": -0.9531382918357849, "rewards/margins": 10.948513984680176, "rewards/rejected": -11.901653289794922, "step": 1697 }, { "epoch": 0.41, "learning_rate": 1.3168e-07, "logps/chosen": -213.57534790039062, "logps/rejected": -297.0679931640625, "loss": 0.0079, "losses/dpo": 1.9645405302526342e-08, "losses/sft": 0.6703286170959473, "losses/total": 1.9645405302526342e-08, "ref_logps/chosen": -205.30116271972656, "ref_logps/rejected": -189.84307861328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.8274191617965698, "rewards/margins": 9.89507007598877, "rewards/rejected": -10.722489356994629, "step": 1698 }, { "epoch": 0.41, "learning_rate": 1.3162666666666668e-07, "logps/chosen": -254.80958557128906, "logps/rejected": -356.8031311035156, "loss": 0.0012, "losses/dpo": 2.415699782432057e-06, "losses/sft": 0.5230137705802917, "losses/total": 2.415699782432057e-06, "ref_logps/chosen": -246.06350708007812, "ref_logps/rejected": -229.0286102294922, "rewards/accuracies": 1.0, "rewards/chosen": -0.8746089935302734, "rewards/margins": 11.90284252166748, "rewards/rejected": -12.77745246887207, "step": 1699 }, { "epoch": 0.41, "learning_rate": 1.3157333333333333e-07, "logps/chosen": -211.37103271484375, "logps/rejected": -319.653076171875, "loss": 0.017, "losses/dpo": 6.434603960769891e-08, "losses/sft": 1.0673364400863647, "losses/total": 6.434603960769891e-08, "ref_logps/chosen": -203.3714599609375, "ref_logps/rejected": -212.69436645507812, "rewards/accuracies": 1.0, "rewards/chosen": -0.799956738948822, "rewards/margins": 9.895915031433105, "rewards/rejected": -10.695871353149414, "step": 1700 }, { "epoch": 0.41, "learning_rate": 1.3151999999999998e-07, "logps/chosen": -255.23370361328125, "logps/rejected": -349.60223388671875, "loss": 0.0007, "losses/dpo": 0.0001485422981204465, "losses/sft": 0.46176907420158386, "losses/total": 0.0001485422981204465, "ref_logps/chosen": -248.29409790039062, "ref_logps/rejected": -230.83412170410156, "rewards/accuracies": 1.0, "rewards/chosen": -0.6939592957496643, "rewards/margins": 11.182851791381836, "rewards/rejected": -11.876811027526855, "step": 1701 }, { "epoch": 0.41, "learning_rate": 1.3146666666666665e-07, "logps/chosen": -240.3405303955078, "logps/rejected": -353.09918212890625, "loss": 0.003, "losses/dpo": 6.464041035769696e-08, "losses/sft": 0.5951628088951111, "losses/total": 6.464041035769696e-08, "ref_logps/chosen": -230.36866760253906, "ref_logps/rejected": -218.98483276367188, "rewards/accuracies": 1.0, "rewards/chosen": -0.9971863031387329, "rewards/margins": 12.414250373840332, "rewards/rejected": -13.411436080932617, "step": 1702 }, { "epoch": 0.41, "learning_rate": 1.3141333333333333e-07, "logps/chosen": -229.60858154296875, "logps/rejected": -363.5487365722656, "loss": 0.0009, "losses/dpo": 5.594758931692922e-07, "losses/sft": 0.6497723460197449, "losses/total": 5.594758931692922e-07, "ref_logps/chosen": -220.92591857910156, "ref_logps/rejected": -237.52133178710938, "rewards/accuracies": 1.0, "rewards/chosen": -0.8682657480239868, "rewards/margins": 11.734475135803223, "rewards/rejected": -12.602741241455078, "step": 1703 }, { "epoch": 0.41, "learning_rate": 1.3136e-07, "logps/chosen": -228.78585815429688, "logps/rejected": -320.1768493652344, "loss": 0.0129, "losses/dpo": 0.37323829531669617, "losses/sft": 0.927075982093811, "losses/total": 0.37323829531669617, "ref_logps/chosen": -221.57598876953125, "ref_logps/rejected": -202.66757202148438, "rewards/accuracies": 1.0, "rewards/chosen": -0.7209867835044861, "rewards/margins": 11.029939651489258, "rewards/rejected": -11.750926971435547, "step": 1704 }, { "epoch": 0.41, "learning_rate": 1.3130666666666666e-07, "logps/chosen": -296.2284851074219, "logps/rejected": -392.5995788574219, "loss": 0.0002, "losses/dpo": 6.602546363865258e-06, "losses/sft": 0.5671896934509277, "losses/total": 6.602546363865258e-06, "ref_logps/chosen": -286.1260681152344, "ref_logps/rejected": -267.58758544921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.0102407932281494, "rewards/margins": 11.490959167480469, "rewards/rejected": -12.501199722290039, "step": 1705 }, { "epoch": 0.41, "learning_rate": 1.3125333333333333e-07, "logps/chosen": -215.3690948486328, "logps/rejected": -323.1369323730469, "loss": 0.0067, "losses/dpo": 9.126102668233216e-06, "losses/sft": 0.7199310064315796, "losses/total": 9.126102668233216e-06, "ref_logps/chosen": -209.13250732421875, "ref_logps/rejected": -209.00283813476562, "rewards/accuracies": 1.0, "rewards/chosen": -0.6236591935157776, "rewards/margins": 10.789752006530762, "rewards/rejected": -11.413412094116211, "step": 1706 }, { "epoch": 0.41, "learning_rate": 1.312e-07, "logps/chosen": -207.45819091796875, "logps/rejected": -341.74639892578125, "loss": 0.001, "losses/dpo": 1.7542846535434364e-06, "losses/sft": 0.9422776103019714, "losses/total": 1.7542846535434364e-06, "ref_logps/chosen": -201.35546875, "ref_logps/rejected": -227.3302001953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.6102726459503174, "rewards/margins": 10.831348419189453, "rewards/rejected": -11.441619873046875, "step": 1707 }, { "epoch": 0.41, "learning_rate": 1.3114666666666666e-07, "logps/chosen": -207.11720275878906, "logps/rejected": -296.7895202636719, "loss": 0.0073, "losses/dpo": 3.3168944355566055e-05, "losses/sft": 0.453104704618454, "losses/total": 3.3168944355566055e-05, "ref_logps/chosen": -200.47000122070312, "ref_logps/rejected": -197.34725952148438, "rewards/accuracies": 1.0, "rewards/chosen": -0.6647195816040039, "rewards/margins": 9.279504776000977, "rewards/rejected": -9.944223403930664, "step": 1708 }, { "epoch": 0.41, "learning_rate": 1.310933333333333e-07, "logps/chosen": -217.21148681640625, "logps/rejected": -327.57568359375, "loss": 0.01, "losses/dpo": 4.024190275231376e-05, "losses/sft": 0.5844886898994446, "losses/total": 4.024190275231376e-05, "ref_logps/chosen": -209.41128540039062, "ref_logps/rejected": -211.81939697265625, "rewards/accuracies": 1.0, "rewards/chosen": -0.7800216674804688, "rewards/margins": 10.795608520507812, "rewards/rejected": -11.575630187988281, "step": 1709 }, { "epoch": 0.41, "learning_rate": 1.3104e-07, "logps/chosen": -262.1650695800781, "logps/rejected": -349.84210205078125, "loss": 0.0007, "losses/dpo": 5.648510068567703e-07, "losses/sft": 0.6155184507369995, "losses/total": 5.648510068567703e-07, "ref_logps/chosen": -253.2959747314453, "ref_logps/rejected": -225.72003173828125, "rewards/accuracies": 1.0, "rewards/chosen": -0.8869084119796753, "rewards/margins": 11.525300025939941, "rewards/rejected": -12.412208557128906, "step": 1710 }, { "epoch": 0.41, "learning_rate": 1.3098666666666666e-07, "logps/chosen": -263.14129638671875, "logps/rejected": -348.73846435546875, "loss": 0.0088, "losses/dpo": 1.1682894235365282e-11, "losses/sft": 0.5694514513015747, "losses/total": 1.1682894235365282e-11, "ref_logps/chosen": -255.64102172851562, "ref_logps/rejected": -231.1090850830078, "rewards/accuracies": 1.0, "rewards/chosen": -0.7500290870666504, "rewards/margins": 11.012908935546875, "rewards/rejected": -11.762937545776367, "step": 1711 }, { "epoch": 0.41, "learning_rate": 1.3093333333333331e-07, "logps/chosen": -188.6166229248047, "logps/rejected": -303.18597412109375, "loss": 0.0103, "losses/dpo": 6.568110499927116e-09, "losses/sft": 0.9553936719894409, "losses/total": 6.568110499927116e-09, "ref_logps/chosen": -182.63819885253906, "ref_logps/rejected": -193.92349243164062, "rewards/accuracies": 1.0, "rewards/chosen": -0.5978437662124634, "rewards/margins": 10.32840347290039, "rewards/rejected": -10.926247596740723, "step": 1712 }, { "epoch": 0.41, "learning_rate": 1.3088e-07, "logps/chosen": -268.44305419921875, "logps/rejected": -378.4866943359375, "loss": 0.0054, "losses/dpo": 3.3121308661065996e-05, "losses/sft": 0.5844837427139282, "losses/total": 3.3121308661065996e-05, "ref_logps/chosen": -258.67071533203125, "ref_logps/rejected": -242.82406616210938, "rewards/accuracies": 1.0, "rewards/chosen": -0.9772318601608276, "rewards/margins": 12.589031219482422, "rewards/rejected": -13.566262245178223, "step": 1713 }, { "epoch": 0.41, "learning_rate": 1.3082666666666667e-07, "logps/chosen": -225.26947021484375, "logps/rejected": -318.9975280761719, "loss": 0.0136, "losses/dpo": 2.2881962649989873e-05, "losses/sft": 0.6625933647155762, "losses/total": 2.2881962649989873e-05, "ref_logps/chosen": -216.8372344970703, "ref_logps/rejected": -205.68582153320312, "rewards/accuracies": 1.0, "rewards/chosen": -0.8432222604751587, "rewards/margins": 10.487948417663574, "rewards/rejected": -11.331170082092285, "step": 1714 }, { "epoch": 0.41, "learning_rate": 1.3077333333333334e-07, "logps/chosen": -194.95420837402344, "logps/rejected": -306.3331604003906, "loss": 0.0037, "losses/dpo": 0.00015077085117809474, "losses/sft": 0.6237480640411377, "losses/total": 0.00015077085117809474, "ref_logps/chosen": -186.63650512695312, "ref_logps/rejected": -204.01287841796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.8317692875862122, "rewards/margins": 9.400259017944336, "rewards/rejected": -10.232028007507324, "step": 1715 }, { "epoch": 0.41, "learning_rate": 1.3072e-07, "logps/chosen": -261.7964782714844, "logps/rejected": -362.5114440917969, "loss": 0.0058, "losses/dpo": 5.616570888378192e-07, "losses/sft": 0.6239914894104004, "losses/total": 5.616570888378192e-07, "ref_logps/chosen": -251.6476593017578, "ref_logps/rejected": -236.58212280273438, "rewards/accuracies": 1.0, "rewards/chosen": -1.0148818492889404, "rewards/margins": 11.57805061340332, "rewards/rejected": -12.592931747436523, "step": 1716 }, { "epoch": 0.41, "learning_rate": 1.3066666666666665e-07, "logps/chosen": -232.71478271484375, "logps/rejected": -327.56494140625, "loss": 0.0033, "losses/dpo": 1.2138166027853003e-07, "losses/sft": 0.6527310609817505, "losses/total": 1.2138166027853003e-07, "ref_logps/chosen": -226.15191650390625, "ref_logps/rejected": -209.222412109375, "rewards/accuracies": 1.0, "rewards/chosen": -0.6562860608100891, "rewards/margins": 11.177966117858887, "rewards/rejected": -11.83425235748291, "step": 1717 }, { "epoch": 0.41, "learning_rate": 1.3061333333333332e-07, "logps/chosen": -266.31890869140625, "logps/rejected": -316.976318359375, "loss": 0.0034, "losses/dpo": 5.4460615501739085e-05, "losses/sft": 0.5425869226455688, "losses/total": 5.4460615501739085e-05, "ref_logps/chosen": -254.7071533203125, "ref_logps/rejected": -207.70742797851562, "rewards/accuracies": 1.0, "rewards/chosen": -1.1611757278442383, "rewards/margins": 9.765708923339844, "rewards/rejected": -10.926883697509766, "step": 1718 }, { "epoch": 0.41, "learning_rate": 1.3056e-07, "logps/chosen": -251.49807739257812, "logps/rejected": -389.843505859375, "loss": 0.0077, "losses/dpo": 7.2032626121654175e-06, "losses/sft": 0.5111322402954102, "losses/total": 7.2032626121654175e-06, "ref_logps/chosen": -240.2254638671875, "ref_logps/rejected": -257.5206298828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.1272602081298828, "rewards/margins": 12.105026245117188, "rewards/rejected": -13.23228645324707, "step": 1719 }, { "epoch": 0.41, "learning_rate": 1.3050666666666665e-07, "logps/chosen": -248.7764129638672, "logps/rejected": -359.9925537109375, "loss": 0.0006, "losses/dpo": 1.3748089031651034e-06, "losses/sft": 0.6465226411819458, "losses/total": 1.3748089031651034e-06, "ref_logps/chosen": -238.20721435546875, "ref_logps/rejected": -232.46926879882812, "rewards/accuracies": 1.0, "rewards/chosen": -1.0569192171096802, "rewards/margins": 11.695412635803223, "rewards/rejected": -12.752331733703613, "step": 1720 }, { "epoch": 0.41, "learning_rate": 1.3045333333333333e-07, "logps/chosen": -219.80494689941406, "logps/rejected": -334.77117919921875, "loss": 0.003, "losses/dpo": 4.631261276699661e-07, "losses/sft": 0.5596633553504944, "losses/total": 4.631261276699661e-07, "ref_logps/chosen": -209.6774139404297, "ref_logps/rejected": -219.99050903320312, "rewards/accuracies": 1.0, "rewards/chosen": -1.0127530097961426, "rewards/margins": 10.465316772460938, "rewards/rejected": -11.478070259094238, "step": 1721 }, { "epoch": 0.41, "learning_rate": 1.304e-07, "logps/chosen": -214.12611389160156, "logps/rejected": -337.82012939453125, "loss": 0.0017, "losses/dpo": 6.372337338689249e-06, "losses/sft": 0.668906033039093, "losses/total": 6.372337338689249e-06, "ref_logps/chosen": -205.0568389892578, "ref_logps/rejected": -219.85006713867188, "rewards/accuracies": 1.0, "rewards/chosen": -0.9069278836250305, "rewards/margins": 10.890080451965332, "rewards/rejected": -11.797008514404297, "step": 1722 }, { "epoch": 0.41, "learning_rate": 1.3034666666666668e-07, "logps/chosen": -235.8888397216797, "logps/rejected": -355.4464416503906, "loss": 0.0024, "losses/dpo": 0.00021123638725839555, "losses/sft": 0.680051863193512, "losses/total": 0.00021123638725839555, "ref_logps/chosen": -228.88607788085938, "ref_logps/rejected": -236.15383911132812, "rewards/accuracies": 1.0, "rewards/chosen": -0.7002745866775513, "rewards/margins": 11.228986740112305, "rewards/rejected": -11.92926025390625, "step": 1723 }, { "epoch": 0.41, "learning_rate": 1.3029333333333333e-07, "logps/chosen": -289.8806457519531, "logps/rejected": -382.94329833984375, "loss": 0.0041, "losses/dpo": 6.046465159670333e-07, "losses/sft": 0.6737843751907349, "losses/total": 6.046465159670333e-07, "ref_logps/chosen": -281.31610107421875, "ref_logps/rejected": -254.03421020507812, "rewards/accuracies": 1.0, "rewards/chosen": -0.8564544916152954, "rewards/margins": 12.034455299377441, "rewards/rejected": -12.890910148620605, "step": 1724 }, { "epoch": 0.41, "learning_rate": 1.3024e-07, "logps/chosen": -219.63311767578125, "logps/rejected": -327.16424560546875, "loss": 0.0041, "losses/dpo": 9.559129239278263e-07, "losses/sft": 0.6698132157325745, "losses/total": 9.559129239278263e-07, "ref_logps/chosen": -213.7523651123047, "ref_logps/rejected": -213.18243408203125, "rewards/accuracies": 1.0, "rewards/chosen": -0.588074803352356, "rewards/margins": 10.810104370117188, "rewards/rejected": -11.398178100585938, "step": 1725 }, { "epoch": 0.41, "learning_rate": 1.3018666666666666e-07, "logps/chosen": -224.45098876953125, "logps/rejected": -320.42864990234375, "loss": 0.0074, "losses/dpo": 3.925360033463221e-06, "losses/sft": 0.5520684719085693, "losses/total": 3.925360033463221e-06, "ref_logps/chosen": -218.87881469726562, "ref_logps/rejected": -210.6466522216797, "rewards/accuracies": 1.0, "rewards/chosen": -0.5572171211242676, "rewards/margins": 10.420981407165527, "rewards/rejected": -10.978199005126953, "step": 1726 }, { "epoch": 0.41, "learning_rate": 1.301333333333333e-07, "logps/chosen": -220.18911743164062, "logps/rejected": -326.26104736328125, "loss": 0.0035, "losses/dpo": 4.7179955231513304e-07, "losses/sft": 0.6167248487472534, "losses/total": 4.7179955231513304e-07, "ref_logps/chosen": -212.0715789794922, "ref_logps/rejected": -215.044189453125, "rewards/accuracies": 1.0, "rewards/chosen": -0.8117536306381226, "rewards/margins": 10.309932708740234, "rewards/rejected": -11.121686935424805, "step": 1727 }, { "epoch": 0.41, "learning_rate": 1.3007999999999998e-07, "logps/chosen": -231.69354248046875, "logps/rejected": -343.8896789550781, "loss": 0.0103, "losses/dpo": 2.2143138878050195e-08, "losses/sft": 0.46720603108406067, "losses/total": 2.2143138878050195e-08, "ref_logps/chosen": -223.62379455566406, "ref_logps/rejected": -226.96607971191406, "rewards/accuracies": 1.0, "rewards/chosen": -0.8069745302200317, "rewards/margins": 10.885385513305664, "rewards/rejected": -11.692359924316406, "step": 1728 }, { "epoch": 0.41, "learning_rate": 1.3002666666666666e-07, "logps/chosen": -257.13031005859375, "logps/rejected": -351.23406982421875, "loss": 0.0005, "losses/dpo": 3.30899933942419e-06, "losses/sft": 0.5078000426292419, "losses/total": 3.30899933942419e-06, "ref_logps/chosen": -244.52418518066406, "ref_logps/rejected": -229.90550231933594, "rewards/accuracies": 1.0, "rewards/chosen": -1.2606127262115479, "rewards/margins": 10.872243881225586, "rewards/rejected": -12.132857322692871, "step": 1729 }, { "epoch": 0.42, "learning_rate": 1.2997333333333334e-07, "logps/chosen": -203.2530517578125, "logps/rejected": -345.75140380859375, "loss": 0.0022, "losses/dpo": 3.120212355156582e-08, "losses/sft": 0.4619942903518677, "losses/total": 3.120212355156582e-08, "ref_logps/chosen": -196.24404907226562, "ref_logps/rejected": -225.53199768066406, "rewards/accuracies": 1.0, "rewards/chosen": -0.7008994817733765, "rewards/margins": 11.321039199829102, "rewards/rejected": -12.02193832397461, "step": 1730 }, { "epoch": 0.42, "learning_rate": 1.2992e-07, "logps/chosen": -269.6754455566406, "logps/rejected": -380.16461181640625, "loss": 0.0023, "losses/dpo": 6.553909770445898e-08, "losses/sft": 0.5871347188949585, "losses/total": 6.553909770445898e-08, "ref_logps/chosen": -261.506591796875, "ref_logps/rejected": -258.57135009765625, "rewards/accuracies": 1.0, "rewards/chosen": -0.8168826103210449, "rewards/margins": 11.34244155883789, "rewards/rejected": -12.159324645996094, "step": 1731 }, { "epoch": 0.42, "learning_rate": 1.2986666666666666e-07, "logps/chosen": -266.3621826171875, "logps/rejected": -347.165283203125, "loss": 0.0048, "losses/dpo": 9.180443885270506e-05, "losses/sft": 0.7900704741477966, "losses/total": 9.180443885270506e-05, "ref_logps/chosen": -257.67694091796875, "ref_logps/rejected": -227.77134704589844, "rewards/accuracies": 1.0, "rewards/chosen": -0.8685258626937866, "rewards/margins": 11.070870399475098, "rewards/rejected": -11.939395904541016, "step": 1732 }, { "epoch": 0.42, "learning_rate": 1.2981333333333334e-07, "logps/chosen": -234.4033203125, "logps/rejected": -338.8565673828125, "loss": 0.0008, "losses/dpo": 4.5362453420239035e-06, "losses/sft": 0.5530555248260498, "losses/total": 4.5362453420239035e-06, "ref_logps/chosen": -228.84185791015625, "ref_logps/rejected": -223.39224243164062, "rewards/accuracies": 1.0, "rewards/chosen": -0.5561482906341553, "rewards/margins": 10.990283966064453, "rewards/rejected": -11.546432495117188, "step": 1733 }, { "epoch": 0.42, "learning_rate": 1.2976e-07, "logps/chosen": -230.94439697265625, "logps/rejected": -368.1627197265625, "loss": 0.0017, "losses/dpo": 9.403645526617765e-05, "losses/sft": 0.6865830421447754, "losses/total": 9.403645526617765e-05, "ref_logps/chosen": -221.43649291992188, "ref_logps/rejected": -246.1226043701172, "rewards/accuracies": 1.0, "rewards/chosen": -0.9507906436920166, "rewards/margins": 11.25322151184082, "rewards/rejected": -12.204011917114258, "step": 1734 }, { "epoch": 0.42, "learning_rate": 1.2970666666666664e-07, "logps/chosen": -223.21669006347656, "logps/rejected": -351.9604797363281, "loss": 0.0053, "losses/dpo": 8.219786451491018e-08, "losses/sft": 0.5742188692092896, "losses/total": 8.219786451491018e-08, "ref_logps/chosen": -214.3001251220703, "ref_logps/rejected": -223.44342041015625, "rewards/accuracies": 1.0, "rewards/chosen": -0.891658365726471, "rewards/margins": 11.96004867553711, "rewards/rejected": -12.851707458496094, "step": 1735 }, { "epoch": 0.42, "learning_rate": 1.2965333333333332e-07, "logps/chosen": -227.7704620361328, "logps/rejected": -284.4403991699219, "loss": 0.0021, "losses/dpo": 2.0546713130897842e-07, "losses/sft": 0.9440471529960632, "losses/total": 2.0546713130897842e-07, "ref_logps/chosen": -218.01495361328125, "ref_logps/rejected": -174.59896850585938, "rewards/accuracies": 1.0, "rewards/chosen": -0.9755522012710571, "rewards/margins": 10.008589744567871, "rewards/rejected": -10.984142303466797, "step": 1736 }, { "epoch": 0.42, "learning_rate": 1.296e-07, "logps/chosen": -202.79148864746094, "logps/rejected": -317.72607421875, "loss": 0.0162, "losses/dpo": 9.447500701753597e-07, "losses/sft": 0.5619181394577026, "losses/total": 9.447500701753597e-07, "ref_logps/chosen": -194.7307891845703, "ref_logps/rejected": -203.62367248535156, "rewards/accuracies": 1.0, "rewards/chosen": -0.8060685992240906, "rewards/margins": 10.604168891906738, "rewards/rejected": -11.410238265991211, "step": 1737 }, { "epoch": 0.42, "learning_rate": 1.2954666666666667e-07, "logps/chosen": -274.41400146484375, "logps/rejected": -365.9372863769531, "loss": 0.0026, "losses/dpo": 2.8561430553963874e-06, "losses/sft": 0.7502959966659546, "losses/total": 2.8561430553963874e-06, "ref_logps/chosen": -260.9273681640625, "ref_logps/rejected": -238.82760620117188, "rewards/accuracies": 1.0, "rewards/chosen": -1.3486602306365967, "rewards/margins": 11.362306594848633, "rewards/rejected": -12.710967063903809, "step": 1738 }, { "epoch": 0.42, "learning_rate": 1.2949333333333332e-07, "logps/chosen": -229.73544311523438, "logps/rejected": -347.8154602050781, "loss": 0.004, "losses/dpo": 5.969531684968388e-06, "losses/sft": 0.7853083610534668, "losses/total": 5.969531684968388e-06, "ref_logps/chosen": -219.20692443847656, "ref_logps/rejected": -215.70767211914062, "rewards/accuracies": 1.0, "rewards/chosen": -1.0528510808944702, "rewards/margins": 12.157927513122559, "rewards/rejected": -13.210777282714844, "step": 1739 }, { "epoch": 0.42, "learning_rate": 1.2944e-07, "logps/chosen": -248.75997924804688, "logps/rejected": -331.90283203125, "loss": 0.005, "losses/dpo": 7.252585055539384e-05, "losses/sft": 0.4153175950050354, "losses/total": 7.252585055539384e-05, "ref_logps/chosen": -238.68711853027344, "ref_logps/rejected": -219.3727569580078, "rewards/accuracies": 1.0, "rewards/chosen": -1.0072870254516602, "rewards/margins": 10.245721817016602, "rewards/rejected": -11.253008842468262, "step": 1740 }, { "epoch": 0.42, "learning_rate": 1.2938666666666668e-07, "logps/chosen": -223.84307861328125, "logps/rejected": -309.2828369140625, "loss": 0.0046, "losses/dpo": 3.202563547688442e-08, "losses/sft": 0.9038119316101074, "losses/total": 3.202563547688442e-08, "ref_logps/chosen": -217.86477661132812, "ref_logps/rejected": -203.82437133789062, "rewards/accuracies": 1.0, "rewards/chosen": -0.5978270173072815, "rewards/margins": 9.948020935058594, "rewards/rejected": -10.54584789276123, "step": 1741 }, { "epoch": 0.42, "learning_rate": 1.2933333333333333e-07, "logps/chosen": -253.03871154785156, "logps/rejected": -343.7923583984375, "loss": 0.0044, "losses/dpo": 1.1496459819682059e-06, "losses/sft": 0.591376543045044, "losses/total": 1.1496459819682059e-06, "ref_logps/chosen": -245.93707275390625, "ref_logps/rejected": -220.86962890625, "rewards/accuracies": 1.0, "rewards/chosen": -0.7101619243621826, "rewards/margins": 11.582109451293945, "rewards/rejected": -12.292271614074707, "step": 1742 }, { "epoch": 0.42, "learning_rate": 1.2927999999999998e-07, "logps/chosen": -247.9513397216797, "logps/rejected": -353.59173583984375, "loss": 0.0185, "losses/dpo": 9.90223103514154e-09, "losses/sft": 1.258918285369873, "losses/total": 9.90223103514154e-09, "ref_logps/chosen": -241.1789093017578, "ref_logps/rejected": -232.68826293945312, "rewards/accuracies": 1.0, "rewards/chosen": -0.6772443056106567, "rewards/margins": 11.413101196289062, "rewards/rejected": -12.09034538269043, "step": 1743 }, { "epoch": 0.42, "learning_rate": 1.2922666666666665e-07, "logps/chosen": -232.6693572998047, "logps/rejected": -322.9033203125, "loss": 0.0022, "losses/dpo": 3.2404793159912515e-07, "losses/sft": 0.6572991013526917, "losses/total": 3.2404793159912515e-07, "ref_logps/chosen": -224.86862182617188, "ref_logps/rejected": -204.65457153320312, "rewards/accuracies": 1.0, "rewards/chosen": -0.7800739407539368, "rewards/margins": 11.044800758361816, "rewards/rejected": -11.824874877929688, "step": 1744 }, { "epoch": 0.42, "learning_rate": 1.2917333333333333e-07, "logps/chosen": -250.17288208007812, "logps/rejected": -351.1876220703125, "loss": 0.0071, "losses/dpo": 2.730090216118697e-07, "losses/sft": 0.4650020897388458, "losses/total": 2.730090216118697e-07, "ref_logps/chosen": -240.87954711914062, "ref_logps/rejected": -232.98330688476562, "rewards/accuracies": 1.0, "rewards/chosen": -0.9293322563171387, "rewards/margins": 10.891100883483887, "rewards/rejected": -11.820432662963867, "step": 1745 }, { "epoch": 0.42, "learning_rate": 1.2911999999999998e-07, "logps/chosen": -239.9639434814453, "logps/rejected": -326.2256774902344, "loss": 0.0008, "losses/dpo": 5.9876867453567684e-05, "losses/sft": 0.8199329376220703, "losses/total": 5.9876867453567684e-05, "ref_logps/chosen": -229.96482849121094, "ref_logps/rejected": -209.17088317871094, "rewards/accuracies": 1.0, "rewards/chosen": -0.9999101161956787, "rewards/margins": 10.705568313598633, "rewards/rejected": -11.70547866821289, "step": 1746 }, { "epoch": 0.42, "learning_rate": 1.2906666666666666e-07, "logps/chosen": -277.8196105957031, "logps/rejected": -357.07855224609375, "loss": 0.0038, "losses/dpo": 7.897912837506738e-06, "losses/sft": 0.584273636341095, "losses/total": 7.897912837506738e-06, "ref_logps/chosen": -270.4485168457031, "ref_logps/rejected": -246.38143920898438, "rewards/accuracies": 1.0, "rewards/chosen": -0.7371122241020203, "rewards/margins": 10.332595825195312, "rewards/rejected": -11.069709777832031, "step": 1747 }, { "epoch": 0.42, "learning_rate": 1.2901333333333333e-07, "logps/chosen": -229.5089874267578, "logps/rejected": -325.8025817871094, "loss": 0.0117, "losses/dpo": 1.8387670934316702e-05, "losses/sft": 0.624566912651062, "losses/total": 1.8387670934316702e-05, "ref_logps/chosen": -220.69998168945312, "ref_logps/rejected": -211.64871215820312, "rewards/accuracies": 1.0, "rewards/chosen": -0.880899965763092, "rewards/margins": 10.534486770629883, "rewards/rejected": -11.415385246276855, "step": 1748 }, { "epoch": 0.42, "learning_rate": 1.2896e-07, "logps/chosen": -203.8064727783203, "logps/rejected": -351.8326110839844, "loss": 0.0064, "losses/dpo": 8.908356903702952e-06, "losses/sft": 0.7957072854042053, "losses/total": 8.908356903702952e-06, "ref_logps/chosen": -193.80703735351562, "ref_logps/rejected": -233.24462890625, "rewards/accuracies": 1.0, "rewards/chosen": -0.9999444484710693, "rewards/margins": 10.85885238647461, "rewards/rejected": -11.858797073364258, "step": 1749 }, { "epoch": 0.42, "learning_rate": 1.2890666666666666e-07, "logps/chosen": -217.33741760253906, "logps/rejected": -340.67864990234375, "loss": 0.0036, "losses/dpo": 7.305506733246148e-05, "losses/sft": 0.4429379403591156, "losses/total": 7.305506733246148e-05, "ref_logps/chosen": -208.99098205566406, "ref_logps/rejected": -224.56752014160156, "rewards/accuracies": 1.0, "rewards/chosen": -0.8346443772315979, "rewards/margins": 10.776466369628906, "rewards/rejected": -11.61111068725586, "step": 1750 }, { "epoch": 0.42, "learning_rate": 1.288533333333333e-07, "logps/chosen": -226.69076538085938, "logps/rejected": -319.3272705078125, "loss": 0.0054, "losses/dpo": 6.173873316583922e-06, "losses/sft": 0.6400348544120789, "losses/total": 6.173873316583922e-06, "ref_logps/chosen": -219.46006774902344, "ref_logps/rejected": -210.07220458984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.723071813583374, "rewards/margins": 10.202436447143555, "rewards/rejected": -10.925508499145508, "step": 1751 }, { "epoch": 0.42, "learning_rate": 1.288e-07, "logps/chosen": -243.66571044921875, "logps/rejected": -294.72137451171875, "loss": 0.0084, "losses/dpo": 1.0316074217087134e-08, "losses/sft": 0.6029734015464783, "losses/total": 1.0316074217087134e-08, "ref_logps/chosen": -237.39437866210938, "ref_logps/rejected": -185.83660888671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.6271363496780396, "rewards/margins": 10.261338233947754, "rewards/rejected": -10.88847541809082, "step": 1752 }, { "epoch": 0.42, "learning_rate": 1.2874666666666667e-07, "logps/chosen": -199.11427307128906, "logps/rejected": -330.2720947265625, "loss": 0.0045, "losses/dpo": 3.4039273941743886e-06, "losses/sft": 0.5350415706634521, "losses/total": 3.4039273941743886e-06, "ref_logps/chosen": -192.31903076171875, "ref_logps/rejected": -217.23707580566406, "rewards/accuracies": 1.0, "rewards/chosen": -0.6795231103897095, "rewards/margins": 10.623977661132812, "rewards/rejected": -11.30350112915039, "step": 1753 }, { "epoch": 0.42, "learning_rate": 1.2869333333333332e-07, "logps/chosen": -256.30828857421875, "logps/rejected": -371.6541748046875, "loss": 0.0016, "losses/dpo": 2.5571807782398537e-08, "losses/sft": 0.7025841474533081, "losses/total": 2.5571807782398537e-08, "ref_logps/chosen": -250.65762329101562, "ref_logps/rejected": -237.03585815429688, "rewards/accuracies": 1.0, "rewards/chosen": -0.5650684237480164, "rewards/margins": 12.896764755249023, "rewards/rejected": -13.461832046508789, "step": 1754 }, { "epoch": 0.42, "learning_rate": 1.2864e-07, "logps/chosen": -227.79879760742188, "logps/rejected": -334.4470520019531, "loss": 0.01, "losses/dpo": 1.601772758874631e-08, "losses/sft": 1.0567631721496582, "losses/total": 1.601772758874631e-08, "ref_logps/chosen": -217.627685546875, "ref_logps/rejected": -219.73297119140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.0171113014221191, "rewards/margins": 10.454296112060547, "rewards/rejected": -11.471406936645508, "step": 1755 }, { "epoch": 0.42, "learning_rate": 1.2858666666666667e-07, "logps/chosen": -282.08624267578125, "logps/rejected": -382.776611328125, "loss": 0.0043, "losses/dpo": 1.0783802295577516e-08, "losses/sft": 0.5973285436630249, "losses/total": 1.0783802295577516e-08, "ref_logps/chosen": -267.8763427734375, "ref_logps/rejected": -246.31053161621094, "rewards/accuracies": 1.0, "rewards/chosen": -1.4209870100021362, "rewards/margins": 12.22562313079834, "rewards/rejected": -13.646610260009766, "step": 1756 }, { "epoch": 0.42, "learning_rate": 1.2853333333333335e-07, "logps/chosen": -246.91262817382812, "logps/rejected": -373.74554443359375, "loss": 0.0037, "losses/dpo": 7.246570021379739e-05, "losses/sft": 0.38051193952560425, "losses/total": 7.246570021379739e-05, "ref_logps/chosen": -239.4517822265625, "ref_logps/rejected": -246.54139709472656, "rewards/accuracies": 1.0, "rewards/chosen": -0.7460848093032837, "rewards/margins": 11.974329948425293, "rewards/rejected": -12.720415115356445, "step": 1757 }, { "epoch": 0.42, "learning_rate": 1.2848e-07, "logps/chosen": -219.49490356445312, "logps/rejected": -337.59320068359375, "loss": 0.0016, "losses/dpo": 1.3863662218227546e-07, "losses/sft": 0.48721426725387573, "losses/total": 1.3863662218227546e-07, "ref_logps/chosen": -212.09869384765625, "ref_logps/rejected": -219.63888549804688, "rewards/accuracies": 1.0, "rewards/chosen": -0.7396190166473389, "rewards/margins": 11.055815696716309, "rewards/rejected": -11.795434951782227, "step": 1758 }, { "epoch": 0.42, "learning_rate": 1.2842666666666667e-07, "logps/chosen": -223.0382843017578, "logps/rejected": -315.78045654296875, "loss": 0.0023, "losses/dpo": 6.657519975306059e-07, "losses/sft": 0.678282618522644, "losses/total": 6.657519975306059e-07, "ref_logps/chosen": -215.55613708496094, "ref_logps/rejected": -201.7598114013672, "rewards/accuracies": 1.0, "rewards/chosen": -0.7482137084007263, "rewards/margins": 10.653850555419922, "rewards/rejected": -11.402064323425293, "step": 1759 }, { "epoch": 0.42, "learning_rate": 1.2837333333333332e-07, "logps/chosen": -272.36163330078125, "logps/rejected": -372.84503173828125, "loss": 0.0062, "losses/dpo": 4.930860608709509e-09, "losses/sft": 0.5583232045173645, "losses/total": 4.930860608709509e-09, "ref_logps/chosen": -260.8491516113281, "ref_logps/rejected": -242.6705780029297, "rewards/accuracies": 1.0, "rewards/chosen": -1.1512506008148193, "rewards/margins": 11.866192817687988, "rewards/rejected": -13.017443656921387, "step": 1760 }, { "epoch": 0.42, "learning_rate": 1.2831999999999997e-07, "logps/chosen": -278.37066650390625, "logps/rejected": -347.4755859375, "loss": 0.0035, "losses/dpo": 4.628519718608004e-07, "losses/sft": 0.5134592056274414, "losses/total": 4.628519718608004e-07, "ref_logps/chosen": -269.5782470703125, "ref_logps/rejected": -225.44894409179688, "rewards/accuracies": 1.0, "rewards/chosen": -0.8792443871498108, "rewards/margins": 11.323419570922852, "rewards/rejected": -12.202665328979492, "step": 1761 }, { "epoch": 0.42, "learning_rate": 1.2826666666666665e-07, "logps/chosen": -241.74429321289062, "logps/rejected": -310.07891845703125, "loss": 0.0012, "losses/dpo": 8.562265563227811e-09, "losses/sft": 0.6292972564697266, "losses/total": 8.562265563227811e-09, "ref_logps/chosen": -233.6403350830078, "ref_logps/rejected": -203.55810546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.8103947639465332, "rewards/margins": 9.841686248779297, "rewards/rejected": -10.652080535888672, "step": 1762 }, { "epoch": 0.42, "learning_rate": 1.2821333333333333e-07, "logps/chosen": -217.15867614746094, "logps/rejected": -340.6344909667969, "loss": 0.0039, "losses/dpo": 9.792312994250096e-06, "losses/sft": 0.5328431129455566, "losses/total": 9.792312994250096e-06, "ref_logps/chosen": -209.4368896484375, "ref_logps/rejected": -220.477294921875, "rewards/accuracies": 1.0, "rewards/chosen": -0.7721790075302124, "rewards/margins": 11.243539810180664, "rewards/rejected": -12.01572036743164, "step": 1763 }, { "epoch": 0.42, "learning_rate": 1.2816e-07, "logps/chosen": -273.73974609375, "logps/rejected": -366.29595947265625, "loss": 0.0025, "losses/dpo": 3.2165866059585824e-07, "losses/sft": 0.7161983847618103, "losses/total": 3.2165866059585824e-07, "ref_logps/chosen": -261.7313537597656, "ref_logps/rejected": -235.21612548828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.2008386850357056, "rewards/margins": 11.907146453857422, "rewards/rejected": -13.107985496520996, "step": 1764 }, { "epoch": 0.42, "learning_rate": 1.2810666666666665e-07, "logps/chosen": -247.23333740234375, "logps/rejected": -323.97564697265625, "loss": 0.0042, "losses/dpo": 7.048099881501457e-09, "losses/sft": 0.5252187848091125, "losses/total": 7.048099881501457e-09, "ref_logps/chosen": -237.4349365234375, "ref_logps/rejected": -210.043212890625, "rewards/accuracies": 1.0, "rewards/chosen": -0.9798421859741211, "rewards/margins": 10.413402557373047, "rewards/rejected": -11.393243789672852, "step": 1765 }, { "epoch": 0.42, "learning_rate": 1.2805333333333333e-07, "logps/chosen": -237.7022247314453, "logps/rejected": -349.2378234863281, "loss": 0.0012, "losses/dpo": 1.7794160385165014e-06, "losses/sft": 0.8201975226402283, "losses/total": 1.7794160385165014e-06, "ref_logps/chosen": -228.15087890625, "ref_logps/rejected": -221.10769653320312, "rewards/accuracies": 1.0, "rewards/chosen": -0.9551342725753784, "rewards/margins": 11.857881546020508, "rewards/rejected": -12.81301498413086, "step": 1766 }, { "epoch": 0.42, "learning_rate": 1.28e-07, "logps/chosen": -286.241943359375, "logps/rejected": -395.16668701171875, "loss": 0.0004, "losses/dpo": 4.3773273006308955e-08, "losses/sft": 0.485348641872406, "losses/total": 4.3773273006308955e-08, "ref_logps/chosen": -278.420654296875, "ref_logps/rejected": -257.62164306640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.7821289300918579, "rewards/margins": 12.972373962402344, "rewards/rejected": -13.75450325012207, "step": 1767 }, { "epoch": 0.42, "learning_rate": 1.2794666666666666e-07, "logps/chosen": -254.836181640625, "logps/rejected": -356.64227294921875, "loss": 0.0015, "losses/dpo": 3.3909188346115116e-07, "losses/sft": 0.4591365456581116, "losses/total": 3.3909188346115116e-07, "ref_logps/chosen": -245.0163116455078, "ref_logps/rejected": -227.2821807861328, "rewards/accuracies": 1.0, "rewards/chosen": -0.9819843769073486, "rewards/margins": 11.954023361206055, "rewards/rejected": -12.93600845336914, "step": 1768 }, { "epoch": 0.42, "learning_rate": 1.278933333333333e-07, "logps/chosen": -266.4690856933594, "logps/rejected": -379.77264404296875, "loss": 0.0022, "losses/dpo": 3.666223165055271e-07, "losses/sft": 0.7034691572189331, "losses/total": 3.666223165055271e-07, "ref_logps/chosen": -256.139404296875, "ref_logps/rejected": -253.9795684814453, "rewards/accuracies": 1.0, "rewards/chosen": -1.0329644680023193, "rewards/margins": 11.54633903503418, "rewards/rejected": -12.579303741455078, "step": 1769 }, { "epoch": 0.42, "learning_rate": 1.2783999999999999e-07, "logps/chosen": -237.98194885253906, "logps/rejected": -374.32598876953125, "loss": 0.0025, "losses/dpo": 2.8617516818485456e-06, "losses/sft": 0.6255119442939758, "losses/total": 2.8617516818485456e-06, "ref_logps/chosen": -228.2210693359375, "ref_logps/rejected": -237.55508422851562, "rewards/accuracies": 1.0, "rewards/chosen": -0.9760887026786804, "rewards/margins": 12.701004028320312, "rewards/rejected": -13.677093505859375, "step": 1770 }, { "epoch": 0.43, "learning_rate": 1.2778666666666666e-07, "logps/chosen": -240.0262908935547, "logps/rejected": -345.8621826171875, "loss": 0.0006, "losses/dpo": 3.15268061656937e-10, "losses/sft": 0.9010862112045288, "losses/total": 3.15268061656937e-10, "ref_logps/chosen": -236.11483764648438, "ref_logps/rejected": -231.97726440429688, "rewards/accuracies": 1.0, "rewards/chosen": -0.39114585518836975, "rewards/margins": 10.997346878051758, "rewards/rejected": -11.388492584228516, "step": 1771 }, { "epoch": 0.43, "learning_rate": 1.2773333333333334e-07, "logps/chosen": -284.5218505859375, "logps/rejected": -369.197509765625, "loss": 0.0016, "losses/dpo": 6.683314524025263e-08, "losses/sft": 0.4128999710083008, "losses/total": 6.683314524025263e-08, "ref_logps/chosen": -272.3862609863281, "ref_logps/rejected": -239.8302001953125, "rewards/accuracies": 1.0, "rewards/chosen": -1.2135595083236694, "rewards/margins": 11.723170280456543, "rewards/rejected": -12.936729431152344, "step": 1772 }, { "epoch": 0.43, "learning_rate": 1.2768e-07, "logps/chosen": -248.52720642089844, "logps/rejected": -341.24139404296875, "loss": 0.0014, "losses/dpo": 2.697748300306557e-07, "losses/sft": 0.4856548309326172, "losses/total": 2.697748300306557e-07, "ref_logps/chosen": -238.91941833496094, "ref_logps/rejected": -222.32705688476562, "rewards/accuracies": 1.0, "rewards/chosen": -0.9607812166213989, "rewards/margins": 10.930654525756836, "rewards/rejected": -11.891435623168945, "step": 1773 }, { "epoch": 0.43, "learning_rate": 1.2762666666666667e-07, "logps/chosen": -233.5127716064453, "logps/rejected": -319.80914306640625, "loss": 0.0023, "losses/dpo": 0.0007779118604958057, "losses/sft": 0.6213931441307068, "losses/total": 0.0007779118604958057, "ref_logps/chosen": -224.930908203125, "ref_logps/rejected": -207.73886108398438, "rewards/accuracies": 1.0, "rewards/chosen": -0.8581857681274414, "rewards/margins": 10.348844528198242, "rewards/rejected": -11.207029342651367, "step": 1774 }, { "epoch": 0.43, "learning_rate": 1.2757333333333334e-07, "logps/chosen": -223.8974151611328, "logps/rejected": -328.61334228515625, "loss": 0.0029, "losses/dpo": 2.0216102711856365e-06, "losses/sft": 0.6484996676445007, "losses/total": 2.0216102711856365e-06, "ref_logps/chosen": -216.91049194335938, "ref_logps/rejected": -214.8035430908203, "rewards/accuracies": 1.0, "rewards/chosen": -0.698691189289093, "rewards/margins": 10.682287216186523, "rewards/rejected": -11.380977630615234, "step": 1775 }, { "epoch": 0.43, "learning_rate": 1.2752e-07, "logps/chosen": -225.2972412109375, "logps/rejected": -337.3575439453125, "loss": 0.0073, "losses/dpo": 5.587379092730771e-08, "losses/sft": 0.4877385199069977, "losses/total": 5.587379092730771e-08, "ref_logps/chosen": -219.22259521484375, "ref_logps/rejected": -217.62527465820312, "rewards/accuracies": 1.0, "rewards/chosen": -0.607463002204895, "rewards/margins": 11.365762710571289, "rewards/rejected": -11.973224639892578, "step": 1776 }, { "epoch": 0.43, "learning_rate": 1.2746666666666664e-07, "logps/chosen": -244.26620483398438, "logps/rejected": -328.4784851074219, "loss": 0.0038, "losses/dpo": 5.476881028698699e-07, "losses/sft": 0.6925702691078186, "losses/total": 5.476881028698699e-07, "ref_logps/chosen": -237.32540893554688, "ref_logps/rejected": -213.7164306640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.6940792798995972, "rewards/margins": 10.782125473022461, "rewards/rejected": -11.476205825805664, "step": 1777 }, { "epoch": 0.43, "learning_rate": 1.2741333333333332e-07, "logps/chosen": -230.2350311279297, "logps/rejected": -336.2851867675781, "loss": 0.0025, "losses/dpo": 8.661196261527948e-06, "losses/sft": 0.7342872619628906, "losses/total": 8.661196261527948e-06, "ref_logps/chosen": -221.3883514404297, "ref_logps/rejected": -220.37998962402344, "rewards/accuracies": 1.0, "rewards/chosen": -0.8846703767776489, "rewards/margins": 10.705848693847656, "rewards/rejected": -11.590518951416016, "step": 1778 }, { "epoch": 0.43, "learning_rate": 1.2736e-07, "logps/chosen": -243.2621307373047, "logps/rejected": -378.450439453125, "loss": 0.0017, "losses/dpo": 5.385999202189851e-09, "losses/sft": 0.6766730546951294, "losses/total": 5.385999202189851e-09, "ref_logps/chosen": -234.4365234375, "ref_logps/rejected": -247.98744201660156, "rewards/accuracies": 1.0, "rewards/chosen": -0.8825603723526001, "rewards/margins": 12.163738250732422, "rewards/rejected": -13.04629898071289, "step": 1779 }, { "epoch": 0.43, "learning_rate": 1.2730666666666665e-07, "logps/chosen": -233.01101684570312, "logps/rejected": -321.7800598144531, "loss": 0.0048, "losses/dpo": 7.504900736421405e-07, "losses/sft": 0.694031834602356, "losses/total": 7.504900736421405e-07, "ref_logps/chosen": -224.1347198486328, "ref_logps/rejected": -209.27731323242188, "rewards/accuracies": 1.0, "rewards/chosen": -0.8876276612281799, "rewards/margins": 10.362646102905273, "rewards/rejected": -11.250272750854492, "step": 1780 }, { "epoch": 0.43, "learning_rate": 1.2725333333333332e-07, "logps/chosen": -201.17742919921875, "logps/rejected": -345.2278747558594, "loss": 0.001, "losses/dpo": 3.852801455650479e-06, "losses/sft": 0.6652323603630066, "losses/total": 3.852801455650479e-06, "ref_logps/chosen": -194.4175567626953, "ref_logps/rejected": -218.794189453125, "rewards/accuracies": 1.0, "rewards/chosen": -0.6759871244430542, "rewards/margins": 11.967381477355957, "rewards/rejected": -12.643369674682617, "step": 1781 }, { "epoch": 0.43, "learning_rate": 1.272e-07, "logps/chosen": -214.78622436523438, "logps/rejected": -291.024658203125, "loss": 0.0058, "losses/dpo": 4.2145723000430735e-07, "losses/sft": 0.6393559575080872, "losses/total": 4.2145723000430735e-07, "ref_logps/chosen": -207.1971435546875, "ref_logps/rejected": -184.74952697753906, "rewards/accuracies": 1.0, "rewards/chosen": -0.7589088082313538, "rewards/margins": 9.868603706359863, "rewards/rejected": -10.627511978149414, "step": 1782 }, { "epoch": 0.43, "learning_rate": 1.2714666666666668e-07, "logps/chosen": -269.2264099121094, "logps/rejected": -343.85260009765625, "loss": 0.0085, "losses/dpo": 1.0704807351658019e-07, "losses/sft": 0.774749219417572, "losses/total": 1.0704807351658019e-07, "ref_logps/chosen": -259.7374572753906, "ref_logps/rejected": -226.6303253173828, "rewards/accuracies": 1.0, "rewards/chosen": -0.9488940834999084, "rewards/margins": 10.773334503173828, "rewards/rejected": -11.72222900390625, "step": 1783 }, { "epoch": 0.43, "learning_rate": 1.2709333333333333e-07, "logps/chosen": -235.90625, "logps/rejected": -363.53167724609375, "loss": 0.0069, "losses/dpo": 4.601113090529907e-08, "losses/sft": 0.5799727439880371, "losses/total": 4.601113090529907e-08, "ref_logps/chosen": -230.32496643066406, "ref_logps/rejected": -242.92474365234375, "rewards/accuracies": 1.0, "rewards/chosen": -0.5581280589103699, "rewards/margins": 11.502564430236816, "rewards/rejected": -12.06069278717041, "step": 1784 }, { "epoch": 0.43, "learning_rate": 1.2704e-07, "logps/chosen": -208.52279663085938, "logps/rejected": -306.67218017578125, "loss": 0.0221, "losses/dpo": 4.920182732348621e-07, "losses/sft": 1.0836936235427856, "losses/total": 4.920182732348621e-07, "ref_logps/chosen": -197.35614013671875, "ref_logps/rejected": -198.93106079101562, "rewards/accuracies": 1.0, "rewards/chosen": -1.1166653633117676, "rewards/margins": 9.657448768615723, "rewards/rejected": -10.774113655090332, "step": 1785 }, { "epoch": 0.43, "learning_rate": 1.2698666666666665e-07, "logps/chosen": -228.42221069335938, "logps/rejected": -352.1236877441406, "loss": 0.0089, "losses/dpo": 0.0006635939935222268, "losses/sft": 0.6217408180236816, "losses/total": 0.0006635939935222268, "ref_logps/chosen": -217.72247314453125, "ref_logps/rejected": -222.9981689453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.0699726343154907, "rewards/margins": 11.842577934265137, "rewards/rejected": -12.912550926208496, "step": 1786 }, { "epoch": 0.43, "learning_rate": 1.2693333333333333e-07, "logps/chosen": -232.49472045898438, "logps/rejected": -352.7695617675781, "loss": 0.0009, "losses/dpo": 8.106763971227338e-08, "losses/sft": 0.4985452890396118, "losses/total": 8.106763971227338e-08, "ref_logps/chosen": -224.36105346679688, "ref_logps/rejected": -223.75979614257812, "rewards/accuracies": 1.0, "rewards/chosen": -0.8133639693260193, "rewards/margins": 12.08761215209961, "rewards/rejected": -12.900976181030273, "step": 1787 }, { "epoch": 0.43, "learning_rate": 1.2687999999999998e-07, "logps/chosen": -253.51339721679688, "logps/rejected": -386.762451171875, "loss": 0.0015, "losses/dpo": 4.668794417739264e-08, "losses/sft": 0.6658831834793091, "losses/total": 4.668794417739264e-08, "ref_logps/chosen": -243.9519805908203, "ref_logps/rejected": -257.71527099609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9561412334442139, "rewards/margins": 11.948575019836426, "rewards/rejected": -12.904716491699219, "step": 1788 }, { "epoch": 0.43, "learning_rate": 1.2682666666666666e-07, "logps/chosen": -261.2837829589844, "logps/rejected": -345.08612060546875, "loss": 0.0025, "losses/dpo": 5.930211877114289e-11, "losses/sft": 0.7337092757225037, "losses/total": 5.930211877114289e-11, "ref_logps/chosen": -251.4923553466797, "ref_logps/rejected": -219.65489196777344, "rewards/accuracies": 1.0, "rewards/chosen": -0.9791433215141296, "rewards/margins": 11.563980102539062, "rewards/rejected": -12.543123245239258, "step": 1789 }, { "epoch": 0.43, "learning_rate": 1.2677333333333334e-07, "logps/chosen": -220.6180419921875, "logps/rejected": -330.76031494140625, "loss": 0.0074, "losses/dpo": 5.6231943545981267e-08, "losses/sft": 0.5279763340950012, "losses/total": 5.6231943545981267e-08, "ref_logps/chosen": -209.39303588867188, "ref_logps/rejected": -213.05807495117188, "rewards/accuracies": 1.0, "rewards/chosen": -1.1225018501281738, "rewards/margins": 10.647721290588379, "rewards/rejected": -11.770222663879395, "step": 1790 }, { "epoch": 0.43, "learning_rate": 1.2672e-07, "logps/chosen": -229.26901245117188, "logps/rejected": -322.1302185058594, "loss": 0.0039, "losses/dpo": 3.5752447047343594e-07, "losses/sft": 0.4171372354030609, "losses/total": 3.5752447047343594e-07, "ref_logps/chosen": -220.35552978515625, "ref_logps/rejected": -206.95152282714844, "rewards/accuracies": 1.0, "rewards/chosen": -0.8913471698760986, "rewards/margins": 10.6265230178833, "rewards/rejected": -11.51786994934082, "step": 1791 }, { "epoch": 0.43, "learning_rate": 1.2666666666666666e-07, "logps/chosen": -280.03759765625, "logps/rejected": -389.20416259765625, "loss": 0.0006, "losses/dpo": 3.3043622948980556e-08, "losses/sft": 0.5417138934135437, "losses/total": 3.3043622948980556e-08, "ref_logps/chosen": -270.56756591796875, "ref_logps/rejected": -248.21949768066406, "rewards/accuracies": 1.0, "rewards/chosen": -0.9470021724700928, "rewards/margins": 13.151468276977539, "rewards/rejected": -14.098470687866211, "step": 1792 }, { "epoch": 0.43, "learning_rate": 1.2661333333333334e-07, "logps/chosen": -228.5651092529297, "logps/rejected": -330.4697570800781, "loss": 0.0008, "losses/dpo": 2.933258656412363e-06, "losses/sft": 0.7325915098190308, "losses/total": 2.933258656412363e-06, "ref_logps/chosen": -217.91317749023438, "ref_logps/rejected": -216.09043884277344, "rewards/accuracies": 1.0, "rewards/chosen": -1.0651932954788208, "rewards/margins": 10.372737884521484, "rewards/rejected": -11.437931060791016, "step": 1793 }, { "epoch": 0.43, "learning_rate": 1.2656e-07, "logps/chosen": -214.50439453125, "logps/rejected": -329.7393798828125, "loss": 0.0103, "losses/dpo": 3.836131872958504e-05, "losses/sft": 0.6144011616706848, "losses/total": 3.836131872958504e-05, "ref_logps/chosen": -206.07386779785156, "ref_logps/rejected": -214.8900909423828, "rewards/accuracies": 1.0, "rewards/chosen": -0.8430529832839966, "rewards/margins": 10.641876220703125, "rewards/rejected": -11.484929084777832, "step": 1794 }, { "epoch": 0.43, "learning_rate": 1.2650666666666664e-07, "logps/chosen": -263.8465576171875, "logps/rejected": -324.7881164550781, "loss": 0.0046, "losses/dpo": 7.24466588053474e-07, "losses/sft": 0.6497805118560791, "losses/total": 7.24466588053474e-07, "ref_logps/chosen": -254.1267852783203, "ref_logps/rejected": -213.75074768066406, "rewards/accuracies": 1.0, "rewards/chosen": -0.9719799757003784, "rewards/margins": 10.131755828857422, "rewards/rejected": -11.10373592376709, "step": 1795 }, { "epoch": 0.43, "learning_rate": 1.2645333333333332e-07, "logps/chosen": -249.20242309570312, "logps/rejected": -358.5970458984375, "loss": 0.0029, "losses/dpo": 0.00040180524229072034, "losses/sft": 0.6857947111129761, "losses/total": 0.00040180524229072034, "ref_logps/chosen": -238.7411346435547, "ref_logps/rejected": -235.72479248046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.0461289882659912, "rewards/margins": 11.241094589233398, "rewards/rejected": -12.287223815917969, "step": 1796 }, { "epoch": 0.43, "learning_rate": 1.264e-07, "logps/chosen": -253.90231323242188, "logps/rejected": -373.0710754394531, "loss": 0.0006, "losses/dpo": 0.004742021206766367, "losses/sft": 0.41426876187324524, "losses/total": 0.004742021206766367, "ref_logps/chosen": -242.89974975585938, "ref_logps/rejected": -241.91175842285156, "rewards/accuracies": 1.0, "rewards/chosen": -1.100256085395813, "rewards/margins": 12.015674591064453, "rewards/rejected": -13.11592960357666, "step": 1797 }, { "epoch": 0.43, "learning_rate": 1.2634666666666667e-07, "logps/chosen": -254.88494873046875, "logps/rejected": -378.26666259765625, "loss": 0.0159, "losses/dpo": 5.509890797839034e-06, "losses/sft": 0.7157798409461975, "losses/total": 5.509890797839034e-06, "ref_logps/chosen": -245.150390625, "ref_logps/rejected": -251.3236083984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9734558463096619, "rewards/margins": 11.72085189819336, "rewards/rejected": -12.694307327270508, "step": 1798 }, { "epoch": 0.43, "learning_rate": 1.2629333333333332e-07, "logps/chosen": -249.67849731445312, "logps/rejected": -341.9849548339844, "loss": 0.0503, "losses/dpo": 5.8927867030433845e-06, "losses/sft": 0.7357516288757324, "losses/total": 5.8927867030433845e-06, "ref_logps/chosen": -240.408935546875, "ref_logps/rejected": -214.78378295898438, "rewards/accuracies": 0.96875, "rewards/chosen": -0.9269583225250244, "rewards/margins": 11.793161392211914, "rewards/rejected": -12.72011947631836, "step": 1799 }, { "epoch": 0.43, "learning_rate": 1.2624e-07, "logps/chosen": -222.2318878173828, "logps/rejected": -356.24835205078125, "loss": 0.0017, "losses/dpo": 1.4341135283757467e-05, "losses/sft": 0.645513653755188, "losses/total": 1.4341135283757467e-05, "ref_logps/chosen": -212.9261474609375, "ref_logps/rejected": -225.787841796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.9305728673934937, "rewards/margins": 12.115480422973633, "rewards/rejected": -13.046051979064941, "step": 1800 }, { "epoch": 0.43, "learning_rate": 1.2618666666666667e-07, "logps/chosen": -258.5447082519531, "logps/rejected": -372.162841796875, "loss": 0.0006, "losses/dpo": 5.046010119258426e-05, "losses/sft": 0.6992858052253723, "losses/total": 5.046010119258426e-05, "ref_logps/chosen": -250.43850708007812, "ref_logps/rejected": -245.23309326171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.8106197714805603, "rewards/margins": 11.882354736328125, "rewards/rejected": -12.692974090576172, "step": 1801 }, { "epoch": 0.43, "learning_rate": 1.2613333333333332e-07, "logps/chosen": -204.56394958496094, "logps/rejected": -346.84014892578125, "loss": 0.0083, "losses/dpo": 7.653293323528487e-07, "losses/sft": 0.44336453080177307, "losses/total": 7.653293323528487e-07, "ref_logps/chosen": -197.58193969726562, "ref_logps/rejected": -228.90652465820312, "rewards/accuracies": 1.0, "rewards/chosen": -0.698201060295105, "rewards/margins": 11.095160484313965, "rewards/rejected": -11.79336166381836, "step": 1802 }, { "epoch": 0.43, "learning_rate": 1.2607999999999997e-07, "logps/chosen": -198.209716796875, "logps/rejected": -285.2567138671875, "loss": 0.0209, "losses/dpo": 0.003467055968940258, "losses/sft": 0.5445650815963745, "losses/total": 0.003467055968940258, "ref_logps/chosen": -191.94631958007812, "ref_logps/rejected": -182.8399658203125, "rewards/accuracies": 1.0, "rewards/chosen": -0.6263400316238403, "rewards/margins": 9.615336418151855, "rewards/rejected": -10.241676330566406, "step": 1803 }, { "epoch": 0.43, "learning_rate": 1.2602666666666665e-07, "logps/chosen": -278.4189758300781, "logps/rejected": -380.6470031738281, "loss": 0.0008, "losses/dpo": 2.7685271675181866e-07, "losses/sft": 0.6580249667167664, "losses/total": 2.7685271675181866e-07, "ref_logps/chosen": -266.7761535644531, "ref_logps/rejected": -244.19252014160156, "rewards/accuracies": 1.0, "rewards/chosen": -1.1642827987670898, "rewards/margins": 12.481165885925293, "rewards/rejected": -13.645448684692383, "step": 1804 }, { "epoch": 0.43, "learning_rate": 1.2597333333333333e-07, "logps/chosen": -252.4886474609375, "logps/rejected": -353.8536376953125, "loss": 0.0013, "losses/dpo": 4.562415597320069e-06, "losses/sft": 0.523838222026825, "losses/total": 4.562415597320069e-06, "ref_logps/chosen": -242.31399536132812, "ref_logps/rejected": -222.97088623046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.0174667835235596, "rewards/margins": 12.070809364318848, "rewards/rejected": -13.088276863098145, "step": 1805 }, { "epoch": 0.43, "learning_rate": 1.2592e-07, "logps/chosen": -236.09912109375, "logps/rejected": -346.88299560546875, "loss": 0.0008, "losses/dpo": 5.151781223844409e-09, "losses/sft": 0.6670408248901367, "losses/total": 5.151781223844409e-09, "ref_logps/chosen": -227.37405395507812, "ref_logps/rejected": -222.63720703125, "rewards/accuracies": 1.0, "rewards/chosen": -0.8725069165229797, "rewards/margins": 11.552072525024414, "rewards/rejected": -12.424579620361328, "step": 1806 }, { "epoch": 0.43, "learning_rate": 1.2586666666666666e-07, "logps/chosen": -209.51177978515625, "logps/rejected": -324.0050354003906, "loss": 0.0021, "losses/dpo": 0.00014320702757686377, "losses/sft": 0.9960373640060425, "losses/total": 0.00014320702757686377, "ref_logps/chosen": -199.52239990234375, "ref_logps/rejected": -204.8788604736328, "rewards/accuracies": 1.0, "rewards/chosen": -0.9989374876022339, "rewards/margins": 10.913679122924805, "rewards/rejected": -11.912616729736328, "step": 1807 }, { "epoch": 0.43, "learning_rate": 1.2581333333333333e-07, "logps/chosen": -235.7449188232422, "logps/rejected": -330.30517578125, "loss": 0.001, "losses/dpo": 3.189283961546607e-05, "losses/sft": 0.49395567178726196, "losses/total": 3.189283961546607e-05, "ref_logps/chosen": -226.70416259765625, "ref_logps/rejected": -213.0565185546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.9040740132331848, "rewards/margins": 10.820792198181152, "rewards/rejected": -11.72486686706543, "step": 1808 }, { "epoch": 0.43, "learning_rate": 1.2576e-07, "logps/chosen": -253.3575439453125, "logps/rejected": -344.9772033691406, "loss": 0.0022, "losses/dpo": 0.0001183570857392624, "losses/sft": 0.5840930938720703, "losses/total": 0.0001183570857392624, "ref_logps/chosen": -239.560302734375, "ref_logps/rejected": -220.18878173828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.3797252178192139, "rewards/margins": 11.099117279052734, "rewards/rejected": -12.478841781616211, "step": 1809 }, { "epoch": 0.43, "learning_rate": 1.2570666666666666e-07, "logps/chosen": -226.79600524902344, "logps/rejected": -339.18408203125, "loss": 0.0056, "losses/dpo": 3.398886576633231e-07, "losses/sft": 0.5878095030784607, "losses/total": 3.398886576633231e-07, "ref_logps/chosen": -216.6297149658203, "ref_logps/rejected": -216.0518341064453, "rewards/accuracies": 1.0, "rewards/chosen": -1.0166280269622803, "rewards/margins": 11.296597480773926, "rewards/rejected": -12.313224792480469, "step": 1810 }, { "epoch": 0.43, "learning_rate": 1.256533333333333e-07, "logps/chosen": -272.01190185546875, "logps/rejected": -387.2762451171875, "loss": 0.0088, "losses/dpo": 4.9560826482775155e-06, "losses/sft": 0.44089922308921814, "losses/total": 4.9560826482775155e-06, "ref_logps/chosen": -262.925048828125, "ref_logps/rejected": -253.3086395263672, "rewards/accuracies": 1.0, "rewards/chosen": -0.9086846113204956, "rewards/margins": 12.488075256347656, "rewards/rejected": -13.396760940551758, "step": 1811 }, { "epoch": 0.43, "learning_rate": 1.2559999999999999e-07, "logps/chosen": -247.05056762695312, "logps/rejected": -341.43511962890625, "loss": 0.0008, "losses/dpo": 3.1429939554072917e-05, "losses/sft": 0.7131778597831726, "losses/total": 3.1429939554072917e-05, "ref_logps/chosen": -239.27999877929688, "ref_logps/rejected": -220.50006103515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.777056097984314, "rewards/margins": 11.316450119018555, "rewards/rejected": -12.093505859375, "step": 1812 }, { "epoch": 0.44, "learning_rate": 1.2554666666666666e-07, "logps/chosen": -185.66790771484375, "logps/rejected": -315.5682678222656, "loss": 0.0053, "losses/dpo": 0.0002729315892793238, "losses/sft": 0.3570828437805176, "losses/total": 0.0002729315892793238, "ref_logps/chosen": -177.80210876464844, "ref_logps/rejected": -198.6695556640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.7865803241729736, "rewards/margins": 10.903290748596191, "rewards/rejected": -11.689870834350586, "step": 1813 }, { "epoch": 0.44, "learning_rate": 1.2549333333333331e-07, "logps/chosen": -242.44027709960938, "logps/rejected": -335.6617431640625, "loss": 0.0059, "losses/dpo": 3.2532861951040104e-06, "losses/sft": 0.8929413557052612, "losses/total": 3.2532861951040104e-06, "ref_logps/chosen": -234.70010375976562, "ref_logps/rejected": -222.72427368164062, "rewards/accuracies": 1.0, "rewards/chosen": -0.7740152478218079, "rewards/margins": 10.519731521606445, "rewards/rejected": -11.293746948242188, "step": 1814 }, { "epoch": 0.44, "learning_rate": 1.2544e-07, "logps/chosen": -233.92388916015625, "logps/rejected": -370.3973388671875, "loss": 0.0027, "losses/dpo": 7.863008067943156e-05, "losses/sft": 0.6571695804595947, "losses/total": 7.863008067943156e-05, "ref_logps/chosen": -226.68161010742188, "ref_logps/rejected": -230.61874389648438, "rewards/accuracies": 1.0, "rewards/chosen": -0.7242278456687927, "rewards/margins": 13.253632545471191, "rewards/rejected": -13.977861404418945, "step": 1815 }, { "epoch": 0.44, "learning_rate": 1.2538666666666667e-07, "logps/chosen": -236.783203125, "logps/rejected": -356.3064270019531, "loss": 0.0017, "losses/dpo": 8.532952122664028e-09, "losses/sft": 0.6395910978317261, "losses/total": 8.532952122664028e-09, "ref_logps/chosen": -227.222412109375, "ref_logps/rejected": -228.62347412109375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9560770988464355, "rewards/margins": 11.812217712402344, "rewards/rejected": -12.768294334411621, "step": 1816 }, { "epoch": 0.44, "learning_rate": 1.2533333333333334e-07, "logps/chosen": -237.59271240234375, "logps/rejected": -356.74176025390625, "loss": 0.0027, "losses/dpo": 4.00481026119337e-09, "losses/sft": 0.5755846500396729, "losses/total": 4.00481026119337e-09, "ref_logps/chosen": -223.12103271484375, "ref_logps/rejected": -222.2262420654297, "rewards/accuracies": 1.0, "rewards/chosen": -1.4471676349639893, "rewards/margins": 12.004383087158203, "rewards/rejected": -13.451550483703613, "step": 1817 }, { "epoch": 0.44, "learning_rate": 1.2528e-07, "logps/chosen": -268.85894775390625, "logps/rejected": -336.1976318359375, "loss": 0.0064, "losses/dpo": 5.483753398038971e-07, "losses/sft": 0.6882116794586182, "losses/total": 5.483753398038971e-07, "ref_logps/chosen": -260.8477783203125, "ref_logps/rejected": -227.79116821289062, "rewards/accuracies": 1.0, "rewards/chosen": -0.8011158108711243, "rewards/margins": 10.039532661437988, "rewards/rejected": -10.840648651123047, "step": 1818 }, { "epoch": 0.44, "learning_rate": 1.2522666666666667e-07, "logps/chosen": -253.0308837890625, "logps/rejected": -332.35211181640625, "loss": 0.011, "losses/dpo": 3.8655988987557066e-07, "losses/sft": 0.4298733174800873, "losses/total": 3.8655988987557066e-07, "ref_logps/chosen": -245.061767578125, "ref_logps/rejected": -223.27291870117188, "rewards/accuracies": 1.0, "rewards/chosen": -0.7969119548797607, "rewards/margins": 10.111007690429688, "rewards/rejected": -10.907918930053711, "step": 1819 }, { "epoch": 0.44, "learning_rate": 1.2517333333333332e-07, "logps/chosen": -263.73895263671875, "logps/rejected": -356.96240234375, "loss": 0.0018, "losses/dpo": 5.4497917290063924e-08, "losses/sft": 0.42059609293937683, "losses/total": 5.4497917290063924e-08, "ref_logps/chosen": -251.42462158203125, "ref_logps/rejected": -230.2659912109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.2314321994781494, "rewards/margins": 11.438206672668457, "rewards/rejected": -12.669638633728027, "step": 1820 }, { "epoch": 0.44, "learning_rate": 1.2512e-07, "logps/chosen": -246.19979858398438, "logps/rejected": -337.45977783203125, "loss": 0.0061, "losses/dpo": 6.046387625247007e-06, "losses/sft": 0.7376959919929504, "losses/total": 6.046387625247007e-06, "ref_logps/chosen": -236.88633728027344, "ref_logps/rejected": -215.48703002929688, "rewards/accuracies": 1.0, "rewards/chosen": -0.9313475489616394, "rewards/margins": 11.265926361083984, "rewards/rejected": -12.197274208068848, "step": 1821 }, { "epoch": 0.44, "learning_rate": 1.2506666666666665e-07, "logps/chosen": -227.11151123046875, "logps/rejected": -307.2287292480469, "loss": 0.0194, "losses/dpo": 7.911649504421803e-07, "losses/sft": 0.5293012261390686, "losses/total": 7.911649504421803e-07, "ref_logps/chosen": -220.55612182617188, "ref_logps/rejected": -199.200927734375, "rewards/accuracies": 1.0, "rewards/chosen": -0.6555399894714355, "rewards/margins": 10.147241592407227, "rewards/rejected": -10.80278205871582, "step": 1822 }, { "epoch": 0.44, "learning_rate": 1.2501333333333333e-07, "logps/chosen": -190.97738647460938, "logps/rejected": -366.3949279785156, "loss": 0.0048, "losses/dpo": 4.813618659227359e-08, "losses/sft": 0.9817265868186951, "losses/total": 4.813618659227359e-08, "ref_logps/chosen": -182.11419677734375, "ref_logps/rejected": -240.0228271484375, "rewards/accuracies": 1.0, "rewards/chosen": -0.8863183259963989, "rewards/margins": 11.750889778137207, "rewards/rejected": -12.63720703125, "step": 1823 }, { "epoch": 0.44, "learning_rate": 1.2496e-07, "logps/chosen": -248.56365966796875, "logps/rejected": -348.7693176269531, "loss": 0.0016, "losses/dpo": 1.604105818842072e-05, "losses/sft": 0.7042554616928101, "losses/total": 1.604105818842072e-05, "ref_logps/chosen": -239.35604858398438, "ref_logps/rejected": -224.08506774902344, "rewards/accuracies": 1.0, "rewards/chosen": -0.9207597970962524, "rewards/margins": 11.547664642333984, "rewards/rejected": -12.468424797058105, "step": 1824 }, { "epoch": 0.44, "learning_rate": 1.2490666666666668e-07, "logps/chosen": -210.028564453125, "logps/rejected": -311.19158935546875, "loss": 0.0083, "losses/dpo": 5.066759101168827e-08, "losses/sft": 0.6070703268051147, "losses/total": 5.066759101168827e-08, "ref_logps/chosen": -202.57789611816406, "ref_logps/rejected": -196.76229858398438, "rewards/accuracies": 1.0, "rewards/chosen": -0.7450662851333618, "rewards/margins": 10.69786262512207, "rewards/rejected": -11.442928314208984, "step": 1825 }, { "epoch": 0.44, "learning_rate": 1.2485333333333333e-07, "logps/chosen": -266.1163330078125, "logps/rejected": -338.20550537109375, "loss": 0.0014, "losses/dpo": 3.7213369523669826e-06, "losses/sft": 0.8245658278465271, "losses/total": 3.7213369523669826e-06, "ref_logps/chosen": -254.59640502929688, "ref_logps/rejected": -220.6442108154297, "rewards/accuracies": 1.0, "rewards/chosen": -1.1519932746887207, "rewards/margins": 10.604135513305664, "rewards/rejected": -11.756128311157227, "step": 1826 }, { "epoch": 0.44, "learning_rate": 1.248e-07, "logps/chosen": -255.09121704101562, "logps/rejected": -370.22308349609375, "loss": 0.0011, "losses/dpo": 0.002787813311442733, "losses/sft": 0.7170009016990662, "losses/total": 0.002787813311442733, "ref_logps/chosen": -245.43043518066406, "ref_logps/rejected": -236.0711669921875, "rewards/accuracies": 1.0, "rewards/chosen": -0.9660761952400208, "rewards/margins": 12.449114799499512, "rewards/rejected": -13.415191650390625, "step": 1827 }, { "epoch": 0.44, "learning_rate": 1.2474666666666666e-07, "logps/chosen": -240.115966796875, "logps/rejected": -375.0596923828125, "loss": 0.0003, "losses/dpo": 6.356595986289904e-05, "losses/sft": 0.8538117408752441, "losses/total": 6.356595986289904e-05, "ref_logps/chosen": -233.10238647460938, "ref_logps/rejected": -236.91575622558594, "rewards/accuracies": 1.0, "rewards/chosen": -0.7013580799102783, "rewards/margins": 13.113035202026367, "rewards/rejected": -13.814393997192383, "step": 1828 }, { "epoch": 0.44, "learning_rate": 1.246933333333333e-07, "logps/chosen": -187.75271606445312, "logps/rejected": -324.44482421875, "loss": 0.0026, "losses/dpo": 4.509743575908942e-06, "losses/sft": 0.6024181246757507, "losses/total": 4.509743575908942e-06, "ref_logps/chosen": -180.5428009033203, "ref_logps/rejected": -201.1892852783203, "rewards/accuracies": 1.0, "rewards/chosen": -0.7209903001785278, "rewards/margins": 11.604562759399414, "rewards/rejected": -12.325552940368652, "step": 1829 }, { "epoch": 0.44, "learning_rate": 1.2463999999999998e-07, "logps/chosen": -278.6962890625, "logps/rejected": -347.1845703125, "loss": 0.0018, "losses/dpo": 6.102890779402514e-07, "losses/sft": 0.6264070868492126, "losses/total": 6.102890779402514e-07, "ref_logps/chosen": -265.5384826660156, "ref_logps/rejected": -219.27197265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.3157813549041748, "rewards/margins": 11.475480079650879, "rewards/rejected": -12.791261672973633, "step": 1830 }, { "epoch": 0.44, "learning_rate": 1.2458666666666666e-07, "logps/chosen": -227.51187133789062, "logps/rejected": -327.64752197265625, "loss": 0.0145, "losses/dpo": 2.994288479385432e-06, "losses/sft": 0.5671288967132568, "losses/total": 2.994288479385432e-06, "ref_logps/chosen": -217.88145446777344, "ref_logps/rejected": -206.3650360107422, "rewards/accuracies": 1.0, "rewards/chosen": -0.9630416631698608, "rewards/margins": 11.165206909179688, "rewards/rejected": -12.12824821472168, "step": 1831 }, { "epoch": 0.44, "learning_rate": 1.2453333333333334e-07, "logps/chosen": -211.28182983398438, "logps/rejected": -347.70404052734375, "loss": 0.0028, "losses/dpo": 1.4800193959274566e-08, "losses/sft": 0.4518166482448578, "losses/total": 1.4800193959274566e-08, "ref_logps/chosen": -204.19772338867188, "ref_logps/rejected": -215.8790283203125, "rewards/accuracies": 1.0, "rewards/chosen": -0.708410382270813, "rewards/margins": 12.474090576171875, "rewards/rejected": -13.182500839233398, "step": 1832 }, { "epoch": 0.44, "learning_rate": 1.2448e-07, "logps/chosen": -300.2029113769531, "logps/rejected": -408.99127197265625, "loss": 0.0004, "losses/dpo": 1.3047203538008034e-05, "losses/sft": 0.3701799213886261, "losses/total": 1.3047203538008034e-05, "ref_logps/chosen": -291.5400390625, "ref_logps/rejected": -274.6697082519531, "rewards/accuracies": 1.0, "rewards/chosen": -0.8662875890731812, "rewards/margins": 12.565866470336914, "rewards/rejected": -13.432153701782227, "step": 1833 }, { "epoch": 0.44, "learning_rate": 1.2442666666666666e-07, "logps/chosen": -247.1068878173828, "logps/rejected": -339.2921142578125, "loss": 0.0019, "losses/dpo": 1.6450788962174556e-06, "losses/sft": 1.2653452157974243, "losses/total": 1.6450788962174556e-06, "ref_logps/chosen": -237.57080078125, "ref_logps/rejected": -217.16943359375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9536086320877075, "rewards/margins": 11.258663177490234, "rewards/rejected": -12.212270736694336, "step": 1834 }, { "epoch": 0.44, "learning_rate": 1.2437333333333334e-07, "logps/chosen": -223.1592254638672, "logps/rejected": -320.26031494140625, "loss": 0.0055, "losses/dpo": 1.1486147286632331e-06, "losses/sft": 0.5623924136161804, "losses/total": 1.1486147286632331e-06, "ref_logps/chosen": -214.922607421875, "ref_logps/rejected": -208.75845336914062, "rewards/accuracies": 1.0, "rewards/chosen": -0.8236609697341919, "rewards/margins": 10.326526641845703, "rewards/rejected": -11.150187492370605, "step": 1835 }, { "epoch": 0.44, "learning_rate": 1.2432e-07, "logps/chosen": -200.49847412109375, "logps/rejected": -345.50775146484375, "loss": 0.0009, "losses/dpo": 1.584893084327632e-06, "losses/sft": 0.4504598081111908, "losses/total": 1.584893084327632e-06, "ref_logps/chosen": -194.71829223632812, "ref_logps/rejected": -220.5909423828125, "rewards/accuracies": 1.0, "rewards/chosen": -0.5780174136161804, "rewards/margins": 11.91366195678711, "rewards/rejected": -12.491680145263672, "step": 1836 }, { "epoch": 0.44, "learning_rate": 1.2426666666666664e-07, "logps/chosen": -231.9527130126953, "logps/rejected": -325.041748046875, "loss": 0.0127, "losses/dpo": 8.608167689772017e-08, "losses/sft": 0.707618236541748, "losses/total": 8.608167689772017e-08, "ref_logps/chosen": -225.0233612060547, "ref_logps/rejected": -208.4730224609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.6929347515106201, "rewards/margins": 10.963939666748047, "rewards/rejected": -11.656874656677246, "step": 1837 }, { "epoch": 0.44, "learning_rate": 1.2421333333333332e-07, "logps/chosen": -261.0279541015625, "logps/rejected": -349.5958557128906, "loss": 0.022, "losses/dpo": 8.960332706919871e-06, "losses/sft": 0.9654547572135925, "losses/total": 8.960332706919871e-06, "ref_logps/chosen": -255.41429138183594, "ref_logps/rejected": -231.92848205566406, "rewards/accuracies": 1.0, "rewards/chosen": -0.5613677501678467, "rewards/margins": 11.205370903015137, "rewards/rejected": -11.766737937927246, "step": 1838 }, { "epoch": 0.44, "learning_rate": 1.2416e-07, "logps/chosen": -234.348876953125, "logps/rejected": -354.40789794921875, "loss": 0.0016, "losses/dpo": 8.556299690098967e-06, "losses/sft": 0.5912672877311707, "losses/total": 8.556299690098967e-06, "ref_logps/chosen": -224.7012939453125, "ref_logps/rejected": -229.68939208984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9647588729858398, "rewards/margins": 11.507091522216797, "rewards/rejected": -12.471850395202637, "step": 1839 }, { "epoch": 0.44, "learning_rate": 1.2410666666666667e-07, "logps/chosen": -217.29324340820312, "logps/rejected": -365.3048400878906, "loss": 0.0008, "losses/dpo": 7.929169321130303e-09, "losses/sft": 0.5792378187179565, "losses/total": 7.929169321130303e-09, "ref_logps/chosen": -208.03851318359375, "ref_logps/rejected": -234.53582763671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.9254735708236694, "rewards/margins": 12.151429176330566, "rewards/rejected": -13.076903343200684, "step": 1840 }, { "epoch": 0.44, "learning_rate": 1.2405333333333332e-07, "logps/chosen": -249.4271240234375, "logps/rejected": -348.33392333984375, "loss": 0.004, "losses/dpo": 1.4529900909110438e-05, "losses/sft": 0.46862050890922546, "losses/total": 1.4529900909110438e-05, "ref_logps/chosen": -241.7603759765625, "ref_logps/rejected": -219.86070251464844, "rewards/accuracies": 1.0, "rewards/chosen": -0.7666757106781006, "rewards/margins": 12.080648422241211, "rewards/rejected": -12.847323417663574, "step": 1841 }, { "epoch": 0.44, "learning_rate": 1.24e-07, "logps/chosen": -227.10906982421875, "logps/rejected": -358.616943359375, "loss": 0.0013, "losses/dpo": 4.7808704550789116e-08, "losses/sft": 0.8015053868293762, "losses/total": 4.7808704550789116e-08, "ref_logps/chosen": -218.65150451660156, "ref_logps/rejected": -225.8321990966797, "rewards/accuracies": 1.0, "rewards/chosen": -0.8457548022270203, "rewards/margins": 12.432722091674805, "rewards/rejected": -13.27847671508789, "step": 1842 }, { "epoch": 0.44, "learning_rate": 1.2394666666666668e-07, "logps/chosen": -236.16403198242188, "logps/rejected": -380.4720458984375, "loss": 0.0011, "losses/dpo": 5.860360488441074e-06, "losses/sft": 0.6126073002815247, "losses/total": 5.860360488441074e-06, "ref_logps/chosen": -229.22509765625, "ref_logps/rejected": -249.61822509765625, "rewards/accuracies": 1.0, "rewards/chosen": -0.6938946843147278, "rewards/margins": 12.391489028930664, "rewards/rejected": -13.085382461547852, "step": 1843 }, { "epoch": 0.44, "learning_rate": 1.2389333333333333e-07, "logps/chosen": -257.45587158203125, "logps/rejected": -365.25555419921875, "loss": 0.0019, "losses/dpo": 5.000882993044797e-06, "losses/sft": 0.9793967008590698, "losses/total": 5.000882993044797e-06, "ref_logps/chosen": -249.7157745361328, "ref_logps/rejected": -231.69482421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.7740093469619751, "rewards/margins": 12.582061767578125, "rewards/rejected": -13.356071472167969, "step": 1844 }, { "epoch": 0.44, "learning_rate": 1.2383999999999998e-07, "logps/chosen": -209.1181640625, "logps/rejected": -330.4139404296875, "loss": 0.0046, "losses/dpo": 1.336049137989903e-07, "losses/sft": 0.5057851076126099, "losses/total": 1.336049137989903e-07, "ref_logps/chosen": -201.8063201904297, "ref_logps/rejected": -213.75534057617188, "rewards/accuracies": 1.0, "rewards/chosen": -0.731183648109436, "rewards/margins": 10.934677124023438, "rewards/rejected": -11.665860176086426, "step": 1845 }, { "epoch": 0.44, "learning_rate": 1.2378666666666665e-07, "logps/chosen": -291.4600524902344, "logps/rejected": -391.3083190917969, "loss": 0.0067, "losses/dpo": 7.39877137334588e-08, "losses/sft": 0.6914785504341125, "losses/total": 7.39877137334588e-08, "ref_logps/chosen": -278.9111022949219, "ref_logps/rejected": -257.29168701171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.254894495010376, "rewards/margins": 12.146768569946289, "rewards/rejected": -13.401664733886719, "step": 1846 }, { "epoch": 0.44, "learning_rate": 1.2373333333333333e-07, "logps/chosen": -217.74981689453125, "logps/rejected": -346.696044921875, "loss": 0.0045, "losses/dpo": 5.725713236870433e-10, "losses/sft": 0.6209825277328491, "losses/total": 5.725713236870433e-10, "ref_logps/chosen": -211.67251586914062, "ref_logps/rejected": -224.93756103515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.6077302694320679, "rewards/margins": 11.568120956420898, "rewards/rejected": -12.175850868225098, "step": 1847 }, { "epoch": 0.44, "learning_rate": 1.2367999999999998e-07, "logps/chosen": -247.1173095703125, "logps/rejected": -334.8271484375, "loss": 0.0018, "losses/dpo": 1.2334052712503762e-07, "losses/sft": 0.7564617991447449, "losses/total": 1.2334052712503762e-07, "ref_logps/chosen": -237.47406005859375, "ref_logps/rejected": -211.84254455566406, "rewards/accuracies": 1.0, "rewards/chosen": -0.964324414730072, "rewards/margins": 11.334138870239258, "rewards/rejected": -12.298462867736816, "step": 1848 }, { "epoch": 0.44, "learning_rate": 1.2362666666666666e-07, "logps/chosen": -211.88201904296875, "logps/rejected": -289.4976806640625, "loss": 0.0062, "losses/dpo": 2.985976133018653e-09, "losses/sft": 0.8684476613998413, "losses/total": 2.985976133018653e-09, "ref_logps/chosen": -201.53111267089844, "ref_logps/rejected": -183.81207275390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.0350902080535889, "rewards/margins": 9.533473014831543, "rewards/rejected": -10.568563461303711, "step": 1849 }, { "epoch": 0.44, "learning_rate": 1.2357333333333333e-07, "logps/chosen": -232.650634765625, "logps/rejected": -307.60809326171875, "loss": 0.0037, "losses/dpo": 0.0006026255432516336, "losses/sft": 0.7398252487182617, "losses/total": 0.0006026255432516336, "ref_logps/chosen": -222.56362915039062, "ref_logps/rejected": -198.03591918945312, "rewards/accuracies": 1.0, "rewards/chosen": -1.0087000131607056, "rewards/margins": 9.948517799377441, "rewards/rejected": -10.957218170166016, "step": 1850 }, { "epoch": 0.44, "learning_rate": 1.2352e-07, "logps/chosen": -242.63021850585938, "logps/rejected": -368.47747802734375, "loss": 0.0074, "losses/dpo": 7.303450502149644e-07, "losses/sft": 1.1998509168624878, "losses/total": 7.303450502149644e-07, "ref_logps/chosen": -235.07632446289062, "ref_logps/rejected": -245.79542541503906, "rewards/accuracies": 1.0, "rewards/chosen": -0.7553901076316833, "rewards/margins": 11.512816429138184, "rewards/rejected": -12.268205642700195, "step": 1851 }, { "epoch": 0.44, "learning_rate": 1.2346666666666666e-07, "logps/chosen": -224.28668212890625, "logps/rejected": -341.1720886230469, "loss": 0.0003, "losses/dpo": 4.495384473557351e-06, "losses/sft": 0.6030548214912415, "losses/total": 4.495384473557351e-06, "ref_logps/chosen": -216.48451232910156, "ref_logps/rejected": -211.8321990966797, "rewards/accuracies": 1.0, "rewards/chosen": -0.7802174091339111, "rewards/margins": 12.15377140045166, "rewards/rejected": -12.933988571166992, "step": 1852 }, { "epoch": 0.44, "learning_rate": 1.2341333333333334e-07, "logps/chosen": -224.17320251464844, "logps/rejected": -341.93011474609375, "loss": 0.0025, "losses/dpo": 2.2069058104534633e-05, "losses/sft": 0.5597447752952576, "losses/total": 2.2069058104534633e-05, "ref_logps/chosen": -217.67771911621094, "ref_logps/rejected": -215.89968872070312, "rewards/accuracies": 1.0, "rewards/chosen": -0.6495475769042969, "rewards/margins": 11.953495025634766, "rewards/rejected": -12.603042602539062, "step": 1853 }, { "epoch": 0.44, "learning_rate": 1.2336e-07, "logps/chosen": -221.1344757080078, "logps/rejected": -350.2275085449219, "loss": 0.002, "losses/dpo": 1.5644219502064516e-06, "losses/sft": 0.6867809295654297, "losses/total": 1.5644219502064516e-06, "ref_logps/chosen": -211.53817749023438, "ref_logps/rejected": -229.00558471679688, "rewards/accuracies": 1.0, "rewards/chosen": -0.9596281051635742, "rewards/margins": 11.162564277648926, "rewards/rejected": -12.1221923828125, "step": 1854 }, { "epoch": 0.45, "learning_rate": 1.2330666666666666e-07, "logps/chosen": -248.46510314941406, "logps/rejected": -355.5487060546875, "loss": 0.0068, "losses/dpo": 4.209531834931113e-05, "losses/sft": 0.7770125865936279, "losses/total": 4.209531834931113e-05, "ref_logps/chosen": -239.46876525878906, "ref_logps/rejected": -222.35665893554688, "rewards/accuracies": 1.0, "rewards/chosen": -0.8996320962905884, "rewards/margins": 12.419577598571777, "rewards/rejected": -13.319210052490234, "step": 1855 }, { "epoch": 0.45, "learning_rate": 1.2325333333333331e-07, "logps/chosen": -219.3441619873047, "logps/rejected": -321.4904479980469, "loss": 0.0046, "losses/dpo": 1.866553816398664e-06, "losses/sft": 1.0554660558700562, "losses/total": 1.866553816398664e-06, "ref_logps/chosen": -208.48773193359375, "ref_logps/rejected": -200.67755126953125, "rewards/accuracies": 1.0, "rewards/chosen": -1.0856425762176514, "rewards/margins": 10.995647430419922, "rewards/rejected": -12.081291198730469, "step": 1856 }, { "epoch": 0.45, "learning_rate": 1.232e-07, "logps/chosen": -209.74917602539062, "logps/rejected": -369.1123352050781, "loss": 0.0031, "losses/dpo": 3.3300701034022495e-05, "losses/sft": 0.5819383263587952, "losses/total": 3.3300701034022495e-05, "ref_logps/chosen": -202.2120361328125, "ref_logps/rejected": -237.09371948242188, "rewards/accuracies": 1.0, "rewards/chosen": -0.7537124156951904, "rewards/margins": 12.448148727416992, "rewards/rejected": -13.201861381530762, "step": 1857 }, { "epoch": 0.45, "learning_rate": 1.2314666666666667e-07, "logps/chosen": -301.87060546875, "logps/rejected": -391.10986328125, "loss": 0.0018, "losses/dpo": 4.1092878433346414e-08, "losses/sft": 0.6882864832878113, "losses/total": 4.1092878433346414e-08, "ref_logps/chosen": -288.8249206542969, "ref_logps/rejected": -250.389404296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.3045684099197388, "rewards/margins": 12.767475128173828, "rewards/rejected": -14.072042465209961, "step": 1858 }, { "epoch": 0.45, "learning_rate": 1.2309333333333335e-07, "logps/chosen": -241.98773193359375, "logps/rejected": -338.6460876464844, "loss": 0.0019, "losses/dpo": 1.3666126505995635e-05, "losses/sft": 0.3997553288936615, "losses/total": 1.3666126505995635e-05, "ref_logps/chosen": -234.50436401367188, "ref_logps/rejected": -219.90359497070312, "rewards/accuracies": 1.0, "rewards/chosen": -0.748336136341095, "rewards/margins": 11.125913619995117, "rewards/rejected": -11.874249458312988, "step": 1859 }, { "epoch": 0.45, "learning_rate": 1.2304e-07, "logps/chosen": -184.26556396484375, "logps/rejected": -337.7085876464844, "loss": 0.0014, "losses/dpo": 5.95937535430302e-11, "losses/sft": 0.5540234446525574, "losses/total": 5.95937535430302e-11, "ref_logps/chosen": -175.33981323242188, "ref_logps/rejected": -212.79519653320312, "rewards/accuracies": 1.0, "rewards/chosen": -0.8925743103027344, "rewards/margins": 11.598766326904297, "rewards/rejected": -12.491340637207031, "step": 1860 }, { "epoch": 0.45, "learning_rate": 1.2298666666666667e-07, "logps/chosen": -250.95870971679688, "logps/rejected": -351.7044372558594, "loss": 0.0098, "losses/dpo": 2.4081657556962455e-07, "losses/sft": 0.7963506579399109, "losses/total": 2.4081657556962455e-07, "ref_logps/chosen": -241.8060302734375, "ref_logps/rejected": -234.5919189453125, "rewards/accuracies": 1.0, "rewards/chosen": -0.9152662754058838, "rewards/margins": 10.795982360839844, "rewards/rejected": -11.711248397827148, "step": 1861 }, { "epoch": 0.45, "learning_rate": 1.2293333333333332e-07, "logps/chosen": -230.15042114257812, "logps/rejected": -369.87542724609375, "loss": 0.0026, "losses/dpo": 1.6144315395649755e-06, "losses/sft": 0.4524995684623718, "losses/total": 1.6144315395649755e-06, "ref_logps/chosen": -222.29055786132812, "ref_logps/rejected": -237.90721130371094, "rewards/accuracies": 1.0, "rewards/chosen": -0.7859860062599182, "rewards/margins": 12.410835266113281, "rewards/rejected": -13.196821212768555, "step": 1862 }, { "epoch": 0.45, "learning_rate": 1.2287999999999997e-07, "logps/chosen": -209.0176239013672, "logps/rejected": -319.08587646484375, "loss": 0.0043, "losses/dpo": 1.5528075891779736e-07, "losses/sft": 0.5612272024154663, "losses/total": 1.5528075891779736e-07, "ref_logps/chosen": -200.05055236816406, "ref_logps/rejected": -205.55380249023438, "rewards/accuracies": 1.0, "rewards/chosen": -0.8967093825340271, "rewards/margins": 10.456498146057129, "rewards/rejected": -11.353206634521484, "step": 1863 }, { "epoch": 0.45, "learning_rate": 1.2282666666666665e-07, "logps/chosen": -258.71514892578125, "logps/rejected": -355.992919921875, "loss": 0.004, "losses/dpo": 3.418801270527183e-06, "losses/sft": 0.8297433853149414, "losses/total": 3.418801270527183e-06, "ref_logps/chosen": -252.49850463867188, "ref_logps/rejected": -238.3271026611328, "rewards/accuracies": 1.0, "rewards/chosen": -0.6216636300086975, "rewards/margins": 11.144918441772461, "rewards/rejected": -11.766580581665039, "step": 1864 }, { "epoch": 0.45, "learning_rate": 1.2277333333333333e-07, "logps/chosen": -191.52670288085938, "logps/rejected": -332.4504699707031, "loss": 0.0082, "losses/dpo": 2.369202957197558e-05, "losses/sft": 0.5651504397392273, "losses/total": 2.369202957197558e-05, "ref_logps/chosen": -186.1783905029297, "ref_logps/rejected": -215.40362548828125, "rewards/accuracies": 1.0, "rewards/chosen": -0.5348312258720398, "rewards/margins": 11.169853210449219, "rewards/rejected": -11.70468521118164, "step": 1865 }, { "epoch": 0.45, "learning_rate": 1.2272e-07, "logps/chosen": -223.8539581298828, "logps/rejected": -318.6564636230469, "loss": 0.0122, "losses/dpo": 1.146577051258646e-05, "losses/sft": 0.7311921119689941, "losses/total": 1.146577051258646e-05, "ref_logps/chosen": -215.2113800048828, "ref_logps/rejected": -211.82830810546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.8642578125, "rewards/margins": 9.81855583190918, "rewards/rejected": -10.682812690734863, "step": 1866 }, { "epoch": 0.45, "learning_rate": 1.2266666666666665e-07, "logps/chosen": -231.5670166015625, "logps/rejected": -377.87371826171875, "loss": 0.0249, "losses/dpo": 2.9708578949794173e-05, "losses/sft": 0.5997775793075562, "losses/total": 2.9708578949794173e-05, "ref_logps/chosen": -223.54409790039062, "ref_logps/rejected": -248.7375946044922, "rewards/accuracies": 0.96875, "rewards/chosen": -0.802291989326477, "rewards/margins": 12.111321449279785, "rewards/rejected": -12.913612365722656, "step": 1867 }, { "epoch": 0.45, "learning_rate": 1.2261333333333333e-07, "logps/chosen": -246.83425903320312, "logps/rejected": -336.54632568359375, "loss": 0.0042, "losses/dpo": 1.3300892476308945e-07, "losses/sft": 0.6804583668708801, "losses/total": 1.3300892476308945e-07, "ref_logps/chosen": -238.39231872558594, "ref_logps/rejected": -202.4905242919922, "rewards/accuracies": 1.0, "rewards/chosen": -0.8441923260688782, "rewards/margins": 12.561387062072754, "rewards/rejected": -13.405579566955566, "step": 1868 }, { "epoch": 0.45, "learning_rate": 1.2256e-07, "logps/chosen": -198.6476593017578, "logps/rejected": -336.652099609375, "loss": 0.0013, "losses/dpo": 5.774613214271085e-07, "losses/sft": 0.384067177772522, "losses/total": 5.774613214271085e-07, "ref_logps/chosen": -187.24925231933594, "ref_logps/rejected": -205.06288146972656, "rewards/accuracies": 1.0, "rewards/chosen": -1.139841079711914, "rewards/margins": 12.019082069396973, "rewards/rejected": -13.158923149108887, "step": 1869 }, { "epoch": 0.45, "learning_rate": 1.2250666666666666e-07, "logps/chosen": -174.93862915039062, "logps/rejected": -318.6055908203125, "loss": 0.0095, "losses/dpo": 9.728553322929656e-07, "losses/sft": 0.5589771866798401, "losses/total": 9.728553322929656e-07, "ref_logps/chosen": -168.19476318359375, "ref_logps/rejected": -207.1876678466797, "rewards/accuracies": 1.0, "rewards/chosen": -0.6743859648704529, "rewards/margins": 10.4674072265625, "rewards/rejected": -11.141793251037598, "step": 1870 }, { "epoch": 0.45, "learning_rate": 1.224533333333333e-07, "logps/chosen": -226.9091796875, "logps/rejected": -307.9095458984375, "loss": 0.0014, "losses/dpo": 2.311260232090717e-06, "losses/sft": 0.8001373410224915, "losses/total": 2.311260232090717e-06, "ref_logps/chosen": -218.53839111328125, "ref_logps/rejected": -205.02969360351562, "rewards/accuracies": 1.0, "rewards/chosen": -0.8370762467384338, "rewards/margins": 9.450910568237305, "rewards/rejected": -10.287986755371094, "step": 1871 }, { "epoch": 0.45, "learning_rate": 1.2239999999999998e-07, "logps/chosen": -214.5009765625, "logps/rejected": -341.2043151855469, "loss": 0.0023, "losses/dpo": 1.0709743946790695e-05, "losses/sft": 0.6601160168647766, "losses/total": 1.0709743946790695e-05, "ref_logps/chosen": -206.41390991210938, "ref_logps/rejected": -223.42074584960938, "rewards/accuracies": 1.0, "rewards/chosen": -0.8087072372436523, "rewards/margins": 10.969650268554688, "rewards/rejected": -11.778358459472656, "step": 1872 }, { "epoch": 0.45, "learning_rate": 1.2234666666666666e-07, "logps/chosen": -221.08058166503906, "logps/rejected": -321.44976806640625, "loss": 0.0056, "losses/dpo": 3.4109825719497167e-07, "losses/sft": 0.6655860543251038, "losses/total": 3.4109825719497167e-07, "ref_logps/chosen": -211.5060272216797, "ref_logps/rejected": -200.09872436523438, "rewards/accuracies": 1.0, "rewards/chosen": -0.9574552774429321, "rewards/margins": 11.17764949798584, "rewards/rejected": -12.13510513305664, "step": 1873 }, { "epoch": 0.45, "learning_rate": 1.2229333333333334e-07, "logps/chosen": -259.23968505859375, "logps/rejected": -375.9814453125, "loss": 0.0013, "losses/dpo": 5.706644601310984e-10, "losses/sft": 0.6337318420410156, "losses/total": 5.706644601310984e-10, "ref_logps/chosen": -247.52499389648438, "ref_logps/rejected": -244.38018798828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.1714714765548706, "rewards/margins": 11.988653182983398, "rewards/rejected": -13.160124778747559, "step": 1874 }, { "epoch": 0.45, "learning_rate": 1.2224e-07, "logps/chosen": -227.32003784179688, "logps/rejected": -341.73004150390625, "loss": 0.005, "losses/dpo": 3.382432822718329e-08, "losses/sft": 0.6296247839927673, "losses/total": 3.382432822718329e-08, "ref_logps/chosen": -217.9434814453125, "ref_logps/rejected": -218.38560485839844, "rewards/accuracies": 1.0, "rewards/chosen": -0.9376563429832458, "rewards/margins": 11.39678955078125, "rewards/rejected": -12.33444595336914, "step": 1875 }, { "epoch": 0.45, "learning_rate": 1.2218666666666666e-07, "logps/chosen": -230.93994140625, "logps/rejected": -342.69451904296875, "loss": 0.005, "losses/dpo": 2.818467015330839e-09, "losses/sft": 0.5887678265571594, "losses/total": 2.818467015330839e-09, "ref_logps/chosen": -222.46775817871094, "ref_logps/rejected": -225.17080688476562, "rewards/accuracies": 1.0, "rewards/chosen": -0.8472193479537964, "rewards/margins": 10.905153274536133, "rewards/rejected": -11.752372741699219, "step": 1876 }, { "epoch": 0.45, "learning_rate": 1.2213333333333334e-07, "logps/chosen": -268.2400207519531, "logps/rejected": -359.35223388671875, "loss": 0.002, "losses/dpo": 0.003731525968760252, "losses/sft": 0.761345624923706, "losses/total": 0.003731525968760252, "ref_logps/chosen": -255.42984008789062, "ref_logps/rejected": -232.27749633789062, "rewards/accuracies": 1.0, "rewards/chosen": -1.2810180187225342, "rewards/margins": 11.426456451416016, "rewards/rejected": -12.707474708557129, "step": 1877 }, { "epoch": 0.45, "learning_rate": 1.2208e-07, "logps/chosen": -200.3526153564453, "logps/rejected": -300.0743103027344, "loss": 0.0105, "losses/dpo": 1.4225093103448216e-08, "losses/sft": 0.48803839087486267, "losses/total": 1.4225093103448216e-08, "ref_logps/chosen": -194.09588623046875, "ref_logps/rejected": -194.47438049316406, "rewards/accuracies": 1.0, "rewards/chosen": -0.6256755590438843, "rewards/margins": 9.934316635131836, "rewards/rejected": -10.559992790222168, "step": 1878 }, { "epoch": 0.45, "learning_rate": 1.2202666666666664e-07, "logps/chosen": -234.1190185546875, "logps/rejected": -346.3839111328125, "loss": 0.0017, "losses/dpo": 5.0005019147647545e-06, "losses/sft": 0.8544845581054688, "losses/total": 5.0005019147647545e-06, "ref_logps/chosen": -224.14845275878906, "ref_logps/rejected": -228.3046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.9970579743385315, "rewards/margins": 10.810863494873047, "rewards/rejected": -11.807921409606934, "step": 1879 }, { "epoch": 0.45, "learning_rate": 1.2197333333333332e-07, "logps/chosen": -278.9553527832031, "logps/rejected": -377.0561218261719, "loss": 0.0007, "losses/dpo": 4.831868238852621e-08, "losses/sft": 0.6262639760971069, "losses/total": 4.831868238852621e-08, "ref_logps/chosen": -265.4246520996094, "ref_logps/rejected": -243.55528259277344, "rewards/accuracies": 1.0, "rewards/chosen": -1.3530689477920532, "rewards/margins": 11.997014999389648, "rewards/rejected": -13.35008430480957, "step": 1880 }, { "epoch": 0.45, "learning_rate": 1.2192e-07, "logps/chosen": -216.55287170410156, "logps/rejected": -355.3337707519531, "loss": 0.001, "losses/dpo": 4.18726529005653e-07, "losses/sft": 1.3063080310821533, "losses/total": 4.18726529005653e-07, "ref_logps/chosen": -212.0055389404297, "ref_logps/rejected": -228.0103759765625, "rewards/accuracies": 1.0, "rewards/chosen": -0.45473241806030273, "rewards/margins": 12.277610778808594, "rewards/rejected": -12.732342720031738, "step": 1881 }, { "epoch": 0.45, "learning_rate": 1.2186666666666665e-07, "logps/chosen": -237.05438232421875, "logps/rejected": -292.6409912109375, "loss": 0.0132, "losses/dpo": 4.777632511832053e-06, "losses/sft": 1.069725751876831, "losses/total": 4.777632511832053e-06, "ref_logps/chosen": -226.17701721191406, "ref_logps/rejected": -190.51596069335938, "rewards/accuracies": 1.0, "rewards/chosen": -1.0877364873886108, "rewards/margins": 9.124767303466797, "rewards/rejected": -10.212503433227539, "step": 1882 }, { "epoch": 0.45, "learning_rate": 1.2181333333333332e-07, "logps/chosen": -254.30996704101562, "logps/rejected": -364.67938232421875, "loss": 0.006, "losses/dpo": 4.214856730300198e-08, "losses/sft": 0.40547922253608704, "losses/total": 4.214856730300198e-08, "ref_logps/chosen": -244.81141662597656, "ref_logps/rejected": -237.01507568359375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9498540759086609, "rewards/margins": 11.816579818725586, "rewards/rejected": -12.766433715820312, "step": 1883 }, { "epoch": 0.45, "learning_rate": 1.2176e-07, "logps/chosen": -235.62261962890625, "logps/rejected": -369.1015625, "loss": 0.0013, "losses/dpo": 9.80941786110634e-06, "losses/sft": 0.5514762997627258, "losses/total": 9.80941786110634e-06, "ref_logps/chosen": -227.23590087890625, "ref_logps/rejected": -231.774658203125, "rewards/accuracies": 1.0, "rewards/chosen": -0.8386707901954651, "rewards/margins": 12.894020080566406, "rewards/rejected": -13.732690811157227, "step": 1884 }, { "epoch": 0.45, "learning_rate": 1.2170666666666668e-07, "logps/chosen": -252.3214111328125, "logps/rejected": -379.8828125, "loss": 0.0008, "losses/dpo": 8.014377606002654e-09, "losses/sft": 0.5577275156974792, "losses/total": 8.014377606002654e-09, "ref_logps/chosen": -240.7374267578125, "ref_logps/rejected": -238.46832275390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.1583985090255737, "rewards/margins": 12.983053207397461, "rewards/rejected": -14.141451835632324, "step": 1885 }, { "epoch": 0.45, "learning_rate": 1.2165333333333333e-07, "logps/chosen": -235.62619018554688, "logps/rejected": -334.586669921875, "loss": 0.0081, "losses/dpo": 7.85154099958163e-07, "losses/sft": 0.5376251935958862, "losses/total": 7.85154099958163e-07, "ref_logps/chosen": -228.61306762695312, "ref_logps/rejected": -216.393798828125, "rewards/accuracies": 1.0, "rewards/chosen": -0.7013120651245117, "rewards/margins": 11.117977142333984, "rewards/rejected": -11.81928825378418, "step": 1886 }, { "epoch": 0.45, "learning_rate": 1.216e-07, "logps/chosen": -238.01089477539062, "logps/rejected": -381.94635009765625, "loss": 0.0066, "losses/dpo": 1.6564026736887172e-06, "losses/sft": 0.5154592394828796, "losses/total": 1.6564026736887172e-06, "ref_logps/chosen": -229.57742309570312, "ref_logps/rejected": -250.39508056640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.843346118927002, "rewards/margins": 12.311777114868164, "rewards/rejected": -13.155123710632324, "step": 1887 }, { "epoch": 0.45, "learning_rate": 1.2154666666666665e-07, "logps/chosen": -212.11813354492188, "logps/rejected": -349.6858825683594, "loss": 0.0034, "losses/dpo": 9.89281012664378e-09, "losses/sft": 0.8539053797721863, "losses/total": 9.89281012664378e-09, "ref_logps/chosen": -201.9163818359375, "ref_logps/rejected": -217.76443481445312, "rewards/accuracies": 1.0, "rewards/chosen": -1.0201750993728638, "rewards/margins": 12.171968460083008, "rewards/rejected": -13.192143440246582, "step": 1888 }, { "epoch": 0.45, "learning_rate": 1.2149333333333333e-07, "logps/chosen": -215.96627807617188, "logps/rejected": -307.82879638671875, "loss": 0.0029, "losses/dpo": 1.0820172015313378e-11, "losses/sft": 0.5943210124969482, "losses/total": 1.0820172015313378e-11, "ref_logps/chosen": -205.36764526367188, "ref_logps/rejected": -194.99667358398438, "rewards/accuracies": 1.0, "rewards/chosen": -1.059862732887268, "rewards/margins": 10.223350524902344, "rewards/rejected": -11.283212661743164, "step": 1889 }, { "epoch": 0.45, "learning_rate": 1.2143999999999998e-07, "logps/chosen": -248.41787719726562, "logps/rejected": -380.62762451171875, "loss": 0.0011, "losses/dpo": 6.709450644848403e-06, "losses/sft": 0.6454054713249207, "losses/total": 6.709450644848403e-06, "ref_logps/chosen": -238.68849182128906, "ref_logps/rejected": -246.79489135742188, "rewards/accuracies": 1.0, "rewards/chosen": -0.9729374647140503, "rewards/margins": 12.4103364944458, "rewards/rejected": -13.38327407836914, "step": 1890 }, { "epoch": 0.45, "learning_rate": 1.2138666666666666e-07, "logps/chosen": -228.23760986328125, "logps/rejected": -378.41046142578125, "loss": 0.002, "losses/dpo": 0.0001686211471678689, "losses/sft": 0.6746002435684204, "losses/total": 0.0001686211471678689, "ref_logps/chosen": -217.12686157226562, "ref_logps/rejected": -246.093017578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.111077904701233, "rewards/margins": 12.120668411254883, "rewards/rejected": -13.231745719909668, "step": 1891 }, { "epoch": 0.45, "learning_rate": 1.2133333333333333e-07, "logps/chosen": -204.9093780517578, "logps/rejected": -341.372314453125, "loss": 0.0085, "losses/dpo": 1.59958518652914e-10, "losses/sft": 0.7137326002120972, "losses/total": 1.59958518652914e-10, "ref_logps/chosen": -197.81024169921875, "ref_logps/rejected": -215.7466583251953, "rewards/accuracies": 1.0, "rewards/chosen": -0.7099135518074036, "rewards/margins": 11.852653503417969, "rewards/rejected": -12.562566757202148, "step": 1892 }, { "epoch": 0.45, "learning_rate": 1.2128e-07, "logps/chosen": -196.2790069580078, "logps/rejected": -327.5706481933594, "loss": 0.0031, "losses/dpo": 0.00013998462236486375, "losses/sft": 0.6222878694534302, "losses/total": 0.00013998462236486375, "ref_logps/chosen": -190.44183349609375, "ref_logps/rejected": -203.85797119140625, "rewards/accuracies": 1.0, "rewards/chosen": -0.5837181210517883, "rewards/margins": 11.787551879882812, "rewards/rejected": -12.371270179748535, "step": 1893 }, { "epoch": 0.45, "learning_rate": 1.2122666666666666e-07, "logps/chosen": -206.57418823242188, "logps/rejected": -326.40411376953125, "loss": 0.0036, "losses/dpo": 5.117991349834483e-06, "losses/sft": 0.6444863080978394, "losses/total": 5.117991349834483e-06, "ref_logps/chosen": -198.7251739501953, "ref_logps/rejected": -207.22459411621094, "rewards/accuracies": 1.0, "rewards/chosen": -0.7849013209342957, "rewards/margins": 11.133047103881836, "rewards/rejected": -11.917948722839355, "step": 1894 }, { "epoch": 0.45, "learning_rate": 1.2117333333333334e-07, "logps/chosen": -240.37481689453125, "logps/rejected": -343.5999450683594, "loss": 0.0029, "losses/dpo": 7.922966727136327e-10, "losses/sft": 0.6041725873947144, "losses/total": 7.922966727136327e-10, "ref_logps/chosen": -232.98660278320312, "ref_logps/rejected": -219.4482879638672, "rewards/accuracies": 1.0, "rewards/chosen": -0.7388215065002441, "rewards/margins": 11.676342964172363, "rewards/rejected": -12.415164947509766, "step": 1895 }, { "epoch": 0.46, "learning_rate": 1.2112e-07, "logps/chosen": -206.8787841796875, "logps/rejected": -347.18634033203125, "loss": 0.0052, "losses/dpo": 3.8968710924791594e-08, "losses/sft": 0.49971288442611694, "losses/total": 3.8968710924791594e-08, "ref_logps/chosen": -199.4618682861328, "ref_logps/rejected": -220.371826171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.7416917085647583, "rewards/margins": 11.939759254455566, "rewards/rejected": -12.681451797485352, "step": 1896 }, { "epoch": 0.46, "learning_rate": 1.2106666666666664e-07, "logps/chosen": -223.58856201171875, "logps/rejected": -321.0335693359375, "loss": 0.003, "losses/dpo": 1.9286626411485486e-06, "losses/sft": 0.782457709312439, "losses/total": 1.9286626411485486e-06, "ref_logps/chosen": -211.73489379882812, "ref_logps/rejected": -211.75344848632812, "rewards/accuracies": 1.0, "rewards/chosen": -1.185365915298462, "rewards/margins": 9.742646217346191, "rewards/rejected": -10.92801284790039, "step": 1897 }, { "epoch": 0.46, "learning_rate": 1.2101333333333332e-07, "logps/chosen": -226.97967529296875, "logps/rejected": -357.9974365234375, "loss": 0.0008, "losses/dpo": 2.5956794615922263e-06, "losses/sft": 0.7967387437820435, "losses/total": 2.5956794615922263e-06, "ref_logps/chosen": -220.2085418701172, "ref_logps/rejected": -228.91717529296875, "rewards/accuracies": 1.0, "rewards/chosen": -0.6771132946014404, "rewards/margins": 12.230912208557129, "rewards/rejected": -12.908024787902832, "step": 1898 }, { "epoch": 0.46, "learning_rate": 1.2096e-07, "logps/chosen": -242.131103515625, "logps/rejected": -333.445556640625, "loss": 0.0008, "losses/dpo": 0.00042337612831033766, "losses/sft": 0.87380051612854, "losses/total": 0.00042337612831033766, "ref_logps/chosen": -234.80665588378906, "ref_logps/rejected": -211.19476318359375, "rewards/accuracies": 1.0, "rewards/chosen": -0.7324440479278564, "rewards/margins": 11.492635726928711, "rewards/rejected": -12.225080490112305, "step": 1899 }, { "epoch": 0.46, "learning_rate": 1.2090666666666667e-07, "logps/chosen": -223.9566192626953, "logps/rejected": -358.98590087890625, "loss": 0.0016, "losses/dpo": 5.628721169159689e-07, "losses/sft": 0.6017212867736816, "losses/total": 5.628721169159689e-07, "ref_logps/chosen": -213.86029052734375, "ref_logps/rejected": -224.94656372070312, "rewards/accuracies": 1.0, "rewards/chosen": -1.0096309185028076, "rewards/margins": 12.394306182861328, "rewards/rejected": -13.403937339782715, "step": 1900 }, { "epoch": 0.46, "learning_rate": 1.2085333333333332e-07, "logps/chosen": -238.94119262695312, "logps/rejected": -345.99798583984375, "loss": 0.0038, "losses/dpo": 3.508448420674881e-09, "losses/sft": 0.526235044002533, "losses/total": 3.508448420674881e-09, "ref_logps/chosen": -229.8619842529297, "ref_logps/rejected": -214.60165405273438, "rewards/accuracies": 1.0, "rewards/chosen": -0.9079190492630005, "rewards/margins": 12.231714248657227, "rewards/rejected": -13.139632225036621, "step": 1901 }, { "epoch": 0.46, "learning_rate": 1.208e-07, "logps/chosen": -232.44415283203125, "logps/rejected": -350.07318115234375, "loss": 0.0082, "losses/dpo": 1.8788302895700326e-06, "losses/sft": 0.4431626796722412, "losses/total": 1.8788302895700326e-06, "ref_logps/chosen": -222.66575622558594, "ref_logps/rejected": -221.48158264160156, "rewards/accuracies": 1.0, "rewards/chosen": -0.9778394103050232, "rewards/margins": 11.88132095336914, "rewards/rejected": -12.859161376953125, "step": 1902 }, { "epoch": 0.46, "learning_rate": 1.2074666666666667e-07, "logps/chosen": -247.13632202148438, "logps/rejected": -358.6159362792969, "loss": 0.0054, "losses/dpo": 2.6661888341550366e-07, "losses/sft": 0.45234131813049316, "losses/total": 2.6661888341550366e-07, "ref_logps/chosen": -236.72967529296875, "ref_logps/rejected": -237.15286254882812, "rewards/accuracies": 1.0, "rewards/chosen": -1.040667176246643, "rewards/margins": 11.105640411376953, "rewards/rejected": -12.146307945251465, "step": 1903 }, { "epoch": 0.46, "learning_rate": 1.2069333333333332e-07, "logps/chosen": -258.22216796875, "logps/rejected": -345.7685546875, "loss": 0.004, "losses/dpo": 0.009379583410918713, "losses/sft": 1.015631079673767, "losses/total": 0.009379583410918713, "ref_logps/chosen": -247.93374633789062, "ref_logps/rejected": -226.79254150390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.0288416147232056, "rewards/margins": 10.868756294250488, "rewards/rejected": -11.897598266601562, "step": 1904 }, { "epoch": 0.46, "learning_rate": 1.2063999999999997e-07, "logps/chosen": -218.15245056152344, "logps/rejected": -307.27978515625, "loss": 0.0163, "losses/dpo": 1.6529231743334094e-06, "losses/sft": 1.0127036571502686, "losses/total": 1.6529231743334094e-06, "ref_logps/chosen": -208.99057006835938, "ref_logps/rejected": -204.456787109375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9161888957023621, "rewards/margins": 9.366113662719727, "rewards/rejected": -10.282302856445312, "step": 1905 }, { "epoch": 0.46, "learning_rate": 1.2058666666666665e-07, "logps/chosen": -240.2444610595703, "logps/rejected": -361.034912109375, "loss": 0.0018, "losses/dpo": 1.0615982546369196e-06, "losses/sft": 0.41108644008636475, "losses/total": 1.0615982546369196e-06, "ref_logps/chosen": -231.69769287109375, "ref_logps/rejected": -231.69508361816406, "rewards/accuracies": 1.0, "rewards/chosen": -0.8546774387359619, "rewards/margins": 12.079306602478027, "rewards/rejected": -12.93398380279541, "step": 1906 }, { "epoch": 0.46, "learning_rate": 1.2053333333333333e-07, "logps/chosen": -241.20945739746094, "logps/rejected": -385.85833740234375, "loss": 0.008, "losses/dpo": 8.49195558316751e-09, "losses/sft": 0.5362125039100647, "losses/total": 8.49195558316751e-09, "ref_logps/chosen": -231.78271484375, "ref_logps/rejected": -246.55746459960938, "rewards/accuracies": 1.0, "rewards/chosen": -0.9426740407943726, "rewards/margins": 12.987417221069336, "rewards/rejected": -13.930089950561523, "step": 1907 }, { "epoch": 0.46, "learning_rate": 1.2048e-07, "logps/chosen": -262.9468994140625, "logps/rejected": -393.7816162109375, "loss": 0.0013, "losses/dpo": 3.9718099742458435e-07, "losses/sft": 0.970291793346405, "losses/total": 3.9718099742458435e-07, "ref_logps/chosen": -255.87484741210938, "ref_logps/rejected": -248.80126953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.7072056531906128, "rewards/margins": 13.790830612182617, "rewards/rejected": -14.49803638458252, "step": 1908 }, { "epoch": 0.46, "learning_rate": 1.2042666666666665e-07, "logps/chosen": -241.57736206054688, "logps/rejected": -348.04736328125, "loss": 0.0019, "losses/dpo": 8.706399512448115e-08, "losses/sft": 0.5566457509994507, "losses/total": 8.706399512448115e-08, "ref_logps/chosen": -234.13232421875, "ref_logps/rejected": -225.32345581054688, "rewards/accuracies": 1.0, "rewards/chosen": -0.7445018887519836, "rewards/margins": 11.52789306640625, "rewards/rejected": -12.272394180297852, "step": 1909 }, { "epoch": 0.46, "learning_rate": 1.2037333333333333e-07, "logps/chosen": -188.0975799560547, "logps/rejected": -329.61724853515625, "loss": 0.0053, "losses/dpo": 1.648051147640217e-05, "losses/sft": 0.7375083565711975, "losses/total": 1.648051147640217e-05, "ref_logps/chosen": -176.71876525878906, "ref_logps/rejected": -207.36093139648438, "rewards/accuracies": 1.0, "rewards/chosen": -1.1378803253173828, "rewards/margins": 11.087750434875488, "rewards/rejected": -12.225630760192871, "step": 1910 }, { "epoch": 0.46, "learning_rate": 1.2032e-07, "logps/chosen": -206.8844757080078, "logps/rejected": -323.3040771484375, "loss": 0.0007, "losses/dpo": 1.13728754058684e-06, "losses/sft": 0.8776072263717651, "losses/total": 1.13728754058684e-06, "ref_logps/chosen": -198.3836669921875, "ref_logps/rejected": -200.88621520996094, "rewards/accuracies": 1.0, "rewards/chosen": -0.850082516670227, "rewards/margins": 11.391703605651855, "rewards/rejected": -12.241785049438477, "step": 1911 }, { "epoch": 0.46, "learning_rate": 1.2026666666666668e-07, "logps/chosen": -216.23593139648438, "logps/rejected": -368.1744689941406, "loss": 0.0021, "losses/dpo": 2.1573158814902627e-09, "losses/sft": 0.4980510473251343, "losses/total": 2.1573158814902627e-09, "ref_logps/chosen": -209.24319458007812, "ref_logps/rejected": -230.9225311279297, "rewards/accuracies": 1.0, "rewards/chosen": -0.6992721557617188, "rewards/margins": 13.025921821594238, "rewards/rejected": -13.725193977355957, "step": 1912 }, { "epoch": 0.46, "learning_rate": 1.2021333333333334e-07, "logps/chosen": -253.71630859375, "logps/rejected": -363.47998046875, "loss": 0.0007, "losses/dpo": 6.013706752128201e-06, "losses/sft": 0.6358117461204529, "losses/total": 6.013706752128201e-06, "ref_logps/chosen": -245.01531982421875, "ref_logps/rejected": -236.1473388671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.8701006174087524, "rewards/margins": 11.863165855407715, "rewards/rejected": -12.733266830444336, "step": 1913 }, { "epoch": 0.46, "learning_rate": 1.2015999999999999e-07, "logps/chosen": -231.61013793945312, "logps/rejected": -322.82916259765625, "loss": 0.0086, "losses/dpo": 2.8030731300532352e-06, "losses/sft": 0.5522704124450684, "losses/total": 2.8030731300532352e-06, "ref_logps/chosen": -220.73348999023438, "ref_logps/rejected": -210.94049072265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.0876656770706177, "rewards/margins": 10.101202011108398, "rewards/rejected": -11.188867568969727, "step": 1914 }, { "epoch": 0.46, "learning_rate": 1.2010666666666666e-07, "logps/chosen": -214.07901000976562, "logps/rejected": -338.4092102050781, "loss": 0.0017, "losses/dpo": 8.95790321919776e-07, "losses/sft": 0.5671049356460571, "losses/total": 8.95790321919776e-07, "ref_logps/chosen": -204.91864013671875, "ref_logps/rejected": -214.76953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.9160366058349609, "rewards/margins": 11.447933197021484, "rewards/rejected": -12.363969802856445, "step": 1915 }, { "epoch": 0.46, "learning_rate": 1.200533333333333e-07, "logps/chosen": -257.5049133300781, "logps/rejected": -388.1544189453125, "loss": 0.0004, "losses/dpo": 3.2787895065666817e-07, "losses/sft": 0.5539432168006897, "losses/total": 3.2787895065666817e-07, "ref_logps/chosen": -248.29220581054688, "ref_logps/rejected": -253.457763671875, "rewards/accuracies": 1.0, "rewards/chosen": -0.9212706089019775, "rewards/margins": 12.548397064208984, "rewards/rejected": -13.469667434692383, "step": 1916 }, { "epoch": 0.46, "learning_rate": 1.2e-07, "logps/chosen": -239.80014038085938, "logps/rejected": -385.23089599609375, "loss": 0.0035, "losses/dpo": 8.011706853494616e-08, "losses/sft": 1.191268801689148, "losses/total": 8.011706853494616e-08, "ref_logps/chosen": -230.03993225097656, "ref_logps/rejected": -244.3663330078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.9760192036628723, "rewards/margins": 13.110435485839844, "rewards/rejected": -14.086454391479492, "step": 1917 }, { "epoch": 0.46, "learning_rate": 1.1994666666666667e-07, "logps/chosen": -265.0579833984375, "logps/rejected": -345.67529296875, "loss": 0.003, "losses/dpo": 4.38431024818442e-09, "losses/sft": 0.7461006045341492, "losses/total": 4.38431024818442e-09, "ref_logps/chosen": -253.17098999023438, "ref_logps/rejected": -213.8243865966797, "rewards/accuracies": 1.0, "rewards/chosen": -1.1886959075927734, "rewards/margins": 11.996394157409668, "rewards/rejected": -13.185089111328125, "step": 1918 }, { "epoch": 0.46, "learning_rate": 1.1989333333333334e-07, "logps/chosen": -224.07359313964844, "logps/rejected": -345.57220458984375, "loss": 0.0012, "losses/dpo": 8.110584894893691e-05, "losses/sft": 0.8144818544387817, "losses/total": 8.110584894893691e-05, "ref_logps/chosen": -214.80477905273438, "ref_logps/rejected": -226.38516235351562, "rewards/accuracies": 1.0, "rewards/chosen": -0.9268813729286194, "rewards/margins": 10.9918212890625, "rewards/rejected": -11.91870403289795, "step": 1919 }, { "epoch": 0.46, "learning_rate": 1.1984e-07, "logps/chosen": -277.4112548828125, "logps/rejected": -395.99310302734375, "loss": 0.0004, "losses/dpo": 2.0448299054010022e-08, "losses/sft": 0.5848615169525146, "losses/total": 2.0448299054010022e-08, "ref_logps/chosen": -264.96844482421875, "ref_logps/rejected": -261.13531494140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.244281530380249, "rewards/margins": 12.241497039794922, "rewards/rejected": -13.485777854919434, "step": 1920 }, { "epoch": 0.46, "learning_rate": 1.1978666666666667e-07, "logps/chosen": -259.614501953125, "logps/rejected": -376.3712463378906, "loss": 0.0017, "losses/dpo": 3.545316817898936e-10, "losses/sft": 0.8365771174430847, "losses/total": 3.545316817898936e-10, "ref_logps/chosen": -247.1455078125, "ref_logps/rejected": -242.401611328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.2468990087509155, "rewards/margins": 12.150064468383789, "rewards/rejected": -13.39696216583252, "step": 1921 }, { "epoch": 0.46, "learning_rate": 1.1973333333333332e-07, "logps/chosen": -240.53868103027344, "logps/rejected": -370.06317138671875, "loss": 0.0004, "losses/dpo": 6.347734782252701e-09, "losses/sft": 0.4217807650566101, "losses/total": 6.347734782252701e-09, "ref_logps/chosen": -229.79302978515625, "ref_logps/rejected": -229.39920043945312, "rewards/accuracies": 1.0, "rewards/chosen": -1.0745649337768555, "rewards/margins": 12.991832733154297, "rewards/rejected": -14.066396713256836, "step": 1922 }, { "epoch": 0.46, "learning_rate": 1.1968e-07, "logps/chosen": -228.91741943359375, "logps/rejected": -328.2333984375, "loss": 0.0038, "losses/dpo": 2.76608898275299e-06, "losses/sft": 0.5254502892494202, "losses/total": 2.76608898275299e-06, "ref_logps/chosen": -221.28311157226562, "ref_logps/rejected": -211.7576141357422, "rewards/accuracies": 1.0, "rewards/chosen": -0.7634283900260925, "rewards/margins": 10.884151458740234, "rewards/rejected": -11.647579193115234, "step": 1923 }, { "epoch": 0.46, "learning_rate": 1.1962666666666665e-07, "logps/chosen": -262.8874816894531, "logps/rejected": -347.35546875, "loss": 0.002, "losses/dpo": 2.1086774992795654e-08, "losses/sft": 0.46315887570381165, "losses/total": 2.1086774992795654e-08, "ref_logps/chosen": -249.34132385253906, "ref_logps/rejected": -219.7407989501953, "rewards/accuracies": 1.0, "rewards/chosen": -1.3546149730682373, "rewards/margins": 11.406851768493652, "rewards/rejected": -12.761466026306152, "step": 1924 }, { "epoch": 0.46, "learning_rate": 1.1957333333333332e-07, "logps/chosen": -234.54672241210938, "logps/rejected": -376.6729431152344, "loss": 0.0007, "losses/dpo": 0.00010599392408039421, "losses/sft": 0.5401250720024109, "losses/total": 0.00010599392408039421, "ref_logps/chosen": -223.28640747070312, "ref_logps/rejected": -238.608154296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.1260321140289307, "rewards/margins": 12.68044662475586, "rewards/rejected": -13.806480407714844, "step": 1925 }, { "epoch": 0.46, "learning_rate": 1.1952e-07, "logps/chosen": -234.5955810546875, "logps/rejected": -350.6058349609375, "loss": 0.0028, "losses/dpo": 1.8889715647674166e-05, "losses/sft": 0.7771262526512146, "losses/total": 1.8889715647674166e-05, "ref_logps/chosen": -225.0189971923828, "ref_logps/rejected": -222.5660400390625, "rewards/accuracies": 1.0, "rewards/chosen": -0.9576584696769714, "rewards/margins": 11.846319198608398, "rewards/rejected": -12.803977966308594, "step": 1926 }, { "epoch": 0.46, "learning_rate": 1.1946666666666668e-07, "logps/chosen": -200.79673767089844, "logps/rejected": -322.9657287597656, "loss": 0.0012, "losses/dpo": 0.0002447690931148827, "losses/sft": 0.5609487891197205, "losses/total": 0.0002447690931148827, "ref_logps/chosen": -191.70376586914062, "ref_logps/rejected": -206.59616088867188, "rewards/accuracies": 1.0, "rewards/chosen": -0.9092963337898254, "rewards/margins": 10.7276611328125, "rewards/rejected": -11.636957168579102, "step": 1927 }, { "epoch": 0.46, "learning_rate": 1.1941333333333333e-07, "logps/chosen": -195.4965057373047, "logps/rejected": -339.22149658203125, "loss": 0.0037, "losses/dpo": 2.732476245270732e-09, "losses/sft": 0.5329421162605286, "losses/total": 2.732476245270732e-09, "ref_logps/chosen": -183.95143127441406, "ref_logps/rejected": -211.4486083984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.1545075178146362, "rewards/margins": 11.622782707214355, "rewards/rejected": -12.777290344238281, "step": 1928 }, { "epoch": 0.46, "learning_rate": 1.1936e-07, "logps/chosen": -285.00311279296875, "logps/rejected": -387.78271484375, "loss": 0.0009, "losses/dpo": 5.480103482113918e-08, "losses/sft": 0.6109564304351807, "losses/total": 5.480103482113918e-08, "ref_logps/chosen": -272.75750732421875, "ref_logps/rejected": -249.03286743164062, "rewards/accuracies": 1.0, "rewards/chosen": -1.224560022354126, "rewards/margins": 12.650426864624023, "rewards/rejected": -13.874987602233887, "step": 1929 }, { "epoch": 0.46, "learning_rate": 1.1930666666666666e-07, "logps/chosen": -218.29161071777344, "logps/rejected": -337.7550354003906, "loss": 0.0083, "losses/dpo": 9.294788583247282e-07, "losses/sft": 0.615254819393158, "losses/total": 9.294788583247282e-07, "ref_logps/chosen": -208.38485717773438, "ref_logps/rejected": -210.99722290039062, "rewards/accuracies": 1.0, "rewards/chosen": -0.9906774759292603, "rewards/margins": 11.685105323791504, "rewards/rejected": -12.675783157348633, "step": 1930 }, { "epoch": 0.46, "learning_rate": 1.192533333333333e-07, "logps/chosen": -195.23739624023438, "logps/rejected": -364.49298095703125, "loss": 0.0152, "losses/dpo": 4.576912999709748e-07, "losses/sft": 0.7087901830673218, "losses/total": 4.576912999709748e-07, "ref_logps/chosen": -187.8358154296875, "ref_logps/rejected": -233.10108947753906, "rewards/accuracies": 1.0, "rewards/chosen": -0.7401577830314636, "rewards/margins": 12.399030685424805, "rewards/rejected": -13.139188766479492, "step": 1931 }, { "epoch": 0.46, "learning_rate": 1.192e-07, "logps/chosen": -192.8231201171875, "logps/rejected": -341.163330078125, "loss": 0.0022, "losses/dpo": 0.0004995924537070096, "losses/sft": 0.7594218254089355, "losses/total": 0.0004995924537070096, "ref_logps/chosen": -186.67745971679688, "ref_logps/rejected": -220.1453857421875, "rewards/accuracies": 1.0, "rewards/chosen": -0.6145644783973694, "rewards/margins": 11.487229347229004, "rewards/rejected": -12.101794242858887, "step": 1932 }, { "epoch": 0.46, "learning_rate": 1.1914666666666666e-07, "logps/chosen": -255.36631774902344, "logps/rejected": -364.29168701171875, "loss": 0.013, "losses/dpo": 1.4960444616463064e-07, "losses/sft": 0.5815931558609009, "losses/total": 1.4960444616463064e-07, "ref_logps/chosen": -245.34622192382812, "ref_logps/rejected": -233.49618530273438, "rewards/accuracies": 1.0, "rewards/chosen": -1.0020116567611694, "rewards/margins": 12.077536582946777, "rewards/rejected": -13.079548835754395, "step": 1933 }, { "epoch": 0.46, "learning_rate": 1.1909333333333334e-07, "logps/chosen": -232.4889373779297, "logps/rejected": -340.2850036621094, "loss": 0.0017, "losses/dpo": 3.888649846572889e-09, "losses/sft": 0.8707177042961121, "losses/total": 3.888649846572889e-09, "ref_logps/chosen": -220.6761016845703, "ref_logps/rejected": -211.94061279296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.1812852621078491, "rewards/margins": 11.653153419494629, "rewards/rejected": -12.834439277648926, "step": 1934 }, { "epoch": 0.46, "learning_rate": 1.1903999999999999e-07, "logps/chosen": -246.6409149169922, "logps/rejected": -357.3848876953125, "loss": 0.0005, "losses/dpo": 1.22211807340733e-09, "losses/sft": 0.6795612573623657, "losses/total": 1.22211807340733e-09, "ref_logps/chosen": -233.32940673828125, "ref_logps/rejected": -222.59463500976562, "rewards/accuracies": 1.0, "rewards/chosen": -1.3311512470245361, "rewards/margins": 12.147872924804688, "rewards/rejected": -13.479024887084961, "step": 1935 }, { "epoch": 0.46, "learning_rate": 1.1898666666666666e-07, "logps/chosen": -226.81976318359375, "logps/rejected": -335.5614013671875, "loss": 0.0019, "losses/dpo": 3.1436513392435472e-09, "losses/sft": 0.558568000793457, "losses/total": 3.1436513392435472e-09, "ref_logps/chosen": -218.04673767089844, "ref_logps/rejected": -217.22154235839844, "rewards/accuracies": 1.0, "rewards/chosen": -0.8773036003112793, "rewards/margins": 10.956684112548828, "rewards/rejected": -11.83398723602295, "step": 1936 }, { "epoch": 0.46, "learning_rate": 1.1893333333333333e-07, "logps/chosen": -231.0074005126953, "logps/rejected": -324.1221923828125, "loss": 0.0019, "losses/dpo": 8.301591378767625e-07, "losses/sft": 0.6248851418495178, "losses/total": 8.301591378767625e-07, "ref_logps/chosen": -220.64517211914062, "ref_logps/rejected": -209.54531860351562, "rewards/accuracies": 1.0, "rewards/chosen": -1.0362237691879272, "rewards/margins": 10.421463012695312, "rewards/rejected": -11.457686424255371, "step": 1937 }, { "epoch": 0.47, "learning_rate": 1.1888e-07, "logps/chosen": -287.7477111816406, "logps/rejected": -363.3535461425781, "loss": 0.0006, "losses/dpo": 1.7163492316285556e-07, "losses/sft": 0.5654438734054565, "losses/total": 1.7163492316285556e-07, "ref_logps/chosen": -276.9091796875, "ref_logps/rejected": -222.37115478515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.083853006362915, "rewards/margins": 13.014386177062988, "rewards/rejected": -14.098238945007324, "step": 1938 }, { "epoch": 0.47, "learning_rate": 1.1882666666666665e-07, "logps/chosen": -208.01568603515625, "logps/rejected": -311.8341064453125, "loss": 0.0015, "losses/dpo": 0.0003903149045072496, "losses/sft": 0.4963100850582123, "losses/total": 0.0003903149045072496, "ref_logps/chosen": -201.25454711914062, "ref_logps/rejected": -203.92852783203125, "rewards/accuracies": 1.0, "rewards/chosen": -0.6761125922203064, "rewards/margins": 10.114448547363281, "rewards/rejected": -10.790560722351074, "step": 1939 }, { "epoch": 0.47, "learning_rate": 1.1877333333333333e-07, "logps/chosen": -207.72262573242188, "logps/rejected": -333.5772705078125, "loss": 0.0012, "losses/dpo": 0.001238969387486577, "losses/sft": 0.6084657311439514, "losses/total": 0.001238969387486577, "ref_logps/chosen": -195.77072143554688, "ref_logps/rejected": -211.02964782714844, "rewards/accuracies": 1.0, "rewards/chosen": -1.195190668106079, "rewards/margins": 11.05956745147705, "rewards/rejected": -12.254758834838867, "step": 1940 }, { "epoch": 0.47, "learning_rate": 1.1872e-07, "logps/chosen": -226.16510009765625, "logps/rejected": -332.3839111328125, "loss": 0.0027, "losses/dpo": 2.650895680744725e-07, "losses/sft": 0.6202464699745178, "losses/total": 2.650895680744725e-07, "ref_logps/chosen": -217.76255798339844, "ref_logps/rejected": -213.57278442382812, "rewards/accuracies": 1.0, "rewards/chosen": -0.8402537107467651, "rewards/margins": 11.04085922241211, "rewards/rejected": -11.881113052368164, "step": 1941 }, { "epoch": 0.47, "learning_rate": 1.1866666666666667e-07, "logps/chosen": -236.65609741210938, "logps/rejected": -395.2457275390625, "loss": 0.0002, "losses/dpo": 3.969293538830243e-05, "losses/sft": 0.48587754368782043, "losses/total": 3.969293538830243e-05, "ref_logps/chosen": -223.3628692626953, "ref_logps/rejected": -251.48764038085938, "rewards/accuracies": 1.0, "rewards/chosen": -1.329323172569275, "rewards/margins": 13.046485900878906, "rewards/rejected": -14.375808715820312, "step": 1942 }, { "epoch": 0.47, "learning_rate": 1.1861333333333332e-07, "logps/chosen": -257.43988037109375, "logps/rejected": -343.50140380859375, "loss": 0.0179, "losses/dpo": 5.685432370228227e-06, "losses/sft": 0.8336851000785828, "losses/total": 5.685432370228227e-06, "ref_logps/chosen": -244.85202026367188, "ref_logps/rejected": -219.07778930664062, "rewards/accuracies": 1.0, "rewards/chosen": -1.2587838172912598, "rewards/margins": 11.183574676513672, "rewards/rejected": -12.44235897064209, "step": 1943 }, { "epoch": 0.47, "learning_rate": 1.1856e-07, "logps/chosen": -215.6351776123047, "logps/rejected": -338.93829345703125, "loss": 0.007, "losses/dpo": 3.664712858153507e-05, "losses/sft": 0.7589095830917358, "losses/total": 3.664712858153507e-05, "ref_logps/chosen": -204.61813354492188, "ref_logps/rejected": -213.5213623046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.101705551147461, "rewards/margins": 11.439987182617188, "rewards/rejected": -12.541692733764648, "step": 1944 }, { "epoch": 0.47, "learning_rate": 1.1850666666666666e-07, "logps/chosen": -264.91729736328125, "logps/rejected": -375.7362976074219, "loss": 0.0003, "losses/dpo": 3.186024696333334e-05, "losses/sft": 0.6607131958007812, "losses/total": 3.186024696333334e-05, "ref_logps/chosen": -255.03475952148438, "ref_logps/rejected": -245.45216369628906, "rewards/accuracies": 1.0, "rewards/chosen": -0.988253116607666, "rewards/margins": 12.040160179138184, "rewards/rejected": -13.028413772583008, "step": 1945 }, { "epoch": 0.47, "learning_rate": 1.1845333333333334e-07, "logps/chosen": -197.38494873046875, "logps/rejected": -363.7577819824219, "loss": 0.006, "losses/dpo": 0.0002229152451036498, "losses/sft": 0.4454825520515442, "losses/total": 0.0002229152451036498, "ref_logps/chosen": -189.73541259765625, "ref_logps/rejected": -228.22573852539062, "rewards/accuracies": 1.0, "rewards/chosen": -0.764952540397644, "rewards/margins": 12.788252830505371, "rewards/rejected": -13.553205490112305, "step": 1946 }, { "epoch": 0.47, "learning_rate": 1.1839999999999999e-07, "logps/chosen": -214.78822326660156, "logps/rejected": -365.65960693359375, "loss": 0.0065, "losses/dpo": 8.814507168608543e-08, "losses/sft": 0.7047910094261169, "losses/total": 8.814507168608543e-08, "ref_logps/chosen": -207.76657104492188, "ref_logps/rejected": -232.49794006347656, "rewards/accuracies": 1.0, "rewards/chosen": -0.7021641731262207, "rewards/margins": 12.614006042480469, "rewards/rejected": -13.316170692443848, "step": 1947 }, { "epoch": 0.47, "learning_rate": 1.1834666666666666e-07, "logps/chosen": -245.23834228515625, "logps/rejected": -314.0120544433594, "loss": 0.0092, "losses/dpo": 4.39654604633688e-06, "losses/sft": 0.6763669848442078, "losses/total": 4.39654604633688e-06, "ref_logps/chosen": -234.4315948486328, "ref_logps/rejected": -202.94305419921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.0806739330291748, "rewards/margins": 10.026227951049805, "rewards/rejected": -11.106901168823242, "step": 1948 }, { "epoch": 0.47, "learning_rate": 1.1829333333333333e-07, "logps/chosen": -248.23910522460938, "logps/rejected": -366.2413635253906, "loss": 0.001, "losses/dpo": 1.159556973107101e-06, "losses/sft": 0.45326924324035645, "losses/total": 1.159556973107101e-06, "ref_logps/chosen": -239.52467346191406, "ref_logps/rejected": -234.7431640625, "rewards/accuracies": 1.0, "rewards/chosen": -0.8714436888694763, "rewards/margins": 12.278377532958984, "rewards/rejected": -13.149820327758789, "step": 1949 }, { "epoch": 0.47, "learning_rate": 1.1823999999999998e-07, "logps/chosen": -201.36801147460938, "logps/rejected": -316.69573974609375, "loss": 0.0028, "losses/dpo": 6.164926890050992e-05, "losses/sft": 0.5556370615959167, "losses/total": 6.164926890050992e-05, "ref_logps/chosen": -190.2537078857422, "ref_logps/rejected": -200.41421508789062, "rewards/accuracies": 1.0, "rewards/chosen": -1.1114304065704346, "rewards/margins": 10.516722679138184, "rewards/rejected": -11.628152847290039, "step": 1950 }, { "epoch": 0.47, "learning_rate": 1.1818666666666666e-07, "logps/chosen": -193.67547607421875, "logps/rejected": -336.92193603515625, "loss": 0.0012, "losses/dpo": 1.5251860077114543e-06, "losses/sft": 0.5337570905685425, "losses/total": 1.5251860077114543e-06, "ref_logps/chosen": -186.42799377441406, "ref_logps/rejected": -214.28114318847656, "rewards/accuracies": 1.0, "rewards/chosen": -0.7247475981712341, "rewards/margins": 11.539334297180176, "rewards/rejected": -12.264081954956055, "step": 1951 }, { "epoch": 0.47, "learning_rate": 1.1813333333333333e-07, "logps/chosen": -301.83453369140625, "logps/rejected": -368.3714904785156, "loss": 0.0157, "losses/dpo": 1.273975613003131e-05, "losses/sft": 0.6312464475631714, "losses/total": 1.273975613003131e-05, "ref_logps/chosen": -288.74945068359375, "ref_logps/rejected": -234.54763793945312, "rewards/accuracies": 1.0, "rewards/chosen": -1.3085076808929443, "rewards/margins": 12.073877334594727, "rewards/rejected": -13.38238525390625, "step": 1952 }, { "epoch": 0.47, "learning_rate": 1.1808e-07, "logps/chosen": -236.83543395996094, "logps/rejected": -376.2091064453125, "loss": 0.0016, "losses/dpo": 4.963878382113762e-06, "losses/sft": 0.6854293942451477, "losses/total": 4.963878382113762e-06, "ref_logps/chosen": -229.8502197265625, "ref_logps/rejected": -242.28363037109375, "rewards/accuracies": 1.0, "rewards/chosen": -0.6985199451446533, "rewards/margins": 12.6940279006958, "rewards/rejected": -13.392547607421875, "step": 1953 }, { "epoch": 0.47, "learning_rate": 1.1802666666666665e-07, "logps/chosen": -238.99652099609375, "logps/rejected": -356.45068359375, "loss": 0.0008, "losses/dpo": 1.7423988538212143e-05, "losses/sft": 1.2178219556808472, "losses/total": 1.7423988538212143e-05, "ref_logps/chosen": -229.43577575683594, "ref_logps/rejected": -232.54566955566406, "rewards/accuracies": 1.0, "rewards/chosen": -0.956075131893158, "rewards/margins": 11.434429168701172, "rewards/rejected": -12.390504837036133, "step": 1954 }, { "epoch": 0.47, "learning_rate": 1.1797333333333332e-07, "logps/chosen": -234.13385009765625, "logps/rejected": -334.5266418457031, "loss": 0.0023, "losses/dpo": 9.370377256345819e-07, "losses/sft": 0.609146773815155, "losses/total": 9.370377256345819e-07, "ref_logps/chosen": -223.13314819335938, "ref_logps/rejected": -220.50978088378906, "rewards/accuracies": 1.0, "rewards/chosen": -1.1000701189041138, "rewards/margins": 10.301615715026855, "rewards/rejected": -11.40168571472168, "step": 1955 }, { "epoch": 0.47, "learning_rate": 1.1792e-07, "logps/chosen": -217.32650756835938, "logps/rejected": -353.8182373046875, "loss": 0.0177, "losses/dpo": 5.719331230835678e-09, "losses/sft": 0.6534879803657532, "losses/total": 5.719331230835678e-09, "ref_logps/chosen": -208.33505249023438, "ref_logps/rejected": -223.99993896484375, "rewards/accuracies": 1.0, "rewards/chosen": -0.8991448283195496, "rewards/margins": 12.082687377929688, "rewards/rejected": -12.981831550598145, "step": 1956 }, { "epoch": 0.47, "learning_rate": 1.1786666666666666e-07, "logps/chosen": -276.40838623046875, "logps/rejected": -390.41058349609375, "loss": 0.0012, "losses/dpo": 3.068784906190558e-07, "losses/sft": 0.8600915670394897, "losses/total": 3.068784906190558e-07, "ref_logps/chosen": -262.14349365234375, "ref_logps/rejected": -241.82289123535156, "rewards/accuracies": 1.0, "rewards/chosen": -1.4264881610870361, "rewards/margins": 13.432284355163574, "rewards/rejected": -14.858772277832031, "step": 1957 }, { "epoch": 0.47, "learning_rate": 1.1781333333333331e-07, "logps/chosen": -269.23876953125, "logps/rejected": -398.1545715332031, "loss": 0.0048, "losses/dpo": 0.00021182638010941446, "losses/sft": 0.5420299172401428, "losses/total": 0.00021182638010941446, "ref_logps/chosen": -261.079345703125, "ref_logps/rejected": -258.15948486328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.8159447908401489, "rewards/margins": 13.183562278747559, "rewards/rejected": -13.999506950378418, "step": 1958 }, { "epoch": 0.47, "learning_rate": 1.1775999999999999e-07, "logps/chosen": -273.0066223144531, "logps/rejected": -381.9491882324219, "loss": 0.0005, "losses/dpo": 5.320924856277998e-07, "losses/sft": 0.6707040071487427, "losses/total": 5.320924856277998e-07, "ref_logps/chosen": -264.2277526855469, "ref_logps/rejected": -244.50357055664062, "rewards/accuracies": 1.0, "rewards/chosen": -0.8778877258300781, "rewards/margins": 12.866673469543457, "rewards/rejected": -13.744560241699219, "step": 1959 }, { "epoch": 0.47, "learning_rate": 1.1770666666666667e-07, "logps/chosen": -262.2607421875, "logps/rejected": -392.6260986328125, "loss": 0.0105, "losses/dpo": 1.7311037936451612e-08, "losses/sft": 0.5091372728347778, "losses/total": 1.7311037936451612e-08, "ref_logps/chosen": -251.3433837890625, "ref_logps/rejected": -254.17535400390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.091737151145935, "rewards/margins": 12.753336906433105, "rewards/rejected": -13.845074653625488, "step": 1960 }, { "epoch": 0.47, "learning_rate": 1.1765333333333334e-07, "logps/chosen": -195.65750122070312, "logps/rejected": -345.9481201171875, "loss": 0.0064, "losses/dpo": 2.338936428714078e-05, "losses/sft": 0.6307450532913208, "losses/total": 2.338936428714078e-05, "ref_logps/chosen": -186.641357421875, "ref_logps/rejected": -213.88543701171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.9016149640083313, "rewards/margins": 12.304655075073242, "rewards/rejected": -13.206268310546875, "step": 1961 }, { "epoch": 0.47, "learning_rate": 1.176e-07, "logps/chosen": -262.8585205078125, "logps/rejected": -371.87158203125, "loss": 0.0002, "losses/dpo": 1.7159231902041938e-06, "losses/sft": 0.8990613222122192, "losses/total": 1.7159231902041938e-06, "ref_logps/chosen": -250.64161682128906, "ref_logps/rejected": -225.19830322265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2216898202896118, "rewards/margins": 13.445638656616211, "rewards/rejected": -14.667327880859375, "step": 1962 }, { "epoch": 0.47, "learning_rate": 1.1754666666666666e-07, "logps/chosen": -227.211181640625, "logps/rejected": -335.28021240234375, "loss": 0.0153, "losses/dpo": 9.638807796363835e-07, "losses/sft": 0.733934760093689, "losses/total": 9.638807796363835e-07, "ref_logps/chosen": -218.1566619873047, "ref_logps/rejected": -218.10321044921875, "rewards/accuracies": 1.0, "rewards/chosen": -0.9054506421089172, "rewards/margins": 10.812246322631836, "rewards/rejected": -11.717697143554688, "step": 1963 }, { "epoch": 0.47, "learning_rate": 1.1749333333333333e-07, "logps/chosen": -259.09088134765625, "logps/rejected": -343.5277099609375, "loss": 0.0023, "losses/dpo": 2.8537849061649467e-07, "losses/sft": 0.4953864812850952, "losses/total": 2.8537849061649467e-07, "ref_logps/chosen": -249.7630615234375, "ref_logps/rejected": -219.74356079101562, "rewards/accuracies": 1.0, "rewards/chosen": -0.9327863454818726, "rewards/margins": 11.445630073547363, "rewards/rejected": -12.378416061401367, "step": 1964 }, { "epoch": 0.47, "learning_rate": 1.1744000000000001e-07, "logps/chosen": -229.99081420898438, "logps/rejected": -340.37811279296875, "loss": 0.0057, "losses/dpo": 3.482147440081462e-05, "losses/sft": 0.5053510069847107, "losses/total": 3.482147440081462e-05, "ref_logps/chosen": -221.59222412109375, "ref_logps/rejected": -218.08743286132812, "rewards/accuracies": 1.0, "rewards/chosen": -0.8398576378822327, "rewards/margins": 11.38921070098877, "rewards/rejected": -12.229068756103516, "step": 1965 }, { "epoch": 0.47, "learning_rate": 1.1738666666666666e-07, "logps/chosen": -248.76608276367188, "logps/rejected": -391.036865234375, "loss": 0.0029, "losses/dpo": 3.520281310898099e-08, "losses/sft": 0.43866512179374695, "losses/total": 3.520281310898099e-08, "ref_logps/chosen": -240.7248992919922, "ref_logps/rejected": -255.7657928466797, "rewards/accuracies": 1.0, "rewards/chosen": -0.8041190505027771, "rewards/margins": 12.722989082336426, "rewards/rejected": -13.527108192443848, "step": 1966 }, { "epoch": 0.47, "learning_rate": 1.1733333333333333e-07, "logps/chosen": -220.52615356445312, "logps/rejected": -375.795166015625, "loss": 0.0021, "losses/dpo": 3.118736913165776e-08, "losses/sft": 0.8557870984077454, "losses/total": 3.118736913165776e-08, "ref_logps/chosen": -211.42156982421875, "ref_logps/rejected": -241.68853759765625, "rewards/accuracies": 1.0, "rewards/chosen": -0.9104574918746948, "rewards/margins": 12.500205993652344, "rewards/rejected": -13.410663604736328, "step": 1967 }, { "epoch": 0.47, "learning_rate": 1.1728e-07, "logps/chosen": -246.7315216064453, "logps/rejected": -348.88262939453125, "loss": 0.0039, "losses/dpo": 9.060573402663863e-10, "losses/sft": 0.5723937749862671, "losses/total": 9.060573402663863e-10, "ref_logps/chosen": -237.8825225830078, "ref_logps/rejected": -219.05484008789062, "rewards/accuracies": 1.0, "rewards/chosen": -0.8848994374275208, "rewards/margins": 12.097880363464355, "rewards/rejected": -12.982779502868652, "step": 1968 }, { "epoch": 0.47, "learning_rate": 1.1722666666666665e-07, "logps/chosen": -212.81488037109375, "logps/rejected": -326.3699645996094, "loss": 0.0248, "losses/dpo": 2.2918912634395383e-07, "losses/sft": 0.6286334991455078, "losses/total": 2.2918912634395383e-07, "ref_logps/chosen": -202.14036560058594, "ref_logps/rejected": -202.45242309570312, "rewards/accuracies": 1.0, "rewards/chosen": -1.0674487352371216, "rewards/margins": 11.324307441711426, "rewards/rejected": -12.391756057739258, "step": 1969 }, { "epoch": 0.47, "learning_rate": 1.1717333333333333e-07, "logps/chosen": -260.341552734375, "logps/rejected": -359.0500793457031, "loss": 0.0027, "losses/dpo": 2.3536406388302566e-06, "losses/sft": 0.5892493724822998, "losses/total": 2.3536406388302566e-06, "ref_logps/chosen": -250.25894165039062, "ref_logps/rejected": -228.4866485595703, "rewards/accuracies": 1.0, "rewards/chosen": -1.0082621574401855, "rewards/margins": 12.048081398010254, "rewards/rejected": -13.056343078613281, "step": 1970 }, { "epoch": 0.47, "learning_rate": 1.1711999999999999e-07, "logps/chosen": -224.12857055664062, "logps/rejected": -343.33062744140625, "loss": 0.0151, "losses/dpo": 6.651813322378075e-08, "losses/sft": 0.9491991400718689, "losses/total": 6.651813322378075e-08, "ref_logps/chosen": -215.00372314453125, "ref_logps/rejected": -219.7325439453125, "rewards/accuracies": 1.0, "rewards/chosen": -0.9124868512153625, "rewards/margins": 11.447320938110352, "rewards/rejected": -12.359807968139648, "step": 1971 }, { "epoch": 0.47, "learning_rate": 1.1706666666666667e-07, "logps/chosen": -224.39413452148438, "logps/rejected": -328.21209716796875, "loss": 0.0043, "losses/dpo": 1.2798261650459608e-07, "losses/sft": 0.9436051249504089, "losses/total": 1.2798261650459608e-07, "ref_logps/chosen": -216.26910400390625, "ref_logps/rejected": -220.16928100585938, "rewards/accuracies": 1.0, "rewards/chosen": -0.8125022649765015, "rewards/margins": 9.991781234741211, "rewards/rejected": -10.804283142089844, "step": 1972 }, { "epoch": 0.47, "learning_rate": 1.1701333333333332e-07, "logps/chosen": -232.8295135498047, "logps/rejected": -348.6219787597656, "loss": 0.0019, "losses/dpo": 9.606734465705813e-07, "losses/sft": 0.766133189201355, "losses/total": 9.606734465705813e-07, "ref_logps/chosen": -222.56227111816406, "ref_logps/rejected": -219.2181396484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.0267242193222046, "rewards/margins": 11.913658142089844, "rewards/rejected": -12.94038200378418, "step": 1973 }, { "epoch": 0.47, "learning_rate": 1.1696e-07, "logps/chosen": -199.94876098632812, "logps/rejected": -337.12677001953125, "loss": 0.0029, "losses/dpo": 4.8675548924848044e-08, "losses/sft": 0.5420015454292297, "losses/total": 4.8675548924848044e-08, "ref_logps/chosen": -192.03562927246094, "ref_logps/rejected": -209.3898468017578, "rewards/accuracies": 1.0, "rewards/chosen": -0.7913130521774292, "rewards/margins": 11.982379913330078, "rewards/rejected": -12.773693084716797, "step": 1974 }, { "epoch": 0.47, "learning_rate": 1.1690666666666666e-07, "logps/chosen": -229.82107543945312, "logps/rejected": -347.22393798828125, "loss": 0.0021, "losses/dpo": 7.211842234511323e-09, "losses/sft": 0.9933286905288696, "losses/total": 7.211842234511323e-09, "ref_logps/chosen": -220.55581665039062, "ref_logps/rejected": -224.8704833984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9265238642692566, "rewards/margins": 11.308820724487305, "rewards/rejected": -12.235343933105469, "step": 1975 }, { "epoch": 0.47, "learning_rate": 1.1685333333333334e-07, "logps/chosen": -275.7049865722656, "logps/rejected": -351.51806640625, "loss": 0.0014, "losses/dpo": 1.8034229753993714e-07, "losses/sft": 0.5076038241386414, "losses/total": 1.8034229753993714e-07, "ref_logps/chosen": -264.61279296875, "ref_logps/rejected": -227.71920776367188, "rewards/accuracies": 1.0, "rewards/chosen": -1.1092190742492676, "rewards/margins": 11.27066707611084, "rewards/rejected": -12.37988567352295, "step": 1976 }, { "epoch": 0.47, "learning_rate": 1.1679999999999999e-07, "logps/chosen": -218.4063720703125, "logps/rejected": -352.62591552734375, "loss": 0.0018, "losses/dpo": 5.793942818854703e-07, "losses/sft": 0.6731001138687134, "losses/total": 5.793942818854703e-07, "ref_logps/chosen": -206.5705108642578, "ref_logps/rejected": -220.07492065429688, "rewards/accuracies": 1.0, "rewards/chosen": -1.183585286140442, "rewards/margins": 12.071512222290039, "rewards/rejected": -13.255096435546875, "step": 1977 }, { "epoch": 0.47, "learning_rate": 1.1674666666666666e-07, "logps/chosen": -231.6058349609375, "logps/rejected": -366.37158203125, "loss": 0.0018, "losses/dpo": 2.0479446902754717e-06, "losses/sft": 0.7143850922584534, "losses/total": 2.0479446902754717e-06, "ref_logps/chosen": -222.9923858642578, "ref_logps/rejected": -236.46444702148438, "rewards/accuracies": 1.0, "rewards/chosen": -0.8613448143005371, "rewards/margins": 12.12936782836914, "rewards/rejected": -12.99071216583252, "step": 1978 }, { "epoch": 0.47, "learning_rate": 1.1669333333333333e-07, "logps/chosen": -199.36993408203125, "logps/rejected": -336.09375, "loss": 0.0046, "losses/dpo": 7.435256179633143e-07, "losses/sft": 0.572842538356781, "losses/total": 7.435256179633143e-07, "ref_logps/chosen": -186.08865356445312, "ref_logps/rejected": -209.93548583984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.3281280994415283, "rewards/margins": 11.287698745727539, "rewards/rejected": -12.615825653076172, "step": 1979 }, { "epoch": 0.48, "learning_rate": 1.1664e-07, "logps/chosen": -234.05209350585938, "logps/rejected": -366.08221435546875, "loss": 0.0007, "losses/dpo": 4.235872097524407e-07, "losses/sft": 0.6367220878601074, "losses/total": 4.235872097524407e-07, "ref_logps/chosen": -222.802978515625, "ref_logps/rejected": -223.09481811523438, "rewards/accuracies": 1.0, "rewards/chosen": -1.1249098777770996, "rewards/margins": 13.173826217651367, "rewards/rejected": -14.298736572265625, "step": 1980 }, { "epoch": 0.48, "learning_rate": 1.1658666666666665e-07, "logps/chosen": -237.65219116210938, "logps/rejected": -355.236328125, "loss": 0.0012, "losses/dpo": 3.7774852899019606e-06, "losses/sft": 0.6349276900291443, "losses/total": 3.7774852899019606e-06, "ref_logps/chosen": -228.85760498046875, "ref_logps/rejected": -225.24488830566406, "rewards/accuracies": 1.0, "rewards/chosen": -0.8794575929641724, "rewards/margins": 12.11968994140625, "rewards/rejected": -12.999147415161133, "step": 1981 }, { "epoch": 0.48, "learning_rate": 1.1653333333333333e-07, "logps/chosen": -217.60340881347656, "logps/rejected": -356.15838623046875, "loss": 0.0005, "losses/dpo": 3.2986550650093704e-05, "losses/sft": 0.570548415184021, "losses/total": 3.2986550650093704e-05, "ref_logps/chosen": -208.67494201660156, "ref_logps/rejected": -224.59637451171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.8928475379943848, "rewards/margins": 12.263355255126953, "rewards/rejected": -13.15620231628418, "step": 1982 }, { "epoch": 0.48, "learning_rate": 1.1648e-07, "logps/chosen": -279.15655517578125, "logps/rejected": -404.6698913574219, "loss": 0.0003, "losses/dpo": 7.745065886410885e-06, "losses/sft": 0.7685832977294922, "losses/total": 7.745065886410885e-06, "ref_logps/chosen": -267.7484130859375, "ref_logps/rejected": -263.5034484863281, "rewards/accuracies": 1.0, "rewards/chosen": -1.140815258026123, "rewards/margins": 12.975830078125, "rewards/rejected": -14.116644859313965, "step": 1983 }, { "epoch": 0.48, "learning_rate": 1.1642666666666665e-07, "logps/chosen": -248.74363708496094, "logps/rejected": -371.6142578125, "loss": 0.0032, "losses/dpo": 1.5486148186028004e-07, "losses/sft": 0.5124748349189758, "losses/total": 1.5486148186028004e-07, "ref_logps/chosen": -241.53396606445312, "ref_logps/rejected": -236.18524169921875, "rewards/accuracies": 1.0, "rewards/chosen": -0.7209647297859192, "rewards/margins": 12.821937561035156, "rewards/rejected": -13.542902946472168, "step": 1984 }, { "epoch": 0.48, "learning_rate": 1.1637333333333332e-07, "logps/chosen": -215.98431396484375, "logps/rejected": -361.9854736328125, "loss": 0.0002, "losses/dpo": 2.6253002943121828e-05, "losses/sft": 0.8880274891853333, "losses/total": 2.6253002943121828e-05, "ref_logps/chosen": -207.10020446777344, "ref_logps/rejected": -228.02255249023438, "rewards/accuracies": 1.0, "rewards/chosen": -0.8884105682373047, "rewards/margins": 12.507881164550781, "rewards/rejected": -13.396291732788086, "step": 1985 }, { "epoch": 0.48, "learning_rate": 1.1632e-07, "logps/chosen": -223.6947479248047, "logps/rejected": -316.359375, "loss": 0.0058, "losses/dpo": 1.1438968613219913e-05, "losses/sft": 0.6323815584182739, "losses/total": 1.1438968613219913e-05, "ref_logps/chosen": -216.2336883544922, "ref_logps/rejected": -201.54922485351562, "rewards/accuracies": 1.0, "rewards/chosen": -0.7461042404174805, "rewards/margins": 10.734912872314453, "rewards/rejected": -11.48101806640625, "step": 1986 }, { "epoch": 0.48, "learning_rate": 1.1626666666666666e-07, "logps/chosen": -223.80865478515625, "logps/rejected": -309.836181640625, "loss": 0.0027, "losses/dpo": 2.934007534349803e-05, "losses/sft": 0.7181106805801392, "losses/total": 2.934007534349803e-05, "ref_logps/chosen": -214.187255859375, "ref_logps/rejected": -190.07485961914062, "rewards/accuracies": 1.0, "rewards/chosen": -0.9621407985687256, "rewards/margins": 11.013992309570312, "rewards/rejected": -11.976133346557617, "step": 1987 }, { "epoch": 0.48, "learning_rate": 1.1621333333333331e-07, "logps/chosen": -186.458740234375, "logps/rejected": -322.608642578125, "loss": 0.0042, "losses/dpo": 5.319382125890115e-07, "losses/sft": 0.3771284520626068, "losses/total": 5.319382125890115e-07, "ref_logps/chosen": -179.03053283691406, "ref_logps/rejected": -197.9542236328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.7428210973739624, "rewards/margins": 11.722622871398926, "rewards/rejected": -12.46544361114502, "step": 1988 }, { "epoch": 0.48, "learning_rate": 1.1615999999999999e-07, "logps/chosen": -236.2841339111328, "logps/rejected": -357.6334228515625, "loss": 0.02, "losses/dpo": 8.765046466407966e-09, "losses/sft": 0.5787633657455444, "losses/total": 8.765046466407966e-09, "ref_logps/chosen": -226.73878479003906, "ref_logps/rejected": -223.38796997070312, "rewards/accuracies": 1.0, "rewards/chosen": -0.9545353651046753, "rewards/margins": 12.470010757446289, "rewards/rejected": -13.424546241760254, "step": 1989 }, { "epoch": 0.48, "learning_rate": 1.1610666666666667e-07, "logps/chosen": -250.773681640625, "logps/rejected": -400.6612854003906, "loss": 0.0002, "losses/dpo": 6.865060981908755e-07, "losses/sft": 0.4627334773540497, "losses/total": 6.865060981908755e-07, "ref_logps/chosen": -234.69471740722656, "ref_logps/rejected": -239.57687377929688, "rewards/accuracies": 1.0, "rewards/chosen": -1.6078948974609375, "rewards/margins": 14.500544548034668, "rewards/rejected": -16.108440399169922, "step": 1990 }, { "epoch": 0.48, "learning_rate": 1.1605333333333334e-07, "logps/chosen": -240.75970458984375, "logps/rejected": -339.35003662109375, "loss": 0.0125, "losses/dpo": 1.8449977545742513e-08, "losses/sft": 0.7374297976493835, "losses/total": 1.8449977545742513e-08, "ref_logps/chosen": -228.93716430664062, "ref_logps/rejected": -212.887451171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.1822559833526611, "rewards/margins": 11.46400260925293, "rewards/rejected": -12.646258354187012, "step": 1991 }, { "epoch": 0.48, "learning_rate": 1.1599999999999999e-07, "logps/chosen": -194.30177307128906, "logps/rejected": -327.361328125, "loss": 0.0043, "losses/dpo": 4.448208468232906e-09, "losses/sft": 0.6837669014930725, "losses/total": 4.448208468232906e-09, "ref_logps/chosen": -185.17420959472656, "ref_logps/rejected": -206.7189483642578, "rewards/accuracies": 1.0, "rewards/chosen": -0.9127573370933533, "rewards/margins": 11.151483535766602, "rewards/rejected": -12.064240455627441, "step": 1992 }, { "epoch": 0.48, "learning_rate": 1.1594666666666666e-07, "logps/chosen": -233.15818786621094, "logps/rejected": -350.55413818359375, "loss": 0.0043, "losses/dpo": 5.021250331083138e-07, "losses/sft": 0.7560766935348511, "losses/total": 5.021250331083138e-07, "ref_logps/chosen": -223.98355102539062, "ref_logps/rejected": -218.53988647460938, "rewards/accuracies": 1.0, "rewards/chosen": -0.9174633622169495, "rewards/margins": 12.28396224975586, "rewards/rejected": -13.201424598693848, "step": 1993 }, { "epoch": 0.48, "learning_rate": 1.1589333333333333e-07, "logps/chosen": -249.27284240722656, "logps/rejected": -330.3370361328125, "loss": 0.0013, "losses/dpo": 0.0003625854442361742, "losses/sft": 1.3123841285705566, "losses/total": 0.0003625854442361742, "ref_logps/chosen": -237.40684509277344, "ref_logps/rejected": -209.39256286621094, "rewards/accuracies": 1.0, "rewards/chosen": -1.1866004467010498, "rewards/margins": 10.90784740447998, "rewards/rejected": -12.09444808959961, "step": 1994 }, { "epoch": 0.48, "learning_rate": 1.1584000000000001e-07, "logps/chosen": -232.3402099609375, "logps/rejected": -338.595703125, "loss": 0.0051, "losses/dpo": 1.6345213225577027e-05, "losses/sft": 0.9093114137649536, "losses/total": 1.6345213225577027e-05, "ref_logps/chosen": -223.22743225097656, "ref_logps/rejected": -213.95535278320312, "rewards/accuracies": 1.0, "rewards/chosen": -0.9112784266471863, "rewards/margins": 11.552753448486328, "rewards/rejected": -12.464032173156738, "step": 1995 }, { "epoch": 0.48, "learning_rate": 1.1578666666666666e-07, "logps/chosen": -250.63943481445312, "logps/rejected": -400.0820007324219, "loss": 0.0011, "losses/dpo": 2.0407533440902625e-07, "losses/sft": 0.550631046295166, "losses/total": 2.0407533440902625e-07, "ref_logps/chosen": -238.40562438964844, "ref_logps/rejected": -254.4786376953125, "rewards/accuracies": 1.0, "rewards/chosen": -1.223381757736206, "rewards/margins": 13.336954116821289, "rewards/rejected": -14.560335159301758, "step": 1996 }, { "epoch": 0.48, "learning_rate": 1.1573333333333332e-07, "logps/chosen": -270.33929443359375, "logps/rejected": -394.6061706542969, "loss": 0.002, "losses/dpo": 4.8756110260228525e-08, "losses/sft": 0.47444209456443787, "losses/total": 4.8756110260228525e-08, "ref_logps/chosen": -257.44805908203125, "ref_logps/rejected": -249.8392791748047, "rewards/accuracies": 1.0, "rewards/chosen": -1.289125919342041, "rewards/margins": 13.187566757202148, "rewards/rejected": -14.476692199707031, "step": 1997 }, { "epoch": 0.48, "learning_rate": 1.1568e-07, "logps/chosen": -235.6310577392578, "logps/rejected": -355.5665283203125, "loss": 0.0027, "losses/dpo": 0.021657807752490044, "losses/sft": 0.647240400314331, "losses/total": 0.021657807752490044, "ref_logps/chosen": -225.2942657470703, "ref_logps/rejected": -224.55352783203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.033677577972412, "rewards/margins": 12.067623138427734, "rewards/rejected": -13.101300239562988, "step": 1998 }, { "epoch": 0.48, "learning_rate": 1.1562666666666668e-07, "logps/chosen": -230.15797424316406, "logps/rejected": -351.23822021484375, "loss": 0.0456, "losses/dpo": 3.66092317563016e-05, "losses/sft": 0.8277803063392639, "losses/total": 3.66092317563016e-05, "ref_logps/chosen": -220.8091583251953, "ref_logps/rejected": -222.275390625, "rewards/accuracies": 0.96875, "rewards/chosen": -0.9348816871643066, "rewards/margins": 11.96140193939209, "rewards/rejected": -12.896284103393555, "step": 1999 }, { "epoch": 0.48, "learning_rate": 1.1557333333333333e-07, "logps/chosen": -245.41966247558594, "logps/rejected": -343.82647705078125, "loss": 0.0021, "losses/dpo": 0.0004693206865340471, "losses/sft": 0.6123818755149841, "losses/total": 0.0004693206865340471, "ref_logps/chosen": -232.89089965820312, "ref_logps/rejected": -213.9482879638672, "rewards/accuracies": 1.0, "rewards/chosen": -1.2528765201568604, "rewards/margins": 11.734941482543945, "rewards/rejected": -12.987817764282227, "step": 2000 }, { "epoch": 0.48, "learning_rate": 1.1551999999999999e-07, "logps/chosen": -231.0897216796875, "logps/rejected": -368.524658203125, "loss": 0.0005, "losses/dpo": 1.2111154035210348e-07, "losses/sft": 0.7769525647163391, "losses/total": 1.2111154035210348e-07, "ref_logps/chosen": -219.19358825683594, "ref_logps/rejected": -225.28909301757812, "rewards/accuracies": 1.0, "rewards/chosen": -1.1896133422851562, "rewards/margins": 13.133943557739258, "rewards/rejected": -14.323555946350098, "step": 2001 }, { "epoch": 0.48, "learning_rate": 1.1546666666666667e-07, "logps/chosen": -258.9920654296875, "logps/rejected": -381.7270202636719, "loss": 0.0016, "losses/dpo": 7.895194357843138e-06, "losses/sft": 0.4796134829521179, "losses/total": 7.895194357843138e-06, "ref_logps/chosen": -247.43603515625, "ref_logps/rejected": -246.0797119140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.1556017398834229, "rewards/margins": 12.40912914276123, "rewards/rejected": -13.56473159790039, "step": 2002 }, { "epoch": 0.48, "learning_rate": 1.1541333333333332e-07, "logps/chosen": -204.15109252929688, "logps/rejected": -384.735107421875, "loss": 0.0063, "losses/dpo": 8.24141963562397e-08, "losses/sft": 0.6771034002304077, "losses/total": 8.24141963562397e-08, "ref_logps/chosen": -193.1319580078125, "ref_logps/rejected": -243.99420166015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.1019102334976196, "rewards/margins": 12.97217845916748, "rewards/rejected": -14.074090003967285, "step": 2003 }, { "epoch": 0.48, "learning_rate": 1.1536e-07, "logps/chosen": -239.56536865234375, "logps/rejected": -366.04547119140625, "loss": 0.0018, "losses/dpo": 6.066389687475748e-06, "losses/sft": 0.6219429969787598, "losses/total": 6.066389687475748e-06, "ref_logps/chosen": -227.46875, "ref_logps/rejected": -224.92929077148438, "rewards/accuracies": 1.0, "rewards/chosen": -1.209663987159729, "rewards/margins": 12.901956558227539, "rewards/rejected": -14.11161994934082, "step": 2004 }, { "epoch": 0.48, "learning_rate": 1.1530666666666666e-07, "logps/chosen": -194.9879150390625, "logps/rejected": -347.4219665527344, "loss": 0.0004, "losses/dpo": 2.8054472522853757e-07, "losses/sft": 0.4727000594139099, "losses/total": 2.8054472522853757e-07, "ref_logps/chosen": -186.0640869140625, "ref_logps/rejected": -208.3332061767578, "rewards/accuracies": 1.0, "rewards/chosen": -0.8923837542533875, "rewards/margins": 13.016494750976562, "rewards/rejected": -13.908879280090332, "step": 2005 }, { "epoch": 0.48, "learning_rate": 1.1525333333333334e-07, "logps/chosen": -184.4335479736328, "logps/rejected": -316.92950439453125, "loss": 0.0123, "losses/dpo": 2.9659108502499976e-09, "losses/sft": 0.9339368343353271, "losses/total": 2.9659108502499976e-09, "ref_logps/chosen": -173.7779998779297, "ref_logps/rejected": -195.5989990234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.0655550956726074, "rewards/margins": 11.067493438720703, "rewards/rejected": -12.133049011230469, "step": 2006 }, { "epoch": 0.48, "learning_rate": 1.1519999999999999e-07, "logps/chosen": -244.86097717285156, "logps/rejected": -380.6214599609375, "loss": 0.0011, "losses/dpo": 1.4982479115133174e-05, "losses/sft": 0.6520379781723022, "losses/total": 1.4982479115133174e-05, "ref_logps/chosen": -233.36892700195312, "ref_logps/rejected": -237.63470458984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.149206280708313, "rewards/margins": 13.149467468261719, "rewards/rejected": -14.298674583435059, "step": 2007 }, { "epoch": 0.48, "learning_rate": 1.1514666666666666e-07, "logps/chosen": -233.16036987304688, "logps/rejected": -368.6076965332031, "loss": 0.0026, "losses/dpo": 5.81108183794754e-09, "losses/sft": 0.6156381964683533, "losses/total": 5.81108183794754e-09, "ref_logps/chosen": -223.5404510498047, "ref_logps/rejected": -241.06153869628906, "rewards/accuracies": 1.0, "rewards/chosen": -0.9619916081428528, "rewards/margins": 11.792623519897461, "rewards/rejected": -12.754615783691406, "step": 2008 }, { "epoch": 0.48, "learning_rate": 1.1509333333333333e-07, "logps/chosen": -224.43115234375, "logps/rejected": -312.814208984375, "loss": 0.0012, "losses/dpo": 0.0006771465996280313, "losses/sft": 0.6178885698318481, "losses/total": 0.0006771465996280313, "ref_logps/chosen": -213.16513061523438, "ref_logps/rejected": -192.6103515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.1266026496887207, "rewards/margins": 10.893783569335938, "rewards/rejected": -12.0203857421875, "step": 2009 }, { "epoch": 0.48, "learning_rate": 1.1504e-07, "logps/chosen": -235.3091278076172, "logps/rejected": -367.6214599609375, "loss": 0.0055, "losses/dpo": 1.0930764346994692e-06, "losses/sft": 0.8167702555656433, "losses/total": 1.0930764346994692e-06, "ref_logps/chosen": -221.8123016357422, "ref_logps/rejected": -233.611083984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.349682331085205, "rewards/margins": 12.051352500915527, "rewards/rejected": -13.40103530883789, "step": 2010 }, { "epoch": 0.48, "learning_rate": 1.1498666666666665e-07, "logps/chosen": -226.38037109375, "logps/rejected": -339.946044921875, "loss": 0.0123, "losses/dpo": 4.107756979010446e-07, "losses/sft": 0.7725157141685486, "losses/total": 4.107756979010446e-07, "ref_logps/chosen": -215.63864135742188, "ref_logps/rejected": -220.07012939453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.0741757154464722, "rewards/margins": 10.913414001464844, "rewards/rejected": -11.987590789794922, "step": 2011 }, { "epoch": 0.48, "learning_rate": 1.1493333333333333e-07, "logps/chosen": -226.5063934326172, "logps/rejected": -369.90692138671875, "loss": 0.0005, "losses/dpo": 1.179000832962629e-06, "losses/sft": 0.5979008674621582, "losses/total": 1.179000832962629e-06, "ref_logps/chosen": -214.98062133789062, "ref_logps/rejected": -221.87060546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.1525760889053345, "rewards/margins": 13.651054382324219, "rewards/rejected": -14.803630828857422, "step": 2012 }, { "epoch": 0.48, "learning_rate": 1.1488e-07, "logps/chosen": -253.06031799316406, "logps/rejected": -345.4504089355469, "loss": 0.0009, "losses/dpo": 8.489915437337459e-09, "losses/sft": 0.6258065700531006, "losses/total": 8.489915437337459e-09, "ref_logps/chosen": -243.5778350830078, "ref_logps/rejected": -216.18858337402344, "rewards/accuracies": 1.0, "rewards/chosen": -0.9482487440109253, "rewards/margins": 11.977935791015625, "rewards/rejected": -12.926183700561523, "step": 2013 }, { "epoch": 0.48, "learning_rate": 1.1482666666666667e-07, "logps/chosen": -227.416259765625, "logps/rejected": -360.5399475097656, "loss": 0.0027, "losses/dpo": 1.1968720459165638e-09, "losses/sft": 0.6497198939323425, "losses/total": 1.1968720459165638e-09, "ref_logps/chosen": -214.9120330810547, "ref_logps/rejected": -215.70794677734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.2504221200942993, "rewards/margins": 13.232778549194336, "rewards/rejected": -14.483200073242188, "step": 2014 }, { "epoch": 0.48, "learning_rate": 1.1477333333333332e-07, "logps/chosen": -227.36624145507812, "logps/rejected": -378.91259765625, "loss": 0.0174, "losses/dpo": 9.624321251067158e-08, "losses/sft": 0.7741101384162903, "losses/total": 9.624321251067158e-08, "ref_logps/chosen": -215.8039093017578, "ref_logps/rejected": -242.417724609375, "rewards/accuracies": 1.0, "rewards/chosen": -1.1562355756759644, "rewards/margins": 12.493250846862793, "rewards/rejected": -13.649486541748047, "step": 2015 }, { "epoch": 0.48, "learning_rate": 1.1472e-07, "logps/chosen": -236.8392333984375, "logps/rejected": -327.156982421875, "loss": 0.0018, "losses/dpo": 7.373631660811952e-07, "losses/sft": 0.8426986336708069, "losses/total": 7.373631660811952e-07, "ref_logps/chosen": -227.38632202148438, "ref_logps/rejected": -204.63772583007812, "rewards/accuracies": 1.0, "rewards/chosen": -0.9452921152114868, "rewards/margins": 11.306636810302734, "rewards/rejected": -12.251928329467773, "step": 2016 }, { "epoch": 0.48, "learning_rate": 1.1466666666666666e-07, "logps/chosen": -244.0333709716797, "logps/rejected": -323.521240234375, "loss": 0.016, "losses/dpo": 4.231019534728375e-09, "losses/sft": 0.5497071743011475, "losses/total": 4.231019534728375e-09, "ref_logps/chosen": -234.01023864746094, "ref_logps/rejected": -208.068603515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.0023136138916016, "rewards/margins": 10.542948722839355, "rewards/rejected": -11.545262336730957, "step": 2017 }, { "epoch": 0.48, "learning_rate": 1.1461333333333331e-07, "logps/chosen": -209.40505981445312, "logps/rejected": -354.0510559082031, "loss": 0.0154, "losses/dpo": 2.713552760269522e-07, "losses/sft": 0.5011381506919861, "losses/total": 2.713552760269522e-07, "ref_logps/chosen": -203.51451110839844, "ref_logps/rejected": -225.01344299316406, "rewards/accuracies": 1.0, "rewards/chosen": -0.5890536308288574, "rewards/margins": 12.314708709716797, "rewards/rejected": -12.903762817382812, "step": 2018 }, { "epoch": 0.48, "learning_rate": 1.1455999999999999e-07, "logps/chosen": -270.363037109375, "logps/rejected": -384.68975830078125, "loss": 0.0093, "losses/dpo": 1.3166726603230927e-05, "losses/sft": 0.5184860825538635, "losses/total": 1.3166726603230927e-05, "ref_logps/chosen": -258.0472717285156, "ref_logps/rejected": -247.9525909423828, "rewards/accuracies": 1.0, "rewards/chosen": -1.2315747737884521, "rewards/margins": 12.442142486572266, "rewards/rejected": -13.673717498779297, "step": 2019 }, { "epoch": 0.48, "learning_rate": 1.1450666666666666e-07, "logps/chosen": -252.24163818359375, "logps/rejected": -357.84527587890625, "loss": 0.0003, "losses/dpo": 1.3639046301250346e-05, "losses/sft": 0.7159526348114014, "losses/total": 1.3639046301250346e-05, "ref_logps/chosen": -241.92910766601562, "ref_logps/rejected": -227.09803771972656, "rewards/accuracies": 1.0, "rewards/chosen": -1.0312559604644775, "rewards/margins": 12.043468475341797, "rewards/rejected": -13.074724197387695, "step": 2020 }, { "epoch": 0.49, "learning_rate": 1.1445333333333333e-07, "logps/chosen": -229.1363983154297, "logps/rejected": -342.9774169921875, "loss": 0.0112, "losses/dpo": 2.4147991553036263e-06, "losses/sft": 0.45190390944480896, "losses/total": 2.4147991553036263e-06, "ref_logps/chosen": -219.51296997070312, "ref_logps/rejected": -217.5749969482422, "rewards/accuracies": 1.0, "rewards/chosen": -0.9623417258262634, "rewards/margins": 11.577898025512695, "rewards/rejected": -12.540239334106445, "step": 2021 }, { "epoch": 0.49, "learning_rate": 1.1439999999999998e-07, "logps/chosen": -267.314697265625, "logps/rejected": -392.6621398925781, "loss": 0.0009, "losses/dpo": 3.0425717056914436e-08, "losses/sft": 0.60237717628479, "losses/total": 3.0425717056914436e-08, "ref_logps/chosen": -254.3263702392578, "ref_logps/rejected": -247.24417114257812, "rewards/accuracies": 1.0, "rewards/chosen": -1.2988338470458984, "rewards/margins": 13.242962837219238, "rewards/rejected": -14.541796684265137, "step": 2022 }, { "epoch": 0.49, "learning_rate": 1.1434666666666666e-07, "logps/chosen": -249.9442596435547, "logps/rejected": -369.8154296875, "loss": 0.0207, "losses/dpo": 6.070592917239992e-07, "losses/sft": 0.64066481590271, "losses/total": 6.070592917239992e-07, "ref_logps/chosen": -239.29425048828125, "ref_logps/rejected": -227.40625, "rewards/accuracies": 1.0, "rewards/chosen": -1.065000295639038, "rewards/margins": 13.175919532775879, "rewards/rejected": -14.24091911315918, "step": 2023 }, { "epoch": 0.49, "learning_rate": 1.1429333333333333e-07, "logps/chosen": -203.429443359375, "logps/rejected": -334.80328369140625, "loss": 0.0053, "losses/dpo": 1.6885164768609684e-06, "losses/sft": 0.4636613726615906, "losses/total": 1.6885164768609684e-06, "ref_logps/chosen": -193.90835571289062, "ref_logps/rejected": -214.2938690185547, "rewards/accuracies": 1.0, "rewards/chosen": -0.952106773853302, "rewards/margins": 11.098832130432129, "rewards/rejected": -12.050939559936523, "step": 2024 }, { "epoch": 0.49, "learning_rate": 1.1424000000000001e-07, "logps/chosen": -205.28829956054688, "logps/rejected": -356.1986083984375, "loss": 0.0015, "losses/dpo": 6.451390532902224e-08, "losses/sft": 0.49217942357063293, "losses/total": 6.451390532902224e-08, "ref_logps/chosen": -198.40090942382812, "ref_logps/rejected": -225.4044189453125, "rewards/accuracies": 1.0, "rewards/chosen": -0.6887401938438416, "rewards/margins": 12.390680313110352, "rewards/rejected": -13.07942008972168, "step": 2025 }, { "epoch": 0.49, "learning_rate": 1.1418666666666666e-07, "logps/chosen": -248.2218780517578, "logps/rejected": -351.4197998046875, "loss": 0.0007, "losses/dpo": 4.49154686066322e-05, "losses/sft": 0.9954765439033508, "losses/total": 4.49154686066322e-05, "ref_logps/chosen": -238.78680419921875, "ref_logps/rejected": -226.5031280517578, "rewards/accuracies": 1.0, "rewards/chosen": -0.943510115146637, "rewards/margins": 11.548160552978516, "rewards/rejected": -12.491670608520508, "step": 2026 }, { "epoch": 0.49, "learning_rate": 1.1413333333333332e-07, "logps/chosen": -229.7841339111328, "logps/rejected": -392.0687561035156, "loss": 0.003, "losses/dpo": 4.5435953666128626e-07, "losses/sft": 0.5053315162658691, "losses/total": 4.5435953666128626e-07, "ref_logps/chosen": -220.86593627929688, "ref_logps/rejected": -239.40057373046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.8918200731277466, "rewards/margins": 14.374996185302734, "rewards/rejected": -15.266815185546875, "step": 2027 }, { "epoch": 0.49, "learning_rate": 1.1408e-07, "logps/chosen": -289.8407897949219, "logps/rejected": -371.5794677734375, "loss": 0.0016, "losses/dpo": 5.956135282758623e-07, "losses/sft": 1.4136333465576172, "losses/total": 5.956135282758623e-07, "ref_logps/chosen": -277.94036865234375, "ref_logps/rejected": -237.7080841064453, "rewards/accuracies": 1.0, "rewards/chosen": -1.1900440454483032, "rewards/margins": 12.197093963623047, "rewards/rejected": -13.387138366699219, "step": 2028 }, { "epoch": 0.49, "learning_rate": 1.1402666666666668e-07, "logps/chosen": -212.61099243164062, "logps/rejected": -348.7510986328125, "loss": 0.0019, "losses/dpo": 3.935473458227534e-08, "losses/sft": 0.5768654942512512, "losses/total": 3.935473458227534e-08, "ref_logps/chosen": -204.6175079345703, "ref_logps/rejected": -218.662353515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.7993495464324951, "rewards/margins": 12.209521293640137, "rewards/rejected": -13.008871078491211, "step": 2029 }, { "epoch": 0.49, "learning_rate": 1.1397333333333333e-07, "logps/chosen": -230.65179443359375, "logps/rejected": -395.9365539550781, "loss": 0.0018, "losses/dpo": 6.517482688650489e-06, "losses/sft": 0.577485203742981, "losses/total": 6.517482688650489e-06, "ref_logps/chosen": -219.5103759765625, "ref_logps/rejected": -256.75360107421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.1141430139541626, "rewards/margins": 12.804153442382812, "rewards/rejected": -13.918296813964844, "step": 2030 }, { "epoch": 0.49, "learning_rate": 1.1391999999999999e-07, "logps/chosen": -243.089111328125, "logps/rejected": -347.85003662109375, "loss": 0.0102, "losses/dpo": 7.4522897193674e-05, "losses/sft": 0.5823318958282471, "losses/total": 7.4522897193674e-05, "ref_logps/chosen": -232.28659057617188, "ref_logps/rejected": -225.5199737548828, "rewards/accuracies": 1.0, "rewards/chosen": -1.0802528858184814, "rewards/margins": 11.152754783630371, "rewards/rejected": -12.233007431030273, "step": 2031 }, { "epoch": 0.49, "learning_rate": 1.1386666666666667e-07, "logps/chosen": -188.14520263671875, "logps/rejected": -310.314208984375, "loss": 0.0066, "losses/dpo": 5.319046977092512e-06, "losses/sft": 0.6259080767631531, "losses/total": 5.319046977092512e-06, "ref_logps/chosen": -177.99758911132812, "ref_logps/rejected": -190.14186096191406, "rewards/accuracies": 1.0, "rewards/chosen": -1.0147606134414673, "rewards/margins": 11.002473831176758, "rewards/rejected": -12.017233848571777, "step": 2032 }, { "epoch": 0.49, "learning_rate": 1.1381333333333334e-07, "logps/chosen": -216.85922241210938, "logps/rejected": -350.935791015625, "loss": 0.007, "losses/dpo": 9.2587024482782e-06, "losses/sft": 0.8208397030830383, "losses/total": 9.2587024482782e-06, "ref_logps/chosen": -205.4189453125, "ref_logps/rejected": -221.25991821289062, "rewards/accuracies": 1.0, "rewards/chosen": -1.1440279483795166, "rewards/margins": 11.82356071472168, "rewards/rejected": -12.96759033203125, "step": 2033 }, { "epoch": 0.49, "learning_rate": 1.1376e-07, "logps/chosen": -248.61219787597656, "logps/rejected": -358.0152587890625, "loss": 0.0025, "losses/dpo": 2.2046093306471448e-07, "losses/sft": 1.2402833700180054, "losses/total": 2.2046093306471448e-07, "ref_logps/chosen": -240.08123779296875, "ref_logps/rejected": -227.6080780029297, "rewards/accuracies": 1.0, "rewards/chosen": -0.8530964851379395, "rewards/margins": 12.187623023986816, "rewards/rejected": -13.040719985961914, "step": 2034 }, { "epoch": 0.49, "learning_rate": 1.1370666666666666e-07, "logps/chosen": -228.15170288085938, "logps/rejected": -362.69537353515625, "loss": 0.0041, "losses/dpo": 2.0044419457576623e-08, "losses/sft": 0.5367599725723267, "losses/total": 2.0044419457576623e-08, "ref_logps/chosen": -216.6943817138672, "ref_logps/rejected": -224.6429443359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.145730972290039, "rewards/margins": 12.659509658813477, "rewards/rejected": -13.805241584777832, "step": 2035 }, { "epoch": 0.49, "learning_rate": 1.1365333333333333e-07, "logps/chosen": -217.47784423828125, "logps/rejected": -363.60382080078125, "loss": 0.0003, "losses/dpo": 2.675052020173485e-11, "losses/sft": 0.6421078443527222, "losses/total": 2.675052020173485e-11, "ref_logps/chosen": -207.02745056152344, "ref_logps/rejected": -215.14508056640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.0450395345687866, "rewards/margins": 13.800836563110352, "rewards/rejected": -14.845874786376953, "step": 2036 }, { "epoch": 0.49, "learning_rate": 1.1359999999999998e-07, "logps/chosen": -193.51461791992188, "logps/rejected": -338.08819580078125, "loss": 0.0036, "losses/dpo": 1.3587218461452721e-07, "losses/sft": 0.6463648080825806, "losses/total": 1.3587218461452721e-07, "ref_logps/chosen": -185.17477416992188, "ref_logps/rejected": -209.8997802734375, "rewards/accuracies": 1.0, "rewards/chosen": -0.8339850902557373, "rewards/margins": 11.984857559204102, "rewards/rejected": -12.818843841552734, "step": 2037 }, { "epoch": 0.49, "learning_rate": 1.1354666666666666e-07, "logps/chosen": -204.18893432617188, "logps/rejected": -381.9055480957031, "loss": 0.0024, "losses/dpo": 1.650142777975816e-08, "losses/sft": 0.4891948103904724, "losses/total": 1.650142777975816e-08, "ref_logps/chosen": -194.31607055664062, "ref_logps/rejected": -240.12632751464844, "rewards/accuracies": 1.0, "rewards/chosen": -0.9872871041297913, "rewards/margins": 13.19063663482666, "rewards/rejected": -14.177923202514648, "step": 2038 }, { "epoch": 0.49, "learning_rate": 1.1349333333333333e-07, "logps/chosen": -232.46102905273438, "logps/rejected": -364.21002197265625, "loss": 0.0165, "losses/dpo": 1.2877825916746133e-08, "losses/sft": 0.739315927028656, "losses/total": 1.2877825916746133e-08, "ref_logps/chosen": -219.22723388671875, "ref_logps/rejected": -226.85092163085938, "rewards/accuracies": 1.0, "rewards/chosen": -1.3233805894851685, "rewards/margins": 12.412532806396484, "rewards/rejected": -13.735912322998047, "step": 2039 }, { "epoch": 0.49, "learning_rate": 1.1344e-07, "logps/chosen": -244.77130126953125, "logps/rejected": -359.921142578125, "loss": 0.0034, "losses/dpo": 5.124604740558425e-06, "losses/sft": 0.6928724050521851, "losses/total": 5.124604740558425e-06, "ref_logps/chosen": -233.5909423828125, "ref_logps/rejected": -224.92422485351562, "rewards/accuracies": 1.0, "rewards/chosen": -1.1180391311645508, "rewards/margins": 12.38165283203125, "rewards/rejected": -13.499692916870117, "step": 2040 }, { "epoch": 0.49, "learning_rate": 1.1338666666666665e-07, "logps/chosen": -204.5904998779297, "logps/rejected": -348.626220703125, "loss": 0.0027, "losses/dpo": 1.1609778693966177e-09, "losses/sft": 0.46263211965560913, "losses/total": 1.1609778693966177e-09, "ref_logps/chosen": -192.28253173828125, "ref_logps/rejected": -216.45590209960938, "rewards/accuracies": 1.0, "rewards/chosen": -1.230797290802002, "rewards/margins": 11.986233711242676, "rewards/rejected": -13.21703052520752, "step": 2041 }, { "epoch": 0.49, "learning_rate": 1.1333333333333333e-07, "logps/chosen": -233.10562133789062, "logps/rejected": -349.3951416015625, "loss": 0.001, "losses/dpo": 1.2949216698920907e-10, "losses/sft": 0.6999226808547974, "losses/total": 1.2949216698920907e-10, "ref_logps/chosen": -223.75424194335938, "ref_logps/rejected": -217.970703125, "rewards/accuracies": 1.0, "rewards/chosen": -0.9351367354393005, "rewards/margins": 12.207308769226074, "rewards/rejected": -13.14244556427002, "step": 2042 }, { "epoch": 0.49, "learning_rate": 1.1327999999999999e-07, "logps/chosen": -233.35284423828125, "logps/rejected": -345.9075927734375, "loss": 0.002, "losses/dpo": 6.739268201272353e-07, "losses/sft": 0.8533174395561218, "losses/total": 6.739268201272353e-07, "ref_logps/chosen": -225.74322509765625, "ref_logps/rejected": -221.80621337890625, "rewards/accuracies": 1.0, "rewards/chosen": -0.7609612941741943, "rewards/margins": 11.649174690246582, "rewards/rejected": -12.410135269165039, "step": 2043 }, { "epoch": 0.49, "learning_rate": 1.1322666666666667e-07, "logps/chosen": -221.45281982421875, "logps/rejected": -338.7384338378906, "loss": 0.0002, "losses/dpo": 5.124960580360494e-07, "losses/sft": 0.6268787980079651, "losses/total": 5.124960580360494e-07, "ref_logps/chosen": -214.0204315185547, "ref_logps/rejected": -209.70809936523438, "rewards/accuracies": 1.0, "rewards/chosen": -0.7432385683059692, "rewards/margins": 12.159797668457031, "rewards/rejected": -12.903035163879395, "step": 2044 }, { "epoch": 0.49, "learning_rate": 1.1317333333333332e-07, "logps/chosen": -231.98159790039062, "logps/rejected": -326.95904541015625, "loss": 0.0017, "losses/dpo": 3.7352938520029966e-09, "losses/sft": 0.9898804426193237, "losses/total": 3.7352938520029966e-09, "ref_logps/chosen": -224.97036743164062, "ref_logps/rejected": -211.2667236328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.7011208534240723, "rewards/margins": 10.86811351776123, "rewards/rejected": -11.569234848022461, "step": 2045 }, { "epoch": 0.49, "learning_rate": 1.1312e-07, "logps/chosen": -256.79058837890625, "logps/rejected": -344.150390625, "loss": 0.0005, "losses/dpo": 1.598762196408643e-06, "losses/sft": 0.648891031742096, "losses/total": 1.598762196408643e-06, "ref_logps/chosen": -246.04183959960938, "ref_logps/rejected": -217.06170654296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.0748741626739502, "rewards/margins": 11.63399600982666, "rewards/rejected": -12.708869934082031, "step": 2046 }, { "epoch": 0.49, "learning_rate": 1.1306666666666666e-07, "logps/chosen": -199.23031616210938, "logps/rejected": -326.7972412109375, "loss": 0.0027, "losses/dpo": 0.0011135695967823267, "losses/sft": 0.4126698970794678, "losses/total": 0.0011135695967823267, "ref_logps/chosen": -191.71116638183594, "ref_logps/rejected": -212.74673461914062, "rewards/accuracies": 1.0, "rewards/chosen": -0.751915693283081, "rewards/margins": 10.6531343460083, "rewards/rejected": -11.405050277709961, "step": 2047 }, { "epoch": 0.49, "learning_rate": 1.1301333333333334e-07, "logps/chosen": -287.4508056640625, "logps/rejected": -371.4892272949219, "loss": 0.0009, "losses/dpo": 0.001885637640953064, "losses/sft": 0.7267425060272217, "losses/total": 0.001885637640953064, "ref_logps/chosen": -275.8563537597656, "ref_logps/rejected": -246.9751739501953, "rewards/accuracies": 1.0, "rewards/chosen": -1.1594436168670654, "rewards/margins": 11.291963577270508, "rewards/rejected": -12.45140552520752, "step": 2048 }, { "epoch": 0.49, "learning_rate": 1.1295999999999999e-07, "logps/chosen": -250.69924926757812, "logps/rejected": -397.88677978515625, "loss": 0.0031, "losses/dpo": 2.7947396574745653e-06, "losses/sft": 0.38970428705215454, "losses/total": 2.7947396574745653e-06, "ref_logps/chosen": -241.62388610839844, "ref_logps/rejected": -249.16058349609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9075349569320679, "rewards/margins": 13.965082168579102, "rewards/rejected": -14.872617721557617, "step": 2049 }, { "epoch": 0.49, "learning_rate": 1.1290666666666666e-07, "logps/chosen": -266.17742919921875, "logps/rejected": -389.5911865234375, "loss": 0.0023, "losses/dpo": 6.893125004125977e-08, "losses/sft": 0.622403621673584, "losses/total": 6.893125004125977e-08, "ref_logps/chosen": -252.08621215820312, "ref_logps/rejected": -242.20408630371094, "rewards/accuracies": 1.0, "rewards/chosen": -1.4091228246688843, "rewards/margins": 13.32958984375, "rewards/rejected": -14.738712310791016, "step": 2050 }, { "epoch": 0.49, "learning_rate": 1.1285333333333333e-07, "logps/chosen": -234.31492614746094, "logps/rejected": -360.6156005859375, "loss": 0.0066, "losses/dpo": 9.981250201351877e-12, "losses/sft": 0.7360346913337708, "losses/total": 9.981250201351877e-12, "ref_logps/chosen": -224.3890380859375, "ref_logps/rejected": -229.60740661621094, "rewards/accuracies": 1.0, "rewards/chosen": -0.9925867915153503, "rewards/margins": 12.108233451843262, "rewards/rejected": -13.100820541381836, "step": 2051 }, { "epoch": 0.49, "learning_rate": 1.1279999999999998e-07, "logps/chosen": -288.39215087890625, "logps/rejected": -378.10418701171875, "loss": 0.0022, "losses/dpo": 1.0707792985620301e-10, "losses/sft": 0.7451257705688477, "losses/total": 1.0707792985620301e-10, "ref_logps/chosen": -279.38079833984375, "ref_logps/rejected": -236.330078125, "rewards/accuracies": 1.0, "rewards/chosen": -0.9011379480361938, "rewards/margins": 13.276272773742676, "rewards/rejected": -14.177411079406738, "step": 2052 }, { "epoch": 0.49, "learning_rate": 1.1274666666666665e-07, "logps/chosen": -213.30267333984375, "logps/rejected": -320.31451416015625, "loss": 0.0095, "losses/dpo": 6.07737365498906e-06, "losses/sft": 0.7551980018615723, "losses/total": 6.07737365498906e-06, "ref_logps/chosen": -204.24169921875, "ref_logps/rejected": -204.06443786621094, "rewards/accuracies": 1.0, "rewards/chosen": -0.9060983061790466, "rewards/margins": 10.718910217285156, "rewards/rejected": -11.625007629394531, "step": 2053 }, { "epoch": 0.49, "learning_rate": 1.1269333333333333e-07, "logps/chosen": -247.16268920898438, "logps/rejected": -356.15618896484375, "loss": 0.0032, "losses/dpo": 1.4317771412919456e-09, "losses/sft": 0.7189550399780273, "losses/total": 1.4317771412919456e-09, "ref_logps/chosen": -232.01150512695312, "ref_logps/rejected": -222.25506591796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5151180028915405, "rewards/margins": 11.874992370605469, "rewards/rejected": -13.39011001586914, "step": 2054 }, { "epoch": 0.49, "learning_rate": 1.1264000000000001e-07, "logps/chosen": -258.1912841796875, "logps/rejected": -365.3824157714844, "loss": 0.002, "losses/dpo": 1.0792480225063628e-06, "losses/sft": 0.5797995924949646, "losses/total": 1.0792480225063628e-06, "ref_logps/chosen": -245.86029052734375, "ref_logps/rejected": -236.87037658691406, "rewards/accuracies": 1.0, "rewards/chosen": -1.2330985069274902, "rewards/margins": 11.618104934692383, "rewards/rejected": -12.851203918457031, "step": 2055 }, { "epoch": 0.49, "learning_rate": 1.1258666666666666e-07, "logps/chosen": -204.49612426757812, "logps/rejected": -314.4345703125, "loss": 0.0042, "losses/dpo": 1.4045699572307058e-05, "losses/sft": 0.5462822914123535, "losses/total": 1.4045699572307058e-05, "ref_logps/chosen": -196.2516632080078, "ref_logps/rejected": -189.82286071777344, "rewards/accuracies": 1.0, "rewards/chosen": -0.8244476914405823, "rewards/margins": 11.636722564697266, "rewards/rejected": -12.461170196533203, "step": 2056 }, { "epoch": 0.49, "learning_rate": 1.1253333333333332e-07, "logps/chosen": -254.771728515625, "logps/rejected": -341.8126525878906, "loss": 0.0026, "losses/dpo": 1.6014600987546146e-05, "losses/sft": 0.5351620316505432, "losses/total": 1.6014600987546146e-05, "ref_logps/chosen": -241.99949645996094, "ref_logps/rejected": -221.59640502929688, "rewards/accuracies": 1.0, "rewards/chosen": -1.2772226333618164, "rewards/margins": 10.744403839111328, "rewards/rejected": -12.021627426147461, "step": 2057 }, { "epoch": 0.49, "learning_rate": 1.1248e-07, "logps/chosen": -250.221435546875, "logps/rejected": -366.1605224609375, "loss": 0.0007, "losses/dpo": 3.471916798503116e-08, "losses/sft": 0.6667816638946533, "losses/total": 3.471916798503116e-08, "ref_logps/chosen": -237.0811004638672, "ref_logps/rejected": -234.16297912597656, "rewards/accuracies": 1.0, "rewards/chosen": -1.314033031463623, "rewards/margins": 11.885721206665039, "rewards/rejected": -13.19975471496582, "step": 2058 }, { "epoch": 0.49, "learning_rate": 1.1242666666666668e-07, "logps/chosen": -188.05328369140625, "logps/rejected": -309.58123779296875, "loss": 0.0018, "losses/dpo": 9.494901576090342e-08, "losses/sft": 0.6585124731063843, "losses/total": 9.494901576090342e-08, "ref_logps/chosen": -180.9917449951172, "ref_logps/rejected": -185.54803466796875, "rewards/accuracies": 1.0, "rewards/chosen": -0.7061529159545898, "rewards/margins": 11.697164535522461, "rewards/rejected": -12.403318405151367, "step": 2059 }, { "epoch": 0.49, "learning_rate": 1.1237333333333333e-07, "logps/chosen": -224.5369110107422, "logps/rejected": -363.67230224609375, "loss": 0.0002, "losses/dpo": 9.033370815814123e-07, "losses/sft": 0.9228923916816711, "losses/total": 9.033370815814123e-07, "ref_logps/chosen": -215.89715576171875, "ref_logps/rejected": -224.677978515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.8639771938323975, "rewards/margins": 13.035455703735352, "rewards/rejected": -13.899434089660645, "step": 2060 }, { "epoch": 0.49, "learning_rate": 1.1231999999999999e-07, "logps/chosen": -245.357177734375, "logps/rejected": -367.69268798828125, "loss": 0.0007, "losses/dpo": 2.4601240511401556e-05, "losses/sft": 0.6687793135643005, "losses/total": 2.4601240511401556e-05, "ref_logps/chosen": -234.83197021484375, "ref_logps/rejected": -232.71939086914062, "rewards/accuracies": 1.0, "rewards/chosen": -1.0525211095809937, "rewards/margins": 12.444808959960938, "rewards/rejected": -13.497329711914062, "step": 2061 }, { "epoch": 0.49, "learning_rate": 1.1226666666666667e-07, "logps/chosen": -231.86737060546875, "logps/rejected": -372.7794494628906, "loss": 0.0048, "losses/dpo": 2.0369705922007597e-08, "losses/sft": 0.7840253710746765, "losses/total": 2.0369705922007597e-08, "ref_logps/chosen": -220.46560668945312, "ref_logps/rejected": -239.41555786132812, "rewards/accuracies": 1.0, "rewards/chosen": -1.1401777267456055, "rewards/margins": 12.196210861206055, "rewards/rejected": -13.33638858795166, "step": 2062 }, { "epoch": 0.5, "learning_rate": 1.1221333333333334e-07, "logps/chosen": -259.4906005859375, "logps/rejected": -344.5231018066406, "loss": 0.005, "losses/dpo": 7.232122101186178e-08, "losses/sft": 0.8846602439880371, "losses/total": 7.232122101186178e-08, "ref_logps/chosen": -249.9154815673828, "ref_logps/rejected": -216.06536865234375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9575124382972717, "rewards/margins": 11.888261795043945, "rewards/rejected": -12.845773696899414, "step": 2063 }, { "epoch": 0.5, "learning_rate": 1.1215999999999999e-07, "logps/chosen": -221.8822021484375, "logps/rejected": -312.8011779785156, "loss": 0.026, "losses/dpo": 0.001410387922078371, "losses/sft": 0.49840211868286133, "losses/total": 0.001410387922078371, "ref_logps/chosen": -212.68624877929688, "ref_logps/rejected": -192.74688720703125, "rewards/accuracies": 1.0, "rewards/chosen": -0.919596791267395, "rewards/margins": 11.085829734802246, "rewards/rejected": -12.005427360534668, "step": 2064 }, { "epoch": 0.5, "learning_rate": 1.1210666666666666e-07, "logps/chosen": -219.69769287109375, "logps/rejected": -324.0005798339844, "loss": 0.0109, "losses/dpo": 4.847922537010163e-05, "losses/sft": 0.67073655128479, "losses/total": 4.847922537010163e-05, "ref_logps/chosen": -210.88307189941406, "ref_logps/rejected": -204.51974487304688, "rewards/accuracies": 1.0, "rewards/chosen": -0.8814613223075867, "rewards/margins": 11.066619873046875, "rewards/rejected": -11.948081016540527, "step": 2065 }, { "epoch": 0.5, "learning_rate": 1.1205333333333333e-07, "logps/chosen": -214.62716674804688, "logps/rejected": -345.78436279296875, "loss": 0.0007, "losses/dpo": 1.6992428300000029e-06, "losses/sft": 0.45441803336143494, "losses/total": 1.6992428300000029e-06, "ref_logps/chosen": -204.74844360351562, "ref_logps/rejected": -218.03244018554688, "rewards/accuracies": 1.0, "rewards/chosen": -0.9878745675086975, "rewards/margins": 11.787317276000977, "rewards/rejected": -12.775192260742188, "step": 2066 }, { "epoch": 0.5, "learning_rate": 1.1200000000000001e-07, "logps/chosen": -198.02503967285156, "logps/rejected": -367.458251953125, "loss": 0.0112, "losses/dpo": 0.00019551963487174362, "losses/sft": 0.6449425220489502, "losses/total": 0.00019551963487174362, "ref_logps/chosen": -188.6378173828125, "ref_logps/rejected": -227.39694213867188, "rewards/accuracies": 1.0, "rewards/chosen": -0.9387208223342896, "rewards/margins": 13.067408561706543, "rewards/rejected": -14.006128311157227, "step": 2067 }, { "epoch": 0.5, "learning_rate": 1.1194666666666666e-07, "logps/chosen": -202.89122009277344, "logps/rejected": -346.568603515625, "loss": 0.0045, "losses/dpo": 2.091417263727635e-05, "losses/sft": 0.5155127644538879, "losses/total": 2.091417263727635e-05, "ref_logps/chosen": -193.34451293945312, "ref_logps/rejected": -218.68923950195312, "rewards/accuracies": 1.0, "rewards/chosen": -0.9546700716018677, "rewards/margins": 11.833267211914062, "rewards/rejected": -12.787938117980957, "step": 2068 }, { "epoch": 0.5, "learning_rate": 1.1189333333333332e-07, "logps/chosen": -268.096923828125, "logps/rejected": -371.508544921875, "loss": 0.0011, "losses/dpo": 5.851879425478046e-09, "losses/sft": 0.769932746887207, "losses/total": 5.851879425478046e-09, "ref_logps/chosen": -256.11083984375, "ref_logps/rejected": -232.9085693359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.1986079216003418, "rewards/margins": 12.661392211914062, "rewards/rejected": -13.85999870300293, "step": 2069 }, { "epoch": 0.5, "learning_rate": 1.1184e-07, "logps/chosen": -251.5079345703125, "logps/rejected": -314.36737060546875, "loss": 0.013, "losses/dpo": 2.769741911379242e-07, "losses/sft": 0.5626503825187683, "losses/total": 2.769741911379242e-07, "ref_logps/chosen": -240.21347045898438, "ref_logps/rejected": -197.2768096923828, "rewards/accuracies": 1.0, "rewards/chosen": -1.1294474601745605, "rewards/margins": 10.579610824584961, "rewards/rejected": -11.709057807922363, "step": 2070 }, { "epoch": 0.5, "learning_rate": 1.1178666666666665e-07, "logps/chosen": -235.0897216796875, "logps/rejected": -325.8359375, "loss": 0.0008, "losses/dpo": 1.365187586088723e-06, "losses/sft": 0.6542505025863647, "losses/total": 1.365187586088723e-06, "ref_logps/chosen": -225.21319580078125, "ref_logps/rejected": -200.696044921875, "rewards/accuracies": 1.0, "rewards/chosen": -0.9876524806022644, "rewards/margins": 11.526335716247559, "rewards/rejected": -12.51398754119873, "step": 2071 }, { "epoch": 0.5, "learning_rate": 1.1173333333333333e-07, "logps/chosen": -244.16635131835938, "logps/rejected": -356.3099060058594, "loss": 0.0011, "losses/dpo": 4.998842673131776e-09, "losses/sft": 0.722436785697937, "losses/total": 4.998842673131776e-09, "ref_logps/chosen": -232.89389038085938, "ref_logps/rejected": -220.15383911132812, "rewards/accuracies": 1.0, "rewards/chosen": -1.1272451877593994, "rewards/margins": 12.488359451293945, "rewards/rejected": -13.615604400634766, "step": 2072 }, { "epoch": 0.5, "learning_rate": 1.1167999999999999e-07, "logps/chosen": -207.55760192871094, "logps/rejected": -330.37713623046875, "loss": 0.0079, "losses/dpo": 5.19516277108778e-07, "losses/sft": 0.5536274313926697, "losses/total": 5.19516277108778e-07, "ref_logps/chosen": -199.50967407226562, "ref_logps/rejected": -203.89077758789062, "rewards/accuracies": 1.0, "rewards/chosen": -0.8047930002212524, "rewards/margins": 11.843841552734375, "rewards/rejected": -12.648634910583496, "step": 2073 }, { "epoch": 0.5, "learning_rate": 1.1162666666666667e-07, "logps/chosen": -247.82180786132812, "logps/rejected": -343.9360046386719, "loss": 0.0053, "losses/dpo": 0.00014123201253823936, "losses/sft": 0.7398308515548706, "losses/total": 0.00014123201253823936, "ref_logps/chosen": -232.12451171875, "ref_logps/rejected": -206.98504638671875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5697300434112549, "rewards/margins": 12.1253662109375, "rewards/rejected": -13.695096969604492, "step": 2074 }, { "epoch": 0.5, "learning_rate": 1.1157333333333332e-07, "logps/chosen": -240.19570922851562, "logps/rejected": -326.93701171875, "loss": 0.0009, "losses/dpo": 2.2866164783863496e-07, "losses/sft": 0.7390762567520142, "losses/total": 2.2866164783863496e-07, "ref_logps/chosen": -230.8216094970703, "ref_logps/rejected": -205.30587768554688, "rewards/accuracies": 1.0, "rewards/chosen": -0.9374107122421265, "rewards/margins": 11.225700378417969, "rewards/rejected": -12.163111686706543, "step": 2075 }, { "epoch": 0.5, "learning_rate": 1.1152e-07, "logps/chosen": -247.1044464111328, "logps/rejected": -338.54718017578125, "loss": 0.0055, "losses/dpo": 6.350300602205039e-13, "losses/sft": 0.43918031454086304, "losses/total": 6.350300602205039e-13, "ref_logps/chosen": -237.66604614257812, "ref_logps/rejected": -207.0749053955078, "rewards/accuracies": 1.0, "rewards/chosen": -0.9438410997390747, "rewards/margins": 12.203388214111328, "rewards/rejected": -13.14723014831543, "step": 2076 }, { "epoch": 0.5, "learning_rate": 1.1146666666666666e-07, "logps/chosen": -256.96240234375, "logps/rejected": -361.84185791015625, "loss": 0.0006, "losses/dpo": 8.269456088783045e-07, "losses/sft": 0.48284247517585754, "losses/total": 8.269456088783045e-07, "ref_logps/chosen": -244.2844696044922, "ref_logps/rejected": -219.43724060058594, "rewards/accuracies": 1.0, "rewards/chosen": -1.267794132232666, "rewards/margins": 12.97266674041748, "rewards/rejected": -14.240460395812988, "step": 2077 }, { "epoch": 0.5, "learning_rate": 1.1141333333333334e-07, "logps/chosen": -282.7630615234375, "logps/rejected": -364.12298583984375, "loss": 0.0003, "losses/dpo": 5.739338121202309e-06, "losses/sft": 1.046386480331421, "losses/total": 5.739338121202309e-06, "ref_logps/chosen": -268.775146484375, "ref_logps/rejected": -227.99258422851562, "rewards/accuracies": 1.0, "rewards/chosen": -1.3987910747528076, "rewards/margins": 12.214249610900879, "rewards/rejected": -13.613040924072266, "step": 2078 }, { "epoch": 0.5, "learning_rate": 1.1135999999999999e-07, "logps/chosen": -211.52102661132812, "logps/rejected": -341.2198486328125, "loss": 0.0058, "losses/dpo": 3.9805272535886616e-06, "losses/sft": 0.6351823806762695, "losses/total": 3.9805272535886616e-06, "ref_logps/chosen": -204.71376037597656, "ref_logps/rejected": -210.09835815429688, "rewards/accuracies": 1.0, "rewards/chosen": -0.6807280778884888, "rewards/margins": 12.43142032623291, "rewards/rejected": -13.11214828491211, "step": 2079 }, { "epoch": 0.5, "learning_rate": 1.1130666666666666e-07, "logps/chosen": -239.35960388183594, "logps/rejected": -374.2118835449219, "loss": 0.0009, "losses/dpo": 1.683398309637596e-08, "losses/sft": 1.1323310136795044, "losses/total": 1.683398309637596e-08, "ref_logps/chosen": -229.54795837402344, "ref_logps/rejected": -234.25035095214844, "rewards/accuracies": 1.0, "rewards/chosen": -0.9811644554138184, "rewards/margins": 13.014989852905273, "rewards/rejected": -13.99615478515625, "step": 2080 }, { "epoch": 0.5, "learning_rate": 1.1125333333333333e-07, "logps/chosen": -248.92640686035156, "logps/rejected": -361.4105224609375, "loss": 0.0007, "losses/dpo": 7.984178318487523e-10, "losses/sft": 0.6356711983680725, "losses/total": 7.984178318487523e-10, "ref_logps/chosen": -235.6893310546875, "ref_logps/rejected": -222.5495147705078, "rewards/accuracies": 1.0, "rewards/chosen": -1.3237087726593018, "rewards/margins": 12.562393188476562, "rewards/rejected": -13.886102676391602, "step": 2081 }, { "epoch": 0.5, "learning_rate": 1.112e-07, "logps/chosen": -258.23211669921875, "logps/rejected": -360.22723388671875, "loss": 0.0014, "losses/dpo": 2.9122697924321983e-06, "losses/sft": 0.5879802703857422, "losses/total": 2.9122697924321983e-06, "ref_logps/chosen": -246.85128784179688, "ref_logps/rejected": -214.43545532226562, "rewards/accuracies": 1.0, "rewards/chosen": -1.1380810737609863, "rewards/margins": 13.44110107421875, "rewards/rejected": -14.579182624816895, "step": 2082 }, { "epoch": 0.5, "learning_rate": 1.1114666666666665e-07, "logps/chosen": -210.6206512451172, "logps/rejected": -325.6660461425781, "loss": 0.0118, "losses/dpo": 5.6589883890012516e-09, "losses/sft": 0.5794728994369507, "losses/total": 5.6589883890012516e-09, "ref_logps/chosen": -199.06895446777344, "ref_logps/rejected": -207.8944091796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.15516996383667, "rewards/margins": 10.621994018554688, "rewards/rejected": -11.777164459228516, "step": 2083 }, { "epoch": 0.5, "learning_rate": 1.1109333333333333e-07, "logps/chosen": -260.5669250488281, "logps/rejected": -360.58428955078125, "loss": 0.0072, "losses/dpo": 3.2643754366290523e-09, "losses/sft": 0.48542487621307373, "losses/total": 3.2643754366290523e-09, "ref_logps/chosen": -249.69058227539062, "ref_logps/rejected": -226.033203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.0876336097717285, "rewards/margins": 12.367477416992188, "rewards/rejected": -13.455110549926758, "step": 2084 }, { "epoch": 0.5, "learning_rate": 1.1104e-07, "logps/chosen": -220.97708129882812, "logps/rejected": -362.20452880859375, "loss": 0.0045, "losses/dpo": 0.002733373548835516, "losses/sft": 0.5047588348388672, "losses/total": 0.002733373548835516, "ref_logps/chosen": -210.89285278320312, "ref_logps/rejected": -219.6212921142578, "rewards/accuracies": 1.0, "rewards/chosen": -1.008420467376709, "rewards/margins": 13.249905586242676, "rewards/rejected": -14.258325576782227, "step": 2085 }, { "epoch": 0.5, "learning_rate": 1.1098666666666666e-07, "logps/chosen": -229.9752655029297, "logps/rejected": -391.86883544921875, "loss": 0.0026, "losses/dpo": 3.271730406595452e-07, "losses/sft": 0.7080875635147095, "losses/total": 3.271730406595452e-07, "ref_logps/chosen": -219.08010864257812, "ref_logps/rejected": -242.91236877441406, "rewards/accuracies": 1.0, "rewards/chosen": -1.0895154476165771, "rewards/margins": 13.806130409240723, "rewards/rejected": -14.895645141601562, "step": 2086 }, { "epoch": 0.5, "learning_rate": 1.1093333333333332e-07, "logps/chosen": -217.36245727539062, "logps/rejected": -337.94268798828125, "loss": 0.0007, "losses/dpo": 2.1379017312028736e-07, "losses/sft": 0.9916567802429199, "losses/total": 2.1379017312028736e-07, "ref_logps/chosen": -209.34036254882812, "ref_logps/rejected": -211.6041717529297, "rewards/accuracies": 1.0, "rewards/chosen": -0.8022107481956482, "rewards/margins": 11.831642150878906, "rewards/rejected": -12.633853912353516, "step": 2087 }, { "epoch": 0.5, "learning_rate": 1.1088e-07, "logps/chosen": -246.80001831054688, "logps/rejected": -340.9638671875, "loss": 0.0043, "losses/dpo": 0.00029987990274094045, "losses/sft": 0.7373225092887878, "losses/total": 0.00029987990274094045, "ref_logps/chosen": -233.22760009765625, "ref_logps/rejected": -209.1811981201172, "rewards/accuracies": 1.0, "rewards/chosen": -1.3572416305541992, "rewards/margins": 11.821026802062988, "rewards/rejected": -13.178268432617188, "step": 2088 }, { "epoch": 0.5, "learning_rate": 1.1082666666666667e-07, "logps/chosen": -187.9193878173828, "logps/rejected": -342.46734619140625, "loss": 0.0008, "losses/dpo": 7.93797880760394e-06, "losses/sft": 0.6384949088096619, "losses/total": 7.93797880760394e-06, "ref_logps/chosen": -179.12843322753906, "ref_logps/rejected": -211.46131896972656, "rewards/accuracies": 1.0, "rewards/chosen": -0.8790954947471619, "rewards/margins": 12.221510887145996, "rewards/rejected": -13.100605964660645, "step": 2089 }, { "epoch": 0.5, "learning_rate": 1.1077333333333332e-07, "logps/chosen": -218.08172607421875, "logps/rejected": -333.56640625, "loss": 0.0045, "losses/dpo": 0.006800027098506689, "losses/sft": 0.8179154992103577, "losses/total": 0.006800027098506689, "ref_logps/chosen": -207.52590942382812, "ref_logps/rejected": -216.92315673828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.055580735206604, "rewards/margins": 10.608743667602539, "rewards/rejected": -11.664323806762695, "step": 2090 }, { "epoch": 0.5, "learning_rate": 1.1071999999999999e-07, "logps/chosen": -300.90625, "logps/rejected": -404.28997802734375, "loss": 0.0026, "losses/dpo": 3.3568130497485527e-09, "losses/sft": 0.5632089376449585, "losses/total": 3.3568130497485527e-09, "ref_logps/chosen": -283.4631652832031, "ref_logps/rejected": -242.86802673339844, "rewards/accuracies": 1.0, "rewards/chosen": -1.744309663772583, "rewards/margins": 14.397883415222168, "rewards/rejected": -16.142192840576172, "step": 2091 }, { "epoch": 0.5, "learning_rate": 1.1066666666666667e-07, "logps/chosen": -243.297119140625, "logps/rejected": -346.9532470703125, "loss": 0.002, "losses/dpo": 1.0902311942118104e-07, "losses/sft": 0.5959112644195557, "losses/total": 1.0902311942118104e-07, "ref_logps/chosen": -234.7961883544922, "ref_logps/rejected": -219.54782104492188, "rewards/accuracies": 1.0, "rewards/chosen": -0.8500951528549194, "rewards/margins": 11.890449523925781, "rewards/rejected": -12.740544319152832, "step": 2092 }, { "epoch": 0.5, "learning_rate": 1.1061333333333334e-07, "logps/chosen": -217.2580108642578, "logps/rejected": -356.7647399902344, "loss": 0.0019, "losses/dpo": 0.00015390815678983927, "losses/sft": 0.5962818264961243, "losses/total": 0.00015390815678983927, "ref_logps/chosen": -206.48870849609375, "ref_logps/rejected": -227.64999389648438, "rewards/accuracies": 1.0, "rewards/chosen": -1.0769299268722534, "rewards/margins": 11.834548950195312, "rewards/rejected": -12.911478042602539, "step": 2093 }, { "epoch": 0.5, "learning_rate": 1.1055999999999999e-07, "logps/chosen": -261.3921203613281, "logps/rejected": -338.6520080566406, "loss": 0.0102, "losses/dpo": 1.874409605306937e-08, "losses/sft": 0.8387840986251831, "losses/total": 1.874409605306937e-08, "ref_logps/chosen": -249.3370819091797, "ref_logps/rejected": -212.95367431640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2055037021636963, "rewards/margins": 11.36433219909668, "rewards/rejected": -12.569835662841797, "step": 2094 }, { "epoch": 0.5, "learning_rate": 1.1050666666666666e-07, "logps/chosen": -230.969482421875, "logps/rejected": -370.7266845703125, "loss": 0.0233, "losses/dpo": 0.017713112756609917, "losses/sft": 0.4301733374595642, "losses/total": 0.017713112756609917, "ref_logps/chosen": -219.23538208007812, "ref_logps/rejected": -234.28604125976562, "rewards/accuracies": 0.96875, "rewards/chosen": -1.1734097003936768, "rewards/margins": 12.47065544128418, "rewards/rejected": -13.644063949584961, "step": 2095 }, { "epoch": 0.5, "learning_rate": 1.1045333333333333e-07, "logps/chosen": -231.78152465820312, "logps/rejected": -334.0379638671875, "loss": 0.0037, "losses/dpo": 3.0474545837932965e-06, "losses/sft": 0.803244948387146, "losses/total": 3.0474545837932965e-06, "ref_logps/chosen": -221.6141815185547, "ref_logps/rejected": -214.0542755126953, "rewards/accuracies": 1.0, "rewards/chosen": -1.0167334079742432, "rewards/margins": 10.981637954711914, "rewards/rejected": -11.998371124267578, "step": 2096 }, { "epoch": 0.5, "learning_rate": 1.1040000000000001e-07, "logps/chosen": -207.18458557128906, "logps/rejected": -362.0598449707031, "loss": 0.0008, "losses/dpo": 2.791980659822002e-05, "losses/sft": 0.6666867733001709, "losses/total": 2.791980659822002e-05, "ref_logps/chosen": -197.39419555664062, "ref_logps/rejected": -225.77024841308594, "rewards/accuracies": 1.0, "rewards/chosen": -0.9790397882461548, "rewards/margins": 12.649921417236328, "rewards/rejected": -13.628961563110352, "step": 2097 }, { "epoch": 0.5, "learning_rate": 1.1034666666666666e-07, "logps/chosen": -192.4821319580078, "logps/rejected": -303.35150146484375, "loss": 0.0034, "losses/dpo": 4.3499234769761586e-10, "losses/sft": 0.6836902499198914, "losses/total": 4.3499234769761586e-10, "ref_logps/chosen": -183.3007354736328, "ref_logps/rejected": -185.4788055419922, "rewards/accuracies": 1.0, "rewards/chosen": -0.9181402325630188, "rewards/margins": 10.86913013458252, "rewards/rejected": -11.787270545959473, "step": 2098 }, { "epoch": 0.5, "learning_rate": 1.1029333333333332e-07, "logps/chosen": -242.32565307617188, "logps/rejected": -361.8774719238281, "loss": 0.0006, "losses/dpo": 2.1211938872056635e-09, "losses/sft": 0.4376697540283203, "losses/total": 2.1211938872056635e-09, "ref_logps/chosen": -232.7251434326172, "ref_logps/rejected": -223.00892639160156, "rewards/accuracies": 1.0, "rewards/chosen": -0.960051417350769, "rewards/margins": 12.926804542541504, "rewards/rejected": -13.886856079101562, "step": 2099 }, { "epoch": 0.5, "learning_rate": 1.1024e-07, "logps/chosen": -235.19215393066406, "logps/rejected": -378.5767822265625, "loss": 0.0064, "losses/dpo": 4.4492662709672e-06, "losses/sft": 0.8360536694526672, "losses/total": 4.4492662709672e-06, "ref_logps/chosen": -222.8677978515625, "ref_logps/rejected": -229.62696838378906, "rewards/accuracies": 1.0, "rewards/chosen": -1.2324334383010864, "rewards/margins": 13.662548065185547, "rewards/rejected": -14.894981384277344, "step": 2100 }, { "epoch": 0.5, "learning_rate": 1.1018666666666668e-07, "logps/chosen": -248.16921997070312, "logps/rejected": -356.483154296875, "loss": 0.007, "losses/dpo": 1.069183963586795e-09, "losses/sft": 0.5665043592453003, "losses/total": 1.069183963586795e-09, "ref_logps/chosen": -235.61123657226562, "ref_logps/rejected": -219.4776611328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.255797028541565, "rewards/margins": 12.444753646850586, "rewards/rejected": -13.70055103302002, "step": 2101 }, { "epoch": 0.5, "learning_rate": 1.1013333333333333e-07, "logps/chosen": -246.00466918945312, "logps/rejected": -363.9350280761719, "loss": 0.001, "losses/dpo": 9.06768036657013e-07, "losses/sft": 0.6461347937583923, "losses/total": 9.06768036657013e-07, "ref_logps/chosen": -232.9836883544922, "ref_logps/rejected": -227.20362854003906, "rewards/accuracies": 1.0, "rewards/chosen": -1.3020975589752197, "rewards/margins": 12.371042251586914, "rewards/rejected": -13.673139572143555, "step": 2102 }, { "epoch": 0.5, "learning_rate": 1.1007999999999999e-07, "logps/chosen": -216.90469360351562, "logps/rejected": -338.30633544921875, "loss": 0.0017, "losses/dpo": 2.431974621686095e-09, "losses/sft": 0.9889472126960754, "losses/total": 2.431974621686095e-09, "ref_logps/chosen": -207.04486083984375, "ref_logps/rejected": -208.73129272460938, "rewards/accuracies": 1.0, "rewards/chosen": -0.985984206199646, "rewards/margins": 11.971519470214844, "rewards/rejected": -12.957503318786621, "step": 2103 }, { "epoch": 0.5, "learning_rate": 1.1002666666666667e-07, "logps/chosen": -214.45791625976562, "logps/rejected": -377.886962890625, "loss": 0.0051, "losses/dpo": 4.588673903072049e-07, "losses/sft": 0.5022394061088562, "losses/total": 4.588673903072049e-07, "ref_logps/chosen": -203.5762481689453, "ref_logps/rejected": -249.0338592529297, "rewards/accuracies": 1.0, "rewards/chosen": -1.0881681442260742, "rewards/margins": 11.797139167785645, "rewards/rejected": -12.885307312011719, "step": 2104 }, { "epoch": 0.51, "learning_rate": 1.0997333333333332e-07, "logps/chosen": -220.92462158203125, "logps/rejected": -370.3595886230469, "loss": 0.0003, "losses/dpo": 6.137995445243405e-09, "losses/sft": 0.6136762499809265, "losses/total": 6.137995445243405e-09, "ref_logps/chosen": -211.00205993652344, "ref_logps/rejected": -231.369873046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.992257833480835, "rewards/margins": 12.906713485717773, "rewards/rejected": -13.898971557617188, "step": 2105 }, { "epoch": 0.51, "learning_rate": 1.0992e-07, "logps/chosen": -228.2327423095703, "logps/rejected": -346.86279296875, "loss": 0.0009, "losses/dpo": 1.8916912480904102e-09, "losses/sft": 0.48860517144203186, "losses/total": 1.8916912480904102e-09, "ref_logps/chosen": -214.9811248779297, "ref_logps/rejected": -212.60882568359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.3251621723175049, "rewards/margins": 12.100236892700195, "rewards/rejected": -13.425398826599121, "step": 2106 }, { "epoch": 0.51, "learning_rate": 1.0986666666666666e-07, "logps/chosen": -235.2152557373047, "logps/rejected": -336.462646484375, "loss": 0.0041, "losses/dpo": 2.213784933147167e-09, "losses/sft": 0.5961042046546936, "losses/total": 2.213784933147167e-09, "ref_logps/chosen": -226.49319458007812, "ref_logps/rejected": -213.32249450683594, "rewards/accuracies": 1.0, "rewards/chosen": -0.8722073435783386, "rewards/margins": 11.441808700561523, "rewards/rejected": -12.314016342163086, "step": 2107 }, { "epoch": 0.51, "learning_rate": 1.0981333333333333e-07, "logps/chosen": -210.784423828125, "logps/rejected": -364.7796325683594, "loss": 0.0002, "losses/dpo": 3.347969652622851e-07, "losses/sft": 0.7270264029502869, "losses/total": 3.347969652622851e-07, "ref_logps/chosen": -201.97442626953125, "ref_logps/rejected": -225.27923583984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.8810000419616699, "rewards/margins": 13.069038391113281, "rewards/rejected": -13.95003890991211, "step": 2108 }, { "epoch": 0.51, "learning_rate": 1.0975999999999998e-07, "logps/chosen": -214.17230224609375, "logps/rejected": -325.01531982421875, "loss": 0.0128, "losses/dpo": 8.48283088998869e-05, "losses/sft": 0.5538270473480225, "losses/total": 8.48283088998869e-05, "ref_logps/chosen": -205.33599853515625, "ref_logps/rejected": -202.89712524414062, "rewards/accuracies": 1.0, "rewards/chosen": -0.8836297988891602, "rewards/margins": 11.328190803527832, "rewards/rejected": -12.211820602416992, "step": 2109 }, { "epoch": 0.51, "learning_rate": 1.0970666666666666e-07, "logps/chosen": -284.85772705078125, "logps/rejected": -418.0841064453125, "loss": 0.0001, "losses/dpo": 1.4423470013832929e-12, "losses/sft": 0.6324346661567688, "losses/total": 1.4423470013832929e-12, "ref_logps/chosen": -270.30035400390625, "ref_logps/rejected": -263.33935546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.4557347297668457, "rewards/margins": 14.018739700317383, "rewards/rejected": -15.474475860595703, "step": 2110 }, { "epoch": 0.51, "learning_rate": 1.0965333333333333e-07, "logps/chosen": -219.27806091308594, "logps/rejected": -336.0762939453125, "loss": 0.0013, "losses/dpo": 4.626286624898057e-07, "losses/sft": 0.9244856238365173, "losses/total": 4.626286624898057e-07, "ref_logps/chosen": -209.39805603027344, "ref_logps/rejected": -211.64212036132812, "rewards/accuracies": 1.0, "rewards/chosen": -0.9880009889602661, "rewards/margins": 11.45541763305664, "rewards/rejected": -12.443418502807617, "step": 2111 }, { "epoch": 0.51, "learning_rate": 1.096e-07, "logps/chosen": -240.1217498779297, "logps/rejected": -354.62371826171875, "loss": 0.001, "losses/dpo": 1.1844542013861314e-09, "losses/sft": 0.6079273223876953, "losses/total": 1.1844542013861314e-09, "ref_logps/chosen": -229.56900024414062, "ref_logps/rejected": -213.62445068359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.0552740097045898, "rewards/margins": 13.04465103149414, "rewards/rejected": -14.09992504119873, "step": 2112 }, { "epoch": 0.51, "learning_rate": 1.0954666666666665e-07, "logps/chosen": -231.36485290527344, "logps/rejected": -343.9089050292969, "loss": 0.0125, "losses/dpo": 1.4103605963100563e-06, "losses/sft": 0.6004858613014221, "losses/total": 1.4103605963100563e-06, "ref_logps/chosen": -220.10824584960938, "ref_logps/rejected": -226.3368682861328, "rewards/accuracies": 1.0, "rewards/chosen": -1.1256617307662964, "rewards/margins": 10.631546020507812, "rewards/rejected": -11.757207870483398, "step": 2113 }, { "epoch": 0.51, "learning_rate": 1.0949333333333333e-07, "logps/chosen": -244.8137969970703, "logps/rejected": -346.8601379394531, "loss": 0.0004, "losses/dpo": 5.4227302825893275e-06, "losses/sft": 0.7122886776924133, "losses/total": 5.4227302825893275e-06, "ref_logps/chosen": -236.53753662109375, "ref_logps/rejected": -214.21615600585938, "rewards/accuracies": 1.0, "rewards/chosen": -0.8276258707046509, "rewards/margins": 12.436773300170898, "rewards/rejected": -13.264398574829102, "step": 2114 }, { "epoch": 0.51, "learning_rate": 1.0943999999999999e-07, "logps/chosen": -247.57012939453125, "logps/rejected": -363.55499267578125, "loss": 0.0043, "losses/dpo": 9.976207593354047e-08, "losses/sft": 0.6672707796096802, "losses/total": 9.976207593354047e-08, "ref_logps/chosen": -235.52975463867188, "ref_logps/rejected": -232.71902465820312, "rewards/accuracies": 1.0, "rewards/chosen": -1.204038143157959, "rewards/margins": 11.879557609558105, "rewards/rejected": -13.083597183227539, "step": 2115 }, { "epoch": 0.51, "learning_rate": 1.0938666666666667e-07, "logps/chosen": -246.6216583251953, "logps/rejected": -373.98016357421875, "loss": 0.0005, "losses/dpo": 8.143882155309257e-07, "losses/sft": 0.7317637801170349, "losses/total": 8.143882155309257e-07, "ref_logps/chosen": -233.1188507080078, "ref_logps/rejected": -228.57638549804688, "rewards/accuracies": 1.0, "rewards/chosen": -1.3502790927886963, "rewards/margins": 13.19009780883789, "rewards/rejected": -14.540376663208008, "step": 2116 }, { "epoch": 0.51, "learning_rate": 1.0933333333333332e-07, "logps/chosen": -219.82106018066406, "logps/rejected": -339.66485595703125, "loss": 0.0049, "losses/dpo": 3.384663784800068e-07, "losses/sft": 0.7889578938484192, "losses/total": 3.384663784800068e-07, "ref_logps/chosen": -207.8489990234375, "ref_logps/rejected": -214.73634338378906, "rewards/accuracies": 1.0, "rewards/chosen": -1.197206974029541, "rewards/margins": 11.295642852783203, "rewards/rejected": -12.492850303649902, "step": 2117 }, { "epoch": 0.51, "learning_rate": 1.0928e-07, "logps/chosen": -230.5235595703125, "logps/rejected": -344.03302001953125, "loss": 0.0057, "losses/dpo": 1.4412147208986426e-07, "losses/sft": 0.6900151968002319, "losses/total": 1.4412147208986426e-07, "ref_logps/chosen": -220.91990661621094, "ref_logps/rejected": -218.5802001953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.9603638648986816, "rewards/margins": 11.584918975830078, "rewards/rejected": -12.545281410217285, "step": 2118 }, { "epoch": 0.51, "learning_rate": 1.0922666666666667e-07, "logps/chosen": -265.6645812988281, "logps/rejected": -389.0328674316406, "loss": 0.0012, "losses/dpo": 1.5345352721851668e-07, "losses/sft": 0.4340934455394745, "losses/total": 1.5345352721851668e-07, "ref_logps/chosen": -251.7026824951172, "ref_logps/rejected": -236.70654296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.3961917161941528, "rewards/margins": 13.836442947387695, "rewards/rejected": -15.232634544372559, "step": 2119 }, { "epoch": 0.51, "learning_rate": 1.0917333333333332e-07, "logps/chosen": -302.9531555175781, "logps/rejected": -370.1087646484375, "loss": 0.0025, "losses/dpo": 2.119804594258312e-05, "losses/sft": 0.5505417585372925, "losses/total": 2.119804594258312e-05, "ref_logps/chosen": -289.2169189453125, "ref_logps/rejected": -236.78594970703125, "rewards/accuracies": 1.0, "rewards/chosen": -1.3736228942871094, "rewards/margins": 11.958658218383789, "rewards/rejected": -13.332281112670898, "step": 2120 }, { "epoch": 0.51, "learning_rate": 1.0911999999999999e-07, "logps/chosen": -237.75746154785156, "logps/rejected": -387.26092529296875, "loss": 0.0018, "losses/dpo": 6.352691883648731e-08, "losses/sft": 0.5936670303344727, "losses/total": 6.352691883648731e-08, "ref_logps/chosen": -227.91656494140625, "ref_logps/rejected": -248.54095458984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9840874075889587, "rewards/margins": 12.887907028198242, "rewards/rejected": -13.871994018554688, "step": 2121 }, { "epoch": 0.51, "learning_rate": 1.0906666666666666e-07, "logps/chosen": -259.20111083984375, "logps/rejected": -365.5111389160156, "loss": 0.0053, "losses/dpo": 6.4334603848692495e-06, "losses/sft": 1.0019474029541016, "losses/total": 6.4334603848692495e-06, "ref_logps/chosen": -246.64732360839844, "ref_logps/rejected": -224.909912109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.2553801536560059, "rewards/margins": 12.8047456741333, "rewards/rejected": -14.060125350952148, "step": 2122 }, { "epoch": 0.51, "learning_rate": 1.0901333333333334e-07, "logps/chosen": -197.17724609375, "logps/rejected": -342.68768310546875, "loss": 0.0092, "losses/dpo": 2.6222288784794046e-09, "losses/sft": 0.48169898986816406, "losses/total": 2.6222288784794046e-09, "ref_logps/chosen": -188.2342071533203, "ref_logps/rejected": -222.376708984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.8943025469779968, "rewards/margins": 11.136795997619629, "rewards/rejected": -12.031098365783691, "step": 2123 }, { "epoch": 0.51, "learning_rate": 1.0895999999999999e-07, "logps/chosen": -266.8008728027344, "logps/rejected": -326.09417724609375, "loss": 0.0033, "losses/dpo": 0.0003691060410346836, "losses/sft": 1.207223653793335, "losses/total": 0.0003691060410346836, "ref_logps/chosen": -256.414794921875, "ref_logps/rejected": -209.9593963623047, "rewards/accuracies": 1.0, "rewards/chosen": -1.0386111736297607, "rewards/margins": 10.574870109558105, "rewards/rejected": -11.613481521606445, "step": 2124 }, { "epoch": 0.51, "learning_rate": 1.0890666666666665e-07, "logps/chosen": -272.647216796875, "logps/rejected": -397.8846435546875, "loss": 0.0014, "losses/dpo": 3.013572325016867e-08, "losses/sft": 0.7532072067260742, "losses/total": 3.013572325016867e-08, "ref_logps/chosen": -262.14276123046875, "ref_logps/rejected": -255.32644653320312, "rewards/accuracies": 1.0, "rewards/chosen": -1.0504432916641235, "rewards/margins": 13.205376625061035, "rewards/rejected": -14.255821228027344, "step": 2125 }, { "epoch": 0.51, "learning_rate": 1.0885333333333333e-07, "logps/chosen": -224.4060516357422, "logps/rejected": -391.65789794921875, "loss": 0.0009, "losses/dpo": 0.005450142081826925, "losses/sft": 0.7703885436058044, "losses/total": 0.005450142081826925, "ref_logps/chosen": -214.0260467529297, "ref_logps/rejected": -238.3894500732422, "rewards/accuracies": 1.0, "rewards/chosen": -1.038001298904419, "rewards/margins": 14.28884506225586, "rewards/rejected": -15.326847076416016, "step": 2126 }, { "epoch": 0.51, "learning_rate": 1.0880000000000001e-07, "logps/chosen": -203.93606567382812, "logps/rejected": -363.6734619140625, "loss": 0.0017, "losses/dpo": 4.5914347879261186e-08, "losses/sft": 0.6142034530639648, "losses/total": 4.5914347879261186e-08, "ref_logps/chosen": -194.55673217773438, "ref_logps/rejected": -225.556884765625, "rewards/accuracies": 1.0, "rewards/chosen": -0.9379346370697021, "rewards/margins": 12.873725891113281, "rewards/rejected": -13.811660766601562, "step": 2127 }, { "epoch": 0.51, "learning_rate": 1.0874666666666666e-07, "logps/chosen": -213.53762817382812, "logps/rejected": -344.02215576171875, "loss": 0.0047, "losses/dpo": 8.678074370216393e-10, "losses/sft": 0.6509223580360413, "losses/total": 8.678074370216393e-10, "ref_logps/chosen": -199.23953247070312, "ref_logps/rejected": -209.43572998046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.4298094511032104, "rewards/margins": 12.028833389282227, "rewards/rejected": -13.458642959594727, "step": 2128 }, { "epoch": 0.51, "learning_rate": 1.0869333333333332e-07, "logps/chosen": -261.4638977050781, "logps/rejected": -395.31817626953125, "loss": 0.0001, "losses/dpo": 3.0095789043116383e-06, "losses/sft": 0.5692923069000244, "losses/total": 3.0095789043116383e-06, "ref_logps/chosen": -247.52230834960938, "ref_logps/rejected": -242.08755493164062, "rewards/accuracies": 1.0, "rewards/chosen": -1.3941594362258911, "rewards/margins": 13.928905487060547, "rewards/rejected": -15.323064804077148, "step": 2129 }, { "epoch": 0.51, "learning_rate": 1.0864e-07, "logps/chosen": -239.89297485351562, "logps/rejected": -377.5560302734375, "loss": 0.0067, "losses/dpo": 6.130483676258791e-09, "losses/sft": 0.6138044595718384, "losses/total": 6.130483676258791e-09, "ref_logps/chosen": -228.05101013183594, "ref_logps/rejected": -238.70481872558594, "rewards/accuracies": 1.0, "rewards/chosen": -1.1841981410980225, "rewards/margins": 12.700923919677734, "rewards/rejected": -13.88512134552002, "step": 2130 }, { "epoch": 0.51, "learning_rate": 1.0858666666666668e-07, "logps/chosen": -201.90365600585938, "logps/rejected": -332.7867736816406, "loss": 0.0132, "losses/dpo": 1.0025142938019371e-09, "losses/sft": 0.6829292178153992, "losses/total": 1.0025142938019371e-09, "ref_logps/chosen": -191.22232055664062, "ref_logps/rejected": -196.68927001953125, "rewards/accuracies": 1.0, "rewards/chosen": -1.068134069442749, "rewards/margins": 12.541618347167969, "rewards/rejected": -13.60975170135498, "step": 2131 }, { "epoch": 0.51, "learning_rate": 1.0853333333333333e-07, "logps/chosen": -199.0277099609375, "logps/rejected": -354.6925964355469, "loss": 0.007, "losses/dpo": 2.6198517844022717e-07, "losses/sft": 0.6227763891220093, "losses/total": 2.6198517844022717e-07, "ref_logps/chosen": -188.9949951171875, "ref_logps/rejected": -228.87367248535156, "rewards/accuracies": 1.0, "rewards/chosen": -1.003272294998169, "rewards/margins": 11.578622817993164, "rewards/rejected": -12.581894874572754, "step": 2132 }, { "epoch": 0.51, "learning_rate": 1.0847999999999999e-07, "logps/chosen": -216.11505126953125, "logps/rejected": -334.21514892578125, "loss": 0.0034, "losses/dpo": 1.211399762723886e-06, "losses/sft": 0.5345863103866577, "losses/total": 1.211399762723886e-06, "ref_logps/chosen": -205.08973693847656, "ref_logps/rejected": -198.61135864257812, "rewards/accuracies": 1.0, "rewards/chosen": -1.1025307178497314, "rewards/margins": 12.457846641540527, "rewards/rejected": -13.560378074645996, "step": 2133 }, { "epoch": 0.51, "learning_rate": 1.0842666666666667e-07, "logps/chosen": -265.427490234375, "logps/rejected": -373.1553955078125, "loss": 0.0024, "losses/dpo": 0.00040519816684536636, "losses/sft": 0.5227636098861694, "losses/total": 0.00040519816684536636, "ref_logps/chosen": -254.74801635742188, "ref_logps/rejected": -235.90933227539062, "rewards/accuracies": 1.0, "rewards/chosen": -1.0679497718811035, "rewards/margins": 12.656656265258789, "rewards/rejected": -13.724605560302734, "step": 2134 }, { "epoch": 0.51, "learning_rate": 1.0837333333333334e-07, "logps/chosen": -285.73406982421875, "logps/rejected": -382.6495056152344, "loss": 0.0144, "losses/dpo": 4.872354111284949e-07, "losses/sft": 0.6538007259368896, "losses/total": 4.872354111284949e-07, "ref_logps/chosen": -274.72674560546875, "ref_logps/rejected": -239.2974395751953, "rewards/accuracies": 1.0, "rewards/chosen": -1.1007373332977295, "rewards/margins": 13.234466552734375, "rewards/rejected": -14.335203170776367, "step": 2135 }, { "epoch": 0.51, "learning_rate": 1.0831999999999999e-07, "logps/chosen": -262.99359130859375, "logps/rejected": -357.4908447265625, "loss": 0.0019, "losses/dpo": 2.271016001031967e-06, "losses/sft": 1.3333123922348022, "losses/total": 2.271016001031967e-06, "ref_logps/chosen": -250.90762329101562, "ref_logps/rejected": -212.938232421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.2085974216461182, "rewards/margins": 13.246665000915527, "rewards/rejected": -14.455263137817383, "step": 2136 }, { "epoch": 0.51, "learning_rate": 1.0826666666666666e-07, "logps/chosen": -245.19601440429688, "logps/rejected": -352.10205078125, "loss": 0.0015, "losses/dpo": 5.017711401933411e-08, "losses/sft": 1.0718510150909424, "losses/total": 5.017711401933411e-08, "ref_logps/chosen": -234.34156799316406, "ref_logps/rejected": -219.3681640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.0854452848434448, "rewards/margins": 12.187948226928711, "rewards/rejected": -13.273393630981445, "step": 2137 }, { "epoch": 0.51, "learning_rate": 1.0821333333333333e-07, "logps/chosen": -228.48028564453125, "logps/rejected": -348.4167785644531, "loss": 0.0017, "losses/dpo": 3.393697625142522e-06, "losses/sft": 0.5662974715232849, "losses/total": 3.393697625142522e-06, "ref_logps/chosen": -218.23826599121094, "ref_logps/rejected": -218.19837951660156, "rewards/accuracies": 1.0, "rewards/chosen": -1.0242048501968384, "rewards/margins": 11.997634887695312, "rewards/rejected": -13.02184009552002, "step": 2138 }, { "epoch": 0.51, "learning_rate": 1.0815999999999998e-07, "logps/chosen": -180.1239013671875, "logps/rejected": -342.7176208496094, "loss": 0.0025, "losses/dpo": 7.202472716016928e-06, "losses/sft": 0.6561222076416016, "losses/total": 7.202472716016928e-06, "ref_logps/chosen": -174.92666625976562, "ref_logps/rejected": -212.8515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.519725501537323, "rewards/margins": 12.466880798339844, "rewards/rejected": -12.98660659790039, "step": 2139 }, { "epoch": 0.51, "learning_rate": 1.0810666666666666e-07, "logps/chosen": -262.0485534667969, "logps/rejected": -412.1190185546875, "loss": 0.0008, "losses/dpo": 5.425688982541033e-07, "losses/sft": 0.5934001207351685, "losses/total": 5.425688982541033e-07, "ref_logps/chosen": -249.61346435546875, "ref_logps/rejected": -255.80487060546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.2435076236724854, "rewards/margins": 14.38790512084961, "rewards/rejected": -15.631413459777832, "step": 2140 }, { "epoch": 0.51, "learning_rate": 1.0805333333333332e-07, "logps/chosen": -302.9710998535156, "logps/rejected": -386.7115478515625, "loss": 0.0008, "losses/dpo": 5.081154941244392e-10, "losses/sft": 0.5501779317855835, "losses/total": 5.081154941244392e-10, "ref_logps/chosen": -286.0113220214844, "ref_logps/rejected": -242.78045654296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.6959761381149292, "rewards/margins": 12.697134017944336, "rewards/rejected": -14.393110275268555, "step": 2141 }, { "epoch": 0.51, "learning_rate": 1.08e-07, "logps/chosen": -232.86546325683594, "logps/rejected": -376.721923828125, "loss": 0.0006, "losses/dpo": 3.9295298392971745e-07, "losses/sft": 0.7404453158378601, "losses/total": 3.9295298392971745e-07, "ref_logps/chosen": -222.80325317382812, "ref_logps/rejected": -236.83154296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.0062216520309448, "rewards/margins": 12.982815742492676, "rewards/rejected": -13.989038467407227, "step": 2142 }, { "epoch": 0.51, "learning_rate": 1.0794666666666665e-07, "logps/chosen": -240.18975830078125, "logps/rejected": -383.70538330078125, "loss": 0.0005, "losses/dpo": 4.215034437038412e-07, "losses/sft": 0.6221063137054443, "losses/total": 4.215034437038412e-07, "ref_logps/chosen": -226.7073974609375, "ref_logps/rejected": -243.1293182373047, "rewards/accuracies": 1.0, "rewards/chosen": -1.34823477268219, "rewards/margins": 12.709373474121094, "rewards/rejected": -14.057607650756836, "step": 2143 }, { "epoch": 0.51, "learning_rate": 1.0789333333333333e-07, "logps/chosen": -241.67906188964844, "logps/rejected": -342.9893798828125, "loss": 0.0057, "losses/dpo": 6.397532725799238e-09, "losses/sft": 0.6350870728492737, "losses/total": 6.397532725799238e-09, "ref_logps/chosen": -231.27536010742188, "ref_logps/rejected": -208.11386108398438, "rewards/accuracies": 1.0, "rewards/chosen": -1.0403692722320557, "rewards/margins": 12.447183609008789, "rewards/rejected": -13.48755168914795, "step": 2144 }, { "epoch": 0.51, "learning_rate": 1.0783999999999999e-07, "logps/chosen": -216.0218048095703, "logps/rejected": -334.63336181640625, "loss": 0.0132, "losses/dpo": 1.0536392380799953e-07, "losses/sft": 0.6649707555770874, "losses/total": 1.0536392380799953e-07, "ref_logps/chosen": -204.3212890625, "ref_logps/rejected": -209.23196411132812, "rewards/accuracies": 1.0, "rewards/chosen": -1.170052170753479, "rewards/margins": 11.370089530944824, "rewards/rejected": -12.540141105651855, "step": 2145 }, { "epoch": 0.51, "learning_rate": 1.0778666666666667e-07, "logps/chosen": -228.75579833984375, "logps/rejected": -348.8477783203125, "loss": 0.0016, "losses/dpo": 1.779147662528402e-10, "losses/sft": 0.7579144239425659, "losses/total": 1.779147662528402e-10, "ref_logps/chosen": -219.56906127929688, "ref_logps/rejected": -207.30487060546875, "rewards/accuracies": 1.0, "rewards/chosen": -0.9186729788780212, "rewards/margins": 13.235617637634277, "rewards/rejected": -14.154290199279785, "step": 2146 }, { "epoch": 0.52, "learning_rate": 1.0773333333333332e-07, "logps/chosen": -231.7540283203125, "logps/rejected": -362.11932373046875, "loss": 0.0032, "losses/dpo": 4.503564454694242e-08, "losses/sft": 0.5331072807312012, "losses/total": 4.503564454694242e-08, "ref_logps/chosen": -221.76760864257812, "ref_logps/rejected": -220.89964294433594, "rewards/accuracies": 1.0, "rewards/chosen": -0.998640775680542, "rewards/margins": 13.123329162597656, "rewards/rejected": -14.121969223022461, "step": 2147 }, { "epoch": 0.52, "learning_rate": 1.0768e-07, "logps/chosen": -235.57408142089844, "logps/rejected": -344.218017578125, "loss": 0.0058, "losses/dpo": 2.762197803818367e-09, "losses/sft": 0.48511460423469543, "losses/total": 2.762197803818367e-09, "ref_logps/chosen": -224.79466247558594, "ref_logps/rejected": -211.26638793945312, "rewards/accuracies": 1.0, "rewards/chosen": -1.0779435634613037, "rewards/margins": 12.217220306396484, "rewards/rejected": -13.295164108276367, "step": 2148 }, { "epoch": 0.52, "learning_rate": 1.0762666666666667e-07, "logps/chosen": -236.6437530517578, "logps/rejected": -386.98004150390625, "loss": 0.0015, "losses/dpo": 3.232257085983292e-06, "losses/sft": 0.7492256760597229, "losses/total": 3.232257085983292e-06, "ref_logps/chosen": -224.85137939453125, "ref_logps/rejected": -244.5159912109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.1792372465133667, "rewards/margins": 13.067168235778809, "rewards/rejected": -14.246404647827148, "step": 2149 }, { "epoch": 0.52, "learning_rate": 1.0757333333333334e-07, "logps/chosen": -252.46612548828125, "logps/rejected": -325.97857666015625, "loss": 0.0005, "losses/dpo": 7.972870662342757e-05, "losses/sft": 0.8575652837753296, "losses/total": 7.972870662342757e-05, "ref_logps/chosen": -241.23416137695312, "ref_logps/rejected": -199.07232666015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.1231962442398071, "rewards/margins": 11.567429542541504, "rewards/rejected": -12.690625190734863, "step": 2150 }, { "epoch": 0.52, "learning_rate": 1.0751999999999999e-07, "logps/chosen": -240.98236083984375, "logps/rejected": -373.1484375, "loss": 0.0043, "losses/dpo": 5.810038601339329e-07, "losses/sft": 0.5285105109214783, "losses/total": 5.810038601339329e-07, "ref_logps/chosen": -231.6060791015625, "ref_logps/rejected": -226.99945068359375, "rewards/accuracies": 1.0, "rewards/chosen": -0.937627911567688, "rewards/margins": 13.677270889282227, "rewards/rejected": -14.614898681640625, "step": 2151 }, { "epoch": 0.52, "learning_rate": 1.0746666666666666e-07, "logps/chosen": -241.2849884033203, "logps/rejected": -343.4864501953125, "loss": 0.008, "losses/dpo": 4.1826797314570285e-07, "losses/sft": 0.4389912188053131, "losses/total": 4.1826797314570285e-07, "ref_logps/chosen": -226.1522674560547, "ref_logps/rejected": -213.67185974121094, "rewards/accuracies": 1.0, "rewards/chosen": -1.5132718086242676, "rewards/margins": 11.468189239501953, "rewards/rejected": -12.981460571289062, "step": 2152 }, { "epoch": 0.52, "learning_rate": 1.0741333333333334e-07, "logps/chosen": -273.427734375, "logps/rejected": -365.29278564453125, "loss": 0.0005, "losses/dpo": 1.2766385282247938e-07, "losses/sft": 0.8054946660995483, "losses/total": 1.2766385282247938e-07, "ref_logps/chosen": -260.14459228515625, "ref_logps/rejected": -216.80035400390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.3283166885375977, "rewards/margins": 13.520927429199219, "rewards/rejected": -14.849244117736816, "step": 2153 }, { "epoch": 0.52, "learning_rate": 1.0736e-07, "logps/chosen": -248.64407348632812, "logps/rejected": -382.46868896484375, "loss": 0.0008, "losses/dpo": 2.784552634693682e-06, "losses/sft": 0.44520893692970276, "losses/total": 2.784552634693682e-06, "ref_logps/chosen": -238.65829467773438, "ref_logps/rejected": -242.7902374267578, "rewards/accuracies": 1.0, "rewards/chosen": -0.9985753893852234, "rewards/margins": 12.969269752502441, "rewards/rejected": -13.96784496307373, "step": 2154 }, { "epoch": 0.52, "learning_rate": 1.0730666666666665e-07, "logps/chosen": -218.9915771484375, "logps/rejected": -359.4085998535156, "loss": 0.0004, "losses/dpo": 2.683307513962063e-07, "losses/sft": 0.49029797315597534, "losses/total": 2.683307513962063e-07, "ref_logps/chosen": -208.8597412109375, "ref_logps/rejected": -212.79441833496094, "rewards/accuracies": 1.0, "rewards/chosen": -1.0131855010986328, "rewards/margins": 13.648233413696289, "rewards/rejected": -14.661418914794922, "step": 2155 }, { "epoch": 0.52, "learning_rate": 1.0725333333333333e-07, "logps/chosen": -233.0203399658203, "logps/rejected": -364.35882568359375, "loss": 0.0155, "losses/dpo": 6.269676759984577e-07, "losses/sft": 0.4444056451320648, "losses/total": 6.269676759984577e-07, "ref_logps/chosen": -221.83200073242188, "ref_logps/rejected": -230.89682006835938, "rewards/accuracies": 1.0, "rewards/chosen": -1.118834376335144, "rewards/margins": 12.227365493774414, "rewards/rejected": -13.346200942993164, "step": 2156 }, { "epoch": 0.52, "learning_rate": 1.0720000000000001e-07, "logps/chosen": -221.33200073242188, "logps/rejected": -343.4700927734375, "loss": 0.0024, "losses/dpo": 2.383530384782867e-13, "losses/sft": 0.7176083326339722, "losses/total": 2.383530384782867e-13, "ref_logps/chosen": -212.24014282226562, "ref_logps/rejected": -209.05792236328125, "rewards/accuracies": 1.0, "rewards/chosen": -0.9091853499412537, "rewards/margins": 12.532032012939453, "rewards/rejected": -13.441216468811035, "step": 2157 }, { "epoch": 0.52, "learning_rate": 1.0714666666666666e-07, "logps/chosen": -260.3622131347656, "logps/rejected": -404.8682556152344, "loss": 0.0007, "losses/dpo": 7.337449687838671e-07, "losses/sft": 0.7425881028175354, "losses/total": 7.337449687838671e-07, "ref_logps/chosen": -247.2365264892578, "ref_logps/rejected": -258.2190856933594, "rewards/accuracies": 1.0, "rewards/chosen": -1.312570333480835, "rewards/margins": 13.352344512939453, "rewards/rejected": -14.664916038513184, "step": 2158 }, { "epoch": 0.52, "learning_rate": 1.0709333333333332e-07, "logps/chosen": -248.37220764160156, "logps/rejected": -416.0750732421875, "loss": 0.0, "losses/dpo": 2.795839861846616e-07, "losses/sft": 0.6072022318840027, "losses/total": 2.795839861846616e-07, "ref_logps/chosen": -236.3296661376953, "ref_logps/rejected": -250.43984985351562, "rewards/accuracies": 1.0, "rewards/chosen": -1.2042531967163086, "rewards/margins": 15.359267234802246, "rewards/rejected": -16.563520431518555, "step": 2159 }, { "epoch": 0.52, "learning_rate": 1.0704e-07, "logps/chosen": -207.8306884765625, "logps/rejected": -368.8492431640625, "loss": 0.0039, "losses/dpo": 0.0007637097151018679, "losses/sft": 0.8243607878684998, "losses/total": 0.0007637097151018679, "ref_logps/chosen": -195.73025512695312, "ref_logps/rejected": -229.23483276367188, "rewards/accuracies": 1.0, "rewards/chosen": -1.2100434303283691, "rewards/margins": 12.751397132873535, "rewards/rejected": -13.961440086364746, "step": 2160 }, { "epoch": 0.52, "learning_rate": 1.0698666666666667e-07, "logps/chosen": -240.27557373046875, "logps/rejected": -347.4072265625, "loss": 0.0055, "losses/dpo": 6.474881502072094e-07, "losses/sft": 0.6372520327568054, "losses/total": 6.474881502072094e-07, "ref_logps/chosen": -227.63992309570312, "ref_logps/rejected": -211.4010009765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2635647058486938, "rewards/margins": 12.337058067321777, "rewards/rejected": -13.600622177124023, "step": 2161 }, { "epoch": 0.52, "learning_rate": 1.0693333333333332e-07, "logps/chosen": -275.8612060546875, "logps/rejected": -408.882568359375, "loss": 0.0002, "losses/dpo": 7.630309717399086e-09, "losses/sft": 0.6131377220153809, "losses/total": 7.630309717399086e-09, "ref_logps/chosen": -259.8686218261719, "ref_logps/rejected": -253.95068359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5992600917816162, "rewards/margins": 13.893927574157715, "rewards/rejected": -15.49318790435791, "step": 2162 }, { "epoch": 0.52, "learning_rate": 1.0687999999999999e-07, "logps/chosen": -272.7470703125, "logps/rejected": -374.18865966796875, "loss": 0.0057, "losses/dpo": 3.865354425425238e-10, "losses/sft": 0.5461377501487732, "losses/total": 3.865354425425238e-10, "ref_logps/chosen": -261.6590576171875, "ref_logps/rejected": -240.74664306640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.108799695968628, "rewards/margins": 12.23540210723877, "rewards/rejected": -13.344202041625977, "step": 2163 }, { "epoch": 0.52, "learning_rate": 1.0682666666666667e-07, "logps/chosen": -208.0133056640625, "logps/rejected": -321.3787536621094, "loss": 0.0049, "losses/dpo": 3.919868504453916e-06, "losses/sft": 0.66886305809021, "losses/total": 3.919868504453916e-06, "ref_logps/chosen": -198.66195678710938, "ref_logps/rejected": -190.95306396484375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9351360201835632, "rewards/margins": 12.107433319091797, "rewards/rejected": -13.042569160461426, "step": 2164 }, { "epoch": 0.52, "learning_rate": 1.0677333333333334e-07, "logps/chosen": -272.39892578125, "logps/rejected": -401.42535400390625, "loss": 0.0011, "losses/dpo": 4.948520313519111e-07, "losses/sft": 0.6089315414428711, "losses/total": 4.948520313519111e-07, "ref_logps/chosen": -260.5885009765625, "ref_logps/rejected": -247.52769470214844, "rewards/accuracies": 1.0, "rewards/chosen": -1.1810461282730103, "rewards/margins": 14.208721160888672, "rewards/rejected": -15.38976764678955, "step": 2165 }, { "epoch": 0.52, "learning_rate": 1.0671999999999999e-07, "logps/chosen": -233.33975219726562, "logps/rejected": -342.762451171875, "loss": 0.0028, "losses/dpo": 4.842853741138242e-05, "losses/sft": 0.6388112902641296, "losses/total": 4.842853741138242e-05, "ref_logps/chosen": -223.1460418701172, "ref_logps/rejected": -206.32818603515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.01936936378479, "rewards/margins": 12.624059677124023, "rewards/rejected": -13.643428802490234, "step": 2166 }, { "epoch": 0.52, "learning_rate": 1.0666666666666666e-07, "logps/chosen": -221.74346923828125, "logps/rejected": -308.79266357421875, "loss": 0.0051, "losses/dpo": 6.879321290398366e-08, "losses/sft": 1.0680928230285645, "losses/total": 6.879321290398366e-08, "ref_logps/chosen": -211.157470703125, "ref_logps/rejected": -186.5308837890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.0585980415344238, "rewards/margins": 11.167579650878906, "rewards/rejected": -12.226179122924805, "step": 2167 }, { "epoch": 0.52, "learning_rate": 1.0661333333333333e-07, "logps/chosen": -260.5224914550781, "logps/rejected": -396.62890625, "loss": 0.0006, "losses/dpo": 3.5837248724135407e-09, "losses/sft": 0.4581810534000397, "losses/total": 3.5837248724135407e-09, "ref_logps/chosen": -246.0850830078125, "ref_logps/rejected": -244.3771514892578, "rewards/accuracies": 1.0, "rewards/chosen": -1.443739652633667, "rewards/margins": 13.7814359664917, "rewards/rejected": -15.225176811218262, "step": 2168 }, { "epoch": 0.52, "learning_rate": 1.0656000000000001e-07, "logps/chosen": -241.64305114746094, "logps/rejected": -361.40582275390625, "loss": 0.0006, "losses/dpo": 1.191975158576497e-07, "losses/sft": 0.619644284248352, "losses/total": 1.191975158576497e-07, "ref_logps/chosen": -228.28758239746094, "ref_logps/rejected": -222.46963500976562, "rewards/accuracies": 1.0, "rewards/chosen": -1.3355457782745361, "rewards/margins": 12.558073043823242, "rewards/rejected": -13.893619537353516, "step": 2169 }, { "epoch": 0.52, "learning_rate": 1.0650666666666666e-07, "logps/chosen": -228.9297637939453, "logps/rejected": -377.93389892578125, "loss": 0.0003, "losses/dpo": 4.608026245023211e-07, "losses/sft": 0.6765795946121216, "losses/total": 4.608026245023211e-07, "ref_logps/chosen": -216.7216339111328, "ref_logps/rejected": -233.7933349609375, "rewards/accuracies": 1.0, "rewards/chosen": -1.2208129167556763, "rewards/margins": 13.193243026733398, "rewards/rejected": -14.414056777954102, "step": 2170 }, { "epoch": 0.52, "learning_rate": 1.0645333333333332e-07, "logps/chosen": -259.8901672363281, "logps/rejected": -402.1546936035156, "loss": 0.0001, "losses/dpo": 7.066264060995309e-06, "losses/sft": 1.0873112678527832, "losses/total": 7.066264060995309e-06, "ref_logps/chosen": -248.00038146972656, "ref_logps/rejected": -254.97589111328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.1889784336090088, "rewards/margins": 13.528900146484375, "rewards/rejected": -14.717878341674805, "step": 2171 }, { "epoch": 0.52, "learning_rate": 1.064e-07, "logps/chosen": -258.4532470703125, "logps/rejected": -389.90118408203125, "loss": 0.0008, "losses/dpo": 5.5342701671179384e-05, "losses/sft": 0.825528621673584, "losses/total": 5.5342701671179384e-05, "ref_logps/chosen": -240.671142578125, "ref_logps/rejected": -239.28067016601562, "rewards/accuracies": 1.0, "rewards/chosen": -1.7782127857208252, "rewards/margins": 13.283839225769043, "rewards/rejected": -15.062051773071289, "step": 2172 }, { "epoch": 0.52, "learning_rate": 1.0634666666666665e-07, "logps/chosen": -211.1497344970703, "logps/rejected": -367.877685546875, "loss": 0.0055, "losses/dpo": 1.5754174000903731e-06, "losses/sft": 0.45225390791893005, "losses/total": 1.5754174000903731e-06, "ref_logps/chosen": -201.76031494140625, "ref_logps/rejected": -236.0447998046875, "rewards/accuracies": 1.0, "rewards/chosen": -0.9389399290084839, "rewards/margins": 12.24435043334961, "rewards/rejected": -13.183289527893066, "step": 2173 }, { "epoch": 0.52, "learning_rate": 1.0629333333333333e-07, "logps/chosen": -237.45401000976562, "logps/rejected": -387.00390625, "loss": 0.0003, "losses/dpo": 3.943889126389877e-08, "losses/sft": 0.8390631079673767, "losses/total": 3.943889126389877e-08, "ref_logps/chosen": -225.78070068359375, "ref_logps/rejected": -241.0432586669922, "rewards/accuracies": 1.0, "rewards/chosen": -1.1673305034637451, "rewards/margins": 13.42873764038086, "rewards/rejected": -14.596068382263184, "step": 2174 }, { "epoch": 0.52, "learning_rate": 1.0623999999999999e-07, "logps/chosen": -281.2649841308594, "logps/rejected": -389.6224060058594, "loss": 0.0091, "losses/dpo": 9.017270485855988e-09, "losses/sft": 0.6882092952728271, "losses/total": 9.017270485855988e-09, "ref_logps/chosen": -269.50604248046875, "ref_logps/rejected": -250.5533905029297, "rewards/accuracies": 1.0, "rewards/chosen": -1.1758959293365479, "rewards/margins": 12.73100757598877, "rewards/rejected": -13.906904220581055, "step": 2175 }, { "epoch": 0.52, "learning_rate": 1.0618666666666667e-07, "logps/chosen": -246.1739044189453, "logps/rejected": -360.626220703125, "loss": 0.0067, "losses/dpo": 2.230753182175249e-08, "losses/sft": 1.2020212411880493, "losses/total": 2.230753182175249e-08, "ref_logps/chosen": -230.59548950195312, "ref_logps/rejected": -220.25909423828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.5578410625457764, "rewards/margins": 12.478874206542969, "rewards/rejected": -14.036714553833008, "step": 2176 }, { "epoch": 0.52, "learning_rate": 1.0613333333333332e-07, "logps/chosen": -220.750244140625, "logps/rejected": -368.7654113769531, "loss": 0.0107, "losses/dpo": 6.516239103859789e-09, "losses/sft": 0.5260184407234192, "losses/total": 6.516239103859789e-09, "ref_logps/chosen": -210.44960021972656, "ref_logps/rejected": -223.1666259765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.0300657749176025, "rewards/margins": 13.529813766479492, "rewards/rejected": -14.559879302978516, "step": 2177 }, { "epoch": 0.52, "learning_rate": 1.0608e-07, "logps/chosen": -226.36962890625, "logps/rejected": -364.53765869140625, "loss": 0.0015, "losses/dpo": 4.6169937317763754e-10, "losses/sft": 0.40517693758010864, "losses/total": 4.6169937317763754e-10, "ref_logps/chosen": -214.9299774169922, "ref_logps/rejected": -234.20791625976562, "rewards/accuracies": 1.0, "rewards/chosen": -1.1439638137817383, "rewards/margins": 11.889009475708008, "rewards/rejected": -13.032973289489746, "step": 2178 }, { "epoch": 0.52, "learning_rate": 1.0602666666666666e-07, "logps/chosen": -223.64773559570312, "logps/rejected": -369.3163757324219, "loss": 0.0019, "losses/dpo": 1.5793558816312725e-07, "losses/sft": 0.6491923928260803, "losses/total": 1.5793558816312725e-07, "ref_logps/chosen": -212.0263214111328, "ref_logps/rejected": -222.28717041015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.162142038345337, "rewards/margins": 13.540779113769531, "rewards/rejected": -14.702919960021973, "step": 2179 }, { "epoch": 0.52, "learning_rate": 1.0597333333333333e-07, "logps/chosen": -255.59034729003906, "logps/rejected": -365.9007568359375, "loss": 0.0021, "losses/dpo": 7.204998553334008e-08, "losses/sft": 0.5166524648666382, "losses/total": 7.204998553334008e-08, "ref_logps/chosen": -243.43841552734375, "ref_logps/rejected": -225.64517211914062, "rewards/accuracies": 1.0, "rewards/chosen": -1.2151930332183838, "rewards/margins": 12.810367584228516, "rewards/rejected": -14.02556037902832, "step": 2180 }, { "epoch": 0.52, "learning_rate": 1.0591999999999998e-07, "logps/chosen": -266.0367126464844, "logps/rejected": -398.954345703125, "loss": 0.0112, "losses/dpo": 4.300283151792428e-08, "losses/sft": 0.8039466738700867, "losses/total": 4.300283151792428e-08, "ref_logps/chosen": -253.10565185546875, "ref_logps/rejected": -248.43356323242188, "rewards/accuracies": 1.0, "rewards/chosen": -1.293105959892273, "rewards/margins": 13.75897216796875, "rewards/rejected": -15.052079200744629, "step": 2181 }, { "epoch": 0.52, "learning_rate": 1.0586666666666666e-07, "logps/chosen": -273.91632080078125, "logps/rejected": -373.09930419921875, "loss": 0.0016, "losses/dpo": 3.7591963319982824e-08, "losses/sft": 0.5918643474578857, "losses/total": 3.7591963319982824e-08, "ref_logps/chosen": -257.802978515625, "ref_logps/rejected": -220.34103393554688, "rewards/accuracies": 1.0, "rewards/chosen": -1.6113338470458984, "rewards/margins": 13.664493560791016, "rewards/rejected": -15.275826454162598, "step": 2182 }, { "epoch": 0.52, "learning_rate": 1.0581333333333334e-07, "logps/chosen": -235.6079559326172, "logps/rejected": -375.42791748046875, "loss": 0.002, "losses/dpo": 2.5986441869463306e-06, "losses/sft": 0.6612971425056458, "losses/total": 2.5986441869463306e-06, "ref_logps/chosen": -225.2994384765625, "ref_logps/rejected": -236.79159545898438, "rewards/accuracies": 1.0, "rewards/chosen": -1.0308506488800049, "rewards/margins": 12.832780838012695, "rewards/rejected": -13.863632202148438, "step": 2183 }, { "epoch": 0.52, "learning_rate": 1.0576e-07, "logps/chosen": -230.35659790039062, "logps/rejected": -361.78814697265625, "loss": 0.0018, "losses/dpo": 6.157144980534213e-06, "losses/sft": 0.6065793037414551, "losses/total": 6.157144980534213e-06, "ref_logps/chosen": -216.8054656982422, "ref_logps/rejected": -221.7941131591797, "rewards/accuracies": 1.0, "rewards/chosen": -1.3551113605499268, "rewards/margins": 12.644294738769531, "rewards/rejected": -13.999404907226562, "step": 2184 }, { "epoch": 0.52, "learning_rate": 1.0570666666666665e-07, "logps/chosen": -202.73593139648438, "logps/rejected": -347.3314208984375, "loss": 0.0121, "losses/dpo": 4.8102020144824564e-09, "losses/sft": 0.5265151262283325, "losses/total": 4.8102020144824564e-09, "ref_logps/chosen": -195.6044921875, "ref_logps/rejected": -217.00592041015625, "rewards/accuracies": 1.0, "rewards/chosen": -0.7131432294845581, "rewards/margins": 12.319408416748047, "rewards/rejected": -13.032551765441895, "step": 2185 }, { "epoch": 0.52, "learning_rate": 1.0565333333333333e-07, "logps/chosen": -228.25140380859375, "logps/rejected": -374.70489501953125, "loss": 0.0016, "losses/dpo": 7.991299207787961e-05, "losses/sft": 0.39093583822250366, "losses/total": 7.991299207787961e-05, "ref_logps/chosen": -215.3932342529297, "ref_logps/rejected": -238.72421264648438, "rewards/accuracies": 1.0, "rewards/chosen": -1.2858152389526367, "rewards/margins": 12.312253952026367, "rewards/rejected": -13.598068237304688, "step": 2186 }, { "epoch": 0.52, "learning_rate": 1.056e-07, "logps/chosen": -234.72415161132812, "logps/rejected": -342.26922607421875, "loss": 0.0028, "losses/dpo": 3.955969077651389e-05, "losses/sft": 0.4492213726043701, "losses/total": 3.955969077651389e-05, "ref_logps/chosen": -223.66183471679688, "ref_logps/rejected": -212.8959503173828, "rewards/accuracies": 1.0, "rewards/chosen": -1.1062318086624146, "rewards/margins": 11.831095695495605, "rewards/rejected": -12.93732738494873, "step": 2187 }, { "epoch": 0.53, "learning_rate": 1.0554666666666667e-07, "logps/chosen": -198.5205078125, "logps/rejected": -324.8563537597656, "loss": 0.0027, "losses/dpo": 1.0800646350617171e-06, "losses/sft": 0.6610167622566223, "losses/total": 1.0800646350617171e-06, "ref_logps/chosen": -188.51773071289062, "ref_logps/rejected": -203.90704345703125, "rewards/accuracies": 1.0, "rewards/chosen": -1.000276803970337, "rewards/margins": 11.094654083251953, "rewards/rejected": -12.094931602478027, "step": 2188 }, { "epoch": 0.53, "learning_rate": 1.0549333333333332e-07, "logps/chosen": -245.09771728515625, "logps/rejected": -378.9266357421875, "loss": 0.0013, "losses/dpo": 1.7158184846266522e-06, "losses/sft": 0.44783517718315125, "losses/total": 1.7158184846266522e-06, "ref_logps/chosen": -231.50631713867188, "ref_logps/rejected": -230.03517150878906, "rewards/accuracies": 1.0, "rewards/chosen": -1.3591387271881104, "rewards/margins": 13.530008316040039, "rewards/rejected": -14.889147758483887, "step": 2189 }, { "epoch": 0.53, "learning_rate": 1.0544e-07, "logps/chosen": -239.1737060546875, "logps/rejected": -358.912109375, "loss": 0.0008, "losses/dpo": 8.74883637758117e-11, "losses/sft": 0.5179108381271362, "losses/total": 8.74883637758117e-11, "ref_logps/chosen": -227.29322814941406, "ref_logps/rejected": -216.89987182617188, "rewards/accuracies": 1.0, "rewards/chosen": -1.1880501508712769, "rewards/margins": 13.013175010681152, "rewards/rejected": -14.201225280761719, "step": 2190 }, { "epoch": 0.53, "learning_rate": 1.0538666666666667e-07, "logps/chosen": -251.0421142578125, "logps/rejected": -384.77825927734375, "loss": 0.0052, "losses/dpo": 6.337661488942103e-07, "losses/sft": 0.5461911559104919, "losses/total": 6.337661488942103e-07, "ref_logps/chosen": -237.31362915039062, "ref_logps/rejected": -242.08209228515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.3728480339050293, "rewards/margins": 12.896768569946289, "rewards/rejected": -14.269617080688477, "step": 2191 }, { "epoch": 0.53, "learning_rate": 1.0533333333333332e-07, "logps/chosen": -178.44622802734375, "logps/rejected": -350.27288818359375, "loss": 0.0035, "losses/dpo": 1.4003209116708604e-06, "losses/sft": 0.7244463562965393, "losses/total": 1.4003209116708604e-06, "ref_logps/chosen": -169.99176025390625, "ref_logps/rejected": -220.73526000976562, "rewards/accuracies": 1.0, "rewards/chosen": -0.8454469442367554, "rewards/margins": 12.108316421508789, "rewards/rejected": -12.953763961791992, "step": 2192 }, { "epoch": 0.53, "learning_rate": 1.0527999999999999e-07, "logps/chosen": -245.62889099121094, "logps/rejected": -334.42523193359375, "loss": 0.0037, "losses/dpo": 2.0521931674011284e-06, "losses/sft": 0.5049276351928711, "losses/total": 2.0521931674011284e-06, "ref_logps/chosen": -233.7575225830078, "ref_logps/rejected": -207.97084045410156, "rewards/accuracies": 1.0, "rewards/chosen": -1.1871362924575806, "rewards/margins": 11.45830249786377, "rewards/rejected": -12.645439147949219, "step": 2193 }, { "epoch": 0.53, "learning_rate": 1.0522666666666666e-07, "logps/chosen": -251.8914031982422, "logps/rejected": -371.9679870605469, "loss": 0.0006, "losses/dpo": 5.3291322466009206e-09, "losses/sft": 0.6895569562911987, "losses/total": 5.3291322466009206e-09, "ref_logps/chosen": -244.90084838867188, "ref_logps/rejected": -235.49476623535156, "rewards/accuracies": 1.0, "rewards/chosen": -0.6990559101104736, "rewards/margins": 12.948267936706543, "rewards/rejected": -13.647323608398438, "step": 2194 }, { "epoch": 0.53, "learning_rate": 1.0517333333333334e-07, "logps/chosen": -222.4048309326172, "logps/rejected": -347.08770751953125, "loss": 0.012, "losses/dpo": 3.193956032987444e-08, "losses/sft": 0.5745490789413452, "losses/total": 3.193956032987444e-08, "ref_logps/chosen": -209.32666015625, "ref_logps/rejected": -215.16790771484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.3078172206878662, "rewards/margins": 11.884162902832031, "rewards/rejected": -13.191980361938477, "step": 2195 }, { "epoch": 0.53, "learning_rate": 1.0511999999999999e-07, "logps/chosen": -213.16842651367188, "logps/rejected": -333.7148742675781, "loss": 0.0032, "losses/dpo": 4.716441681651418e-10, "losses/sft": 0.7510417103767395, "losses/total": 4.716441681651418e-10, "ref_logps/chosen": -204.02381896972656, "ref_logps/rejected": -205.98785400390625, "rewards/accuracies": 1.0, "rewards/chosen": -0.9144612550735474, "rewards/margins": 11.858241081237793, "rewards/rejected": -12.77270221710205, "step": 2196 }, { "epoch": 0.53, "learning_rate": 1.0506666666666665e-07, "logps/chosen": -218.14413452148438, "logps/rejected": -360.19061279296875, "loss": 0.0012, "losses/dpo": 8.987458000042636e-10, "losses/sft": 0.4867306649684906, "losses/total": 8.987458000042636e-10, "ref_logps/chosen": -207.27496337890625, "ref_logps/rejected": -220.50743103027344, "rewards/accuracies": 1.0, "rewards/chosen": -1.086920142173767, "rewards/margins": 12.881397247314453, "rewards/rejected": -13.968317031860352, "step": 2197 }, { "epoch": 0.53, "learning_rate": 1.0501333333333333e-07, "logps/chosen": -239.18563842773438, "logps/rejected": -399.89801025390625, "loss": 0.0002, "losses/dpo": 7.832775736460462e-06, "losses/sft": 0.5148656368255615, "losses/total": 7.832775736460462e-06, "ref_logps/chosen": -226.90719604492188, "ref_logps/rejected": -255.95938110351562, "rewards/accuracies": 1.0, "rewards/chosen": -1.227844476699829, "rewards/margins": 13.166017532348633, "rewards/rejected": -14.393861770629883, "step": 2198 }, { "epoch": 0.53, "learning_rate": 1.0496000000000001e-07, "logps/chosen": -243.79312133789062, "logps/rejected": -371.2572021484375, "loss": 0.0075, "losses/dpo": 7.285870395890015e-08, "losses/sft": 0.5298469066619873, "losses/total": 7.285870395890015e-08, "ref_logps/chosen": -232.42062377929688, "ref_logps/rejected": -228.27481079101562, "rewards/accuracies": 1.0, "rewards/chosen": -1.1372485160827637, "rewards/margins": 13.160989761352539, "rewards/rejected": -14.298238754272461, "step": 2199 }, { "epoch": 0.53, "learning_rate": 1.0490666666666666e-07, "logps/chosen": -259.11846923828125, "logps/rejected": -387.2939758300781, "loss": 0.0003, "losses/dpo": 8.455305078314268e-07, "losses/sft": 0.48338842391967773, "losses/total": 8.455305078314268e-07, "ref_logps/chosen": -247.13006591796875, "ref_logps/rejected": -236.97772216796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.1988399028778076, "rewards/margins": 13.832786560058594, "rewards/rejected": -15.03162670135498, "step": 2200 }, { "epoch": 0.53, "learning_rate": 1.0485333333333332e-07, "logps/chosen": -243.6526336669922, "logps/rejected": -347.6053466796875, "loss": 0.0035, "losses/dpo": 8.077840334408393e-07, "losses/sft": 0.703052818775177, "losses/total": 8.077840334408393e-07, "ref_logps/chosen": -235.42764282226562, "ref_logps/rejected": -218.76223754882812, "rewards/accuracies": 1.0, "rewards/chosen": -0.8224998712539673, "rewards/margins": 12.061813354492188, "rewards/rejected": -12.884312629699707, "step": 2201 }, { "epoch": 0.53, "learning_rate": 1.048e-07, "logps/chosen": -214.9185791015625, "logps/rejected": -349.63409423828125, "loss": 0.0096, "losses/dpo": 1.0606864719875375e-07, "losses/sft": 0.6762797236442566, "losses/total": 1.0606864719875375e-07, "ref_logps/chosen": -203.32601928710938, "ref_logps/rejected": -212.58920288085938, "rewards/accuracies": 1.0, "rewards/chosen": -1.1592564582824707, "rewards/margins": 12.5452299118042, "rewards/rejected": -13.704486846923828, "step": 2202 }, { "epoch": 0.53, "learning_rate": 1.0474666666666668e-07, "logps/chosen": -228.73733520507812, "logps/rejected": -400.0914306640625, "loss": 0.002, "losses/dpo": 3.6292757688016763e-09, "losses/sft": 0.6984031796455383, "losses/total": 3.6292757688016763e-09, "ref_logps/chosen": -218.20077514648438, "ref_logps/rejected": -251.71786499023438, "rewards/accuracies": 1.0, "rewards/chosen": -1.0536580085754395, "rewards/margins": 13.78370189666748, "rewards/rejected": -14.837359428405762, "step": 2203 }, { "epoch": 0.53, "learning_rate": 1.0469333333333333e-07, "logps/chosen": -266.80621337890625, "logps/rejected": -392.9569396972656, "loss": 0.0033, "losses/dpo": 3.1958268920107e-08, "losses/sft": 0.5307180881500244, "losses/total": 3.1958268920107e-08, "ref_logps/chosen": -252.2843017578125, "ref_logps/rejected": -234.85049438476562, "rewards/accuracies": 1.0, "rewards/chosen": -1.4521913528442383, "rewards/margins": 14.358453750610352, "rewards/rejected": -15.81064510345459, "step": 2204 }, { "epoch": 0.53, "learning_rate": 1.0463999999999999e-07, "logps/chosen": -245.52386474609375, "logps/rejected": -379.28021240234375, "loss": 0.0024, "losses/dpo": 3.486004374053664e-08, "losses/sft": 0.6271759271621704, "losses/total": 3.486004374053664e-08, "ref_logps/chosen": -230.49398803710938, "ref_logps/rejected": -233.29505920410156, "rewards/accuracies": 1.0, "rewards/chosen": -1.5029866695404053, "rewards/margins": 13.09553050994873, "rewards/rejected": -14.598516464233398, "step": 2205 }, { "epoch": 0.53, "learning_rate": 1.0458666666666667e-07, "logps/chosen": -239.97775268554688, "logps/rejected": -363.9801025390625, "loss": 0.0051, "losses/dpo": 2.2128169803181663e-06, "losses/sft": 0.4608069658279419, "losses/total": 2.2128169803181663e-06, "ref_logps/chosen": -227.2266845703125, "ref_logps/rejected": -222.88914489746094, "rewards/accuracies": 1.0, "rewards/chosen": -1.2751070261001587, "rewards/margins": 12.833985328674316, "rewards/rejected": -14.109090805053711, "step": 2206 }, { "epoch": 0.53, "learning_rate": 1.0453333333333332e-07, "logps/chosen": -224.36196899414062, "logps/rejected": -387.8063659667969, "loss": 0.0113, "losses/dpo": 7.577623506449527e-08, "losses/sft": 0.576657235622406, "losses/total": 7.577623506449527e-08, "ref_logps/chosen": -209.866455078125, "ref_logps/rejected": -240.06622314453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.4495530128479004, "rewards/margins": 13.324460983276367, "rewards/rejected": -14.774014472961426, "step": 2207 }, { "epoch": 0.53, "learning_rate": 1.0447999999999999e-07, "logps/chosen": -196.76600646972656, "logps/rejected": -332.1065979003906, "loss": 0.008, "losses/dpo": 1.579443278387771e-07, "losses/sft": 0.6098587512969971, "losses/total": 1.579443278387771e-07, "ref_logps/chosen": -185.66305541992188, "ref_logps/rejected": -200.6627197265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.1102964878082275, "rewards/margins": 12.034090995788574, "rewards/rejected": -13.144387245178223, "step": 2208 }, { "epoch": 0.53, "learning_rate": 1.0442666666666666e-07, "logps/chosen": -194.10826110839844, "logps/rejected": -325.2867736816406, "loss": 0.0167, "losses/dpo": 8.188396094510608e-08, "losses/sft": 0.8680070042610168, "losses/total": 8.188396094510608e-08, "ref_logps/chosen": -183.80015563964844, "ref_logps/rejected": -196.83026123046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.0308127403259277, "rewards/margins": 11.814836502075195, "rewards/rejected": -12.845649719238281, "step": 2209 }, { "epoch": 0.53, "learning_rate": 1.0437333333333333e-07, "logps/chosen": -235.99118041992188, "logps/rejected": -364.0050964355469, "loss": 0.0022, "losses/dpo": 1.7113110928335118e-08, "losses/sft": 0.7183634042739868, "losses/total": 1.7113110928335118e-08, "ref_logps/chosen": -222.11819458007812, "ref_logps/rejected": -213.40687561035156, "rewards/accuracies": 1.0, "rewards/chosen": -1.387298345565796, "rewards/margins": 13.672526359558105, "rewards/rejected": -15.059823989868164, "step": 2210 }, { "epoch": 0.53, "learning_rate": 1.0431999999999998e-07, "logps/chosen": -245.72268676757812, "logps/rejected": -340.17919921875, "loss": 0.0113, "losses/dpo": 2.0450461306609213e-05, "losses/sft": 1.016215443611145, "losses/total": 2.0450461306609213e-05, "ref_logps/chosen": -232.9132080078125, "ref_logps/rejected": -207.4942626953125, "rewards/accuracies": 1.0, "rewards/chosen": -1.2809480428695679, "rewards/margins": 11.987543106079102, "rewards/rejected": -13.268491744995117, "step": 2211 }, { "epoch": 0.53, "learning_rate": 1.0426666666666666e-07, "logps/chosen": -261.1461181640625, "logps/rejected": -411.1600341796875, "loss": 0.0033, "losses/dpo": 7.203020402357652e-08, "losses/sft": 0.8087544441223145, "losses/total": 7.203020402357652e-08, "ref_logps/chosen": -246.44459533691406, "ref_logps/rejected": -246.44815063476562, "rewards/accuracies": 1.0, "rewards/chosen": -1.4701520204544067, "rewards/margins": 15.001038551330566, "rewards/rejected": -16.471189498901367, "step": 2212 }, { "epoch": 0.53, "learning_rate": 1.0421333333333334e-07, "logps/chosen": -263.4608154296875, "logps/rejected": -382.19793701171875, "loss": 0.0024, "losses/dpo": 2.1523028692627122e-08, "losses/sft": 0.8900230526924133, "losses/total": 2.1523028692627122e-08, "ref_logps/chosen": -253.63258361816406, "ref_logps/rejected": -230.44313049316406, "rewards/accuracies": 1.0, "rewards/chosen": -0.9828235507011414, "rewards/margins": 14.192659378051758, "rewards/rejected": -15.175483703613281, "step": 2213 }, { "epoch": 0.53, "learning_rate": 1.0416e-07, "logps/chosen": -234.6139678955078, "logps/rejected": -360.3170471191406, "loss": 0.0082, "losses/dpo": 3.901245122506225e-07, "losses/sft": 0.6987119913101196, "losses/total": 3.901245122506225e-07, "ref_logps/chosen": -225.63702392578125, "ref_logps/rejected": -219.60340881347656, "rewards/accuracies": 1.0, "rewards/chosen": -0.8976951837539673, "rewards/margins": 13.17366886138916, "rewards/rejected": -14.071364402770996, "step": 2214 }, { "epoch": 0.53, "learning_rate": 1.0410666666666665e-07, "logps/chosen": -277.6482849121094, "logps/rejected": -366.86541748046875, "loss": 0.0081, "losses/dpo": 1.4402742998242957e-08, "losses/sft": 0.5245699286460876, "losses/total": 1.4402742998242957e-08, "ref_logps/chosen": -267.15869140625, "ref_logps/rejected": -236.71937561035156, "rewards/accuracies": 1.0, "rewards/chosen": -1.0489592552185059, "rewards/margins": 11.965646743774414, "rewards/rejected": -13.014605522155762, "step": 2215 }, { "epoch": 0.53, "learning_rate": 1.0405333333333333e-07, "logps/chosen": -253.78733825683594, "logps/rejected": -381.20562744140625, "loss": 0.001, "losses/dpo": 9.395226356900821e-07, "losses/sft": 0.9037811756134033, "losses/total": 9.395226356900821e-07, "ref_logps/chosen": -242.00173950195312, "ref_logps/rejected": -245.3052978515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.178558111190796, "rewards/margins": 12.411474227905273, "rewards/rejected": -13.590033531188965, "step": 2216 }, { "epoch": 0.53, "learning_rate": 1.04e-07, "logps/chosen": -227.9742431640625, "logps/rejected": -364.8431396484375, "loss": 0.0049, "losses/dpo": 1.0221828716794334e-07, "losses/sft": 0.5820485353469849, "losses/total": 1.0221828716794334e-07, "ref_logps/chosen": -215.23907470703125, "ref_logps/rejected": -218.51446533203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.2735180854797363, "rewards/margins": 13.35934829711914, "rewards/rejected": -14.632867813110352, "step": 2217 }, { "epoch": 0.53, "learning_rate": 1.0394666666666667e-07, "logps/chosen": -228.71844482421875, "logps/rejected": -370.4100036621094, "loss": 0.0007, "losses/dpo": 4.4957513978260977e-07, "losses/sft": 0.7314879298210144, "losses/total": 4.4957513978260977e-07, "ref_logps/chosen": -213.91958618164062, "ref_logps/rejected": -226.9839630126953, "rewards/accuracies": 1.0, "rewards/chosen": -1.4798860549926758, "rewards/margins": 12.86271858215332, "rewards/rejected": -14.34260368347168, "step": 2218 }, { "epoch": 0.53, "learning_rate": 1.0389333333333332e-07, "logps/chosen": -275.10614013671875, "logps/rejected": -395.774658203125, "loss": 0.0004, "losses/dpo": 3.3136449140158675e-09, "losses/sft": 0.5565266013145447, "losses/total": 3.3136449140158675e-09, "ref_logps/chosen": -263.3826599121094, "ref_logps/rejected": -251.7402801513672, "rewards/accuracies": 1.0, "rewards/chosen": -1.172346830368042, "rewards/margins": 13.23109245300293, "rewards/rejected": -14.403438568115234, "step": 2219 }, { "epoch": 0.53, "learning_rate": 1.0384e-07, "logps/chosen": -271.5354919433594, "logps/rejected": -389.4083557128906, "loss": 0.0043, "losses/dpo": 8.791148866293952e-05, "losses/sft": 0.8767366409301758, "losses/total": 8.791148866293952e-05, "ref_logps/chosen": -259.3375244140625, "ref_logps/rejected": -244.37542724609375, "rewards/accuracies": 1.0, "rewards/chosen": -1.2197998762130737, "rewards/margins": 13.283493041992188, "rewards/rejected": -14.503293991088867, "step": 2220 }, { "epoch": 0.53, "learning_rate": 1.0378666666666667e-07, "logps/chosen": -290.90966796875, "logps/rejected": -385.9620361328125, "loss": 0.0149, "losses/dpo": 3.184103718467668e-07, "losses/sft": 0.5594736337661743, "losses/total": 3.184103718467668e-07, "ref_logps/chosen": -279.49371337890625, "ref_logps/rejected": -245.02044677734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.1415932178497314, "rewards/margins": 12.952567100524902, "rewards/rejected": -14.094160079956055, "step": 2221 }, { "epoch": 0.53, "learning_rate": 1.0373333333333334e-07, "logps/chosen": -254.00994873046875, "logps/rejected": -358.155517578125, "loss": 0.0003, "losses/dpo": 0.0006418981356546283, "losses/sft": 0.9735234975814819, "losses/total": 0.0006418981356546283, "ref_logps/chosen": -236.49356079101562, "ref_logps/rejected": -215.10751342773438, "rewards/accuracies": 1.0, "rewards/chosen": -1.7516382932662964, "rewards/margins": 12.55316162109375, "rewards/rejected": -14.304800033569336, "step": 2222 }, { "epoch": 0.53, "learning_rate": 1.0367999999999999e-07, "logps/chosen": -269.4134521484375, "logps/rejected": -398.93328857421875, "loss": 0.0012, "losses/dpo": 2.5045981601579115e-05, "losses/sft": 0.6064628958702087, "losses/total": 2.5045981601579115e-05, "ref_logps/chosen": -254.587890625, "ref_logps/rejected": -240.90713500976562, "rewards/accuracies": 1.0, "rewards/chosen": -1.482553243637085, "rewards/margins": 14.320058822631836, "rewards/rejected": -15.802610397338867, "step": 2223 }, { "epoch": 0.53, "learning_rate": 1.0362666666666666e-07, "logps/chosen": -231.72047424316406, "logps/rejected": -370.7962646484375, "loss": 0.0005, "losses/dpo": 4.009525014225801e-07, "losses/sft": 0.5255658030509949, "losses/total": 4.009525014225801e-07, "ref_logps/chosen": -219.17208862304688, "ref_logps/rejected": -232.1670684814453, "rewards/accuracies": 1.0, "rewards/chosen": -1.2548398971557617, "rewards/margins": 12.608083724975586, "rewards/rejected": -13.862922668457031, "step": 2224 }, { "epoch": 0.53, "learning_rate": 1.0357333333333334e-07, "logps/chosen": -311.75762939453125, "logps/rejected": -386.11236572265625, "loss": 0.0115, "losses/dpo": 1.818057349112223e-08, "losses/sft": 0.935254693031311, "losses/total": 1.818057349112223e-08, "ref_logps/chosen": -298.19781494140625, "ref_logps/rejected": -245.09994506835938, "rewards/accuracies": 1.0, "rewards/chosen": -1.3559794425964355, "rewards/margins": 12.745264053344727, "rewards/rejected": -14.10124397277832, "step": 2225 }, { "epoch": 0.53, "learning_rate": 1.0351999999999999e-07, "logps/chosen": -201.53195190429688, "logps/rejected": -355.2507629394531, "loss": 0.0033, "losses/dpo": 6.530082599870771e-14, "losses/sft": 0.6093721389770508, "losses/total": 6.530082599870771e-14, "ref_logps/chosen": -189.67091369628906, "ref_logps/rejected": -203.48544311523438, "rewards/accuracies": 1.0, "rewards/chosen": -1.1861045360565186, "rewards/margins": 13.99042797088623, "rewards/rejected": -15.176532745361328, "step": 2226 }, { "epoch": 0.53, "learning_rate": 1.0346666666666665e-07, "logps/chosen": -251.16259765625, "logps/rejected": -368.12371826171875, "loss": 0.0033, "losses/dpo": 2.091403757731314e-06, "losses/sft": 1.0748447179794312, "losses/total": 2.091403757731314e-06, "ref_logps/chosen": -239.9604034423828, "ref_logps/rejected": -229.57003784179688, "rewards/accuracies": 1.0, "rewards/chosen": -1.1202208995819092, "rewards/margins": 12.735147476196289, "rewards/rejected": -13.855367660522461, "step": 2227 }, { "epoch": 0.53, "learning_rate": 1.0341333333333333e-07, "logps/chosen": -213.8839874267578, "logps/rejected": -341.157470703125, "loss": 0.0067, "losses/dpo": 0.0001240828278241679, "losses/sft": 0.47075727581977844, "losses/total": 0.0001240828278241679, "ref_logps/chosen": -202.27828979492188, "ref_logps/rejected": -204.89865112304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.1605702638626099, "rewards/margins": 12.465311050415039, "rewards/rejected": -13.62588119506836, "step": 2228 }, { "epoch": 0.53, "learning_rate": 1.0336000000000001e-07, "logps/chosen": -239.86184692382812, "logps/rejected": -352.4043273925781, "loss": 0.0015, "losses/dpo": 0.00029773361166007817, "losses/sft": 0.42737188935279846, "losses/total": 0.00029773361166007817, "ref_logps/chosen": -227.59097290039062, "ref_logps/rejected": -212.4687042236328, "rewards/accuracies": 1.0, "rewards/chosen": -1.227089524269104, "rewards/margins": 12.766472816467285, "rewards/rejected": -13.993563652038574, "step": 2229 }, { "epoch": 0.54, "learning_rate": 1.0330666666666666e-07, "logps/chosen": -253.89276123046875, "logps/rejected": -357.69561767578125, "loss": 0.001, "losses/dpo": 1.512716607976472e-06, "losses/sft": 0.6532007455825806, "losses/total": 1.512716607976472e-06, "ref_logps/chosen": -241.0522003173828, "ref_logps/rejected": -218.85049438476562, "rewards/accuracies": 1.0, "rewards/chosen": -1.284057378768921, "rewards/margins": 12.600454330444336, "rewards/rejected": -13.884511947631836, "step": 2230 }, { "epoch": 0.54, "learning_rate": 1.0325333333333332e-07, "logps/chosen": -279.84405517578125, "logps/rejected": -409.1238708496094, "loss": 0.0003, "losses/dpo": 1.5261464314519912e-09, "losses/sft": 0.45660221576690674, "losses/total": 1.5261464314519912e-09, "ref_logps/chosen": -267.0473937988281, "ref_logps/rejected": -254.2466583251953, "rewards/accuracies": 1.0, "rewards/chosen": -1.279664158821106, "rewards/margins": 14.208053588867188, "rewards/rejected": -15.48771858215332, "step": 2231 }, { "epoch": 0.54, "learning_rate": 1.032e-07, "logps/chosen": -230.87875366210938, "logps/rejected": -400.9428405761719, "loss": 0.0016, "losses/dpo": 3.07259542751126e-05, "losses/sft": 0.4615100026130676, "losses/total": 3.07259542751126e-05, "ref_logps/chosen": -218.24270629882812, "ref_logps/rejected": -254.4744415283203, "rewards/accuracies": 1.0, "rewards/chosen": -1.2636048793792725, "rewards/margins": 13.383234024047852, "rewards/rejected": -14.646839141845703, "step": 2232 }, { "epoch": 0.54, "learning_rate": 1.0314666666666667e-07, "logps/chosen": -250.14231872558594, "logps/rejected": -388.9402160644531, "loss": 0.0007, "losses/dpo": 1.413694644725183e-05, "losses/sft": 0.7595769762992859, "losses/total": 1.413694644725183e-05, "ref_logps/chosen": -237.28819274902344, "ref_logps/rejected": -234.98342895507812, "rewards/accuracies": 1.0, "rewards/chosen": -1.2854132652282715, "rewards/margins": 14.110265731811523, "rewards/rejected": -15.395679473876953, "step": 2233 }, { "epoch": 0.54, "learning_rate": 1.0309333333333332e-07, "logps/chosen": -220.22361755371094, "logps/rejected": -391.8375549316406, "loss": 0.0029, "losses/dpo": 6.6261303111048164e-09, "losses/sft": 0.5125657916069031, "losses/total": 6.6261303111048164e-09, "ref_logps/chosen": -207.30355834960938, "ref_logps/rejected": -237.85357666015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2920078039169312, "rewards/margins": 14.106389999389648, "rewards/rejected": -15.398397445678711, "step": 2234 }, { "epoch": 0.54, "learning_rate": 1.0303999999999999e-07, "logps/chosen": -253.90679931640625, "logps/rejected": -372.0347900390625, "loss": 0.0017, "losses/dpo": 1.901036375784315e-05, "losses/sft": 0.5406685471534729, "losses/total": 1.901036375784315e-05, "ref_logps/chosen": -243.51148986816406, "ref_logps/rejected": -236.03335571289062, "rewards/accuracies": 1.0, "rewards/chosen": -1.039530873298645, "rewards/margins": 12.560613632202148, "rewards/rejected": -13.60014533996582, "step": 2235 }, { "epoch": 0.54, "learning_rate": 1.0298666666666667e-07, "logps/chosen": -244.35238647460938, "logps/rejected": -331.65179443359375, "loss": 0.0006, "losses/dpo": 1.0434641808387823e-05, "losses/sft": 0.3478941321372986, "losses/total": 1.0434641808387823e-05, "ref_logps/chosen": -233.08963012695312, "ref_logps/rejected": -206.95724487304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.126274585723877, "rewards/margins": 11.343184471130371, "rewards/rejected": -12.46945858001709, "step": 2236 }, { "epoch": 0.54, "learning_rate": 1.0293333333333334e-07, "logps/chosen": -258.23797607421875, "logps/rejected": -363.74554443359375, "loss": 0.0022, "losses/dpo": 8.080619409156498e-06, "losses/sft": 0.6585609912872314, "losses/total": 8.080619409156498e-06, "ref_logps/chosen": -242.07620239257812, "ref_logps/rejected": -223.008544921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.6161776781082153, "rewards/margins": 12.45751953125, "rewards/rejected": -14.073698043823242, "step": 2237 }, { "epoch": 0.54, "learning_rate": 1.0287999999999999e-07, "logps/chosen": -195.2997589111328, "logps/rejected": -371.2225036621094, "loss": 0.0049, "losses/dpo": 3.2082783718578867e-07, "losses/sft": 0.6039567589759827, "losses/total": 3.2082783718578867e-07, "ref_logps/chosen": -183.86871337890625, "ref_logps/rejected": -224.79415893554688, "rewards/accuracies": 1.0, "rewards/chosen": -1.143103003501892, "rewards/margins": 13.499732971191406, "rewards/rejected": -14.64283561706543, "step": 2238 }, { "epoch": 0.54, "learning_rate": 1.0282666666666666e-07, "logps/chosen": -243.44798278808594, "logps/rejected": -327.826171875, "loss": 0.0042, "losses/dpo": 6.902560016897041e-07, "losses/sft": 0.4828353822231293, "losses/total": 6.902560016897041e-07, "ref_logps/chosen": -230.08184814453125, "ref_logps/rejected": -195.5921630859375, "rewards/accuracies": 1.0, "rewards/chosen": -1.3366130590438843, "rewards/margins": 11.886789321899414, "rewards/rejected": -13.22340202331543, "step": 2239 }, { "epoch": 0.54, "learning_rate": 1.0277333333333333e-07, "logps/chosen": -212.93533325195312, "logps/rejected": -375.6979675292969, "loss": 0.0002, "losses/dpo": 1.0729888799687615e-07, "losses/sft": 0.5587608814239502, "losses/total": 1.0729888799687615e-07, "ref_logps/chosen": -202.64312744140625, "ref_logps/rejected": -223.01007080078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.0292205810546875, "rewards/margins": 14.239568710327148, "rewards/rejected": -15.268789291381836, "step": 2240 }, { "epoch": 0.54, "learning_rate": 1.0271999999999998e-07, "logps/chosen": -220.36093139648438, "logps/rejected": -384.49652099609375, "loss": 0.0035, "losses/dpo": 1.3167650649847928e-06, "losses/sft": 0.7063045501708984, "losses/total": 1.3167650649847928e-06, "ref_logps/chosen": -204.49411010742188, "ref_logps/rejected": -231.4609375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5866825580596924, "rewards/margins": 13.716875076293945, "rewards/rejected": -15.303556442260742, "step": 2241 }, { "epoch": 0.54, "learning_rate": 1.0266666666666666e-07, "logps/chosen": -246.63888549804688, "logps/rejected": -388.9382629394531, "loss": 0.029, "losses/dpo": 3.2811422467915463e-09, "losses/sft": 0.4968888461589813, "losses/total": 3.2811422467915463e-09, "ref_logps/chosen": -231.48025512695312, "ref_logps/rejected": -236.45077514648438, "rewards/accuracies": 1.0, "rewards/chosen": -1.5158642530441284, "rewards/margins": 13.732884407043457, "rewards/rejected": -15.248746871948242, "step": 2242 }, { "epoch": 0.54, "learning_rate": 1.0261333333333332e-07, "logps/chosen": -198.21453857421875, "logps/rejected": -323.417236328125, "loss": 0.0021, "losses/dpo": 6.825928977605145e-08, "losses/sft": 1.3666280508041382, "losses/total": 6.825928977605145e-08, "ref_logps/chosen": -187.82887268066406, "ref_logps/rejected": -194.60841369628906, "rewards/accuracies": 1.0, "rewards/chosen": -1.0385665893554688, "rewards/margins": 11.842313766479492, "rewards/rejected": -12.880880355834961, "step": 2243 }, { "epoch": 0.54, "learning_rate": 1.0256e-07, "logps/chosen": -231.8062744140625, "logps/rejected": -391.0614318847656, "loss": 0.007, "losses/dpo": 2.79935647995444e-07, "losses/sft": 0.8030248284339905, "losses/total": 2.79935647995444e-07, "ref_logps/chosen": -218.53565979003906, "ref_logps/rejected": -233.564697265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.3270628452301025, "rewards/margins": 14.422611236572266, "rewards/rejected": -15.749675750732422, "step": 2244 }, { "epoch": 0.54, "learning_rate": 1.0250666666666665e-07, "logps/chosen": -246.16395568847656, "logps/rejected": -379.0797119140625, "loss": 0.0037, "losses/dpo": 3.14069659168581e-09, "losses/sft": 0.6801179051399231, "losses/total": 3.14069659168581e-09, "ref_logps/chosen": -231.06378173828125, "ref_logps/rejected": -228.9001922607422, "rewards/accuracies": 1.0, "rewards/chosen": -1.5100189447402954, "rewards/margins": 13.5079345703125, "rewards/rejected": -15.017952919006348, "step": 2245 }, { "epoch": 0.54, "learning_rate": 1.0245333333333333e-07, "logps/chosen": -185.5333709716797, "logps/rejected": -350.81683349609375, "loss": 0.002, "losses/dpo": 3.263180406065658e-05, "losses/sft": 0.4944082796573639, "losses/total": 3.263180406065658e-05, "ref_logps/chosen": -176.1705780029297, "ref_logps/rejected": -223.1981964111328, "rewards/accuracies": 1.0, "rewards/chosen": -0.9362809062004089, "rewards/margins": 11.825583457946777, "rewards/rejected": -12.76186466217041, "step": 2246 }, { "epoch": 0.54, "learning_rate": 1.024e-07, "logps/chosen": -272.1966552734375, "logps/rejected": -363.4329528808594, "loss": 0.0032, "losses/dpo": 9.72052202996565e-06, "losses/sft": 0.571595311164856, "losses/total": 9.72052202996565e-06, "ref_logps/chosen": -256.2750244140625, "ref_logps/rejected": -227.13742065429688, "rewards/accuracies": 1.0, "rewards/chosen": -1.592163324356079, "rewards/margins": 12.037391662597656, "rewards/rejected": -13.629554748535156, "step": 2247 }, { "epoch": 0.54, "learning_rate": 1.0234666666666667e-07, "logps/chosen": -260.01776123046875, "logps/rejected": -415.44195556640625, "loss": 0.0003, "losses/dpo": 2.8485862912930315e-06, "losses/sft": 0.30409273505210876, "losses/total": 2.8485862912930315e-06, "ref_logps/chosen": -249.3123779296875, "ref_logps/rejected": -266.08526611328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.07053804397583, "rewards/margins": 13.865133285522461, "rewards/rejected": -14.935670852661133, "step": 2248 }, { "epoch": 0.54, "learning_rate": 1.0229333333333332e-07, "logps/chosen": -207.3450927734375, "logps/rejected": -361.39788818359375, "loss": 0.0009, "losses/dpo": 7.424621344398474e-07, "losses/sft": 0.7887377142906189, "losses/total": 7.424621344398474e-07, "ref_logps/chosen": -197.36346435546875, "ref_logps/rejected": -225.4863739013672, "rewards/accuracies": 1.0, "rewards/chosen": -0.9981626868247986, "rewards/margins": 12.592985153198242, "rewards/rejected": -13.591148376464844, "step": 2249 }, { "epoch": 0.54, "learning_rate": 1.0224e-07, "logps/chosen": -248.67059326171875, "logps/rejected": -399.650146484375, "loss": 0.0009, "losses/dpo": 3.745636856233858e-10, "losses/sft": 0.5564340949058533, "losses/total": 3.745636856233858e-10, "ref_logps/chosen": -235.06011962890625, "ref_logps/rejected": -237.87255859375, "rewards/accuracies": 1.0, "rewards/chosen": -1.3610479831695557, "rewards/margins": 14.816710472106934, "rewards/rejected": -16.177757263183594, "step": 2250 }, { "epoch": 0.54, "learning_rate": 1.0218666666666667e-07, "logps/chosen": -209.12319946289062, "logps/rejected": -346.71771240234375, "loss": 0.004, "losses/dpo": 2.077397221000865e-05, "losses/sft": 0.6684311628341675, "losses/total": 2.077397221000865e-05, "ref_logps/chosen": -197.9901580810547, "ref_logps/rejected": -209.78477478027344, "rewards/accuracies": 1.0, "rewards/chosen": -1.113303542137146, "rewards/margins": 12.57999038696289, "rewards/rejected": -13.693292617797852, "step": 2251 }, { "epoch": 0.54, "learning_rate": 1.0213333333333333e-07, "logps/chosen": -243.6831817626953, "logps/rejected": -358.9230041503906, "loss": 0.0007, "losses/dpo": 9.339623119331009e-08, "losses/sft": 0.6108477115631104, "losses/total": 9.339623119331009e-08, "ref_logps/chosen": -231.28366088867188, "ref_logps/rejected": -212.50067138671875, "rewards/accuracies": 1.0, "rewards/chosen": -1.2399530410766602, "rewards/margins": 13.402278900146484, "rewards/rejected": -14.642232894897461, "step": 2252 }, { "epoch": 0.54, "learning_rate": 1.0207999999999998e-07, "logps/chosen": -248.69728088378906, "logps/rejected": -369.7451477050781, "loss": 0.0034, "losses/dpo": 1.4618841248648096e-08, "losses/sft": 0.5911837220191956, "losses/total": 1.4618841248648096e-08, "ref_logps/chosen": -241.50869750976562, "ref_logps/rejected": -229.15045166015625, "rewards/accuracies": 1.0, "rewards/chosen": -0.7188589572906494, "rewards/margins": 13.34061050415039, "rewards/rejected": -14.059469223022461, "step": 2253 }, { "epoch": 0.54, "learning_rate": 1.0202666666666666e-07, "logps/chosen": -228.24468994140625, "logps/rejected": -392.0095520019531, "loss": 0.0023, "losses/dpo": 1.8225004794203414e-07, "losses/sft": 0.81803959608078, "losses/total": 1.8225004794203414e-07, "ref_logps/chosen": -216.908935546875, "ref_logps/rejected": -233.64663696289062, "rewards/accuracies": 1.0, "rewards/chosen": -1.1335771083831787, "rewards/margins": 14.702714920043945, "rewards/rejected": -15.836292266845703, "step": 2254 }, { "epoch": 0.54, "learning_rate": 1.0197333333333334e-07, "logps/chosen": -220.26119995117188, "logps/rejected": -368.4559326171875, "loss": 0.0019, "losses/dpo": 1.1975649613304995e-05, "losses/sft": 0.7767534255981445, "losses/total": 1.1975649613304995e-05, "ref_logps/chosen": -207.85418701171875, "ref_logps/rejected": -220.67660522460938, "rewards/accuracies": 1.0, "rewards/chosen": -1.2407002449035645, "rewards/margins": 13.5372314453125, "rewards/rejected": -14.777931213378906, "step": 2255 }, { "epoch": 0.54, "learning_rate": 1.0192e-07, "logps/chosen": -189.4661407470703, "logps/rejected": -342.6330261230469, "loss": 0.0007, "losses/dpo": 6.303930294437876e-11, "losses/sft": 0.466545432806015, "losses/total": 6.303930294437876e-11, "ref_logps/chosen": -177.363037109375, "ref_logps/rejected": -210.2976837158203, "rewards/accuracies": 1.0, "rewards/chosen": -1.2103097438812256, "rewards/margins": 12.023223876953125, "rewards/rejected": -13.233532905578613, "step": 2256 }, { "epoch": 0.54, "learning_rate": 1.0186666666666665e-07, "logps/chosen": -244.0213623046875, "logps/rejected": -382.2223815917969, "loss": 0.0, "losses/dpo": 1.6362694532645605e-10, "losses/sft": 0.8452383875846863, "losses/total": 1.6362694532645605e-10, "ref_logps/chosen": -232.09527587890625, "ref_logps/rejected": -226.55355834960938, "rewards/accuracies": 1.0, "rewards/chosen": -1.1926075220108032, "rewards/margins": 14.374275207519531, "rewards/rejected": -15.566883087158203, "step": 2257 }, { "epoch": 0.54, "learning_rate": 1.0181333333333333e-07, "logps/chosen": -278.6519775390625, "logps/rejected": -413.42852783203125, "loss": 0.0163, "losses/dpo": 4.922313223687524e-07, "losses/sft": 0.7577846050262451, "losses/total": 4.922313223687524e-07, "ref_logps/chosen": -259.79644775390625, "ref_logps/rejected": -251.5358123779297, "rewards/accuracies": 1.0, "rewards/chosen": -1.8855528831481934, "rewards/margins": 14.303720474243164, "rewards/rejected": -16.189271926879883, "step": 2258 }, { "epoch": 0.54, "learning_rate": 1.0176e-07, "logps/chosen": -221.96124267578125, "logps/rejected": -382.71783447265625, "loss": 0.0006, "losses/dpo": 3.4064594700566886e-08, "losses/sft": 0.4010457694530487, "losses/total": 3.4064594700566886e-08, "ref_logps/chosen": -208.16653442382812, "ref_logps/rejected": -230.91246032714844, "rewards/accuracies": 1.0, "rewards/chosen": -1.3794701099395752, "rewards/margins": 13.801067352294922, "rewards/rejected": -15.180537223815918, "step": 2259 }, { "epoch": 0.54, "learning_rate": 1.0170666666666666e-07, "logps/chosen": -228.4162139892578, "logps/rejected": -410.22119140625, "loss": 0.0009, "losses/dpo": 1.2317805158090778e-05, "losses/sft": 0.7459191679954529, "losses/total": 1.2317805158090778e-05, "ref_logps/chosen": -214.31793212890625, "ref_logps/rejected": -253.97894287109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4098281860351562, "rewards/margins": 14.214396476745605, "rewards/rejected": -15.624223709106445, "step": 2260 }, { "epoch": 0.54, "learning_rate": 1.0165333333333332e-07, "logps/chosen": -200.88784790039062, "logps/rejected": -331.6506042480469, "loss": 0.0018, "losses/dpo": 7.154017680477409e-07, "losses/sft": 0.5486665368080139, "losses/total": 7.154017680477409e-07, "ref_logps/chosen": -189.29714965820312, "ref_logps/rejected": -192.82354736328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.1590688228607178, "rewards/margins": 12.723634719848633, "rewards/rejected": -13.88270378112793, "step": 2261 }, { "epoch": 0.54, "learning_rate": 1.016e-07, "logps/chosen": -227.32431030273438, "logps/rejected": -355.743896484375, "loss": 0.0013, "losses/dpo": 8.854337707475679e-09, "losses/sft": 0.5502545237541199, "losses/total": 8.854337707475679e-09, "ref_logps/chosen": -213.11099243164062, "ref_logps/rejected": -211.27835083007812, "rewards/accuracies": 1.0, "rewards/chosen": -1.4213342666625977, "rewards/margins": 13.025219917297363, "rewards/rejected": -14.446554183959961, "step": 2262 }, { "epoch": 0.54, "learning_rate": 1.0154666666666667e-07, "logps/chosen": -244.53211975097656, "logps/rejected": -403.72283935546875, "loss": 0.0013, "losses/dpo": 6.357480515362113e-07, "losses/sft": 0.7452567219734192, "losses/total": 6.357480515362113e-07, "ref_logps/chosen": -231.7538604736328, "ref_logps/rejected": -248.872802734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.277826189994812, "rewards/margins": 14.207176208496094, "rewards/rejected": -15.485002517700195, "step": 2263 }, { "epoch": 0.54, "learning_rate": 1.0149333333333332e-07, "logps/chosen": -215.62460327148438, "logps/rejected": -340.4762268066406, "loss": 0.0036, "losses/dpo": 5.8786019508261234e-05, "losses/sft": 0.7358983159065247, "losses/total": 5.8786019508261234e-05, "ref_logps/chosen": -204.28451538085938, "ref_logps/rejected": -206.53770446777344, "rewards/accuracies": 1.0, "rewards/chosen": -1.1340093612670898, "rewards/margins": 12.259841918945312, "rewards/rejected": -13.393850326538086, "step": 2264 }, { "epoch": 0.54, "learning_rate": 1.0143999999999999e-07, "logps/chosen": -247.04904174804688, "logps/rejected": -362.2589111328125, "loss": 0.0006, "losses/dpo": 2.735529278652393e-07, "losses/sft": 0.4474600553512573, "losses/total": 2.735529278652393e-07, "ref_logps/chosen": -232.8944091796875, "ref_logps/rejected": -222.3131866455078, "rewards/accuracies": 1.0, "rewards/chosen": -1.415463924407959, "rewards/margins": 12.579111099243164, "rewards/rejected": -13.994575500488281, "step": 2265 }, { "epoch": 0.54, "learning_rate": 1.0138666666666666e-07, "logps/chosen": -260.03900146484375, "logps/rejected": -412.4768371582031, "loss": 0.0004, "losses/dpo": 2.972769364006922e-09, "losses/sft": 0.6893179416656494, "losses/total": 2.972769364006922e-09, "ref_logps/chosen": -243.61965942382812, "ref_logps/rejected": -252.86526489257812, "rewards/accuracies": 1.0, "rewards/chosen": -1.6419360637664795, "rewards/margins": 14.319221496582031, "rewards/rejected": -15.96115779876709, "step": 2266 }, { "epoch": 0.54, "learning_rate": 1.0133333333333334e-07, "logps/chosen": -208.49472045898438, "logps/rejected": -347.4613037109375, "loss": 0.0029, "losses/dpo": 1.49966837170723e-07, "losses/sft": 0.6131212115287781, "losses/total": 1.49966837170723e-07, "ref_logps/chosen": -193.6129150390625, "ref_logps/rejected": -209.75759887695312, "rewards/accuracies": 1.0, "rewards/chosen": -1.4881830215454102, "rewards/margins": 12.28218936920166, "rewards/rejected": -13.77037239074707, "step": 2267 }, { "epoch": 0.54, "learning_rate": 1.0127999999999999e-07, "logps/chosen": -241.36404418945312, "logps/rejected": -387.0416259765625, "loss": 0.0026, "losses/dpo": 2.161167458325508e-06, "losses/sft": 0.6290886402130127, "losses/total": 2.161167458325508e-06, "ref_logps/chosen": -226.5894317626953, "ref_logps/rejected": -226.220703125, "rewards/accuracies": 1.0, "rewards/chosen": -1.4774630069732666, "rewards/margins": 14.604629516601562, "rewards/rejected": -16.08209228515625, "step": 2268 }, { "epoch": 0.54, "learning_rate": 1.0122666666666665e-07, "logps/chosen": -237.1027374267578, "logps/rejected": -360.13018798828125, "loss": 0.004, "losses/dpo": 1.7366389437611929e-09, "losses/sft": 0.6032610535621643, "losses/total": 1.7366389437611929e-09, "ref_logps/chosen": -222.93289184570312, "ref_logps/rejected": -215.43228149414062, "rewards/accuracies": 1.0, "rewards/chosen": -1.4169853925704956, "rewards/margins": 13.052801132202148, "rewards/rejected": -14.46978759765625, "step": 2269 }, { "epoch": 0.54, "learning_rate": 1.0117333333333333e-07, "logps/chosen": -259.4752197265625, "logps/rejected": -389.21484375, "loss": 0.0004, "losses/dpo": 6.544713215816955e-09, "losses/sft": 0.5621964931488037, "losses/total": 6.544713215816955e-09, "ref_logps/chosen": -244.90826416015625, "ref_logps/rejected": -241.5550537109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4566972255706787, "rewards/margins": 13.309286117553711, "rewards/rejected": -14.765981674194336, "step": 2270 }, { "epoch": 0.54, "learning_rate": 1.0112000000000001e-07, "logps/chosen": -225.65438842773438, "logps/rejected": -356.01239013671875, "loss": 0.0032, "losses/dpo": 6.700977792206686e-06, "losses/sft": 0.6374830603599548, "losses/total": 6.700977792206686e-06, "ref_logps/chosen": -215.03475952148438, "ref_logps/rejected": -213.39950561523438, "rewards/accuracies": 1.0, "rewards/chosen": -1.0619630813598633, "rewards/margins": 13.199325561523438, "rewards/rejected": -14.261287689208984, "step": 2271 }, { "epoch": 0.55, "learning_rate": 1.0106666666666666e-07, "logps/chosen": -269.4522705078125, "logps/rejected": -403.0786437988281, "loss": 0.0002, "losses/dpo": 1.8337524920752912e-07, "losses/sft": 0.5374347567558289, "losses/total": 1.8337524920752912e-07, "ref_logps/chosen": -254.53521728515625, "ref_logps/rejected": -242.67929077148438, "rewards/accuracies": 1.0, "rewards/chosen": -1.4917058944702148, "rewards/margins": 14.548229217529297, "rewards/rejected": -16.039936065673828, "step": 2272 }, { "epoch": 0.55, "learning_rate": 1.0101333333333332e-07, "logps/chosen": -244.73912048339844, "logps/rejected": -335.9186096191406, "loss": 0.0087, "losses/dpo": 4.3872404376088525e-08, "losses/sft": 0.9146193861961365, "losses/total": 4.3872404376088525e-08, "ref_logps/chosen": -232.82351684570312, "ref_logps/rejected": -215.5567626953125, "rewards/accuracies": 1.0, "rewards/chosen": -1.1915596723556519, "rewards/margins": 10.844625473022461, "rewards/rejected": -12.036186218261719, "step": 2273 }, { "epoch": 0.55, "learning_rate": 1.0096e-07, "logps/chosen": -219.59657287597656, "logps/rejected": -378.87890625, "loss": 0.0009, "losses/dpo": 3.8109953948151087e-06, "losses/sft": 0.6505774259567261, "losses/total": 3.8109953948151087e-06, "ref_logps/chosen": -207.99925231933594, "ref_logps/rejected": -241.2062225341797, "rewards/accuracies": 1.0, "rewards/chosen": -1.159731149673462, "rewards/margins": 12.607538223266602, "rewards/rejected": -13.767269134521484, "step": 2274 }, { "epoch": 0.55, "learning_rate": 1.0090666666666665e-07, "logps/chosen": -235.77774047851562, "logps/rejected": -372.3817138671875, "loss": 0.0017, "losses/dpo": 1.7974765569306328e-06, "losses/sft": 0.5171051025390625, "losses/total": 1.7974765569306328e-06, "ref_logps/chosen": -224.35971069335938, "ref_logps/rejected": -220.2779541015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.1418020725250244, "rewards/margins": 14.068571090698242, "rewards/rejected": -15.210372924804688, "step": 2275 }, { "epoch": 0.55, "learning_rate": 1.0085333333333333e-07, "logps/chosen": -214.5868682861328, "logps/rejected": -384.58868408203125, "loss": 0.0002, "losses/dpo": 5.497257916431408e-06, "losses/sft": 0.6055460572242737, "losses/total": 5.497257916431408e-06, "ref_logps/chosen": -204.38079833984375, "ref_logps/rejected": -227.41925048828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.0206053256988525, "rewards/margins": 14.696338653564453, "rewards/rejected": -15.716943740844727, "step": 2276 }, { "epoch": 0.55, "learning_rate": 1.008e-07, "logps/chosen": -196.3695526123047, "logps/rejected": -371.64019775390625, "loss": 0.0045, "losses/dpo": 1.4695937124997727e-06, "losses/sft": 0.7812553644180298, "losses/total": 1.4695937124997727e-06, "ref_logps/chosen": -184.1379852294922, "ref_logps/rejected": -217.17135620117188, "rewards/accuracies": 1.0, "rewards/chosen": -1.2231576442718506, "rewards/margins": 14.223725318908691, "rewards/rejected": -15.446882247924805, "step": 2277 }, { "epoch": 0.55, "learning_rate": 1.0074666666666667e-07, "logps/chosen": -243.66195678710938, "logps/rejected": -376.55230712890625, "loss": 0.007, "losses/dpo": 7.89920306942804e-07, "losses/sft": 0.5648667812347412, "losses/total": 7.89920306942804e-07, "ref_logps/chosen": -227.2706298828125, "ref_logps/rejected": -231.51651000976562, "rewards/accuracies": 1.0, "rewards/chosen": -1.6391336917877197, "rewards/margins": 12.864448547363281, "rewards/rejected": -14.503582000732422, "step": 2278 }, { "epoch": 0.55, "learning_rate": 1.0069333333333332e-07, "logps/chosen": -226.19537353515625, "logps/rejected": -361.16009521484375, "loss": 0.0016, "losses/dpo": 4.8704619985073805e-05, "losses/sft": 0.534330427646637, "losses/total": 4.8704619985073805e-05, "ref_logps/chosen": -214.08306884765625, "ref_logps/rejected": -212.47470092773438, "rewards/accuracies": 1.0, "rewards/chosen": -1.2112321853637695, "rewards/margins": 13.657308578491211, "rewards/rejected": -14.868539810180664, "step": 2279 }, { "epoch": 0.55, "learning_rate": 1.0063999999999999e-07, "logps/chosen": -232.1951141357422, "logps/rejected": -398.62701416015625, "loss": 0.0031, "losses/dpo": 1.6542462617508136e-06, "losses/sft": 0.6022157073020935, "losses/total": 1.6542462617508136e-06, "ref_logps/chosen": -221.98345947265625, "ref_logps/rejected": -237.47979736328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.021166443824768, "rewards/margins": 15.093555450439453, "rewards/rejected": -16.114723205566406, "step": 2280 }, { "epoch": 0.55, "learning_rate": 1.0058666666666667e-07, "logps/chosen": -260.92266845703125, "logps/rejected": -385.7501220703125, "loss": 0.0037, "losses/dpo": 9.664703759515447e-11, "losses/sft": 0.9427118301391602, "losses/total": 9.664703759515447e-11, "ref_logps/chosen": -247.65118408203125, "ref_logps/rejected": -234.30081176757812, "rewards/accuracies": 1.0, "rewards/chosen": -1.3271466493606567, "rewards/margins": 13.817781448364258, "rewards/rejected": -15.144927978515625, "step": 2281 }, { "epoch": 0.55, "learning_rate": 1.0053333333333333e-07, "logps/chosen": -256.4772644042969, "logps/rejected": -390.5093994140625, "loss": 0.0006, "losses/dpo": 1.276914268766305e-10, "losses/sft": 0.9291847348213196, "losses/total": 1.276914268766305e-10, "ref_logps/chosen": -244.22885131835938, "ref_logps/rejected": -226.40896606445312, "rewards/accuracies": 1.0, "rewards/chosen": -1.2248408794403076, "rewards/margins": 15.185201644897461, "rewards/rejected": -16.41004180908203, "step": 2282 }, { "epoch": 0.55, "learning_rate": 1.0047999999999998e-07, "logps/chosen": -211.20343017578125, "logps/rejected": -316.5641784667969, "loss": 0.004, "losses/dpo": 1.5402960684696154e-07, "losses/sft": 0.6043627262115479, "losses/total": 1.5402960684696154e-07, "ref_logps/chosen": -197.77047729492188, "ref_logps/rejected": -192.65890502929688, "rewards/accuracies": 1.0, "rewards/chosen": -1.3432948589324951, "rewards/margins": 11.047232627868652, "rewards/rejected": -12.39052677154541, "step": 2283 }, { "epoch": 0.55, "learning_rate": 1.0042666666666666e-07, "logps/chosen": -226.9905548095703, "logps/rejected": -337.42913818359375, "loss": 0.0014, "losses/dpo": 2.4825547839668616e-08, "losses/sft": 0.5537871718406677, "losses/total": 2.4825547839668616e-08, "ref_logps/chosen": -215.63702392578125, "ref_logps/rejected": -203.27606201171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.1353511810302734, "rewards/margins": 12.279956817626953, "rewards/rejected": -13.415307998657227, "step": 2284 }, { "epoch": 0.55, "learning_rate": 1.0037333333333334e-07, "logps/chosen": -239.75579833984375, "logps/rejected": -361.6396484375, "loss": 0.0018, "losses/dpo": 2.4658676878175356e-08, "losses/sft": 0.5222917199134827, "losses/total": 2.4658676878175356e-08, "ref_logps/chosen": -230.41416931152344, "ref_logps/rejected": -226.12396240234375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9341647624969482, "rewards/margins": 12.617399215698242, "rewards/rejected": -13.55156421661377, "step": 2285 }, { "epoch": 0.55, "learning_rate": 1.0032e-07, "logps/chosen": -241.89129638671875, "logps/rejected": -349.87646484375, "loss": 0.0046, "losses/dpo": 6.210854053279036e-10, "losses/sft": 0.8497408032417297, "losses/total": 6.210854053279036e-10, "ref_logps/chosen": -227.6259765625, "ref_logps/rejected": -206.92745971679688, "rewards/accuracies": 1.0, "rewards/chosen": -1.4265308380126953, "rewards/margins": 12.868369102478027, "rewards/rejected": -14.294899940490723, "step": 2286 }, { "epoch": 0.55, "learning_rate": 1.0026666666666665e-07, "logps/chosen": -238.85723876953125, "logps/rejected": -360.93310546875, "loss": 0.0024, "losses/dpo": 1.6750744862292777e-06, "losses/sft": 0.5890098810195923, "losses/total": 1.6750744862292777e-06, "ref_logps/chosen": -224.27334594726562, "ref_logps/rejected": -217.77879333496094, "rewards/accuracies": 1.0, "rewards/chosen": -1.4583896398544312, "rewards/margins": 12.85704231262207, "rewards/rejected": -14.315431594848633, "step": 2287 }, { "epoch": 0.55, "learning_rate": 1.0021333333333333e-07, "logps/chosen": -269.92828369140625, "logps/rejected": -382.79974365234375, "loss": 0.0007, "losses/dpo": 8.058230527296928e-09, "losses/sft": 0.7414445877075195, "losses/total": 8.058230527296928e-09, "ref_logps/chosen": -254.88504028320312, "ref_logps/rejected": -226.8816680908203, "rewards/accuracies": 1.0, "rewards/chosen": -1.5043253898620605, "rewards/margins": 14.087482452392578, "rewards/rejected": -15.591808319091797, "step": 2288 }, { "epoch": 0.55, "learning_rate": 1.0016e-07, "logps/chosen": -218.22439575195312, "logps/rejected": -356.2713623046875, "loss": 0.0011, "losses/dpo": 5.29761337020318e-07, "losses/sft": 0.6000159978866577, "losses/total": 5.29761337020318e-07, "ref_logps/chosen": -204.61314392089844, "ref_logps/rejected": -214.8399200439453, "rewards/accuracies": 1.0, "rewards/chosen": -1.361124873161316, "rewards/margins": 12.782018661499023, "rewards/rejected": -14.143144607543945, "step": 2289 }, { "epoch": 0.55, "learning_rate": 1.0010666666666667e-07, "logps/chosen": -208.45297241210938, "logps/rejected": -340.662353515625, "loss": 0.0094, "losses/dpo": 9.03211159197781e-08, "losses/sft": 0.340877890586853, "losses/total": 9.03211159197781e-08, "ref_logps/chosen": -195.00811767578125, "ref_logps/rejected": -216.24258422851562, "rewards/accuracies": 1.0, "rewards/chosen": -1.3444833755493164, "rewards/margins": 11.097495079040527, "rewards/rejected": -12.44197940826416, "step": 2290 }, { "epoch": 0.55, "learning_rate": 1.0005333333333332e-07, "logps/chosen": -226.359375, "logps/rejected": -384.52532958984375, "loss": 0.0038, "losses/dpo": 6.864139479034748e-09, "losses/sft": 0.5060995221138, "losses/total": 6.864139479034748e-09, "ref_logps/chosen": -214.97885131835938, "ref_logps/rejected": -234.45916748046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.1380529403686523, "rewards/margins": 13.868562698364258, "rewards/rejected": -15.00661563873291, "step": 2291 }, { "epoch": 0.55, "learning_rate": 1e-07, "logps/chosen": -246.26446533203125, "logps/rejected": -415.175537109375, "loss": 0.0013, "losses/dpo": 2.7722660433937563e-06, "losses/sft": 0.5481411218643188, "losses/total": 2.7722660433937563e-06, "ref_logps/chosen": -233.7669677734375, "ref_logps/rejected": -259.398193359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.2497516870498657, "rewards/margins": 14.327981948852539, "rewards/rejected": -15.577733993530273, "step": 2292 }, { "epoch": 0.55, "learning_rate": 9.994666666666666e-08, "logps/chosen": -246.72792053222656, "logps/rejected": -368.7378845214844, "loss": 0.0018, "losses/dpo": 3.5139564147357305e-07, "losses/sft": 0.7389965057373047, "losses/total": 3.5139564147357305e-07, "ref_logps/chosen": -235.4989471435547, "ref_logps/rejected": -213.30514526367188, "rewards/accuracies": 1.0, "rewards/chosen": -1.1228985786437988, "rewards/margins": 14.420376777648926, "rewards/rejected": -15.543275833129883, "step": 2293 }, { "epoch": 0.55, "learning_rate": 9.989333333333334e-08, "logps/chosen": -246.78179931640625, "logps/rejected": -395.0770568847656, "loss": 0.0088, "losses/dpo": 2.257278765682713e-06, "losses/sft": 0.7279987931251526, "losses/total": 2.257278765682713e-06, "ref_logps/chosen": -234.1453399658203, "ref_logps/rejected": -246.8951416015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2636456489562988, "rewards/margins": 13.554542541503906, "rewards/rejected": -14.818188667297363, "step": 2294 }, { "epoch": 0.55, "learning_rate": 9.983999999999999e-08, "logps/chosen": -180.28604125976562, "logps/rejected": -364.0815734863281, "loss": 0.0032, "losses/dpo": 1.5502374139941821e-07, "losses/sft": 0.6671282649040222, "losses/total": 1.5502374139941821e-07, "ref_logps/chosen": -169.82615661621094, "ref_logps/rejected": -220.7838592529297, "rewards/accuracies": 1.0, "rewards/chosen": -1.0459905862808228, "rewards/margins": 13.283782958984375, "rewards/rejected": -14.32977294921875, "step": 2295 }, { "epoch": 0.55, "learning_rate": 9.978666666666666e-08, "logps/chosen": -207.260498046875, "logps/rejected": -359.3201904296875, "loss": 0.0004, "losses/dpo": 2.572670609879424e-07, "losses/sft": 1.0911939144134521, "losses/total": 2.572670609879424e-07, "ref_logps/chosen": -196.58755493164062, "ref_logps/rejected": -216.7667236328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.0672924518585205, "rewards/margins": 13.188055038452148, "rewards/rejected": -14.25534725189209, "step": 2296 }, { "epoch": 0.55, "learning_rate": 9.973333333333333e-08, "logps/chosen": -240.05642700195312, "logps/rejected": -353.6397399902344, "loss": 0.0084, "losses/dpo": 6.141793846836663e-07, "losses/sft": 0.5393868684768677, "losses/total": 6.141793846836663e-07, "ref_logps/chosen": -226.88333129882812, "ref_logps/rejected": -205.9941864013672, "rewards/accuracies": 1.0, "rewards/chosen": -1.317310094833374, "rewards/margins": 13.447246551513672, "rewards/rejected": -14.764556884765625, "step": 2297 }, { "epoch": 0.55, "learning_rate": 9.968e-08, "logps/chosen": -253.77484130859375, "logps/rejected": -413.7645263671875, "loss": 0.0003, "losses/dpo": 0.0004802227776963264, "losses/sft": 0.4444243907928467, "losses/total": 0.0004802227776963264, "ref_logps/chosen": -240.11517333984375, "ref_logps/rejected": -251.06236267089844, "rewards/accuracies": 1.0, "rewards/chosen": -1.3659676313400269, "rewards/margins": 14.90424919128418, "rewards/rejected": -16.270217895507812, "step": 2298 }, { "epoch": 0.55, "learning_rate": 9.962666666666665e-08, "logps/chosen": -222.8824462890625, "logps/rejected": -388.2259826660156, "loss": 0.0005, "losses/dpo": 2.3379465030615165e-09, "losses/sft": 0.534748911857605, "losses/total": 2.3379465030615165e-09, "ref_logps/chosen": -210.79873657226562, "ref_logps/rejected": -236.796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.208372950553894, "rewards/margins": 13.934536933898926, "rewards/rejected": -15.14291000366211, "step": 2299 }, { "epoch": 0.55, "learning_rate": 9.957333333333333e-08, "logps/chosen": -285.4659118652344, "logps/rejected": -380.808349609375, "loss": 0.0008, "losses/dpo": 1.1662022814107331e-07, "losses/sft": 0.9013906717300415, "losses/total": 1.1662022814107331e-07, "ref_logps/chosen": -267.859619140625, "ref_logps/rejected": -237.52484130859375, "rewards/accuracies": 1.0, "rewards/chosen": -1.7606302499771118, "rewards/margins": 12.56772232055664, "rewards/rejected": -14.328351974487305, "step": 2300 }, { "epoch": 0.55, "learning_rate": 9.952e-08, "logps/chosen": -254.86248779296875, "logps/rejected": -376.3894958496094, "loss": 0.0006, "losses/dpo": 1.1644119695120025e-06, "losses/sft": 1.012023687362671, "losses/total": 1.1644119695120025e-06, "ref_logps/chosen": -241.25782775878906, "ref_logps/rejected": -226.75515747070312, "rewards/accuracies": 1.0, "rewards/chosen": -1.3604649305343628, "rewards/margins": 13.6029691696167, "rewards/rejected": -14.963434219360352, "step": 2301 }, { "epoch": 0.55, "learning_rate": 9.946666666666667e-08, "logps/chosen": -230.24102783203125, "logps/rejected": -399.3916015625, "loss": 0.0001, "losses/dpo": 1.7343475267050934e-10, "losses/sft": 0.8151984810829163, "losses/total": 1.7343475267050934e-10, "ref_logps/chosen": -216.6962890625, "ref_logps/rejected": -235.06692504882812, "rewards/accuracies": 1.0, "rewards/chosen": -1.3544718027114868, "rewards/margins": 15.07800006866455, "rewards/rejected": -16.432472229003906, "step": 2302 }, { "epoch": 0.55, "learning_rate": 9.941333333333332e-08, "logps/chosen": -218.72982788085938, "logps/rejected": -337.40911865234375, "loss": 0.0074, "losses/dpo": 0.00011755619198083878, "losses/sft": 0.5463643670082092, "losses/total": 0.00011755619198083878, "ref_logps/chosen": -206.33726501464844, "ref_logps/rejected": -209.26864624023438, "rewards/accuracies": 1.0, "rewards/chosen": -1.2392573356628418, "rewards/margins": 11.574790000915527, "rewards/rejected": -12.814047813415527, "step": 2303 }, { "epoch": 0.55, "learning_rate": 9.936e-08, "logps/chosen": -250.42356872558594, "logps/rejected": -350.34075927734375, "loss": 0.002, "losses/dpo": 2.7670787403621944e-06, "losses/sft": 0.5805696845054626, "losses/total": 2.7670787403621944e-06, "ref_logps/chosen": -236.65687561035156, "ref_logps/rejected": -217.7165069580078, "rewards/accuracies": 1.0, "rewards/chosen": -1.376670002937317, "rewards/margins": 11.885754585266113, "rewards/rejected": -13.26242446899414, "step": 2304 }, { "epoch": 0.55, "learning_rate": 9.930666666666666e-08, "logps/chosen": -290.62689208984375, "logps/rejected": -368.7726135253906, "loss": 0.0017, "losses/dpo": 1.4808813375566388e-07, "losses/sft": 0.6149272322654724, "losses/total": 1.4808813375566388e-07, "ref_logps/chosen": -276.6713562011719, "ref_logps/rejected": -226.09210205078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.3955559730529785, "rewards/margins": 12.872495651245117, "rewards/rejected": -14.268050193786621, "step": 2305 }, { "epoch": 0.55, "learning_rate": 9.925333333333334e-08, "logps/chosen": -231.81480407714844, "logps/rejected": -362.0771789550781, "loss": 0.0009, "losses/dpo": 3.4028366826532874e-06, "losses/sft": 0.4569995105266571, "losses/total": 3.4028366826532874e-06, "ref_logps/chosen": -218.34115600585938, "ref_logps/rejected": -223.486572265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.3473660945892334, "rewards/margins": 12.511693954467773, "rewards/rejected": -13.85905933380127, "step": 2306 }, { "epoch": 0.55, "learning_rate": 9.919999999999999e-08, "logps/chosen": -255.63003540039062, "logps/rejected": -348.7801818847656, "loss": 0.0021, "losses/dpo": 1.7277053121489416e-09, "losses/sft": 0.6613872647285461, "losses/total": 1.7277053121489416e-09, "ref_logps/chosen": -237.82406616210938, "ref_logps/rejected": -212.30474853515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7805989980697632, "rewards/margins": 11.866945266723633, "rewards/rejected": -13.647544860839844, "step": 2307 }, { "epoch": 0.55, "learning_rate": 9.914666666666667e-08, "logps/chosen": -220.12738037109375, "logps/rejected": -382.2796630859375, "loss": 0.0008, "losses/dpo": 9.079421658952924e-08, "losses/sft": 0.4719870984554291, "losses/total": 9.079421658952924e-08, "ref_logps/chosen": -209.8867950439453, "ref_logps/rejected": -227.1222381591797, "rewards/accuracies": 1.0, "rewards/chosen": -1.0240590572357178, "rewards/margins": 14.491681098937988, "rewards/rejected": -15.515739440917969, "step": 2308 }, { "epoch": 0.55, "learning_rate": 9.909333333333333e-08, "logps/chosen": -215.0113067626953, "logps/rejected": -324.0934753417969, "loss": 0.0004, "losses/dpo": 4.4086934103404474e-10, "losses/sft": 0.5904011130332947, "losses/total": 4.4086934103404474e-10, "ref_logps/chosen": -203.181884765625, "ref_logps/rejected": -194.0023193359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.182941198348999, "rewards/margins": 11.826170921325684, "rewards/rejected": -13.009111404418945, "step": 2309 }, { "epoch": 0.55, "learning_rate": 9.903999999999999e-08, "logps/chosen": -286.33892822265625, "logps/rejected": -415.44732666015625, "loss": 0.0003, "losses/dpo": 0.003119313158094883, "losses/sft": 0.7363908886909485, "losses/total": 0.003119313158094883, "ref_logps/chosen": -270.0692138671875, "ref_logps/rejected": -246.90475463867188, "rewards/accuracies": 1.0, "rewards/chosen": -1.6269710063934326, "rewards/margins": 15.22728443145752, "rewards/rejected": -16.85425567626953, "step": 2310 }, { "epoch": 0.55, "learning_rate": 9.898666666666667e-08, "logps/chosen": -217.58111572265625, "logps/rejected": -377.22991943359375, "loss": 0.007, "losses/dpo": 4.4115243014175576e-08, "losses/sft": 0.6850181818008423, "losses/total": 4.4115243014175576e-08, "ref_logps/chosen": -203.11122131347656, "ref_logps/rejected": -226.80125427246094, "rewards/accuracies": 1.0, "rewards/chosen": -1.4469914436340332, "rewards/margins": 13.595876693725586, "rewards/rejected": -15.042867660522461, "step": 2311 }, { "epoch": 0.55, "learning_rate": 9.893333333333332e-08, "logps/chosen": -263.2269287109375, "logps/rejected": -355.06854248046875, "loss": 0.0081, "losses/dpo": 9.0870044004987e-06, "losses/sft": 0.6224904656410217, "losses/total": 9.0870044004987e-06, "ref_logps/chosen": -250.96731567382812, "ref_logps/rejected": -225.86624145507812, "rewards/accuracies": 1.0, "rewards/chosen": -1.225961685180664, "rewards/margins": 11.694270133972168, "rewards/rejected": -12.920230865478516, "step": 2312 }, { "epoch": 0.56, "learning_rate": 9.888e-08, "logps/chosen": -222.24685668945312, "logps/rejected": -369.5613098144531, "loss": 0.0023, "losses/dpo": 2.127989517930473e-07, "losses/sft": 1.0634734630584717, "losses/total": 2.127989517930473e-07, "ref_logps/chosen": -212.66175842285156, "ref_logps/rejected": -230.36517333984375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9585094451904297, "rewards/margins": 12.961105346679688, "rewards/rejected": -13.919614791870117, "step": 2313 }, { "epoch": 0.56, "learning_rate": 9.882666666666666e-08, "logps/chosen": -246.85272216796875, "logps/rejected": -382.0030517578125, "loss": 0.0085, "losses/dpo": 8.154186303954702e-08, "losses/sft": 0.545440673828125, "losses/total": 8.154186303954702e-08, "ref_logps/chosen": -233.3988494873047, "ref_logps/rejected": -236.24081420898438, "rewards/accuracies": 1.0, "rewards/chosen": -1.3453876972198486, "rewards/margins": 13.230840682983398, "rewards/rejected": -14.576228141784668, "step": 2314 }, { "epoch": 0.56, "learning_rate": 9.877333333333334e-08, "logps/chosen": -285.45916748046875, "logps/rejected": -385.9154968261719, "loss": 0.0007, "losses/dpo": 8.917705258681963e-07, "losses/sft": 0.7015578150749207, "losses/total": 8.917705258681963e-07, "ref_logps/chosen": -267.35736083984375, "ref_logps/rejected": -237.34335327148438, "rewards/accuracies": 1.0, "rewards/chosen": -1.810180902481079, "rewards/margins": 13.047033309936523, "rewards/rejected": -14.85721492767334, "step": 2315 }, { "epoch": 0.56, "learning_rate": 9.871999999999999e-08, "logps/chosen": -215.31883239746094, "logps/rejected": -340.46856689453125, "loss": 0.0044, "losses/dpo": 5.583975507761352e-06, "losses/sft": 0.5658040642738342, "losses/total": 5.583975507761352e-06, "ref_logps/chosen": -204.04376220703125, "ref_logps/rejected": -206.07760620117188, "rewards/accuracies": 1.0, "rewards/chosen": -1.1275091171264648, "rewards/margins": 12.3115873336792, "rewards/rejected": -13.439096450805664, "step": 2316 }, { "epoch": 0.56, "learning_rate": 9.866666666666666e-08, "logps/chosen": -224.798828125, "logps/rejected": -356.041015625, "loss": 0.0037, "losses/dpo": 3.2107266179082217e-06, "losses/sft": 1.207785725593567, "losses/total": 3.2107266179082217e-06, "ref_logps/chosen": -213.17474365234375, "ref_logps/rejected": -217.2407684326172, "rewards/accuracies": 1.0, "rewards/chosen": -1.1624091863632202, "rewards/margins": 12.717613220214844, "rewards/rejected": -13.880023956298828, "step": 2317 }, { "epoch": 0.56, "learning_rate": 9.861333333333333e-08, "logps/chosen": -259.6193542480469, "logps/rejected": -341.9388732910156, "loss": 0.0009, "losses/dpo": 1.855471865042091e-08, "losses/sft": 0.564508318901062, "losses/total": 1.855471865042091e-08, "ref_logps/chosen": -243.79103088378906, "ref_logps/rejected": -209.25209045410156, "rewards/accuracies": 1.0, "rewards/chosen": -1.582833170890808, "rewards/margins": 11.685843467712402, "rewards/rejected": -13.2686767578125, "step": 2318 }, { "epoch": 0.56, "learning_rate": 9.856e-08, "logps/chosen": -276.8015441894531, "logps/rejected": -376.67364501953125, "loss": 0.0031, "losses/dpo": 6.427377229556441e-06, "losses/sft": 0.7000522613525391, "losses/total": 6.427377229556441e-06, "ref_logps/chosen": -263.1982421875, "ref_logps/rejected": -231.45306396484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.3603308200836182, "rewards/margins": 13.161725997924805, "rewards/rejected": -14.522056579589844, "step": 2319 }, { "epoch": 0.56, "learning_rate": 9.850666666666665e-08, "logps/chosen": -252.00387573242188, "logps/rejected": -363.4907531738281, "loss": 0.0016, "losses/dpo": 3.028980017916183e-07, "losses/sft": 0.5577481389045715, "losses/total": 3.028980017916183e-07, "ref_logps/chosen": -236.30577087402344, "ref_logps/rejected": -213.35818481445312, "rewards/accuracies": 1.0, "rewards/chosen": -1.5698108673095703, "rewards/margins": 13.44344425201416, "rewards/rejected": -15.013256072998047, "step": 2320 }, { "epoch": 0.56, "learning_rate": 9.845333333333333e-08, "logps/chosen": -268.22711181640625, "logps/rejected": -375.2305908203125, "loss": 0.0131, "losses/dpo": 6.807609054249042e-08, "losses/sft": 0.7062000036239624, "losses/total": 6.807609054249042e-08, "ref_logps/chosen": -252.83291625976562, "ref_logps/rejected": -219.03598022460938, "rewards/accuracies": 1.0, "rewards/chosen": -1.5394209623336792, "rewards/margins": 14.080041885375977, "rewards/rejected": -15.619462966918945, "step": 2321 }, { "epoch": 0.56, "learning_rate": 9.84e-08, "logps/chosen": -255.89112854003906, "logps/rejected": -385.4278259277344, "loss": 0.0007, "losses/dpo": 6.423648528652848e-07, "losses/sft": 0.7274383902549744, "losses/total": 6.423648528652848e-07, "ref_logps/chosen": -241.28152465820312, "ref_logps/rejected": -238.17005920410156, "rewards/accuracies": 1.0, "rewards/chosen": -1.4609594345092773, "rewards/margins": 13.26481819152832, "rewards/rejected": -14.725776672363281, "step": 2322 }, { "epoch": 0.56, "learning_rate": 9.834666666666667e-08, "logps/chosen": -219.35684204101562, "logps/rejected": -389.6991882324219, "loss": 0.0002, "losses/dpo": 5.968400728306733e-05, "losses/sft": 0.3691384196281433, "losses/total": 5.968400728306733e-05, "ref_logps/chosen": -202.76937866210938, "ref_logps/rejected": -230.3101806640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6587477922439575, "rewards/margins": 14.280153274536133, "rewards/rejected": -15.9389009475708, "step": 2323 }, { "epoch": 0.56, "learning_rate": 9.829333333333333e-08, "logps/chosen": -232.29638671875, "logps/rejected": -334.0762634277344, "loss": 0.0018, "losses/dpo": 9.488944488111883e-05, "losses/sft": 0.7162287831306458, "losses/total": 9.488944488111883e-05, "ref_logps/chosen": -218.77516174316406, "ref_logps/rejected": -202.98898315429688, "rewards/accuracies": 1.0, "rewards/chosen": -1.3521229028701782, "rewards/margins": 11.756604194641113, "rewards/rejected": -13.108728408813477, "step": 2324 }, { "epoch": 0.56, "learning_rate": 9.824e-08, "logps/chosen": -231.9025115966797, "logps/rejected": -392.14654541015625, "loss": 0.0017, "losses/dpo": 1.5020356158856885e-06, "losses/sft": 0.776736855506897, "losses/total": 1.5020356158856885e-06, "ref_logps/chosen": -218.51156616210938, "ref_logps/rejected": -246.14207458496094, "rewards/accuracies": 1.0, "rewards/chosen": -1.3390932083129883, "rewards/margins": 13.261351585388184, "rewards/rejected": -14.600444793701172, "step": 2325 }, { "epoch": 0.56, "learning_rate": 9.818666666666666e-08, "logps/chosen": -226.5547332763672, "logps/rejected": -361.4154052734375, "loss": 0.0007, "losses/dpo": 6.039969662197109e-07, "losses/sft": 0.6459108591079712, "losses/total": 6.039969662197109e-07, "ref_logps/chosen": -211.0767364501953, "ref_logps/rejected": -221.95159912109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5478001832962036, "rewards/margins": 12.398578643798828, "rewards/rejected": -13.946378707885742, "step": 2326 }, { "epoch": 0.56, "learning_rate": 9.813333333333333e-08, "logps/chosen": -270.68865966796875, "logps/rejected": -357.7596740722656, "loss": 0.0041, "losses/dpo": 2.5778460894798627e-06, "losses/sft": 0.7051135301589966, "losses/total": 2.5778460894798627e-06, "ref_logps/chosen": -258.6165771484375, "ref_logps/rejected": -217.5584259033203, "rewards/accuracies": 1.0, "rewards/chosen": -1.207207202911377, "rewards/margins": 12.812918663024902, "rewards/rejected": -14.020126342773438, "step": 2327 }, { "epoch": 0.56, "learning_rate": 9.808e-08, "logps/chosen": -231.9167938232422, "logps/rejected": -365.02203369140625, "loss": 0.0084, "losses/dpo": 1.5474951098326528e-08, "losses/sft": 0.9146334528923035, "losses/total": 1.5474951098326528e-08, "ref_logps/chosen": -217.8717498779297, "ref_logps/rejected": -217.7427215576172, "rewards/accuracies": 1.0, "rewards/chosen": -1.4045051336288452, "rewards/margins": 13.323427200317383, "rewards/rejected": -14.72793197631836, "step": 2328 }, { "epoch": 0.56, "learning_rate": 9.802666666666665e-08, "logps/chosen": -235.8797607421875, "logps/rejected": -372.5382080078125, "loss": 0.0054, "losses/dpo": 3.909735823981464e-08, "losses/sft": 0.6534193754196167, "losses/total": 3.909735823981464e-08, "ref_logps/chosen": -222.79043579101562, "ref_logps/rejected": -229.7126007080078, "rewards/accuracies": 1.0, "rewards/chosen": -1.3089323043823242, "rewards/margins": 12.973633766174316, "rewards/rejected": -14.28256607055664, "step": 2329 }, { "epoch": 0.56, "learning_rate": 9.797333333333333e-08, "logps/chosen": -213.7708282470703, "logps/rejected": -369.822265625, "loss": 0.0019, "losses/dpo": 2.2185888542969323e-11, "losses/sft": 0.6478893160820007, "losses/total": 2.2185888542969323e-11, "ref_logps/chosen": -202.71688842773438, "ref_logps/rejected": -225.91465759277344, "rewards/accuracies": 1.0, "rewards/chosen": -1.105393409729004, "rewards/margins": 13.285367012023926, "rewards/rejected": -14.39076042175293, "step": 2330 }, { "epoch": 0.56, "learning_rate": 9.791999999999999e-08, "logps/chosen": -213.22483825683594, "logps/rejected": -355.4648742675781, "loss": 0.0017, "losses/dpo": 5.2886417023501053e-08, "losses/sft": 0.3930279612541199, "losses/total": 5.2886417023501053e-08, "ref_logps/chosen": -200.98861694335938, "ref_logps/rejected": -210.2305450439453, "rewards/accuracies": 1.0, "rewards/chosen": -1.2236213684082031, "rewards/margins": 13.299810409545898, "rewards/rejected": -14.523431777954102, "step": 2331 }, { "epoch": 0.56, "learning_rate": 9.786666666666667e-08, "logps/chosen": -228.36691284179688, "logps/rejected": -350.0528564453125, "loss": 0.0088, "losses/dpo": 1.0328823918825947e-05, "losses/sft": 0.6773537397384644, "losses/total": 1.0328823918825947e-05, "ref_logps/chosen": -215.3859100341797, "ref_logps/rejected": -213.72329711914062, "rewards/accuracies": 1.0, "rewards/chosen": -1.2981020212173462, "rewards/margins": 12.334854125976562, "rewards/rejected": -13.632955551147461, "step": 2332 }, { "epoch": 0.56, "learning_rate": 9.781333333333332e-08, "logps/chosen": -300.4783935546875, "logps/rejected": -393.74981689453125, "loss": 0.0178, "losses/dpo": 3.029962725165092e-09, "losses/sft": 0.5166178345680237, "losses/total": 3.029962725165092e-09, "ref_logps/chosen": -287.55938720703125, "ref_logps/rejected": -251.75006103515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2919020652770996, "rewards/margins": 12.908075332641602, "rewards/rejected": -14.19997787475586, "step": 2333 }, { "epoch": 0.56, "learning_rate": 9.776e-08, "logps/chosen": -247.22877502441406, "logps/rejected": -381.38092041015625, "loss": 0.0013, "losses/dpo": 1.1485686854939559e-06, "losses/sft": 0.5469247102737427, "losses/total": 1.1485686854939559e-06, "ref_logps/chosen": -233.284912109375, "ref_logps/rejected": -233.8999786376953, "rewards/accuracies": 1.0, "rewards/chosen": -1.3943854570388794, "rewards/margins": 13.353708267211914, "rewards/rejected": -14.748093605041504, "step": 2334 }, { "epoch": 0.56, "learning_rate": 9.770666666666666e-08, "logps/chosen": -224.47686767578125, "logps/rejected": -335.6162109375, "loss": 0.0014, "losses/dpo": 2.9046725558146136e-06, "losses/sft": 1.0817550420761108, "losses/total": 2.9046725558146136e-06, "ref_logps/chosen": -213.5269775390625, "ref_logps/rejected": -199.8716278076172, "rewards/accuracies": 1.0, "rewards/chosen": -1.0949891805648804, "rewards/margins": 12.479469299316406, "rewards/rejected": -13.574459075927734, "step": 2335 }, { "epoch": 0.56, "learning_rate": 9.765333333333334e-08, "logps/chosen": -255.509521484375, "logps/rejected": -330.8234558105469, "loss": 0.0073, "losses/dpo": 3.911906787834596e-06, "losses/sft": 0.6703676581382751, "losses/total": 3.911906787834596e-06, "ref_logps/chosen": -243.173583984375, "ref_logps/rejected": -200.2502899169922, "rewards/accuracies": 1.0, "rewards/chosen": -1.233593225479126, "rewards/margins": 11.823724746704102, "rewards/rejected": -13.057317733764648, "step": 2336 }, { "epoch": 0.56, "learning_rate": 9.759999999999999e-08, "logps/chosen": -250.01158142089844, "logps/rejected": -356.08966064453125, "loss": 0.0008, "losses/dpo": 1.5028849276177425e-08, "losses/sft": 0.5793498158454895, "losses/total": 1.5028849276177425e-08, "ref_logps/chosen": -239.25875854492188, "ref_logps/rejected": -212.39169311523438, "rewards/accuracies": 1.0, "rewards/chosen": -1.075282335281372, "rewards/margins": 13.294515609741211, "rewards/rejected": -14.369795799255371, "step": 2337 }, { "epoch": 0.56, "learning_rate": 9.754666666666666e-08, "logps/chosen": -219.67311096191406, "logps/rejected": -356.1156005859375, "loss": 0.008, "losses/dpo": 1.0750805046200185e-07, "losses/sft": 0.5348184108734131, "losses/total": 1.0750805046200185e-07, "ref_logps/chosen": -208.7452850341797, "ref_logps/rejected": -219.3004913330078, "rewards/accuracies": 1.0, "rewards/chosen": -1.0927836894989014, "rewards/margins": 12.588726043701172, "rewards/rejected": -13.681510925292969, "step": 2338 }, { "epoch": 0.56, "learning_rate": 9.749333333333333e-08, "logps/chosen": -225.3559112548828, "logps/rejected": -382.67376708984375, "loss": 0.0006, "losses/dpo": 4.264993247460325e-09, "losses/sft": 0.6211250424385071, "losses/total": 4.264993247460325e-09, "ref_logps/chosen": -215.4073028564453, "ref_logps/rejected": -231.037109375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9948605298995972, "rewards/margins": 14.168807029724121, "rewards/rejected": -15.163667678833008, "step": 2339 }, { "epoch": 0.56, "learning_rate": 9.744e-08, "logps/chosen": -283.969970703125, "logps/rejected": -381.9249267578125, "loss": 0.0039, "losses/dpo": 2.2437683711018508e-08, "losses/sft": 0.750261664390564, "losses/total": 2.2437683711018508e-08, "ref_logps/chosen": -267.94049072265625, "ref_logps/rejected": -228.64907836914062, "rewards/accuracies": 1.0, "rewards/chosen": -1.6029460430145264, "rewards/margins": 13.724638938903809, "rewards/rejected": -15.32758617401123, "step": 2340 }, { "epoch": 0.56, "learning_rate": 9.738666666666667e-08, "logps/chosen": -235.134033203125, "logps/rejected": -367.8486328125, "loss": 0.0005, "losses/dpo": 5.0616159569472075e-05, "losses/sft": 0.524658739566803, "losses/total": 5.0616159569472075e-05, "ref_logps/chosen": -222.55459594726562, "ref_logps/rejected": -220.08900451660156, "rewards/accuracies": 1.0, "rewards/chosen": -1.257944107055664, "rewards/margins": 13.518019676208496, "rewards/rejected": -14.77596378326416, "step": 2341 }, { "epoch": 0.56, "learning_rate": 9.733333333333333e-08, "logps/chosen": -288.9599914550781, "logps/rejected": -383.4059753417969, "loss": 0.0022, "losses/dpo": 4.396988995125639e-09, "losses/sft": 0.7153480052947998, "losses/total": 4.396988995125639e-09, "ref_logps/chosen": -274.1710510253906, "ref_logps/rejected": -227.71615600585938, "rewards/accuracies": 1.0, "rewards/chosen": -1.4788941144943237, "rewards/margins": 14.090089797973633, "rewards/rejected": -15.56898307800293, "step": 2342 }, { "epoch": 0.56, "learning_rate": 9.728e-08, "logps/chosen": -221.29922485351562, "logps/rejected": -334.30291748046875, "loss": 0.0018, "losses/dpo": 4.619929313776083e-05, "losses/sft": 1.203070878982544, "losses/total": 4.619929313776083e-05, "ref_logps/chosen": -212.78753662109375, "ref_logps/rejected": -204.89915466308594, "rewards/accuracies": 1.0, "rewards/chosen": -0.8511708378791809, "rewards/margins": 12.08920669555664, "rewards/rejected": -12.940377235412598, "step": 2343 }, { "epoch": 0.56, "learning_rate": 9.722666666666666e-08, "logps/chosen": -225.93711853027344, "logps/rejected": -347.2160339355469, "loss": 0.001, "losses/dpo": 2.809799859448958e-08, "losses/sft": 0.5957876443862915, "losses/total": 2.809799859448958e-08, "ref_logps/chosen": -211.3533172607422, "ref_logps/rejected": -200.9291534423828, "rewards/accuracies": 1.0, "rewards/chosen": -1.4583795070648193, "rewards/margins": 13.170309066772461, "rewards/rejected": -14.62868881225586, "step": 2344 }, { "epoch": 0.56, "learning_rate": 9.717333333333334e-08, "logps/chosen": -285.28387451171875, "logps/rejected": -417.51824951171875, "loss": 0.0018, "losses/dpo": 4.6526923824785626e-07, "losses/sft": 0.4967104494571686, "losses/total": 4.6526923824785626e-07, "ref_logps/chosen": -271.92193603515625, "ref_logps/rejected": -255.5054931640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.336195468902588, "rewards/margins": 14.865077018737793, "rewards/rejected": -16.20127296447754, "step": 2345 }, { "epoch": 0.56, "learning_rate": 9.711999999999999e-08, "logps/chosen": -226.64854431152344, "logps/rejected": -350.4185485839844, "loss": 0.0004, "losses/dpo": 5.593029511752334e-12, "losses/sft": 0.8010587096214294, "losses/total": 5.593029511752334e-12, "ref_logps/chosen": -213.17092895507812, "ref_logps/rejected": -203.43548583984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.3477613925933838, "rewards/margins": 13.350543022155762, "rewards/rejected": -14.698305130004883, "step": 2346 }, { "epoch": 0.56, "learning_rate": 9.706666666666666e-08, "logps/chosen": -221.99795532226562, "logps/rejected": -339.8409423828125, "loss": 0.0031, "losses/dpo": 2.819975634338334e-06, "losses/sft": 0.5481133460998535, "losses/total": 2.819975634338334e-06, "ref_logps/chosen": -208.5880126953125, "ref_logps/rejected": -206.1347198486328, "rewards/accuracies": 1.0, "rewards/chosen": -1.3409913778305054, "rewards/margins": 12.029631614685059, "rewards/rejected": -13.370623588562012, "step": 2347 }, { "epoch": 0.56, "learning_rate": 9.701333333333333e-08, "logps/chosen": -229.57066345214844, "logps/rejected": -352.0600280761719, "loss": 0.0015, "losses/dpo": 2.4775581586311546e-09, "losses/sft": 0.5351441502571106, "losses/total": 2.4775581586311546e-09, "ref_logps/chosen": -217.0517578125, "ref_logps/rejected": -215.7378692626953, "rewards/accuracies": 1.0, "rewards/chosen": -1.251889705657959, "rewards/margins": 12.380324363708496, "rewards/rejected": -13.632214546203613, "step": 2348 }, { "epoch": 0.56, "learning_rate": 9.696e-08, "logps/chosen": -229.69619750976562, "logps/rejected": -353.3499450683594, "loss": 0.002, "losses/dpo": 3.4479595001357666e-07, "losses/sft": 0.5354375839233398, "losses/total": 3.4479595001357666e-07, "ref_logps/chosen": -218.16796875, "ref_logps/rejected": -205.19261169433594, "rewards/accuracies": 1.0, "rewards/chosen": -1.1528222560882568, "rewards/margins": 13.662912368774414, "rewards/rejected": -14.815733909606934, "step": 2349 }, { "epoch": 0.56, "learning_rate": 9.690666666666665e-08, "logps/chosen": -240.755126953125, "logps/rejected": -384.898681640625, "loss": 0.0002, "losses/dpo": 1.8339741245654295e-06, "losses/sft": 1.082271933555603, "losses/total": 1.8339741245654295e-06, "ref_logps/chosen": -229.52581787109375, "ref_logps/rejected": -239.3604736328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.1229302883148193, "rewards/margins": 13.430889129638672, "rewards/rejected": -14.55381965637207, "step": 2350 }, { "epoch": 0.56, "learning_rate": 9.685333333333333e-08, "logps/chosen": -204.99960327148438, "logps/rejected": -310.987060546875, "loss": 0.0043, "losses/dpo": 2.684948776732199e-05, "losses/sft": 0.6782426834106445, "losses/total": 2.684948776732199e-05, "ref_logps/chosen": -193.96839904785156, "ref_logps/rejected": -192.30270385742188, "rewards/accuracies": 1.0, "rewards/chosen": -1.103121280670166, "rewards/margins": 10.765315055847168, "rewards/rejected": -11.868436813354492, "step": 2351 }, { "epoch": 0.56, "learning_rate": 9.679999999999999e-08, "logps/chosen": -230.40243530273438, "logps/rejected": -347.57098388671875, "loss": 0.0016, "losses/dpo": 0.00012196087482152507, "losses/sft": 1.187470555305481, "losses/total": 0.00012196087482152507, "ref_logps/chosen": -217.58114624023438, "ref_logps/rejected": -203.71409606933594, "rewards/accuracies": 1.0, "rewards/chosen": -1.2821288108825684, "rewards/margins": 13.103557586669922, "rewards/rejected": -14.385684967041016, "step": 2352 }, { "epoch": 0.56, "learning_rate": 9.674666666666667e-08, "logps/chosen": -242.88702392578125, "logps/rejected": -359.9779052734375, "loss": 0.0007, "losses/dpo": 9.65384824667126e-05, "losses/sft": 0.6111667156219482, "losses/total": 9.65384824667126e-05, "ref_logps/chosen": -231.05308532714844, "ref_logps/rejected": -209.40196228027344, "rewards/accuracies": 1.0, "rewards/chosen": -1.1833924055099487, "rewards/margins": 13.874201774597168, "rewards/rejected": -15.057594299316406, "step": 2353 }, { "epoch": 0.56, "learning_rate": 9.669333333333332e-08, "logps/chosen": -245.21873474121094, "logps/rejected": -388.93341064453125, "loss": 0.0042, "losses/dpo": 5.197111185850645e-09, "losses/sft": 0.6218698620796204, "losses/total": 5.197111185850645e-09, "ref_logps/chosen": -230.00631713867188, "ref_logps/rejected": -236.22286987304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.521242380142212, "rewards/margins": 13.749811172485352, "rewards/rejected": -15.2710542678833, "step": 2354 }, { "epoch": 0.57, "learning_rate": 9.664e-08, "logps/chosen": -246.2388458251953, "logps/rejected": -390.44696044921875, "loss": 0.0004, "losses/dpo": 2.570407787061413e-06, "losses/sft": 0.6824327111244202, "losses/total": 2.570407787061413e-06, "ref_logps/chosen": -229.90089416503906, "ref_logps/rejected": -229.63868713378906, "rewards/accuracies": 1.0, "rewards/chosen": -1.633794903755188, "rewards/margins": 14.447035789489746, "rewards/rejected": -16.08083152770996, "step": 2355 }, { "epoch": 0.57, "learning_rate": 9.658666666666666e-08, "logps/chosen": -234.8967742919922, "logps/rejected": -353.4610290527344, "loss": 0.0014, "losses/dpo": 2.430425860566743e-09, "losses/sft": 1.1428682804107666, "losses/total": 2.430425860566743e-09, "ref_logps/chosen": -219.70204162597656, "ref_logps/rejected": -207.5643310546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5194745063781738, "rewards/margins": 13.070194244384766, "rewards/rejected": -14.589668273925781, "step": 2356 }, { "epoch": 0.57, "learning_rate": 9.653333333333334e-08, "logps/chosen": -231.42388916015625, "logps/rejected": -371.42059326171875, "loss": 0.0011, "losses/dpo": 3.980394012614852e-06, "losses/sft": 0.6671555638313293, "losses/total": 3.980394012614852e-06, "ref_logps/chosen": -216.5523681640625, "ref_logps/rejected": -221.33229064941406, "rewards/accuracies": 1.0, "rewards/chosen": -1.4871515035629272, "rewards/margins": 13.521679878234863, "rewards/rejected": -15.008829116821289, "step": 2357 }, { "epoch": 0.57, "learning_rate": 9.648e-08, "logps/chosen": -261.0036926269531, "logps/rejected": -414.5191650390625, "loss": 0.0003, "losses/dpo": 0.00042040261905640364, "losses/sft": 0.6148545145988464, "losses/total": 0.00042040261905640364, "ref_logps/chosen": -244.06788635253906, "ref_logps/rejected": -264.291015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6935793161392212, "rewards/margins": 13.32923698425293, "rewards/rejected": -15.02281665802002, "step": 2358 }, { "epoch": 0.57, "learning_rate": 9.642666666666666e-08, "logps/chosen": -271.63250732421875, "logps/rejected": -400.9352111816406, "loss": 0.0079, "losses/dpo": 1.0999477950690562e-07, "losses/sft": 0.6737672090530396, "losses/total": 1.0999477950690562e-07, "ref_logps/chosen": -257.5027770996094, "ref_logps/rejected": -239.67108154296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.4129716157913208, "rewards/margins": 14.713440895080566, "rewards/rejected": -16.12641143798828, "step": 2359 }, { "epoch": 0.57, "learning_rate": 9.637333333333333e-08, "logps/chosen": -265.9471435546875, "logps/rejected": -379.7845764160156, "loss": 0.003, "losses/dpo": 3.5015847970498726e-05, "losses/sft": 0.5250881314277649, "losses/total": 3.5015847970498726e-05, "ref_logps/chosen": -252.4022979736328, "ref_logps/rejected": -228.7338409423828, "rewards/accuracies": 1.0, "rewards/chosen": -1.3544844388961792, "rewards/margins": 13.750590324401855, "rewards/rejected": -15.105074882507324, "step": 2360 }, { "epoch": 0.57, "learning_rate": 9.631999999999999e-08, "logps/chosen": -232.47821044921875, "logps/rejected": -358.5286865234375, "loss": 0.0032, "losses/dpo": 6.388464157680573e-08, "losses/sft": 0.4668290317058563, "losses/total": 6.388464157680573e-08, "ref_logps/chosen": -219.49472045898438, "ref_logps/rejected": -210.58953857421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.2983503341674805, "rewards/margins": 13.495563507080078, "rewards/rejected": -14.793913841247559, "step": 2361 }, { "epoch": 0.57, "learning_rate": 9.626666666666667e-08, "logps/chosen": -260.14666748046875, "logps/rejected": -351.0531921386719, "loss": 0.0112, "losses/dpo": 3.4211918773507932e-06, "losses/sft": 0.6832187175750732, "losses/total": 3.4211918773507932e-06, "ref_logps/chosen": -244.59669494628906, "ref_logps/rejected": -207.1787872314453, "rewards/accuracies": 1.0, "rewards/chosen": -1.5549967288970947, "rewards/margins": 12.832443237304688, "rewards/rejected": -14.387441635131836, "step": 2362 }, { "epoch": 0.57, "learning_rate": 9.621333333333332e-08, "logps/chosen": -212.73223876953125, "logps/rejected": -317.3275451660156, "loss": 0.0005, "losses/dpo": 5.856644747836981e-06, "losses/sft": 0.6111592650413513, "losses/total": 5.856644747836981e-06, "ref_logps/chosen": -199.08975219726562, "ref_logps/rejected": -183.90872192382812, "rewards/accuracies": 1.0, "rewards/chosen": -1.3642475605010986, "rewards/margins": 11.97763442993164, "rewards/rejected": -13.341882705688477, "step": 2363 }, { "epoch": 0.57, "learning_rate": 9.616e-08, "logps/chosen": -236.72174072265625, "logps/rejected": -327.465087890625, "loss": 0.0029, "losses/dpo": 2.5550327791279415e-06, "losses/sft": 0.42313018441200256, "losses/total": 2.5550327791279415e-06, "ref_logps/chosen": -223.9970245361328, "ref_logps/rejected": -196.47369384765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2724722623825073, "rewards/margins": 11.826663970947266, "rewards/rejected": -13.099136352539062, "step": 2364 }, { "epoch": 0.57, "learning_rate": 9.610666666666666e-08, "logps/chosen": -272.3007507324219, "logps/rejected": -369.5225830078125, "loss": 0.0042, "losses/dpo": 4.7170043870892187e-08, "losses/sft": 1.2167435884475708, "losses/total": 4.7170043870892187e-08, "ref_logps/chosen": -259.4740905761719, "ref_logps/rejected": -214.45498657226562, "rewards/accuracies": 1.0, "rewards/chosen": -1.2826672792434692, "rewards/margins": 14.224090576171875, "rewards/rejected": -15.506757736206055, "step": 2365 }, { "epoch": 0.57, "learning_rate": 9.605333333333334e-08, "logps/chosen": -241.57638549804688, "logps/rejected": -389.0330810546875, "loss": 0.0002, "losses/dpo": 6.324100922938669e-07, "losses/sft": 0.9011669158935547, "losses/total": 6.324100922938669e-07, "ref_logps/chosen": -228.07931518554688, "ref_logps/rejected": -237.3096160888672, "rewards/accuracies": 1.0, "rewards/chosen": -1.3497058153152466, "rewards/margins": 13.822643280029297, "rewards/rejected": -15.172348976135254, "step": 2366 }, { "epoch": 0.57, "learning_rate": 9.599999999999999e-08, "logps/chosen": -212.70663452148438, "logps/rejected": -336.8011474609375, "loss": 0.0046, "losses/dpo": 6.790077122786897e-08, "losses/sft": 0.7451272010803223, "losses/total": 6.790077122786897e-08, "ref_logps/chosen": -201.12744140625, "ref_logps/rejected": -208.29727172851562, "rewards/accuracies": 1.0, "rewards/chosen": -1.1579186916351318, "rewards/margins": 11.692469596862793, "rewards/rejected": -12.85038948059082, "step": 2367 }, { "epoch": 0.57, "learning_rate": 9.594666666666666e-08, "logps/chosen": -259.65106201171875, "logps/rejected": -417.4386291503906, "loss": 0.0002, "losses/dpo": 1.0041185660725205e-08, "losses/sft": 0.587721049785614, "losses/total": 1.0041185660725205e-08, "ref_logps/chosen": -244.6583709716797, "ref_logps/rejected": -261.58251953125, "rewards/accuracies": 1.0, "rewards/chosen": -1.4992687702178955, "rewards/margins": 14.086341857910156, "rewards/rejected": -15.585609436035156, "step": 2368 }, { "epoch": 0.57, "learning_rate": 9.589333333333333e-08, "logps/chosen": -187.731689453125, "logps/rejected": -367.8544616699219, "loss": 0.0002, "losses/dpo": 5.350359478378808e-12, "losses/sft": 0.8930656909942627, "losses/total": 5.350359478378808e-12, "ref_logps/chosen": -175.4026336669922, "ref_logps/rejected": -215.61256408691406, "rewards/accuracies": 1.0, "rewards/chosen": -1.2329044342041016, "rewards/margins": 13.991288185119629, "rewards/rejected": -15.224191665649414, "step": 2369 }, { "epoch": 0.57, "learning_rate": 9.584e-08, "logps/chosen": -272.2569885253906, "logps/rejected": -386.48028564453125, "loss": 0.0001, "losses/dpo": 0.00014738149184267968, "losses/sft": 0.5274540781974792, "losses/total": 0.00014738149184267968, "ref_logps/chosen": -252.18487548828125, "ref_logps/rejected": -224.13165283203125, "rewards/accuracies": 1.0, "rewards/chosen": -2.0072100162506104, "rewards/margins": 14.227655410766602, "rewards/rejected": -16.234865188598633, "step": 2370 }, { "epoch": 0.57, "learning_rate": 9.578666666666665e-08, "logps/chosen": -237.01531982421875, "logps/rejected": -324.3885498046875, "loss": 0.0119, "losses/dpo": 0.0002494559739716351, "losses/sft": 0.32382625341415405, "losses/total": 0.0002494559739716351, "ref_logps/chosen": -222.52090454101562, "ref_logps/rejected": -196.8593292236328, "rewards/accuracies": 1.0, "rewards/chosen": -1.4494431018829346, "rewards/margins": 11.303478240966797, "rewards/rejected": -12.752922058105469, "step": 2371 }, { "epoch": 0.57, "learning_rate": 9.573333333333333e-08, "logps/chosen": -222.02938842773438, "logps/rejected": -364.45855712890625, "loss": 0.0009, "losses/dpo": 3.313477236588369e-06, "losses/sft": 0.44155213236808777, "losses/total": 3.313477236588369e-06, "ref_logps/chosen": -211.1607666015625, "ref_logps/rejected": -222.6811065673828, "rewards/accuracies": 1.0, "rewards/chosen": -1.0868613719940186, "rewards/margins": 13.090885162353516, "rewards/rejected": -14.17774486541748, "step": 2372 }, { "epoch": 0.57, "learning_rate": 9.568e-08, "logps/chosen": -203.38162231445312, "logps/rejected": -366.4833068847656, "loss": 0.0022, "losses/dpo": 0.0006281216046772897, "losses/sft": 0.575425922870636, "losses/total": 0.0006281216046772897, "ref_logps/chosen": -191.89781188964844, "ref_logps/rejected": -214.46990966796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.1483807563781738, "rewards/margins": 14.052961349487305, "rewards/rejected": -15.20134162902832, "step": 2373 }, { "epoch": 0.57, "learning_rate": 9.562666666666667e-08, "logps/chosen": -249.58157348632812, "logps/rejected": -374.22052001953125, "loss": 0.0279, "losses/dpo": 9.160076253067473e-09, "losses/sft": 0.43842706084251404, "losses/total": 9.160076253067473e-09, "ref_logps/chosen": -235.109130859375, "ref_logps/rejected": -225.04832458496094, "rewards/accuracies": 0.96875, "rewards/chosen": -1.4472434520721436, "rewards/margins": 13.469976425170898, "rewards/rejected": -14.917218208312988, "step": 2374 }, { "epoch": 0.57, "learning_rate": 9.557333333333333e-08, "logps/chosen": -236.9725341796875, "logps/rejected": -375.73284912109375, "loss": 0.0011, "losses/dpo": 7.961004371281888e-07, "losses/sft": 0.7509629726409912, "losses/total": 7.961004371281888e-07, "ref_logps/chosen": -223.31076049804688, "ref_logps/rejected": -226.2705078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.3661773204803467, "rewards/margins": 13.58005428314209, "rewards/rejected": -14.946231842041016, "step": 2375 }, { "epoch": 0.57, "learning_rate": 9.552e-08, "logps/chosen": -304.0240173339844, "logps/rejected": -435.8815002441406, "loss": 0.0001, "losses/dpo": 1.315394456469221e-06, "losses/sft": 0.5425429344177246, "losses/total": 1.315394456469221e-06, "ref_logps/chosen": -287.7506103515625, "ref_logps/rejected": -262.7991027832031, "rewards/accuracies": 1.0, "rewards/chosen": -1.6273434162139893, "rewards/margins": 15.68089771270752, "rewards/rejected": -17.308242797851562, "step": 2376 }, { "epoch": 0.57, "learning_rate": 9.546666666666666e-08, "logps/chosen": -267.7506103515625, "logps/rejected": -369.47821044921875, "loss": 0.0008, "losses/dpo": 7.36360916686607e-10, "losses/sft": 0.7464603185653687, "losses/total": 7.36360916686607e-10, "ref_logps/chosen": -252.34214782714844, "ref_logps/rejected": -225.03330993652344, "rewards/accuracies": 1.0, "rewards/chosen": -1.5408457517623901, "rewards/margins": 12.903642654418945, "rewards/rejected": -14.444489479064941, "step": 2377 }, { "epoch": 0.57, "learning_rate": 9.541333333333334e-08, "logps/chosen": -235.61790466308594, "logps/rejected": -381.10882568359375, "loss": 0.012, "losses/dpo": 8.620088531863246e-10, "losses/sft": 0.6093048453330994, "losses/total": 8.620088531863246e-10, "ref_logps/chosen": -222.301513671875, "ref_logps/rejected": -239.53424072265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.3316376209259033, "rewards/margins": 12.825820922851562, "rewards/rejected": -14.157458305358887, "step": 2378 }, { "epoch": 0.57, "learning_rate": 9.536e-08, "logps/chosen": -215.29519653320312, "logps/rejected": -330.103515625, "loss": 0.0045, "losses/dpo": 1.9420678398773816e-08, "losses/sft": 0.7815389633178711, "losses/total": 1.9420678398773816e-08, "ref_logps/chosen": -203.82736206054688, "ref_logps/rejected": -198.43148803710938, "rewards/accuracies": 1.0, "rewards/chosen": -1.1467828750610352, "rewards/margins": 12.020421981811523, "rewards/rejected": -13.167203903198242, "step": 2379 }, { "epoch": 0.57, "learning_rate": 9.530666666666665e-08, "logps/chosen": -209.45132446289062, "logps/rejected": -351.9725036621094, "loss": 0.0039, "losses/dpo": 0.00032030040165409446, "losses/sft": 0.6564932465553284, "losses/total": 0.00032030040165409446, "ref_logps/chosen": -198.66978454589844, "ref_logps/rejected": -219.48095703125, "rewards/accuracies": 1.0, "rewards/chosen": -1.078153133392334, "rewards/margins": 12.171001434326172, "rewards/rejected": -13.24915599822998, "step": 2380 }, { "epoch": 0.57, "learning_rate": 9.525333333333333e-08, "logps/chosen": -240.73211669921875, "logps/rejected": -364.8769836425781, "loss": 0.0027, "losses/dpo": 1.3264236258692108e-05, "losses/sft": 0.44354936480522156, "losses/total": 1.3264236258692108e-05, "ref_logps/chosen": -226.00558471679688, "ref_logps/rejected": -210.44354248046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.472651481628418, "rewards/margins": 13.970691680908203, "rewards/rejected": -15.443344116210938, "step": 2381 }, { "epoch": 0.57, "learning_rate": 9.519999999999999e-08, "logps/chosen": -197.5181884765625, "logps/rejected": -340.8446044921875, "loss": 0.004, "losses/dpo": 1.6966475868684938e-07, "losses/sft": 0.6647657155990601, "losses/total": 1.6966475868684938e-07, "ref_logps/chosen": -185.57713317871094, "ref_logps/rejected": -203.42474365234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.1941052675247192, "rewards/margins": 12.547883033752441, "rewards/rejected": -13.741989135742188, "step": 2382 }, { "epoch": 0.57, "learning_rate": 9.514666666666667e-08, "logps/chosen": -203.3867645263672, "logps/rejected": -373.4170227050781, "loss": 0.0009, "losses/dpo": 2.9261194868013263e-05, "losses/sft": 0.6858322620391846, "losses/total": 2.9261194868013263e-05, "ref_logps/chosen": -193.22726440429688, "ref_logps/rejected": -222.21847534179688, "rewards/accuracies": 1.0, "rewards/chosen": -1.01595139503479, "rewards/margins": 14.103903770446777, "rewards/rejected": -15.119854927062988, "step": 2383 }, { "epoch": 0.57, "learning_rate": 9.509333333333332e-08, "logps/chosen": -242.53781127929688, "logps/rejected": -380.04693603515625, "loss": 0.0005, "losses/dpo": 3.6357321278046584e-06, "losses/sft": 0.5117185711860657, "losses/total": 3.6357321278046584e-06, "ref_logps/chosen": -229.20242309570312, "ref_logps/rejected": -234.5165557861328, "rewards/accuracies": 1.0, "rewards/chosen": -1.3335397243499756, "rewards/margins": 13.219494819641113, "rewards/rejected": -14.553034782409668, "step": 2384 }, { "epoch": 0.57, "learning_rate": 9.504e-08, "logps/chosen": -226.9461669921875, "logps/rejected": -370.5865478515625, "loss": 0.0003, "losses/dpo": 1.5945688858209905e-07, "losses/sft": 0.47697198390960693, "losses/total": 1.5945688858209905e-07, "ref_logps/chosen": -211.77688598632812, "ref_logps/rejected": -223.65695190429688, "rewards/accuracies": 1.0, "rewards/chosen": -1.5169295072555542, "rewards/margins": 13.176031112670898, "rewards/rejected": -14.692960739135742, "step": 2385 }, { "epoch": 0.57, "learning_rate": 9.498666666666666e-08, "logps/chosen": -221.97711181640625, "logps/rejected": -402.453857421875, "loss": 0.0045, "losses/dpo": 4.1848380760711734e-07, "losses/sft": 0.5944380164146423, "losses/total": 4.1848380760711734e-07, "ref_logps/chosen": -208.29087829589844, "ref_logps/rejected": -246.814453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.3686237335205078, "rewards/margins": 14.195318222045898, "rewards/rejected": -15.563941955566406, "step": 2386 }, { "epoch": 0.57, "learning_rate": 9.493333333333334e-08, "logps/chosen": -232.43548583984375, "logps/rejected": -379.05670166015625, "loss": 0.0006, "losses/dpo": 3.000885726578417e-07, "losses/sft": 0.3958128094673157, "losses/total": 3.000885726578417e-07, "ref_logps/chosen": -220.35195922851562, "ref_logps/rejected": -230.90426635742188, "rewards/accuracies": 1.0, "rewards/chosen": -1.2083535194396973, "rewards/margins": 13.606888771057129, "rewards/rejected": -14.815242767333984, "step": 2387 }, { "epoch": 0.57, "learning_rate": 9.488e-08, "logps/chosen": -210.54270935058594, "logps/rejected": -369.9805908203125, "loss": 0.035, "losses/dpo": 6.69300987965471e-08, "losses/sft": 0.501765787601471, "losses/total": 6.69300987965471e-08, "ref_logps/chosen": -196.70486450195312, "ref_logps/rejected": -222.1620635986328, "rewards/accuracies": 0.96875, "rewards/chosen": -1.3837859630584717, "rewards/margins": 13.398069381713867, "rewards/rejected": -14.781855583190918, "step": 2388 }, { "epoch": 0.57, "learning_rate": 9.482666666666666e-08, "logps/chosen": -235.78875732421875, "logps/rejected": -364.9989929199219, "loss": 0.0308, "losses/dpo": 1.186358655758113e-08, "losses/sft": 0.9997819662094116, "losses/total": 1.186358655758113e-08, "ref_logps/chosen": -223.7675018310547, "ref_logps/rejected": -229.62413024902344, "rewards/accuracies": 0.96875, "rewards/chosen": -1.2021257877349854, "rewards/margins": 12.335359573364258, "rewards/rejected": -13.53748607635498, "step": 2389 }, { "epoch": 0.57, "learning_rate": 9.477333333333333e-08, "logps/chosen": -239.61062622070312, "logps/rejected": -345.0777282714844, "loss": 0.0058, "losses/dpo": 8.376478035643231e-06, "losses/sft": 0.49608203768730164, "losses/total": 8.376478035643231e-06, "ref_logps/chosen": -225.90516662597656, "ref_logps/rejected": -208.3430633544922, "rewards/accuracies": 1.0, "rewards/chosen": -1.3705475330352783, "rewards/margins": 12.3029203414917, "rewards/rejected": -13.673466682434082, "step": 2390 }, { "epoch": 0.57, "learning_rate": 9.472e-08, "logps/chosen": -253.4150390625, "logps/rejected": -445.1685791015625, "loss": 0.0002, "losses/dpo": 5.160514238156111e-09, "losses/sft": 0.6019606590270996, "losses/total": 5.160514238156111e-09, "ref_logps/chosen": -239.0758819580078, "ref_logps/rejected": -262.62518310546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.433915376663208, "rewards/margins": 16.820425033569336, "rewards/rejected": -18.25434112548828, "step": 2391 }, { "epoch": 0.57, "learning_rate": 9.466666666666667e-08, "logps/chosen": -217.99676513671875, "logps/rejected": -373.430908203125, "loss": 0.0029, "losses/dpo": 1.7018052176354104e-06, "losses/sft": 0.5058597326278687, "losses/total": 1.7018052176354104e-06, "ref_logps/chosen": -202.51507568359375, "ref_logps/rejected": -216.84906005859375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5481688976287842, "rewards/margins": 14.110016822814941, "rewards/rejected": -15.658185958862305, "step": 2392 }, { "epoch": 0.57, "learning_rate": 9.461333333333333e-08, "logps/chosen": -202.31570434570312, "logps/rejected": -354.80853271484375, "loss": 0.0171, "losses/dpo": 0.4819238781929016, "losses/sft": 0.7950204014778137, "losses/total": 0.4819238781929016, "ref_logps/chosen": -190.00025939941406, "ref_logps/rejected": -210.4180145263672, "rewards/accuracies": 1.0, "rewards/chosen": -1.231544852256775, "rewards/margins": 13.20750904083252, "rewards/rejected": -14.439054489135742, "step": 2393 }, { "epoch": 0.57, "learning_rate": 9.456e-08, "logps/chosen": -244.0879669189453, "logps/rejected": -399.2035827636719, "loss": 0.0002, "losses/dpo": 9.06462264538277e-06, "losses/sft": 0.5694831609725952, "losses/total": 9.06462264538277e-06, "ref_logps/chosen": -227.44384765625, "ref_logps/rejected": -243.85252380371094, "rewards/accuracies": 1.0, "rewards/chosen": -1.664412260055542, "rewards/margins": 13.870694160461426, "rewards/rejected": -15.535106658935547, "step": 2394 }, { "epoch": 0.57, "learning_rate": 9.450666666666667e-08, "logps/chosen": -296.29241943359375, "logps/rejected": -403.15301513671875, "loss": 0.0012, "losses/dpo": 1.3398578175838338e-06, "losses/sft": 0.5539043545722961, "losses/total": 1.3398578175838338e-06, "ref_logps/chosen": -280.11572265625, "ref_logps/rejected": -244.53707885742188, "rewards/accuracies": 1.0, "rewards/chosen": -1.6176707744598389, "rewards/margins": 14.243921279907227, "rewards/rejected": -15.861592292785645, "step": 2395 }, { "epoch": 0.57, "learning_rate": 9.445333333333333e-08, "logps/chosen": -211.109375, "logps/rejected": -338.09454345703125, "loss": 0.0049, "losses/dpo": 5.2810562920058146e-05, "losses/sft": 0.9321690797805786, "losses/total": 5.2810562920058146e-05, "ref_logps/chosen": -199.587890625, "ref_logps/rejected": -198.90618896484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.1521474123001099, "rewards/margins": 12.766688346862793, "rewards/rejected": -13.91883659362793, "step": 2396 }, { "epoch": 0.58, "learning_rate": 9.439999999999999e-08, "logps/chosen": -255.3080291748047, "logps/rejected": -376.877685546875, "loss": 0.0011, "losses/dpo": 1.3785822261525027e-07, "losses/sft": 1.2404413223266602, "losses/total": 1.3785822261525027e-07, "ref_logps/chosen": -242.03961181640625, "ref_logps/rejected": -239.4363250732422, "rewards/accuracies": 1.0, "rewards/chosen": -1.326840877532959, "rewards/margins": 12.417299270629883, "rewards/rejected": -13.744139671325684, "step": 2397 }, { "epoch": 0.58, "learning_rate": 9.434666666666666e-08, "logps/chosen": -188.3662872314453, "logps/rejected": -305.102783203125, "loss": 0.0103, "losses/dpo": 2.075769998555188e-06, "losses/sft": 0.529917299747467, "losses/total": 2.075769998555188e-06, "ref_logps/chosen": -176.760986328125, "ref_logps/rejected": -187.34683227539062, "rewards/accuracies": 1.0, "rewards/chosen": -1.1605291366577148, "rewards/margins": 10.615068435668945, "rewards/rejected": -11.775596618652344, "step": 2398 }, { "epoch": 0.58, "learning_rate": 9.429333333333333e-08, "logps/chosen": -215.20726013183594, "logps/rejected": -362.6169128417969, "loss": 0.0011, "losses/dpo": 1.1412445433123253e-09, "losses/sft": 0.48735636472702026, "losses/total": 1.1412445433123253e-09, "ref_logps/chosen": -202.89013671875, "ref_logps/rejected": -217.72235107421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.2317125797271729, "rewards/margins": 13.257743835449219, "rewards/rejected": -14.489456176757812, "step": 2399 }, { "epoch": 0.58, "learning_rate": 9.424e-08, "logps/chosen": -193.17141723632812, "logps/rejected": -372.15435791015625, "loss": 0.002, "losses/dpo": 7.540008795103859e-08, "losses/sft": 0.671513020992279, "losses/total": 7.540008795103859e-08, "ref_logps/chosen": -181.68333435058594, "ref_logps/rejected": -225.9144287109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.1488089561462402, "rewards/margins": 13.475183486938477, "rewards/rejected": -14.623992919921875, "step": 2400 }, { "epoch": 0.58, "learning_rate": 9.418666666666665e-08, "logps/chosen": -219.62905883789062, "logps/rejected": -319.63116455078125, "loss": 0.0023, "losses/dpo": 1.1243005104688564e-07, "losses/sft": 0.7449487447738647, "losses/total": 1.1243005104688564e-07, "ref_logps/chosen": -206.21559143066406, "ref_logps/rejected": -196.62582397460938, "rewards/accuracies": 1.0, "rewards/chosen": -1.341346263885498, "rewards/margins": 10.959188461303711, "rewards/rejected": -12.300533294677734, "step": 2401 }, { "epoch": 0.58, "learning_rate": 9.413333333333333e-08, "logps/chosen": -278.67413330078125, "logps/rejected": -405.8638916015625, "loss": 0.0121, "losses/dpo": 1.6489408153574914e-05, "losses/sft": 0.4939248263835907, "losses/total": 1.6489408153574914e-05, "ref_logps/chosen": -263.3116149902344, "ref_logps/rejected": -248.01455688476562, "rewards/accuracies": 1.0, "rewards/chosen": -1.5362555980682373, "rewards/margins": 14.248682022094727, "rewards/rejected": -15.784936904907227, "step": 2402 }, { "epoch": 0.58, "learning_rate": 9.407999999999999e-08, "logps/chosen": -250.34458923339844, "logps/rejected": -355.20806884765625, "loss": 0.004, "losses/dpo": 1.2744750144122463e-08, "losses/sft": 0.6950132846832275, "losses/total": 1.2744750144122463e-08, "ref_logps/chosen": -236.73507690429688, "ref_logps/rejected": -206.51022338867188, "rewards/accuracies": 1.0, "rewards/chosen": -1.3609516620635986, "rewards/margins": 13.508834838867188, "rewards/rejected": -14.869786262512207, "step": 2403 }, { "epoch": 0.58, "learning_rate": 9.402666666666667e-08, "logps/chosen": -224.988525390625, "logps/rejected": -331.6658630371094, "loss": 0.0004, "losses/dpo": 6.175848739076173e-07, "losses/sft": 0.6064274907112122, "losses/total": 6.175848739076173e-07, "ref_logps/chosen": -212.8025665283203, "ref_logps/rejected": -194.13414001464844, "rewards/accuracies": 1.0, "rewards/chosen": -1.218592882156372, "rewards/margins": 12.53458023071289, "rewards/rejected": -13.753173828125, "step": 2404 }, { "epoch": 0.58, "learning_rate": 9.397333333333333e-08, "logps/chosen": -265.3001403808594, "logps/rejected": -381.444580078125, "loss": 0.0193, "losses/dpo": 1.9866213278874056e-06, "losses/sft": 0.624530553817749, "losses/total": 1.9866213278874056e-06, "ref_logps/chosen": -250.28787231445312, "ref_logps/rejected": -226.8870849609375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5012251138687134, "rewards/margins": 13.954524993896484, "rewards/rejected": -15.455750465393066, "step": 2405 }, { "epoch": 0.58, "learning_rate": 9.392e-08, "logps/chosen": -228.76513671875, "logps/rejected": -335.4944763183594, "loss": 0.0036, "losses/dpo": 1.068290657713078e-05, "losses/sft": 0.5805911421775818, "losses/total": 1.068290657713078e-05, "ref_logps/chosen": -216.63827514648438, "ref_logps/rejected": -197.56204223632812, "rewards/accuracies": 1.0, "rewards/chosen": -1.2126867771148682, "rewards/margins": 12.58055591583252, "rewards/rejected": -13.793242454528809, "step": 2406 }, { "epoch": 0.58, "learning_rate": 9.386666666666666e-08, "logps/chosen": -274.39239501953125, "logps/rejected": -386.27703857421875, "loss": 0.0023, "losses/dpo": 2.35661334196946e-11, "losses/sft": 0.575849711894989, "losses/total": 2.35661334196946e-11, "ref_logps/chosen": -262.0640869140625, "ref_logps/rejected": -232.66818237304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.2328307628631592, "rewards/margins": 14.128056526184082, "rewards/rejected": -15.36088752746582, "step": 2407 }, { "epoch": 0.58, "learning_rate": 9.381333333333334e-08, "logps/chosen": -213.59405517578125, "logps/rejected": -325.7967529296875, "loss": 0.0019, "losses/dpo": 2.498376306903083e-05, "losses/sft": 0.8333203792572021, "losses/total": 2.498376306903083e-05, "ref_logps/chosen": -202.48678588867188, "ref_logps/rejected": -193.5862274169922, "rewards/accuracies": 1.0, "rewards/chosen": -1.1107265949249268, "rewards/margins": 12.110326766967773, "rewards/rejected": -13.221052169799805, "step": 2408 }, { "epoch": 0.58, "learning_rate": 9.376e-08, "logps/chosen": -242.14480590820312, "logps/rejected": -364.71221923828125, "loss": 0.0001, "losses/dpo": 5.731827513955068e-06, "losses/sft": 0.49232426285743713, "losses/total": 5.731827513955068e-06, "ref_logps/chosen": -230.23355102539062, "ref_logps/rejected": -217.476318359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.191123604774475, "rewards/margins": 13.532465934753418, "rewards/rejected": -14.723589897155762, "step": 2409 }, { "epoch": 0.58, "learning_rate": 9.370666666666666e-08, "logps/chosen": -243.31857299804688, "logps/rejected": -366.5641784667969, "loss": 0.0007, "losses/dpo": 6.848341058685037e-07, "losses/sft": 0.5298066735267639, "losses/total": 6.848341058685037e-07, "ref_logps/chosen": -228.20245361328125, "ref_logps/rejected": -225.77078247070312, "rewards/accuracies": 1.0, "rewards/chosen": -1.5116119384765625, "rewards/margins": 12.567728042602539, "rewards/rejected": -14.079339981079102, "step": 2410 }, { "epoch": 0.58, "learning_rate": 9.365333333333333e-08, "logps/chosen": -249.70741271972656, "logps/rejected": -384.44390869140625, "loss": 0.002, "losses/dpo": 3.068108753723209e-07, "losses/sft": 0.5473427772521973, "losses/total": 3.068108753723209e-07, "ref_logps/chosen": -237.5439910888672, "ref_logps/rejected": -236.85748291015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2163419723510742, "rewards/margins": 13.542304992675781, "rewards/rejected": -14.758646011352539, "step": 2411 }, { "epoch": 0.58, "learning_rate": 9.36e-08, "logps/chosen": -224.0386505126953, "logps/rejected": -329.7762451171875, "loss": 0.006, "losses/dpo": 1.5801393526970742e-09, "losses/sft": 0.5060412287712097, "losses/total": 1.5801393526970742e-09, "ref_logps/chosen": -209.39425659179688, "ref_logps/rejected": -197.3450927734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4644407033920288, "rewards/margins": 11.778674125671387, "rewards/rejected": -13.243114471435547, "step": 2412 }, { "epoch": 0.58, "learning_rate": 9.354666666666667e-08, "logps/chosen": -270.93499755859375, "logps/rejected": -416.07208251953125, "loss": 0.0085, "losses/dpo": 9.531222922021243e-09, "losses/sft": 0.6109756231307983, "losses/total": 9.531222922021243e-09, "ref_logps/chosen": -254.20944213867188, "ref_logps/rejected": -257.080322265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6725571155548096, "rewards/margins": 14.226616859436035, "rewards/rejected": -15.899173736572266, "step": 2413 }, { "epoch": 0.58, "learning_rate": 9.349333333333332e-08, "logps/chosen": -253.94082641601562, "logps/rejected": -370.62493896484375, "loss": 0.0045, "losses/dpo": 1.2792240156045409e-08, "losses/sft": 0.48960447311401367, "losses/total": 1.2792240156045409e-08, "ref_logps/chosen": -239.56356811523438, "ref_logps/rejected": -227.11929321289062, "rewards/accuracies": 1.0, "rewards/chosen": -1.4377291202545166, "rewards/margins": 12.912837982177734, "rewards/rejected": -14.350566864013672, "step": 2414 }, { "epoch": 0.58, "learning_rate": 9.344e-08, "logps/chosen": -204.07318115234375, "logps/rejected": -368.9344787597656, "loss": 0.0012, "losses/dpo": 1.2612190805616486e-11, "losses/sft": 0.5787556171417236, "losses/total": 1.2612190805616486e-11, "ref_logps/chosen": -194.84567260742188, "ref_logps/rejected": -221.04531860351562, "rewards/accuracies": 1.0, "rewards/chosen": -0.9227513074874878, "rewards/margins": 13.86616325378418, "rewards/rejected": -14.788914680480957, "step": 2415 }, { "epoch": 0.58, "learning_rate": 9.338666666666666e-08, "logps/chosen": -270.90869140625, "logps/rejected": -369.41796875, "loss": 0.0029, "losses/dpo": 1.3240543772496949e-08, "losses/sft": 0.5727013349533081, "losses/total": 1.3240543772496949e-08, "ref_logps/chosen": -258.4667053222656, "ref_logps/rejected": -228.79995727539062, "rewards/accuracies": 1.0, "rewards/chosen": -1.2441985607147217, "rewards/margins": 12.81760311126709, "rewards/rejected": -14.061800956726074, "step": 2416 }, { "epoch": 0.58, "learning_rate": 9.333333333333334e-08, "logps/chosen": -225.22671508789062, "logps/rejected": -350.64898681640625, "loss": 0.0011, "losses/dpo": 1.0201786437846749e-07, "losses/sft": 0.5845805406570435, "losses/total": 1.0201786437846749e-07, "ref_logps/chosen": -215.19973754882812, "ref_logps/rejected": -213.5001220703125, "rewards/accuracies": 1.0, "rewards/chosen": -1.0026984214782715, "rewards/margins": 12.712188720703125, "rewards/rejected": -13.714887619018555, "step": 2417 }, { "epoch": 0.58, "learning_rate": 9.327999999999999e-08, "logps/chosen": -248.64022827148438, "logps/rejected": -366.5948791503906, "loss": 0.001, "losses/dpo": 8.07892952536804e-09, "losses/sft": 0.5593826174736023, "losses/total": 8.07892952536804e-09, "ref_logps/chosen": -237.8527069091797, "ref_logps/rejected": -227.06271362304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.0787510871887207, "rewards/margins": 12.874464988708496, "rewards/rejected": -13.953216552734375, "step": 2418 }, { "epoch": 0.58, "learning_rate": 9.322666666666666e-08, "logps/chosen": -188.63177490234375, "logps/rejected": -341.379150390625, "loss": 0.0113, "losses/dpo": 3.6751231533927245e-14, "losses/sft": 0.6673872470855713, "losses/total": 3.6751231533927245e-14, "ref_logps/chosen": -178.9276123046875, "ref_logps/rejected": -203.40106201171875, "rewards/accuracies": 1.0, "rewards/chosen": -0.9704139232635498, "rewards/margins": 12.827393531799316, "rewards/rejected": -13.797807693481445, "step": 2419 }, { "epoch": 0.58, "learning_rate": 9.317333333333333e-08, "logps/chosen": -235.201904296875, "logps/rejected": -342.8934020996094, "loss": 0.0025, "losses/dpo": 6.8428978039491994e-09, "losses/sft": 1.1826021671295166, "losses/total": 6.8428978039491994e-09, "ref_logps/chosen": -227.77621459960938, "ref_logps/rejected": -213.32774353027344, "rewards/accuracies": 1.0, "rewards/chosen": -0.7425692081451416, "rewards/margins": 12.213998794555664, "rewards/rejected": -12.956567764282227, "step": 2420 }, { "epoch": 0.58, "learning_rate": 9.312e-08, "logps/chosen": -262.6710510253906, "logps/rejected": -365.89630126953125, "loss": 0.0031, "losses/dpo": 8.160089782904834e-05, "losses/sft": 0.8879114985466003, "losses/total": 8.160089782904834e-05, "ref_logps/chosen": -250.9339599609375, "ref_logps/rejected": -219.18624877929688, "rewards/accuracies": 1.0, "rewards/chosen": -1.1737077236175537, "rewards/margins": 13.497297286987305, "rewards/rejected": -14.671005249023438, "step": 2421 }, { "epoch": 0.58, "learning_rate": 9.306666666666667e-08, "logps/chosen": -212.37960815429688, "logps/rejected": -357.61346435546875, "loss": 0.0004, "losses/dpo": 2.425746743028867e-06, "losses/sft": 0.5301551818847656, "losses/total": 2.425746743028867e-06, "ref_logps/chosen": -203.70001220703125, "ref_logps/rejected": -217.40115356445312, "rewards/accuracies": 1.0, "rewards/chosen": -0.8679601550102234, "rewards/margins": 13.15327262878418, "rewards/rejected": -14.021232604980469, "step": 2422 }, { "epoch": 0.58, "learning_rate": 9.301333333333333e-08, "logps/chosen": -226.8284912109375, "logps/rejected": -369.6021423339844, "loss": 0.0046, "losses/dpo": 2.2731764914851738e-08, "losses/sft": 0.5398015379905701, "losses/total": 2.2731764914851738e-08, "ref_logps/chosen": -219.58949279785156, "ref_logps/rejected": -232.46859741210938, "rewards/accuracies": 1.0, "rewards/chosen": -0.7238985896110535, "rewards/margins": 12.989456176757812, "rewards/rejected": -13.71335506439209, "step": 2423 }, { "epoch": 0.58, "learning_rate": 9.295999999999999e-08, "logps/chosen": -245.51339721679688, "logps/rejected": -386.8436279296875, "loss": 0.0005, "losses/dpo": 1.5039737233379924e-09, "losses/sft": 0.5666100382804871, "losses/total": 1.5039737233379924e-09, "ref_logps/chosen": -230.9549560546875, "ref_logps/rejected": -235.81341552734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4558441638946533, "rewards/margins": 13.647176742553711, "rewards/rejected": -15.103019714355469, "step": 2424 }, { "epoch": 0.58, "learning_rate": 9.290666666666667e-08, "logps/chosen": -279.13641357421875, "logps/rejected": -404.80517578125, "loss": 0.0005, "losses/dpo": 5.886103338070825e-08, "losses/sft": 0.45661261677742004, "losses/total": 5.886103338070825e-08, "ref_logps/chosen": -265.193115234375, "ref_logps/rejected": -248.77859497070312, "rewards/accuracies": 1.0, "rewards/chosen": -1.3943290710449219, "rewards/margins": 14.208330154418945, "rewards/rejected": -15.602660179138184, "step": 2425 }, { "epoch": 0.58, "learning_rate": 9.285333333333333e-08, "logps/chosen": -223.8531951904297, "logps/rejected": -357.872802734375, "loss": 0.0005, "losses/dpo": 2.501322740044998e-07, "losses/sft": 0.53520667552948, "losses/total": 2.501322740044998e-07, "ref_logps/chosen": -213.51678466796875, "ref_logps/rejected": -217.50018310546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.0336421728134155, "rewards/margins": 13.003618240356445, "rewards/rejected": -14.037260055541992, "step": 2426 }, { "epoch": 0.58, "learning_rate": 9.28e-08, "logps/chosen": -270.15313720703125, "logps/rejected": -405.62127685546875, "loss": 0.002, "losses/dpo": 3.7745763847851777e-07, "losses/sft": 0.7194092273712158, "losses/total": 3.7745763847851777e-07, "ref_logps/chosen": -256.32000732421875, "ref_logps/rejected": -257.24835205078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.3833143711090088, "rewards/margins": 13.4539794921875, "rewards/rejected": -14.837294578552246, "step": 2427 }, { "epoch": 0.58, "learning_rate": 9.274666666666666e-08, "logps/chosen": -234.06761169433594, "logps/rejected": -405.53021240234375, "loss": 0.0004, "losses/dpo": 1.8074417518221253e-09, "losses/sft": 0.4397773742675781, "losses/total": 1.8074417518221253e-09, "ref_logps/chosen": -220.96990966796875, "ref_logps/rejected": -244.77398681640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.309769868850708, "rewards/margins": 14.765854835510254, "rewards/rejected": -16.075624465942383, "step": 2428 }, { "epoch": 0.58, "learning_rate": 9.269333333333334e-08, "logps/chosen": -252.4920196533203, "logps/rejected": -378.0422668457031, "loss": 0.0002, "losses/dpo": 2.8308890250627883e-05, "losses/sft": 0.6972821354866028, "losses/total": 2.8308890250627883e-05, "ref_logps/chosen": -238.16915893554688, "ref_logps/rejected": -223.92660522460938, "rewards/accuracies": 1.0, "rewards/chosen": -1.4322842359542847, "rewards/margins": 13.97928237915039, "rewards/rejected": -15.411566734313965, "step": 2429 }, { "epoch": 0.58, "learning_rate": 9.264e-08, "logps/chosen": -229.76747131347656, "logps/rejected": -354.60186767578125, "loss": 0.0005, "losses/dpo": 1.6444201946796966e-06, "losses/sft": 0.42808201909065247, "losses/total": 1.6444201946796966e-06, "ref_logps/chosen": -217.62109375, "ref_logps/rejected": -219.3285675048828, "rewards/accuracies": 1.0, "rewards/chosen": -1.21463942527771, "rewards/margins": 12.312690734863281, "rewards/rejected": -13.52733039855957, "step": 2430 }, { "epoch": 0.58, "learning_rate": 9.258666666666665e-08, "logps/chosen": -247.1747283935547, "logps/rejected": -398.9894714355469, "loss": 0.0005, "losses/dpo": 4.963601196728007e-10, "losses/sft": 0.5059512257575989, "losses/total": 4.963601196728007e-10, "ref_logps/chosen": -232.95433044433594, "ref_logps/rejected": -242.98143005371094, "rewards/accuracies": 1.0, "rewards/chosen": -1.4220391511917114, "rewards/margins": 14.178764343261719, "rewards/rejected": -15.60080337524414, "step": 2431 }, { "epoch": 0.58, "learning_rate": 9.253333333333333e-08, "logps/chosen": -251.57354736328125, "logps/rejected": -421.59991455078125, "loss": 0.0001, "losses/dpo": 6.667837948448607e-10, "losses/sft": 0.6095645427703857, "losses/total": 6.667837948448607e-10, "ref_logps/chosen": -238.55694580078125, "ref_logps/rejected": -246.61932373046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.3016586303710938, "rewards/margins": 16.19640350341797, "rewards/rejected": -17.498062133789062, "step": 2432 }, { "epoch": 0.58, "learning_rate": 9.247999999999999e-08, "logps/chosen": -216.47222900390625, "logps/rejected": -399.83349609375, "loss": 0.0917, "losses/dpo": 1.143518179452796e-11, "losses/sft": 0.7819676399230957, "losses/total": 1.143518179452796e-11, "ref_logps/chosen": -205.06768798828125, "ref_logps/rejected": -236.5101776123047, "rewards/accuracies": 0.96875, "rewards/chosen": -1.1404542922973633, "rewards/margins": 15.19188117980957, "rewards/rejected": -16.332334518432617, "step": 2433 }, { "epoch": 0.58, "learning_rate": 9.242666666666667e-08, "logps/chosen": -261.37066650390625, "logps/rejected": -368.94781494140625, "loss": 0.0077, "losses/dpo": 0.00021162086341064423, "losses/sft": 0.6467692255973816, "losses/total": 0.00021162086341064423, "ref_logps/chosen": -247.03819274902344, "ref_logps/rejected": -226.6444549560547, "rewards/accuracies": 1.0, "rewards/chosen": -1.43324875831604, "rewards/margins": 12.79708480834961, "rewards/rejected": -14.23033332824707, "step": 2434 }, { "epoch": 0.58, "learning_rate": 9.237333333333332e-08, "logps/chosen": -270.3473205566406, "logps/rejected": -402.36810302734375, "loss": 0.0008, "losses/dpo": 1.4786904145580593e-08, "losses/sft": 0.6670784950256348, "losses/total": 1.4786904145580593e-08, "ref_logps/chosen": -257.2138366699219, "ref_logps/rejected": -243.56797790527344, "rewards/accuracies": 1.0, "rewards/chosen": -1.3133479356765747, "rewards/margins": 14.566667556762695, "rewards/rejected": -15.880016326904297, "step": 2435 }, { "epoch": 0.58, "learning_rate": 9.232e-08, "logps/chosen": -238.06216430664062, "logps/rejected": -367.88494873046875, "loss": 0.0001, "losses/dpo": 6.413317876763358e-09, "losses/sft": 0.5874857902526855, "losses/total": 6.413317876763358e-09, "ref_logps/chosen": -224.58010864257812, "ref_logps/rejected": -215.31063842773438, "rewards/accuracies": 1.0, "rewards/chosen": -1.348205327987671, "rewards/margins": 13.909225463867188, "rewards/rejected": -15.257430076599121, "step": 2436 }, { "epoch": 0.58, "learning_rate": 9.226666666666666e-08, "logps/chosen": -233.30776977539062, "logps/rejected": -399.0670471191406, "loss": 0.0104, "losses/dpo": 1.1646195119396907e-09, "losses/sft": 0.6656816005706787, "losses/total": 1.1646195119396907e-09, "ref_logps/chosen": -221.36288452148438, "ref_logps/rejected": -251.12071228027344, "rewards/accuracies": 1.0, "rewards/chosen": -1.1944878101348877, "rewards/margins": 13.600144386291504, "rewards/rejected": -14.794631958007812, "step": 2437 }, { "epoch": 0.59, "learning_rate": 9.221333333333334e-08, "logps/chosen": -209.63702392578125, "logps/rejected": -360.04290771484375, "loss": 0.0013, "losses/dpo": 0.0024539208970963955, "losses/sft": 0.8024528622627258, "losses/total": 0.0024539208970963955, "ref_logps/chosen": -193.63003540039062, "ref_logps/rejected": -216.92730712890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.600700855255127, "rewards/margins": 12.710860252380371, "rewards/rejected": -14.311561584472656, "step": 2438 }, { "epoch": 0.59, "learning_rate": 9.216e-08, "logps/chosen": -237.41375732421875, "logps/rejected": -373.3050537109375, "loss": 0.0022, "losses/dpo": 6.834874284322723e-07, "losses/sft": 0.4337369203567505, "losses/total": 6.834874284322723e-07, "ref_logps/chosen": -223.4346160888672, "ref_logps/rejected": -220.0019989013672, "rewards/accuracies": 1.0, "rewards/chosen": -1.3979136943817139, "rewards/margins": 13.932390213012695, "rewards/rejected": -15.330304145812988, "step": 2439 }, { "epoch": 0.59, "learning_rate": 9.210666666666666e-08, "logps/chosen": -237.51885986328125, "logps/rejected": -375.94818115234375, "loss": 0.0017, "losses/dpo": 1.3664900677667902e-07, "losses/sft": 0.5042849183082581, "losses/total": 1.3664900677667902e-07, "ref_logps/chosen": -226.70120239257812, "ref_logps/rejected": -233.6064453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.0817666053771973, "rewards/margins": 13.152409553527832, "rewards/rejected": -14.234176635742188, "step": 2440 }, { "epoch": 0.59, "learning_rate": 9.205333333333333e-08, "logps/chosen": -233.8514862060547, "logps/rejected": -366.716064453125, "loss": 0.0007, "losses/dpo": 1.421362725295694e-07, "losses/sft": 0.5800890922546387, "losses/total": 1.421362725295694e-07, "ref_logps/chosen": -222.38511657714844, "ref_logps/rejected": -216.626220703125, "rewards/accuracies": 1.0, "rewards/chosen": -1.1466385126113892, "rewards/margins": 13.862344741821289, "rewards/rejected": -15.00898265838623, "step": 2441 }, { "epoch": 0.59, "learning_rate": 9.2e-08, "logps/chosen": -203.02914428710938, "logps/rejected": -369.92816162109375, "loss": 0.0009, "losses/dpo": 1.0556976126618878e-10, "losses/sft": 0.507379949092865, "losses/total": 1.0556976126618878e-10, "ref_logps/chosen": -192.14816284179688, "ref_logps/rejected": -218.22903442382812, "rewards/accuracies": 1.0, "rewards/chosen": -1.0880961418151855, "rewards/margins": 14.081811904907227, "rewards/rejected": -15.16990852355957, "step": 2442 }, { "epoch": 0.59, "learning_rate": 9.194666666666667e-08, "logps/chosen": -241.28564453125, "logps/rejected": -387.1750793457031, "loss": 0.0028, "losses/dpo": 2.068331894644615e-10, "losses/sft": 0.5526043772697449, "losses/total": 2.068331894644615e-10, "ref_logps/chosen": -227.4056854248047, "ref_logps/rejected": -227.4044952392578, "rewards/accuracies": 1.0, "rewards/chosen": -1.3879973888397217, "rewards/margins": 14.589058876037598, "rewards/rejected": -15.977055549621582, "step": 2443 }, { "epoch": 0.59, "learning_rate": 9.189333333333333e-08, "logps/chosen": -269.20794677734375, "logps/rejected": -375.2760925292969, "loss": 0.0033, "losses/dpo": 3.83346909771376e-09, "losses/sft": 0.5141986608505249, "losses/total": 3.83346909771376e-09, "ref_logps/chosen": -254.52261352539062, "ref_logps/rejected": -222.53485107421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.4685338735580444, "rewards/margins": 13.80558967590332, "rewards/rejected": -15.274124145507812, "step": 2444 }, { "epoch": 0.59, "learning_rate": 9.184e-08, "logps/chosen": -239.94656372070312, "logps/rejected": -362.6321105957031, "loss": 0.0008, "losses/dpo": 1.0415263318463985e-08, "losses/sft": 0.6243847608566284, "losses/total": 1.0415263318463985e-08, "ref_logps/chosen": -227.26296997070312, "ref_logps/rejected": -221.85552978515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.268359899520874, "rewards/margins": 12.809297561645508, "rewards/rejected": -14.077655792236328, "step": 2445 }, { "epoch": 0.59, "learning_rate": 9.178666666666667e-08, "logps/chosen": -234.8390350341797, "logps/rejected": -381.2801513671875, "loss": 0.0007, "losses/dpo": 1.0347030787727363e-08, "losses/sft": 0.7337959408760071, "losses/total": 1.0347030787727363e-08, "ref_logps/chosen": -224.44686889648438, "ref_logps/rejected": -226.1156768798828, "rewards/accuracies": 1.0, "rewards/chosen": -1.0392177104949951, "rewards/margins": 14.477228164672852, "rewards/rejected": -15.51644515991211, "step": 2446 }, { "epoch": 0.59, "learning_rate": 9.173333333333333e-08, "logps/chosen": -252.5439910888672, "logps/rejected": -383.2585144042969, "loss": 0.0052, "losses/dpo": 4.38490133092273e-06, "losses/sft": 0.49234649538993835, "losses/total": 4.38490133092273e-06, "ref_logps/chosen": -237.38282775878906, "ref_logps/rejected": -225.9767303466797, "rewards/accuracies": 1.0, "rewards/chosen": -1.5161173343658447, "rewards/margins": 14.212059020996094, "rewards/rejected": -15.72817611694336, "step": 2447 }, { "epoch": 0.59, "learning_rate": 9.167999999999998e-08, "logps/chosen": -217.32666015625, "logps/rejected": -339.09271240234375, "loss": 0.0036, "losses/dpo": 1.7967738585866755e-06, "losses/sft": 0.6397937536239624, "losses/total": 1.7967738585866755e-06, "ref_logps/chosen": -209.65869140625, "ref_logps/rejected": -215.29251098632812, "rewards/accuracies": 1.0, "rewards/chosen": -0.7667957544326782, "rewards/margins": 11.613224029541016, "rewards/rejected": -12.380020141601562, "step": 2448 }, { "epoch": 0.59, "learning_rate": 9.162666666666666e-08, "logps/chosen": -240.7946014404297, "logps/rejected": -388.3515930175781, "loss": 0.0017, "losses/dpo": 4.938384563502041e-07, "losses/sft": 1.0448724031448364, "losses/total": 4.938384563502041e-07, "ref_logps/chosen": -225.86740112304688, "ref_logps/rejected": -234.3055419921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.492720365524292, "rewards/margins": 13.911885261535645, "rewards/rejected": -15.404605865478516, "step": 2449 }, { "epoch": 0.59, "learning_rate": 9.157333333333332e-08, "logps/chosen": -270.2645568847656, "logps/rejected": -376.94635009765625, "loss": 0.0002, "losses/dpo": 2.987823322087024e-08, "losses/sft": 0.6865801811218262, "losses/total": 2.987823322087024e-08, "ref_logps/chosen": -255.59780883789062, "ref_logps/rejected": -231.5263214111328, "rewards/accuracies": 1.0, "rewards/chosen": -1.4666743278503418, "rewards/margins": 13.07533073425293, "rewards/rejected": -14.542004585266113, "step": 2450 }, { "epoch": 0.59, "learning_rate": 9.152e-08, "logps/chosen": -255.27586364746094, "logps/rejected": -360.31207275390625, "loss": 0.0029, "losses/dpo": 7.292976533790352e-06, "losses/sft": 0.49751701951026917, "losses/total": 7.292976533790352e-06, "ref_logps/chosen": -244.45294189453125, "ref_logps/rejected": -217.43185424804688, "rewards/accuracies": 1.0, "rewards/chosen": -1.0822930335998535, "rewards/margins": 13.205730438232422, "rewards/rejected": -14.288022994995117, "step": 2451 }, { "epoch": 0.59, "learning_rate": 9.146666666666667e-08, "logps/chosen": -266.6341552734375, "logps/rejected": -376.25677490234375, "loss": 0.0002, "losses/dpo": 1.3794461233007649e-11, "losses/sft": 0.7341002821922302, "losses/total": 1.3794461233007649e-11, "ref_logps/chosen": -249.3612823486328, "ref_logps/rejected": -227.3785400390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.727287769317627, "rewards/margins": 13.16053581237793, "rewards/rejected": -14.887824058532715, "step": 2452 }, { "epoch": 0.59, "learning_rate": 9.141333333333333e-08, "logps/chosen": -210.35943603515625, "logps/rejected": -336.5268859863281, "loss": 0.008, "losses/dpo": 6.783720607472787e-08, "losses/sft": 0.6793437600135803, "losses/total": 6.783720607472787e-08, "ref_logps/chosen": -200.99545288085938, "ref_logps/rejected": -204.5880126953125, "rewards/accuracies": 1.0, "rewards/chosen": -0.9363991022109985, "rewards/margins": 12.257488250732422, "rewards/rejected": -13.193887710571289, "step": 2453 }, { "epoch": 0.59, "learning_rate": 9.135999999999999e-08, "logps/chosen": -212.85194396972656, "logps/rejected": -387.39422607421875, "loss": 0.0009, "losses/dpo": 5.966890512354439e-06, "losses/sft": 0.5760471820831299, "losses/total": 5.966890512354439e-06, "ref_logps/chosen": -199.43826293945312, "ref_logps/rejected": -235.68649291992188, "rewards/accuracies": 1.0, "rewards/chosen": -1.3413667678833008, "rewards/margins": 13.829408645629883, "rewards/rejected": -15.170774459838867, "step": 2454 }, { "epoch": 0.59, "learning_rate": 9.130666666666667e-08, "logps/chosen": -192.83578491210938, "logps/rejected": -395.1077880859375, "loss": 0.0002, "losses/dpo": 1.7308610100741362e-10, "losses/sft": 0.539827823638916, "losses/total": 1.7308610100741362e-10, "ref_logps/chosen": -185.28677368164062, "ref_logps/rejected": -230.7004852294922, "rewards/accuracies": 1.0, "rewards/chosen": -0.7549014091491699, "rewards/margins": 15.685829162597656, "rewards/rejected": -16.440731048583984, "step": 2455 }, { "epoch": 0.59, "learning_rate": 9.125333333333333e-08, "logps/chosen": -268.8323059082031, "logps/rejected": -392.7913818359375, "loss": 0.0026, "losses/dpo": 1.1643780162273742e-08, "losses/sft": 0.7405993938446045, "losses/total": 1.1643780162273742e-08, "ref_logps/chosen": -252.00729370117188, "ref_logps/rejected": -236.37185668945312, "rewards/accuracies": 1.0, "rewards/chosen": -1.6825017929077148, "rewards/margins": 13.959447860717773, "rewards/rejected": -15.641948699951172, "step": 2456 }, { "epoch": 0.59, "learning_rate": 9.12e-08, "logps/chosen": -251.66876220703125, "logps/rejected": -416.06341552734375, "loss": 0.0011, "losses/dpo": 2.3646593363957358e-11, "losses/sft": 0.42662686109542847, "losses/total": 2.3646593363957358e-11, "ref_logps/chosen": -237.95872497558594, "ref_logps/rejected": -249.28933715820312, "rewards/accuracies": 1.0, "rewards/chosen": -1.3710029125213623, "rewards/margins": 15.306407928466797, "rewards/rejected": -16.677410125732422, "step": 2457 }, { "epoch": 0.59, "learning_rate": 9.114666666666666e-08, "logps/chosen": -262.81201171875, "logps/rejected": -375.0353698730469, "loss": 0.0066, "losses/dpo": 4.0966153846966336e-07, "losses/sft": 0.6220125555992126, "losses/total": 4.0966153846966336e-07, "ref_logps/chosen": -248.69009399414062, "ref_logps/rejected": -219.6327667236328, "rewards/accuracies": 1.0, "rewards/chosen": -1.4121906757354736, "rewards/margins": 14.128070831298828, "rewards/rejected": -15.540260314941406, "step": 2458 }, { "epoch": 0.59, "learning_rate": 9.109333333333334e-08, "logps/chosen": -236.8861083984375, "logps/rejected": -380.73419189453125, "loss": 0.0032, "losses/dpo": 5.917479211348109e-05, "losses/sft": 1.3732959032058716, "losses/total": 5.917479211348109e-05, "ref_logps/chosen": -226.5016326904297, "ref_logps/rejected": -237.47625732421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.0384474992752075, "rewards/margins": 13.287345886230469, "rewards/rejected": -14.325793266296387, "step": 2459 }, { "epoch": 0.59, "learning_rate": 9.104e-08, "logps/chosen": -289.1329345703125, "logps/rejected": -379.687744140625, "loss": 0.0043, "losses/dpo": 0.00012390628398861736, "losses/sft": 0.8381714224815369, "losses/total": 0.00012390628398861736, "ref_logps/chosen": -276.0839538574219, "ref_logps/rejected": -238.74163818359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.3048986196517944, "rewards/margins": 12.789710998535156, "rewards/rejected": -14.094610214233398, "step": 2460 }, { "epoch": 0.59, "learning_rate": 9.098666666666666e-08, "logps/chosen": -253.33338928222656, "logps/rejected": -353.3739929199219, "loss": 0.002, "losses/dpo": 9.200869510728182e-10, "losses/sft": 0.47250697016716003, "losses/total": 9.200869510728182e-10, "ref_logps/chosen": -239.56211853027344, "ref_logps/rejected": -221.29049682617188, "rewards/accuracies": 1.0, "rewards/chosen": -1.3771283626556396, "rewards/margins": 11.831220626831055, "rewards/rejected": -13.208349227905273, "step": 2461 }, { "epoch": 0.59, "learning_rate": 9.093333333333333e-08, "logps/chosen": -264.26556396484375, "logps/rejected": -387.1225280761719, "loss": 0.0006, "losses/dpo": 7.452406691754732e-08, "losses/sft": 0.545976996421814, "losses/total": 7.452406691754732e-08, "ref_logps/chosen": -252.0594024658203, "ref_logps/rejected": -235.59835815429688, "rewards/accuracies": 1.0, "rewards/chosen": -1.2206138372421265, "rewards/margins": 13.931803703308105, "rewards/rejected": -15.152416229248047, "step": 2462 }, { "epoch": 0.59, "learning_rate": 9.088e-08, "logps/chosen": -254.4632110595703, "logps/rejected": -385.16290283203125, "loss": 0.0018, "losses/dpo": 9.712355677038431e-05, "losses/sft": 0.5777527689933777, "losses/total": 9.712355677038431e-05, "ref_logps/chosen": -240.51129150390625, "ref_logps/rejected": -234.17660522460938, "rewards/accuracies": 1.0, "rewards/chosen": -1.3951911926269531, "rewards/margins": 13.703435897827148, "rewards/rejected": -15.098627090454102, "step": 2463 }, { "epoch": 0.59, "learning_rate": 9.082666666666667e-08, "logps/chosen": -269.3894958496094, "logps/rejected": -417.02276611328125, "loss": 0.0025, "losses/dpo": 1.3477393300768276e-13, "losses/sft": 0.7932831645011902, "losses/total": 1.3477393300768276e-13, "ref_logps/chosen": -256.67474365234375, "ref_logps/rejected": -251.8250732421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.2714768648147583, "rewards/margins": 15.248292922973633, "rewards/rejected": -16.51976776123047, "step": 2464 }, { "epoch": 0.59, "learning_rate": 9.077333333333332e-08, "logps/chosen": -286.26708984375, "logps/rejected": -381.20989990234375, "loss": 0.0086, "losses/dpo": 5.655071078081164e-09, "losses/sft": 0.49869275093078613, "losses/total": 5.655071078081164e-09, "ref_logps/chosen": -267.86004638671875, "ref_logps/rejected": -233.696533203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.8407061100006104, "rewards/margins": 12.91063117980957, "rewards/rejected": -14.751336097717285, "step": 2465 }, { "epoch": 0.59, "learning_rate": 9.072e-08, "logps/chosen": -288.8643798828125, "logps/rejected": -349.2588806152344, "loss": 0.0064, "losses/dpo": 3.904919498864956e-09, "losses/sft": 1.2298723459243774, "losses/total": 3.904919498864956e-09, "ref_logps/chosen": -273.9446105957031, "ref_logps/rejected": -210.2984161376953, "rewards/accuracies": 1.0, "rewards/chosen": -1.4919764995574951, "rewards/margins": 12.404069900512695, "rewards/rejected": -13.896045684814453, "step": 2466 }, { "epoch": 0.59, "learning_rate": 9.066666666666666e-08, "logps/chosen": -231.85418701171875, "logps/rejected": -399.3700256347656, "loss": 0.0003, "losses/dpo": 2.3846527530402284e-10, "losses/sft": 0.5277243852615356, "losses/total": 2.3846527530402284e-10, "ref_logps/chosen": -216.60748291015625, "ref_logps/rejected": -239.00714111328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.5246696472167969, "rewards/margins": 14.511618614196777, "rewards/rejected": -16.03628921508789, "step": 2467 }, { "epoch": 0.59, "learning_rate": 9.061333333333333e-08, "logps/chosen": -215.33209228515625, "logps/rejected": -353.79461669921875, "loss": 0.0121, "losses/dpo": 2.1656603177433453e-09, "losses/sft": 0.5574486255645752, "losses/total": 2.1656603177433453e-09, "ref_logps/chosen": -203.581298828125, "ref_logps/rejected": -214.50289916992188, "rewards/accuracies": 1.0, "rewards/chosen": -1.1750799417495728, "rewards/margins": 12.75408935546875, "rewards/rejected": -13.929168701171875, "step": 2468 }, { "epoch": 0.59, "learning_rate": 9.056e-08, "logps/chosen": -244.93365478515625, "logps/rejected": -403.5912170410156, "loss": 0.0001, "losses/dpo": 1.6156563731328788e-07, "losses/sft": 0.5831326246261597, "losses/total": 1.6156563731328788e-07, "ref_logps/chosen": -232.19229125976562, "ref_logps/rejected": -243.77479553222656, "rewards/accuracies": 1.0, "rewards/chosen": -1.2741358280181885, "rewards/margins": 14.70750904083252, "rewards/rejected": -15.981643676757812, "step": 2469 }, { "epoch": 0.59, "learning_rate": 9.050666666666666e-08, "logps/chosen": -244.0511474609375, "logps/rejected": -387.31781005859375, "loss": 0.001, "losses/dpo": 1.8439262930769473e-05, "losses/sft": 0.3836188018321991, "losses/total": 1.8439262930769473e-05, "ref_logps/chosen": -231.6036376953125, "ref_logps/rejected": -229.67552185058594, "rewards/accuracies": 1.0, "rewards/chosen": -1.2447504997253418, "rewards/margins": 14.519475936889648, "rewards/rejected": -15.764226913452148, "step": 2470 }, { "epoch": 0.59, "learning_rate": 9.045333333333333e-08, "logps/chosen": -265.0152587890625, "logps/rejected": -374.35247802734375, "loss": 0.0011, "losses/dpo": 2.076332293654559e-06, "losses/sft": 0.7147977948188782, "losses/total": 2.076332293654559e-06, "ref_logps/chosen": -252.10780334472656, "ref_logps/rejected": -233.5786590576172, "rewards/accuracies": 1.0, "rewards/chosen": -1.2907459735870361, "rewards/margins": 12.786636352539062, "rewards/rejected": -14.077381134033203, "step": 2471 }, { "epoch": 0.59, "learning_rate": 9.04e-08, "logps/chosen": -245.62106323242188, "logps/rejected": -362.47625732421875, "loss": 0.0044, "losses/dpo": 1.3493906863004668e-06, "losses/sft": 0.9643588662147522, "losses/total": 1.3493906863004668e-06, "ref_logps/chosen": -229.53707885742188, "ref_logps/rejected": -222.3511962890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6083983182907104, "rewards/margins": 12.404106140136719, "rewards/rejected": -14.012504577636719, "step": 2472 }, { "epoch": 0.59, "learning_rate": 9.034666666666667e-08, "logps/chosen": -250.3345184326172, "logps/rejected": -416.00213623046875, "loss": 0.0008, "losses/dpo": 5.334587058314355e-06, "losses/sft": 0.6742861270904541, "losses/total": 5.334587058314355e-06, "ref_logps/chosen": -236.57012939453125, "ref_logps/rejected": -255.9684295654297, "rewards/accuracies": 1.0, "rewards/chosen": -1.3764393329620361, "rewards/margins": 14.626932144165039, "rewards/rejected": -16.003372192382812, "step": 2473 }, { "epoch": 0.59, "learning_rate": 9.029333333333333e-08, "logps/chosen": -297.8265380859375, "logps/rejected": -410.15771484375, "loss": 0.0002, "losses/dpo": 1.77070191398343e-07, "losses/sft": 0.559956967830658, "losses/total": 1.77070191398343e-07, "ref_logps/chosen": -284.564697265625, "ref_logps/rejected": -250.30892944335938, "rewards/accuracies": 1.0, "rewards/chosen": -1.3261840343475342, "rewards/margins": 14.658695220947266, "rewards/rejected": -15.984880447387695, "step": 2474 }, { "epoch": 0.59, "learning_rate": 9.023999999999999e-08, "logps/chosen": -260.0142822265625, "logps/rejected": -403.7760009765625, "loss": 0.0002, "losses/dpo": 5.7279665455212125e-09, "losses/sft": 0.5996457934379578, "losses/total": 5.7279665455212125e-09, "ref_logps/chosen": -243.47647094726562, "ref_logps/rejected": -242.30361938476562, "rewards/accuracies": 1.0, "rewards/chosen": -1.6537846326828003, "rewards/margins": 14.493453979492188, "rewards/rejected": -16.14723777770996, "step": 2475 }, { "epoch": 0.59, "learning_rate": 9.018666666666667e-08, "logps/chosen": -239.54888916015625, "logps/rejected": -401.434814453125, "loss": 0.0031, "losses/dpo": 3.7570313082824214e-08, "losses/sft": 0.45321914553642273, "losses/total": 3.7570313082824214e-08, "ref_logps/chosen": -226.34605407714844, "ref_logps/rejected": -248.7589111328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.3202834129333496, "rewards/margins": 13.947307586669922, "rewards/rejected": -15.26759147644043, "step": 2476 }, { "epoch": 0.59, "learning_rate": 9.013333333333333e-08, "logps/chosen": -266.37811279296875, "logps/rejected": -409.04241943359375, "loss": 0.0055, "losses/dpo": 1.1340843819596103e-08, "losses/sft": 0.7883473038673401, "losses/total": 1.1340843819596103e-08, "ref_logps/chosen": -249.06752014160156, "ref_logps/rejected": -245.37252807617188, "rewards/accuracies": 1.0, "rewards/chosen": -1.7310614585876465, "rewards/margins": 14.635931968688965, "rewards/rejected": -16.366992950439453, "step": 2477 }, { "epoch": 0.59, "learning_rate": 9.008e-08, "logps/chosen": -243.83544921875, "logps/rejected": -388.6443176269531, "loss": 0.001, "losses/dpo": 9.222184417012613e-06, "losses/sft": 0.590713620185852, "losses/total": 9.222184417012613e-06, "ref_logps/chosen": -228.4590301513672, "ref_logps/rejected": -234.68362426757812, "rewards/accuracies": 1.0, "rewards/chosen": -1.5376390218734741, "rewards/margins": 13.85843276977539, "rewards/rejected": -15.39607048034668, "step": 2478 }, { "epoch": 0.59, "learning_rate": 9.002666666666666e-08, "logps/chosen": -242.38815307617188, "logps/rejected": -337.94256591796875, "loss": 0.0013, "losses/dpo": 2.388091679961235e-08, "losses/sft": 0.6646825075149536, "losses/total": 2.388091679961235e-08, "ref_logps/chosen": -226.61207580566406, "ref_logps/rejected": -198.59783935546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5776070356369019, "rewards/margins": 12.356864929199219, "rewards/rejected": -13.934473037719727, "step": 2479 }, { "epoch": 0.6, "learning_rate": 8.997333333333334e-08, "logps/chosen": -234.53738403320312, "logps/rejected": -352.4576416015625, "loss": 0.0028, "losses/dpo": 4.426786472322419e-05, "losses/sft": 0.8058562874794006, "losses/total": 4.426786472322419e-05, "ref_logps/chosen": -221.54061889648438, "ref_logps/rejected": -216.36927795410156, "rewards/accuracies": 1.0, "rewards/chosen": -1.2996761798858643, "rewards/margins": 12.309158325195312, "rewards/rejected": -13.608835220336914, "step": 2480 }, { "epoch": 0.6, "learning_rate": 8.992e-08, "logps/chosen": -206.59796142578125, "logps/rejected": -371.0624694824219, "loss": 0.003, "losses/dpo": 6.503425353798775e-09, "losses/sft": 0.5228036642074585, "losses/total": 6.503425353798775e-09, "ref_logps/chosen": -194.7088165283203, "ref_logps/rejected": -224.61061096191406, "rewards/accuracies": 1.0, "rewards/chosen": -1.1889139413833618, "rewards/margins": 13.45627212524414, "rewards/rejected": -14.645185470581055, "step": 2481 }, { "epoch": 0.6, "learning_rate": 8.986666666666665e-08, "logps/chosen": -234.46575927734375, "logps/rejected": -389.51153564453125, "loss": 0.0035, "losses/dpo": 7.24007451213815e-11, "losses/sft": 1.0669519901275635, "losses/total": 7.24007451213815e-11, "ref_logps/chosen": -223.62741088867188, "ref_logps/rejected": -242.95457458496094, "rewards/accuracies": 1.0, "rewards/chosen": -1.0838327407836914, "rewards/margins": 13.57186508178711, "rewards/rejected": -14.655696868896484, "step": 2482 }, { "epoch": 0.6, "learning_rate": 8.981333333333333e-08, "logps/chosen": -241.98941040039062, "logps/rejected": -380.90045166015625, "loss": 0.0013, "losses/dpo": 2.068545057465343e-10, "losses/sft": 0.6100910902023315, "losses/total": 2.068545057465343e-10, "ref_logps/chosen": -231.80157470703125, "ref_logps/rejected": -235.58914184570312, "rewards/accuracies": 1.0, "rewards/chosen": -1.018784761428833, "rewards/margins": 13.51234245300293, "rewards/rejected": -14.531126022338867, "step": 2483 }, { "epoch": 0.6, "learning_rate": 8.975999999999999e-08, "logps/chosen": -236.175048828125, "logps/rejected": -375.763916015625, "loss": 0.0029, "losses/dpo": 2.1489657275886032e-10, "losses/sft": 0.6667454838752747, "losses/total": 2.1489657275886032e-10, "ref_logps/chosen": -223.82830810546875, "ref_logps/rejected": -220.92213439941406, "rewards/accuracies": 1.0, "rewards/chosen": -1.2346738576889038, "rewards/margins": 14.249505996704102, "rewards/rejected": -15.484180450439453, "step": 2484 }, { "epoch": 0.6, "learning_rate": 8.970666666666667e-08, "logps/chosen": -243.69619750976562, "logps/rejected": -377.89453125, "loss": 0.0024, "losses/dpo": 6.668877255977534e-11, "losses/sft": 0.5112645626068115, "losses/total": 6.668877255977534e-11, "ref_logps/chosen": -227.32723999023438, "ref_logps/rejected": -233.55963134765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6368972063064575, "rewards/margins": 12.796595573425293, "rewards/rejected": -14.433492660522461, "step": 2485 }, { "epoch": 0.6, "learning_rate": 8.965333333333333e-08, "logps/chosen": -272.31475830078125, "logps/rejected": -404.38909912109375, "loss": 0.0002, "losses/dpo": 1.8083309214489418e-06, "losses/sft": 0.49893635511398315, "losses/total": 1.8083309214489418e-06, "ref_logps/chosen": -258.5810546875, "ref_logps/rejected": -250.4868927001953, "rewards/accuracies": 1.0, "rewards/chosen": -1.3733696937561035, "rewards/margins": 14.016853332519531, "rewards/rejected": -15.39022445678711, "step": 2486 }, { "epoch": 0.6, "learning_rate": 8.96e-08, "logps/chosen": -266.78955078125, "logps/rejected": -376.1229553222656, "loss": 0.0077, "losses/dpo": 2.76610194305249e-07, "losses/sft": 0.7723490595817566, "losses/total": 2.76610194305249e-07, "ref_logps/chosen": -250.81732177734375, "ref_logps/rejected": -225.970947265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.597221851348877, "rewards/margins": 13.417977333068848, "rewards/rejected": -15.015199661254883, "step": 2487 }, { "epoch": 0.6, "learning_rate": 8.954666666666666e-08, "logps/chosen": -204.75587463378906, "logps/rejected": -383.461669921875, "loss": 0.0001, "losses/dpo": 2.9523835110012442e-05, "losses/sft": 0.9970360994338989, "losses/total": 2.9523835110012442e-05, "ref_logps/chosen": -192.20217895507812, "ref_logps/rejected": -228.50875854492188, "rewards/accuracies": 1.0, "rewards/chosen": -1.255369782447815, "rewards/margins": 14.239925384521484, "rewards/rejected": -15.495294570922852, "step": 2488 }, { "epoch": 0.6, "learning_rate": 8.949333333333334e-08, "logps/chosen": -249.76658630371094, "logps/rejected": -359.9417724609375, "loss": 0.0156, "losses/dpo": 2.4478507043568243e-07, "losses/sft": 0.5703136324882507, "losses/total": 2.4478507043568243e-07, "ref_logps/chosen": -234.9095916748047, "ref_logps/rejected": -210.17832946777344, "rewards/accuracies": 1.0, "rewards/chosen": -1.4856992959976196, "rewards/margins": 13.490644454956055, "rewards/rejected": -14.976343154907227, "step": 2489 }, { "epoch": 0.6, "learning_rate": 8.944e-08, "logps/chosen": -231.72439575195312, "logps/rejected": -392.9010314941406, "loss": 0.0099, "losses/dpo": 0.0002922933199442923, "losses/sft": 0.705238938331604, "losses/total": 0.0002922933199442923, "ref_logps/chosen": -215.6744842529297, "ref_logps/rejected": -231.65814208984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.604989767074585, "rewards/margins": 14.51930046081543, "rewards/rejected": -16.124290466308594, "step": 2490 }, { "epoch": 0.6, "learning_rate": 8.938666666666666e-08, "logps/chosen": -243.6435089111328, "logps/rejected": -400.2432861328125, "loss": 0.0028, "losses/dpo": 7.861084561078258e-10, "losses/sft": 0.6245794296264648, "losses/total": 7.861084561078258e-10, "ref_logps/chosen": -223.09893798828125, "ref_logps/rejected": -232.83558654785156, "rewards/accuracies": 1.0, "rewards/chosen": -2.0544562339782715, "rewards/margins": 14.686312675476074, "rewards/rejected": -16.740768432617188, "step": 2491 }, { "epoch": 0.6, "learning_rate": 8.933333333333333e-08, "logps/chosen": -224.64517211914062, "logps/rejected": -379.52569580078125, "loss": 0.0052, "losses/dpo": 2.5903250655545662e-09, "losses/sft": 0.47378721833229065, "losses/total": 2.5903250655545662e-09, "ref_logps/chosen": -213.03378295898438, "ref_logps/rejected": -214.9244842529297, "rewards/accuracies": 1.0, "rewards/chosen": -1.161138892173767, "rewards/margins": 15.298978805541992, "rewards/rejected": -16.460119247436523, "step": 2492 }, { "epoch": 0.6, "learning_rate": 8.928e-08, "logps/chosen": -232.3568115234375, "logps/rejected": -380.09600830078125, "loss": 0.0001, "losses/dpo": 5.971937527826654e-11, "losses/sft": 0.7961365580558777, "losses/total": 5.971937527826654e-11, "ref_logps/chosen": -222.53744506835938, "ref_logps/rejected": -222.65223693847656, "rewards/accuracies": 1.0, "rewards/chosen": -0.981935977935791, "rewards/margins": 14.762442588806152, "rewards/rejected": -15.744378089904785, "step": 2493 }, { "epoch": 0.6, "learning_rate": 8.922666666666667e-08, "logps/chosen": -215.11380004882812, "logps/rejected": -364.15472412109375, "loss": 0.0015, "losses/dpo": 4.083293703160962e-08, "losses/sft": 0.8254658579826355, "losses/total": 4.083293703160962e-08, "ref_logps/chosen": -202.2381591796875, "ref_logps/rejected": -212.61251831054688, "rewards/accuracies": 1.0, "rewards/chosen": -1.2875640392303467, "rewards/margins": 13.866659164428711, "rewards/rejected": -15.15422248840332, "step": 2494 }, { "epoch": 0.6, "learning_rate": 8.917333333333333e-08, "logps/chosen": -266.63604736328125, "logps/rejected": -351.87249755859375, "loss": 0.0011, "losses/dpo": 2.9585445204816097e-08, "losses/sft": 0.5739029049873352, "losses/total": 2.9585445204816097e-08, "ref_logps/chosen": -252.41604614257812, "ref_logps/rejected": -213.96463012695312, "rewards/accuracies": 1.0, "rewards/chosen": -1.4219999313354492, "rewards/margins": 12.36878776550293, "rewards/rejected": -13.790788650512695, "step": 2495 }, { "epoch": 0.6, "learning_rate": 8.911999999999999e-08, "logps/chosen": -244.38368225097656, "logps/rejected": -384.8004150390625, "loss": 0.0192, "losses/dpo": 3.902047865267377e-06, "losses/sft": 0.6072816848754883, "losses/total": 3.902047865267377e-06, "ref_logps/chosen": -230.65142822265625, "ref_logps/rejected": -234.8310546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.3732247352600098, "rewards/margins": 13.623711585998535, "rewards/rejected": -14.996936798095703, "step": 2496 }, { "epoch": 0.6, "learning_rate": 8.906666666666667e-08, "logps/chosen": -263.83203125, "logps/rejected": -389.055419921875, "loss": 0.0001, "losses/dpo": 4.545491174212657e-05, "losses/sft": 0.7691364288330078, "losses/total": 4.545491174212657e-05, "ref_logps/chosen": -245.98138427734375, "ref_logps/rejected": -229.12574768066406, "rewards/accuracies": 1.0, "rewards/chosen": -1.7850672006607056, "rewards/margins": 14.20789909362793, "rewards/rejected": -15.99296760559082, "step": 2497 }, { "epoch": 0.6, "learning_rate": 8.901333333333333e-08, "logps/chosen": -262.76904296875, "logps/rejected": -404.075439453125, "loss": 0.0016, "losses/dpo": 2.2923227334104013e-06, "losses/sft": 0.8909385204315186, "losses/total": 2.2923227334104013e-06, "ref_logps/chosen": -249.5767822265625, "ref_logps/rejected": -245.24270629882812, "rewards/accuracies": 1.0, "rewards/chosen": -1.3192269802093506, "rewards/margins": 14.564048767089844, "rewards/rejected": -15.88327407836914, "step": 2498 }, { "epoch": 0.6, "learning_rate": 8.896e-08, "logps/chosen": -265.1011657714844, "logps/rejected": -387.10504150390625, "loss": 0.0005, "losses/dpo": 5.358454586712469e-07, "losses/sft": 0.6834407448768616, "losses/total": 5.358454586712469e-07, "ref_logps/chosen": -248.7061004638672, "ref_logps/rejected": -234.23463439941406, "rewards/accuracies": 1.0, "rewards/chosen": -1.6395063400268555, "rewards/margins": 13.647533416748047, "rewards/rejected": -15.287040710449219, "step": 2499 }, { "epoch": 0.6, "learning_rate": 8.890666666666666e-08, "logps/chosen": -232.51803588867188, "logps/rejected": -362.7051696777344, "loss": 0.0012, "losses/dpo": 4.496368717354926e-07, "losses/sft": 0.9163053631782532, "losses/total": 4.496368717354926e-07, "ref_logps/chosen": -220.1715087890625, "ref_logps/rejected": -230.33900451660156, "rewards/accuracies": 1.0, "rewards/chosen": -1.234651803970337, "rewards/margins": 12.001968383789062, "rewards/rejected": -13.236620903015137, "step": 2500 }, { "epoch": 0.6, "learning_rate": 8.885333333333332e-08, "logps/chosen": -268.8929443359375, "logps/rejected": -396.8974609375, "loss": 0.0019, "losses/dpo": 8.015283015083696e-08, "losses/sft": 0.6289771795272827, "losses/total": 8.015283015083696e-08, "ref_logps/chosen": -256.11358642578125, "ref_logps/rejected": -247.05667114257812, "rewards/accuracies": 1.0, "rewards/chosen": -1.27793550491333, "rewards/margins": 13.706143379211426, "rewards/rejected": -14.984078407287598, "step": 2501 }, { "epoch": 0.6, "learning_rate": 8.88e-08, "logps/chosen": -256.6397399902344, "logps/rejected": -396.8853454589844, "loss": 0.0004, "losses/dpo": 6.026164555805735e-06, "losses/sft": 0.6924143433570862, "losses/total": 6.026164555805735e-06, "ref_logps/chosen": -240.64227294921875, "ref_logps/rejected": -244.9884796142578, "rewards/accuracies": 1.0, "rewards/chosen": -1.5997467041015625, "rewards/margins": 13.58993911743164, "rewards/rejected": -15.18968677520752, "step": 2502 }, { "epoch": 0.6, "learning_rate": 8.874666666666666e-08, "logps/chosen": -263.6067199707031, "logps/rejected": -378.30072021484375, "loss": 0.0004, "losses/dpo": 9.922191566147376e-07, "losses/sft": 0.5921437740325928, "losses/total": 9.922191566147376e-07, "ref_logps/chosen": -250.13223266601562, "ref_logps/rejected": -223.9872283935547, "rewards/accuracies": 1.0, "rewards/chosen": -1.3474482297897339, "rewards/margins": 14.083901405334473, "rewards/rejected": -15.431349754333496, "step": 2503 }, { "epoch": 0.6, "learning_rate": 8.869333333333333e-08, "logps/chosen": -191.89610290527344, "logps/rejected": -311.7547302246094, "loss": 0.0018, "losses/dpo": 0.005707341246306896, "losses/sft": 0.7880350947380066, "losses/total": 0.005707341246306896, "ref_logps/chosen": -178.88320922851562, "ref_logps/rejected": -185.39126586914062, "rewards/accuracies": 1.0, "rewards/chosen": -1.3012902736663818, "rewards/margins": 11.33505630493164, "rewards/rejected": -12.636346817016602, "step": 2504 }, { "epoch": 0.6, "learning_rate": 8.863999999999999e-08, "logps/chosen": -206.8318328857422, "logps/rejected": -385.8729248046875, "loss": 0.0004, "losses/dpo": 1.3826481071177454e-09, "losses/sft": 0.7980940937995911, "losses/total": 1.3826481071177454e-09, "ref_logps/chosen": -194.6039581298828, "ref_logps/rejected": -222.66598510742188, "rewards/accuracies": 1.0, "rewards/chosen": -1.2227890491485596, "rewards/margins": 15.097905158996582, "rewards/rejected": -16.320693969726562, "step": 2505 }, { "epoch": 0.6, "learning_rate": 8.858666666666667e-08, "logps/chosen": -252.03350830078125, "logps/rejected": -353.65838623046875, "loss": 0.0069, "losses/dpo": 3.5925213524024002e-06, "losses/sft": 0.6061254143714905, "losses/total": 3.5925213524024002e-06, "ref_logps/chosen": -240.53721618652344, "ref_logps/rejected": -217.30596923828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.1496294736862183, "rewards/margins": 12.485611915588379, "rewards/rejected": -13.635241508483887, "step": 2506 }, { "epoch": 0.6, "learning_rate": 8.853333333333333e-08, "logps/chosen": -247.9850311279297, "logps/rejected": -381.8781433105469, "loss": 0.0005, "losses/dpo": 3.462238551410479e-10, "losses/sft": 0.48050564527511597, "losses/total": 3.462238551410479e-10, "ref_logps/chosen": -237.83602905273438, "ref_logps/rejected": -230.67359924316406, "rewards/accuracies": 1.0, "rewards/chosen": -1.0148981809616089, "rewards/margins": 14.105557441711426, "rewards/rejected": -15.120455741882324, "step": 2507 }, { "epoch": 0.6, "learning_rate": 8.848e-08, "logps/chosen": -212.86016845703125, "logps/rejected": -346.62982177734375, "loss": 0.004, "losses/dpo": 2.045205178546894e-07, "losses/sft": 0.7491785883903503, "losses/total": 2.045205178546894e-07, "ref_logps/chosen": -200.70785522460938, "ref_logps/rejected": -197.03773498535156, "rewards/accuracies": 1.0, "rewards/chosen": -1.2152296304702759, "rewards/margins": 13.743977546691895, "rewards/rejected": -14.959207534790039, "step": 2508 }, { "epoch": 0.6, "learning_rate": 8.842666666666666e-08, "logps/chosen": -253.44635009765625, "logps/rejected": -407.2440185546875, "loss": 0.0112, "losses/dpo": 1.2217472431075294e-05, "losses/sft": 0.6271594762802124, "losses/total": 1.2217472431075294e-05, "ref_logps/chosen": -240.32595825195312, "ref_logps/rejected": -249.02993774414062, "rewards/accuracies": 1.0, "rewards/chosen": -1.3120403289794922, "rewards/margins": 14.509366989135742, "rewards/rejected": -15.82140827178955, "step": 2509 }, { "epoch": 0.6, "learning_rate": 8.837333333333334e-08, "logps/chosen": -217.884521484375, "logps/rejected": -346.1189880371094, "loss": 0.0019, "losses/dpo": 5.332799496047744e-14, "losses/sft": 0.7641904354095459, "losses/total": 5.332799496047744e-14, "ref_logps/chosen": -206.3385009765625, "ref_logps/rejected": -204.01388549804688, "rewards/accuracies": 1.0, "rewards/chosen": -1.1546014547348022, "rewards/margins": 13.055910110473633, "rewards/rejected": -14.210512161254883, "step": 2510 }, { "epoch": 0.6, "learning_rate": 8.832e-08, "logps/chosen": -325.56707763671875, "logps/rejected": -393.5451354980469, "loss": 0.0004, "losses/dpo": 2.41483988361324e-09, "losses/sft": 0.7219749093055725, "losses/total": 2.41483988361324e-09, "ref_logps/chosen": -309.759765625, "ref_logps/rejected": -244.69857788085938, "rewards/accuracies": 1.0, "rewards/chosen": -1.5807290077209473, "rewards/margins": 13.303926467895508, "rewards/rejected": -14.88465690612793, "step": 2511 }, { "epoch": 0.6, "learning_rate": 8.826666666666666e-08, "logps/chosen": -261.0830078125, "logps/rejected": -374.0074462890625, "loss": 0.0037, "losses/dpo": 3.4109461921616457e-06, "losses/sft": 0.775810182094574, "losses/total": 3.4109461921616457e-06, "ref_logps/chosen": -248.60052490234375, "ref_logps/rejected": -229.02371215820312, "rewards/accuracies": 1.0, "rewards/chosen": -1.2482476234436035, "rewards/margins": 13.250123977661133, "rewards/rejected": -14.498371124267578, "step": 2512 }, { "epoch": 0.6, "learning_rate": 8.821333333333333e-08, "logps/chosen": -257.5318603515625, "logps/rejected": -376.66729736328125, "loss": 0.0003, "losses/dpo": 9.494656114839017e-05, "losses/sft": 0.5992763042449951, "losses/total": 9.494656114839017e-05, "ref_logps/chosen": -246.64862060546875, "ref_logps/rejected": -235.46987915039062, "rewards/accuracies": 1.0, "rewards/chosen": -1.0883255004882812, "rewards/margins": 13.031413078308105, "rewards/rejected": -14.119738578796387, "step": 2513 }, { "epoch": 0.6, "learning_rate": 8.816e-08, "logps/chosen": -245.3785400390625, "logps/rejected": -369.5467529296875, "loss": 0.0003, "losses/dpo": 2.65376474999357e-05, "losses/sft": 0.6228293776512146, "losses/total": 2.65376474999357e-05, "ref_logps/chosen": -233.49447631835938, "ref_logps/rejected": -221.95912170410156, "rewards/accuracies": 1.0, "rewards/chosen": -1.1884063482284546, "rewards/margins": 13.570357322692871, "rewards/rejected": -14.758763313293457, "step": 2514 }, { "epoch": 0.6, "learning_rate": 8.810666666666667e-08, "logps/chosen": -212.43197631835938, "logps/rejected": -359.8683166503906, "loss": 0.0002, "losses/dpo": 2.200250120987235e-10, "losses/sft": 0.7740987539291382, "losses/total": 2.200250120987235e-10, "ref_logps/chosen": -199.6964874267578, "ref_logps/rejected": -207.90435791015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.27354896068573, "rewards/margins": 13.922846794128418, "rewards/rejected": -15.196394920349121, "step": 2515 }, { "epoch": 0.6, "learning_rate": 8.805333333333333e-08, "logps/chosen": -185.75991821289062, "logps/rejected": -337.746337890625, "loss": 0.001, "losses/dpo": 0.0006053748656995595, "losses/sft": 0.8628821969032288, "losses/total": 0.0006053748656995595, "ref_logps/chosen": -177.00845336914062, "ref_logps/rejected": -207.28500366210938, "rewards/accuracies": 1.0, "rewards/chosen": -0.8751453161239624, "rewards/margins": 12.17098617553711, "rewards/rejected": -13.04613208770752, "step": 2516 }, { "epoch": 0.6, "learning_rate": 8.8e-08, "logps/chosen": -247.93203735351562, "logps/rejected": -360.5948181152344, "loss": 0.0063, "losses/dpo": 6.5954154706560075e-06, "losses/sft": 0.62842857837677, "losses/total": 6.5954154706560075e-06, "ref_logps/chosen": -235.8843536376953, "ref_logps/rejected": -220.14590454101562, "rewards/accuracies": 1.0, "rewards/chosen": -1.2047703266143799, "rewards/margins": 12.840123176574707, "rewards/rejected": -14.044894218444824, "step": 2517 }, { "epoch": 0.6, "learning_rate": 8.794666666666666e-08, "logps/chosen": -253.36184692382812, "logps/rejected": -380.99365234375, "loss": 0.0012, "losses/dpo": 8.969545106651822e-10, "losses/sft": 0.7326924204826355, "losses/total": 8.969545106651822e-10, "ref_logps/chosen": -240.06057739257812, "ref_logps/rejected": -237.635986328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.3301284313201904, "rewards/margins": 13.00564193725586, "rewards/rejected": -14.335768699645996, "step": 2518 }, { "epoch": 0.6, "learning_rate": 8.789333333333333e-08, "logps/chosen": -242.55960083007812, "logps/rejected": -411.0028076171875, "loss": 0.0009, "losses/dpo": 1.2920783554193349e-08, "losses/sft": 1.3049497604370117, "losses/total": 1.2920783554193349e-08, "ref_logps/chosen": -231.08514404296875, "ref_logps/rejected": -252.67288208007812, "rewards/accuracies": 1.0, "rewards/chosen": -1.147443175315857, "rewards/margins": 14.685550689697266, "rewards/rejected": -15.83299446105957, "step": 2519 }, { "epoch": 0.6, "learning_rate": 8.784e-08, "logps/chosen": -200.48980712890625, "logps/rejected": -324.05224609375, "loss": 0.0209, "losses/dpo": 5.023458129471692e-07, "losses/sft": 0.6120752692222595, "losses/total": 5.023458129471692e-07, "ref_logps/chosen": -190.55442810058594, "ref_logps/rejected": -196.44741821289062, "rewards/accuracies": 1.0, "rewards/chosen": -0.9935393333435059, "rewards/margins": 11.766944885253906, "rewards/rejected": -12.760483741760254, "step": 2520 }, { "epoch": 0.6, "learning_rate": 8.778666666666666e-08, "logps/chosen": -259.9008483886719, "logps/rejected": -406.84686279296875, "loss": 0.0009, "losses/dpo": 1.6693459059524685e-08, "losses/sft": 0.6719644069671631, "losses/total": 1.6693459059524685e-08, "ref_logps/chosen": -247.69085693359375, "ref_logps/rejected": -243.1673126220703, "rewards/accuracies": 1.0, "rewards/chosen": -1.2209982872009277, "rewards/margins": 15.146955490112305, "rewards/rejected": -16.367952346801758, "step": 2521 }, { "epoch": 0.61, "learning_rate": 8.773333333333332e-08, "logps/chosen": -241.44937133789062, "logps/rejected": -397.2083435058594, "loss": 0.002, "losses/dpo": 1.1464992439869093e-06, "losses/sft": 0.4530591070652008, "losses/total": 1.1464992439869093e-06, "ref_logps/chosen": -224.87423706054688, "ref_logps/rejected": -231.3896026611328, "rewards/accuracies": 1.0, "rewards/chosen": -1.6575134992599487, "rewards/margins": 14.924361228942871, "rewards/rejected": -16.58187484741211, "step": 2522 }, { "epoch": 0.61, "learning_rate": 8.768e-08, "logps/chosen": -201.77853393554688, "logps/rejected": -341.41375732421875, "loss": 0.0021, "losses/dpo": 2.535449084462016e-06, "losses/sft": 0.521567165851593, "losses/total": 2.535449084462016e-06, "ref_logps/chosen": -193.30581665039062, "ref_logps/rejected": -211.76486206054688, "rewards/accuracies": 1.0, "rewards/chosen": -0.8472729325294495, "rewards/margins": 12.117616653442383, "rewards/rejected": -12.964890480041504, "step": 2523 }, { "epoch": 0.61, "learning_rate": 8.762666666666667e-08, "logps/chosen": -251.13861083984375, "logps/rejected": -373.12054443359375, "loss": 0.0043, "losses/dpo": 2.830828861988266e-06, "losses/sft": 0.6674773693084717, "losses/total": 2.830828861988266e-06, "ref_logps/chosen": -238.45957946777344, "ref_logps/rejected": -221.4897918701172, "rewards/accuracies": 1.0, "rewards/chosen": -1.2679033279418945, "rewards/margins": 13.895172119140625, "rewards/rejected": -15.163078308105469, "step": 2524 }, { "epoch": 0.61, "learning_rate": 8.757333333333333e-08, "logps/chosen": -243.4310302734375, "logps/rejected": -383.32135009765625, "loss": 0.0007, "losses/dpo": 2.8626441235246602e-06, "losses/sft": 0.6406401991844177, "losses/total": 2.8626441235246602e-06, "ref_logps/chosen": -230.5595245361328, "ref_logps/rejected": -222.672119140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2871513366699219, "rewards/margins": 14.7777681350708, "rewards/rejected": -16.064918518066406, "step": 2525 }, { "epoch": 0.61, "learning_rate": 8.751999999999999e-08, "logps/chosen": -258.04901123046875, "logps/rejected": -376.99395751953125, "loss": 0.0008, "losses/dpo": 3.692508521169202e-09, "losses/sft": 0.8842417597770691, "losses/total": 3.692508521169202e-09, "ref_logps/chosen": -241.58056640625, "ref_logps/rejected": -218.74269104003906, "rewards/accuracies": 1.0, "rewards/chosen": -1.6468442678451538, "rewards/margins": 14.17828369140625, "rewards/rejected": -15.825128555297852, "step": 2526 }, { "epoch": 0.61, "learning_rate": 8.746666666666667e-08, "logps/chosen": -288.90631103515625, "logps/rejected": -415.1029052734375, "loss": 0.0044, "losses/dpo": 2.8707090677926317e-06, "losses/sft": 0.47924378514289856, "losses/total": 2.8707090677926317e-06, "ref_logps/chosen": -274.40179443359375, "ref_logps/rejected": -259.3296813964844, "rewards/accuracies": 1.0, "rewards/chosen": -1.4504510164260864, "rewards/margins": 14.126871109008789, "rewards/rejected": -15.577322006225586, "step": 2527 }, { "epoch": 0.61, "learning_rate": 8.741333333333333e-08, "logps/chosen": -244.33892822265625, "logps/rejected": -389.01953125, "loss": 0.0002, "losses/dpo": 3.108717276656403e-12, "losses/sft": 0.6719077229499817, "losses/total": 3.108717276656403e-12, "ref_logps/chosen": -231.7705535888672, "ref_logps/rejected": -229.38629150390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.256836175918579, "rewards/margins": 14.706487655639648, "rewards/rejected": -15.963323593139648, "step": 2528 }, { "epoch": 0.61, "learning_rate": 8.736e-08, "logps/chosen": -199.41928100585938, "logps/rejected": -341.85638427734375, "loss": 0.0017, "losses/dpo": 7.47930251066009e-09, "losses/sft": 0.6245325207710266, "losses/total": 7.47930251066009e-09, "ref_logps/chosen": -188.51455688476562, "ref_logps/rejected": -201.18972778320312, "rewards/accuracies": 1.0, "rewards/chosen": -1.0904712677001953, "rewards/margins": 12.9761962890625, "rewards/rejected": -14.066667556762695, "step": 2529 }, { "epoch": 0.61, "learning_rate": 8.730666666666666e-08, "logps/chosen": -259.33343505859375, "logps/rejected": -382.9062805175781, "loss": 0.0002, "losses/dpo": 1.0542157369286542e-08, "losses/sft": 0.5868363976478577, "losses/total": 1.0542157369286542e-08, "ref_logps/chosen": -246.60879516601562, "ref_logps/rejected": -228.3144989013672, "rewards/accuracies": 1.0, "rewards/chosen": -1.272465467453003, "rewards/margins": 14.186712265014648, "rewards/rejected": -15.459178924560547, "step": 2530 }, { "epoch": 0.61, "learning_rate": 8.725333333333334e-08, "logps/chosen": -262.95269775390625, "logps/rejected": -361.9281921386719, "loss": 0.0013, "losses/dpo": 6.743915719198412e-07, "losses/sft": 0.6905671954154968, "losses/total": 6.743915719198412e-07, "ref_logps/chosen": -250.4002685546875, "ref_logps/rejected": -216.51824951171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.2552448511123657, "rewards/margins": 13.285750389099121, "rewards/rejected": -14.540995597839355, "step": 2531 }, { "epoch": 0.61, "learning_rate": 8.72e-08, "logps/chosen": -256.14984130859375, "logps/rejected": -475.7572021484375, "loss": 0.0, "losses/dpo": 4.2368057506791956e-07, "losses/sft": 0.5990276336669922, "losses/total": 4.2368057506791956e-07, "ref_logps/chosen": -244.7005615234375, "ref_logps/rejected": -294.5400085449219, "rewards/accuracies": 1.0, "rewards/chosen": -1.1449251174926758, "rewards/margins": 16.97679328918457, "rewards/rejected": -18.12171745300293, "step": 2532 }, { "epoch": 0.61, "learning_rate": 8.714666666666666e-08, "logps/chosen": -230.3232421875, "logps/rejected": -385.61785888671875, "loss": 0.0127, "losses/dpo": 3.6144549085292965e-05, "losses/sft": 0.8768014311790466, "losses/total": 3.6144549085292965e-05, "ref_logps/chosen": -218.86880493164062, "ref_logps/rejected": -233.4636688232422, "rewards/accuracies": 1.0, "rewards/chosen": -1.1454432010650635, "rewards/margins": 14.069976806640625, "rewards/rejected": -15.21541976928711, "step": 2533 }, { "epoch": 0.61, "learning_rate": 8.709333333333333e-08, "logps/chosen": -229.84962463378906, "logps/rejected": -368.7025146484375, "loss": 0.0009, "losses/dpo": 3.4133666559910125e-08, "losses/sft": 0.5587666034698486, "losses/total": 3.4133666559910125e-08, "ref_logps/chosen": -218.12306213378906, "ref_logps/rejected": -215.88742065429688, "rewards/accuracies": 1.0, "rewards/chosen": -1.1726574897766113, "rewards/margins": 14.108851432800293, "rewards/rejected": -15.28150749206543, "step": 2534 }, { "epoch": 0.61, "learning_rate": 8.703999999999999e-08, "logps/chosen": -268.44683837890625, "logps/rejected": -424.48431396484375, "loss": 0.0007, "losses/dpo": 1.9052524294238538e-05, "losses/sft": 0.6142983436584473, "losses/total": 1.9052524294238538e-05, "ref_logps/chosen": -255.87110900878906, "ref_logps/rejected": -260.04730224609375, "rewards/accuracies": 1.0, "rewards/chosen": -1.2575747966766357, "rewards/margins": 15.186129570007324, "rewards/rejected": -16.44370460510254, "step": 2535 }, { "epoch": 0.61, "learning_rate": 8.698666666666667e-08, "logps/chosen": -267.139404296875, "logps/rejected": -406.57574462890625, "loss": 0.0036, "losses/dpo": 1.502972191147478e-10, "losses/sft": 0.474100798368454, "losses/total": 1.502972191147478e-10, "ref_logps/chosen": -255.84808349609375, "ref_logps/rejected": -249.35568237304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.1291322708129883, "rewards/margins": 14.592872619628906, "rewards/rejected": -15.722004890441895, "step": 2536 }, { "epoch": 0.61, "learning_rate": 8.693333333333333e-08, "logps/chosen": -226.97564697265625, "logps/rejected": -378.2881164550781, "loss": 0.0014, "losses/dpo": 1.6677768499562262e-08, "losses/sft": 0.5982934832572937, "losses/total": 1.6677768499562262e-08, "ref_logps/chosen": -215.29652404785156, "ref_logps/rejected": -237.4767608642578, "rewards/accuracies": 1.0, "rewards/chosen": -1.1679136753082275, "rewards/margins": 12.913222312927246, "rewards/rejected": -14.081136703491211, "step": 2537 }, { "epoch": 0.61, "learning_rate": 8.688e-08, "logps/chosen": -254.33114624023438, "logps/rejected": -375.5749816894531, "loss": 0.0011, "losses/dpo": 1.0376572934234218e-09, "losses/sft": 0.540198028087616, "losses/total": 1.0376572934234218e-09, "ref_logps/chosen": -241.93702697753906, "ref_logps/rejected": -238.33392333984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.239412784576416, "rewards/margins": 12.484695434570312, "rewards/rejected": -13.72410774230957, "step": 2538 }, { "epoch": 0.61, "learning_rate": 8.682666666666666e-08, "logps/chosen": -268.3055419921875, "logps/rejected": -351.90618896484375, "loss": 0.0015, "losses/dpo": 8.9707100414671e-07, "losses/sft": 0.6260699033737183, "losses/total": 8.9707100414671e-07, "ref_logps/chosen": -253.85397338867188, "ref_logps/rejected": -220.2314910888672, "rewards/accuracies": 1.0, "rewards/chosen": -1.4451576471328735, "rewards/margins": 11.722312927246094, "rewards/rejected": -13.167470932006836, "step": 2539 }, { "epoch": 0.61, "learning_rate": 8.677333333333333e-08, "logps/chosen": -255.50576782226562, "logps/rejected": -374.0894775390625, "loss": 0.002, "losses/dpo": 2.344461336178938e-07, "losses/sft": 0.8836807012557983, "losses/total": 2.344461336178938e-07, "ref_logps/chosen": -238.88442993164062, "ref_logps/rejected": -226.4001922607422, "rewards/accuracies": 1.0, "rewards/chosen": -1.6621320247650146, "rewards/margins": 13.106796264648438, "rewards/rejected": -14.768928527832031, "step": 2540 }, { "epoch": 0.61, "learning_rate": 8.672e-08, "logps/chosen": -286.4523620605469, "logps/rejected": -390.14166259765625, "loss": 0.002, "losses/dpo": 0.00010691722854971886, "losses/sft": 0.4729834198951721, "losses/total": 0.00010691722854971886, "ref_logps/chosen": -270.5812683105469, "ref_logps/rejected": -235.70321655273438, "rewards/accuracies": 1.0, "rewards/chosen": -1.5871098041534424, "rewards/margins": 13.856734275817871, "rewards/rejected": -15.44384479522705, "step": 2541 }, { "epoch": 0.61, "learning_rate": 8.666666666666666e-08, "logps/chosen": -240.40321350097656, "logps/rejected": -372.9634704589844, "loss": 0.0018, "losses/dpo": 2.447832912366721e-06, "losses/sft": 0.6991609334945679, "losses/total": 2.447832912366721e-06, "ref_logps/chosen": -228.57688903808594, "ref_logps/rejected": -222.52023315429688, "rewards/accuracies": 1.0, "rewards/chosen": -1.1826329231262207, "rewards/margins": 13.861690521240234, "rewards/rejected": -15.044323921203613, "step": 2542 }, { "epoch": 0.61, "learning_rate": 8.661333333333333e-08, "logps/chosen": -203.39279174804688, "logps/rejected": -314.34991455078125, "loss": 0.0159, "losses/dpo": 0.00020359730115160346, "losses/sft": 1.1294525861740112, "losses/total": 0.00020359730115160346, "ref_logps/chosen": -195.33346557617188, "ref_logps/rejected": -197.95230102539062, "rewards/accuracies": 1.0, "rewards/chosen": -0.8059327602386475, "rewards/margins": 10.833827018737793, "rewards/rejected": -11.639759063720703, "step": 2543 }, { "epoch": 0.61, "learning_rate": 8.656e-08, "logps/chosen": -206.66522216796875, "logps/rejected": -367.275146484375, "loss": 0.0026, "losses/dpo": 2.0594098870674316e-08, "losses/sft": 0.5681045651435852, "losses/total": 2.0594098870674316e-08, "ref_logps/chosen": -191.9818115234375, "ref_logps/rejected": -213.49459838867188, "rewards/accuracies": 1.0, "rewards/chosen": -1.4683417081832886, "rewards/margins": 13.909710884094238, "rewards/rejected": -15.378053665161133, "step": 2544 }, { "epoch": 0.61, "learning_rate": 8.650666666666667e-08, "logps/chosen": -253.00979614257812, "logps/rejected": -351.09454345703125, "loss": 0.0194, "losses/dpo": 0.0003648756246548146, "losses/sft": 0.747264564037323, "losses/total": 0.0003648756246548146, "ref_logps/chosen": -239.71286010742188, "ref_logps/rejected": -214.6776123046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.3296928405761719, "rewards/margins": 12.312002182006836, "rewards/rejected": -13.641695022583008, "step": 2545 }, { "epoch": 0.61, "learning_rate": 8.645333333333333e-08, "logps/chosen": -235.31158447265625, "logps/rejected": -396.23486328125, "loss": 0.0002, "losses/dpo": 1.206448985158204e-07, "losses/sft": 0.5562421679496765, "losses/total": 1.206448985158204e-07, "ref_logps/chosen": -222.2915802001953, "ref_logps/rejected": -235.38601684570312, "rewards/accuracies": 1.0, "rewards/chosen": -1.3020015954971313, "rewards/margins": 14.782881736755371, "rewards/rejected": -16.084884643554688, "step": 2546 }, { "epoch": 0.61, "learning_rate": 8.639999999999999e-08, "logps/chosen": -231.56475830078125, "logps/rejected": -349.660400390625, "loss": 0.0012, "losses/dpo": 1.9646324744826416e-06, "losses/sft": 1.245435118675232, "losses/total": 1.9646324744826416e-06, "ref_logps/chosen": -218.85060119628906, "ref_logps/rejected": -210.1741485595703, "rewards/accuracies": 1.0, "rewards/chosen": -1.271416187286377, "rewards/margins": 12.677207946777344, "rewards/rejected": -13.948623657226562, "step": 2547 }, { "epoch": 0.61, "learning_rate": 8.634666666666667e-08, "logps/chosen": -246.45321655273438, "logps/rejected": -403.21026611328125, "loss": 0.0002, "losses/dpo": 4.245514162448671e-09, "losses/sft": 0.6676571369171143, "losses/total": 4.245514162448671e-09, "ref_logps/chosen": -231.98342895507812, "ref_logps/rejected": -242.57046508789062, "rewards/accuracies": 1.0, "rewards/chosen": -1.4469784498214722, "rewards/margins": 14.616999626159668, "rewards/rejected": -16.063980102539062, "step": 2548 }, { "epoch": 0.61, "learning_rate": 8.629333333333333e-08, "logps/chosen": -229.0970458984375, "logps/rejected": -369.76123046875, "loss": 0.001, "losses/dpo": 1.0851846088399952e-09, "losses/sft": 0.7854852676391602, "losses/total": 1.0851846088399952e-09, "ref_logps/chosen": -214.91490173339844, "ref_logps/rejected": -218.4373321533203, "rewards/accuracies": 1.0, "rewards/chosen": -1.418216586112976, "rewards/margins": 13.71417236328125, "rewards/rejected": -15.132389068603516, "step": 2549 }, { "epoch": 0.61, "learning_rate": 8.624e-08, "logps/chosen": -270.8099060058594, "logps/rejected": -400.1732177734375, "loss": 0.0014, "losses/dpo": 1.4643796930613462e-07, "losses/sft": 0.5308851003646851, "losses/total": 1.4643796930613462e-07, "ref_logps/chosen": -258.18585205078125, "ref_logps/rejected": -244.44125366210938, "rewards/accuracies": 1.0, "rewards/chosen": -1.262404203414917, "rewards/margins": 14.310792922973633, "rewards/rejected": -15.573196411132812, "step": 2550 }, { "epoch": 0.61, "learning_rate": 8.618666666666666e-08, "logps/chosen": -207.67442321777344, "logps/rejected": -357.7289123535156, "loss": 0.0026, "losses/dpo": 6.332984980872425e-07, "losses/sft": 0.7149640321731567, "losses/total": 6.332984980872425e-07, "ref_logps/chosen": -196.90086364746094, "ref_logps/rejected": -215.09783935546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.0773580074310303, "rewards/margins": 13.185752868652344, "rewards/rejected": -14.26310920715332, "step": 2551 }, { "epoch": 0.61, "learning_rate": 8.613333333333332e-08, "logps/chosen": -228.65496826171875, "logps/rejected": -392.97503662109375, "loss": 0.066, "losses/dpo": 1.0350716950213723e-09, "losses/sft": 0.5035812854766846, "losses/total": 1.0350716950213723e-09, "ref_logps/chosen": -212.67047119140625, "ref_logps/rejected": -242.3483123779297, "rewards/accuracies": 0.96875, "rewards/chosen": -1.5984500646591187, "rewards/margins": 13.464221954345703, "rewards/rejected": -15.06267261505127, "step": 2552 }, { "epoch": 0.61, "learning_rate": 8.608e-08, "logps/chosen": -244.46871948242188, "logps/rejected": -388.54815673828125, "loss": 0.0001, "losses/dpo": 1.0402279926324809e-08, "losses/sft": 0.6333569884300232, "losses/total": 1.0402279926324809e-08, "ref_logps/chosen": -231.30758666992188, "ref_logps/rejected": -226.54421997070312, "rewards/accuracies": 1.0, "rewards/chosen": -1.3161122798919678, "rewards/margins": 14.884283065795898, "rewards/rejected": -16.200395584106445, "step": 2553 }, { "epoch": 0.61, "learning_rate": 8.602666666666666e-08, "logps/chosen": -299.66937255859375, "logps/rejected": -414.6568298339844, "loss": 0.0003, "losses/dpo": 1.89703172281952e-07, "losses/sft": 0.5830169916152954, "losses/total": 1.89703172281952e-07, "ref_logps/chosen": -287.158935546875, "ref_logps/rejected": -250.76345825195312, "rewards/accuracies": 1.0, "rewards/chosen": -1.251044750213623, "rewards/margins": 15.13829231262207, "rewards/rejected": -16.38933753967285, "step": 2554 }, { "epoch": 0.61, "learning_rate": 8.597333333333333e-08, "logps/chosen": -235.35569763183594, "logps/rejected": -359.841552734375, "loss": 0.0109, "losses/dpo": 0.01246276218444109, "losses/sft": 0.49688369035720825, "losses/total": 0.01246276218444109, "ref_logps/chosen": -223.6537322998047, "ref_logps/rejected": -220.6731719970703, "rewards/accuracies": 1.0, "rewards/chosen": -1.170196294784546, "rewards/margins": 12.746642112731934, "rewards/rejected": -13.916837692260742, "step": 2555 }, { "epoch": 0.61, "learning_rate": 8.591999999999999e-08, "logps/chosen": -235.25051879882812, "logps/rejected": -390.98046875, "loss": 0.0003, "losses/dpo": 1.4620451516123012e-09, "losses/sft": 0.5586678385734558, "losses/total": 1.4620451516123012e-09, "ref_logps/chosen": -219.851806640625, "ref_logps/rejected": -228.58155822753906, "rewards/accuracies": 1.0, "rewards/chosen": -1.5398706197738647, "rewards/margins": 14.700021743774414, "rewards/rejected": -16.239891052246094, "step": 2556 }, { "epoch": 0.61, "learning_rate": 8.586666666666667e-08, "logps/chosen": -208.29624938964844, "logps/rejected": -355.19451904296875, "loss": 0.0019, "losses/dpo": 1.0270181233229536e-10, "losses/sft": 0.45248547196388245, "losses/total": 1.0270181233229536e-10, "ref_logps/chosen": -194.23533630371094, "ref_logps/rejected": -201.55801391601562, "rewards/accuracies": 1.0, "rewards/chosen": -1.4060895442962646, "rewards/margins": 13.957563400268555, "rewards/rejected": -15.363652229309082, "step": 2557 }, { "epoch": 0.61, "learning_rate": 8.581333333333333e-08, "logps/chosen": -193.50088500976562, "logps/rejected": -328.3036193847656, "loss": 0.0088, "losses/dpo": 2.0857342519775557e-07, "losses/sft": 0.5239068865776062, "losses/total": 2.0857342519775557e-07, "ref_logps/chosen": -183.82916259765625, "ref_logps/rejected": -198.33547973632812, "rewards/accuracies": 1.0, "rewards/chosen": -0.9671725630760193, "rewards/margins": 12.029644012451172, "rewards/rejected": -12.996816635131836, "step": 2558 }, { "epoch": 0.61, "learning_rate": 8.576e-08, "logps/chosen": -233.80218505859375, "logps/rejected": -372.6739501953125, "loss": 0.001, "losses/dpo": 1.2871414583059959e-05, "losses/sft": 0.426101952791214, "losses/total": 1.2871414583059959e-05, "ref_logps/chosen": -221.49029541015625, "ref_logps/rejected": -230.54464721679688, "rewards/accuracies": 1.0, "rewards/chosen": -1.2311897277832031, "rewards/margins": 12.981742858886719, "rewards/rejected": -14.212931632995605, "step": 2559 }, { "epoch": 0.61, "learning_rate": 8.570666666666666e-08, "logps/chosen": -224.21482849121094, "logps/rejected": -355.6991882324219, "loss": 0.002, "losses/dpo": 5.690813395631267e-06, "losses/sft": 0.6958805918693542, "losses/total": 5.690813395631267e-06, "ref_logps/chosen": -215.9018096923828, "ref_logps/rejected": -213.27560424804688, "rewards/accuracies": 1.0, "rewards/chosen": -0.831302285194397, "rewards/margins": 13.411056518554688, "rewards/rejected": -14.242358207702637, "step": 2560 }, { "epoch": 0.61, "learning_rate": 8.565333333333334e-08, "logps/chosen": -240.5293426513672, "logps/rejected": -370.34674072265625, "loss": 0.0009, "losses/dpo": 3.571320849005133e-05, "losses/sft": 0.9702666401863098, "losses/total": 3.571320849005133e-05, "ref_logps/chosen": -226.5348663330078, "ref_logps/rejected": -214.57171630859375, "rewards/accuracies": 1.0, "rewards/chosen": -1.3994488716125488, "rewards/margins": 14.178054809570312, "rewards/rejected": -15.577503204345703, "step": 2561 }, { "epoch": 0.61, "learning_rate": 8.56e-08, "logps/chosen": -313.0767822265625, "logps/rejected": -372.48260498046875, "loss": 0.0016, "losses/dpo": 3.525196916598361e-06, "losses/sft": 0.7251612544059753, "losses/total": 3.525196916598361e-06, "ref_logps/chosen": -299.28131103515625, "ref_logps/rejected": -225.09890747070312, "rewards/accuracies": 1.0, "rewards/chosen": -1.3795483112335205, "rewards/margins": 13.358824729919434, "rewards/rejected": -14.738371849060059, "step": 2562 }, { "epoch": 0.62, "learning_rate": 8.554666666666666e-08, "logps/chosen": -216.65216064453125, "logps/rejected": -378.83721923828125, "loss": 0.0004, "losses/dpo": 9.46417788583176e-09, "losses/sft": 0.6111522316932678, "losses/total": 9.46417788583176e-09, "ref_logps/chosen": -206.62612915039062, "ref_logps/rejected": -224.97830200195312, "rewards/accuracies": 1.0, "rewards/chosen": -1.0026025772094727, "rewards/margins": 14.38328742980957, "rewards/rejected": -15.385889053344727, "step": 2563 }, { "epoch": 0.62, "learning_rate": 8.549333333333333e-08, "logps/chosen": -206.85829162597656, "logps/rejected": -355.137451171875, "loss": 0.0007, "losses/dpo": 2.0408515410963446e-06, "losses/sft": 0.6722481846809387, "losses/total": 2.0408515410963446e-06, "ref_logps/chosen": -194.27029418945312, "ref_logps/rejected": -212.43421936035156, "rewards/accuracies": 1.0, "rewards/chosen": -1.2588019371032715, "rewards/margins": 13.01152229309082, "rewards/rejected": -14.27032470703125, "step": 2564 }, { "epoch": 0.62, "learning_rate": 8.544e-08, "logps/chosen": -182.45101928710938, "logps/rejected": -343.6673278808594, "loss": 0.0119, "losses/dpo": 6.613820460188435e-06, "losses/sft": 0.6919952034950256, "losses/total": 6.613820460188435e-06, "ref_logps/chosen": -172.29783630371094, "ref_logps/rejected": -199.19105529785156, "rewards/accuracies": 1.0, "rewards/chosen": -1.0153175592422485, "rewards/margins": 13.432310104370117, "rewards/rejected": -14.447628021240234, "step": 2565 }, { "epoch": 0.62, "learning_rate": 8.538666666666667e-08, "logps/chosen": -258.2399597167969, "logps/rejected": -370.6399230957031, "loss": 0.001, "losses/dpo": 1.2727447984417495e-09, "losses/sft": 0.6834733486175537, "losses/total": 1.2727447984417495e-09, "ref_logps/chosen": -243.13818359375, "ref_logps/rejected": -217.9821319580078, "rewards/accuracies": 1.0, "rewards/chosen": -1.5101768970489502, "rewards/margins": 13.755602836608887, "rewards/rejected": -15.265779495239258, "step": 2566 }, { "epoch": 0.62, "learning_rate": 8.533333333333333e-08, "logps/chosen": -254.67543029785156, "logps/rejected": -368.7297668457031, "loss": 0.0102, "losses/dpo": 0.012741354294121265, "losses/sft": 0.5250460505485535, "losses/total": 0.012741354294121265, "ref_logps/chosen": -242.06890869140625, "ref_logps/rejected": -221.46554565429688, "rewards/accuracies": 1.0, "rewards/chosen": -1.2606521844863892, "rewards/margins": 13.465768814086914, "rewards/rejected": -14.726421356201172, "step": 2567 }, { "epoch": 0.62, "learning_rate": 8.527999999999999e-08, "logps/chosen": -233.89053344726562, "logps/rejected": -341.1056823730469, "loss": 0.014, "losses/dpo": 2.1998851351678894e-10, "losses/sft": 0.6797870397567749, "losses/total": 2.1998851351678894e-10, "ref_logps/chosen": -220.22494506835938, "ref_logps/rejected": -196.4180908203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.3665573596954346, "rewards/margins": 13.102203369140625, "rewards/rejected": -14.468759536743164, "step": 2568 }, { "epoch": 0.62, "learning_rate": 8.522666666666666e-08, "logps/chosen": -316.1660461425781, "logps/rejected": -419.1802062988281, "loss": 0.0, "losses/dpo": 2.0544232029351406e-05, "losses/sft": 0.6083760857582092, "losses/total": 2.0544232029351406e-05, "ref_logps/chosen": -297.85595703125, "ref_logps/rejected": -250.3190460205078, "rewards/accuracies": 1.0, "rewards/chosen": -1.8310078382492065, "rewards/margins": 15.055108070373535, "rewards/rejected": -16.88611602783203, "step": 2569 }, { "epoch": 0.62, "learning_rate": 8.517333333333333e-08, "logps/chosen": -253.30551147460938, "logps/rejected": -376.480712890625, "loss": 0.0046, "losses/dpo": 5.279923342982329e-08, "losses/sft": 0.6278041005134583, "losses/total": 5.279923342982329e-08, "ref_logps/chosen": -237.19390869140625, "ref_logps/rejected": -228.8753204345703, "rewards/accuracies": 1.0, "rewards/chosen": -1.6111596822738647, "rewards/margins": 13.149381637573242, "rewards/rejected": -14.760540008544922, "step": 2570 }, { "epoch": 0.62, "learning_rate": 8.512e-08, "logps/chosen": -227.986572265625, "logps/rejected": -379.6289367675781, "loss": 0.0, "losses/dpo": 7.940478781165439e-07, "losses/sft": 0.6646056771278381, "losses/total": 7.940478781165439e-07, "ref_logps/chosen": -210.1570281982422, "ref_logps/rejected": -214.09381103515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7829557657241821, "rewards/margins": 14.770559310913086, "rewards/rejected": -16.55351448059082, "step": 2571 }, { "epoch": 0.62, "learning_rate": 8.506666666666666e-08, "logps/chosen": -209.5508270263672, "logps/rejected": -418.5159912109375, "loss": 0.0, "losses/dpo": 6.119157264228292e-11, "losses/sft": 0.4128497242927551, "losses/total": 6.119157264228292e-11, "ref_logps/chosen": -198.22015380859375, "ref_logps/rejected": -252.75039672851562, "rewards/accuracies": 1.0, "rewards/chosen": -1.133066177368164, "rewards/margins": 15.443493843078613, "rewards/rejected": -16.576560974121094, "step": 2572 }, { "epoch": 0.62, "learning_rate": 8.501333333333332e-08, "logps/chosen": -221.4407196044922, "logps/rejected": -383.03961181640625, "loss": 0.0068, "losses/dpo": 8.553463999305677e-07, "losses/sft": 0.4665636122226715, "losses/total": 8.553463999305677e-07, "ref_logps/chosen": -209.6552276611328, "ref_logps/rejected": -227.2655029296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.178549885749817, "rewards/margins": 14.398859977722168, "rewards/rejected": -15.577409744262695, "step": 2573 }, { "epoch": 0.62, "learning_rate": 8.496e-08, "logps/chosen": -215.6713104248047, "logps/rejected": -347.9443359375, "loss": 0.0069, "losses/dpo": 1.0435431931909989e-06, "losses/sft": 0.726797878742218, "losses/total": 1.0435431931909989e-06, "ref_logps/chosen": -200.37608337402344, "ref_logps/rejected": -200.92861938476562, "rewards/accuracies": 1.0, "rewards/chosen": -1.5295240879058838, "rewards/margins": 13.17204761505127, "rewards/rejected": -14.70157241821289, "step": 2574 }, { "epoch": 0.62, "learning_rate": 8.490666666666666e-08, "logps/chosen": -217.46974182128906, "logps/rejected": -376.16107177734375, "loss": 0.0012, "losses/dpo": 1.041182258632034e-06, "losses/sft": 0.45095160603523254, "losses/total": 1.041182258632034e-06, "ref_logps/chosen": -201.70050048828125, "ref_logps/rejected": -222.4837646484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5769240856170654, "rewards/margins": 13.790810585021973, "rewards/rejected": -15.367734909057617, "step": 2575 }, { "epoch": 0.62, "learning_rate": 8.485333333333333e-08, "logps/chosen": -260.37237548828125, "logps/rejected": -400.5390930175781, "loss": 0.0044, "losses/dpo": 9.028305925085078e-08, "losses/sft": 0.5473763942718506, "losses/total": 9.028305925085078e-08, "ref_logps/chosen": -243.6382293701172, "ref_logps/rejected": -242.41708374023438, "rewards/accuracies": 1.0, "rewards/chosen": -1.6734143495559692, "rewards/margins": 14.138786315917969, "rewards/rejected": -15.812201499938965, "step": 2576 }, { "epoch": 0.62, "learning_rate": 8.479999999999999e-08, "logps/chosen": -287.18402099609375, "logps/rejected": -411.02593994140625, "loss": 0.0005, "losses/dpo": 1.5781996864916437e-07, "losses/sft": 0.6236373782157898, "losses/total": 1.5781996864916437e-07, "ref_logps/chosen": -273.62420654296875, "ref_logps/rejected": -259.9328308105469, "rewards/accuracies": 1.0, "rewards/chosen": -1.3559834957122803, "rewards/margins": 13.753328323364258, "rewards/rejected": -15.1093111038208, "step": 2577 }, { "epoch": 0.62, "learning_rate": 8.474666666666667e-08, "logps/chosen": -213.97378540039062, "logps/rejected": -382.449462890625, "loss": 0.0017, "losses/dpo": 3.6752307863707756e-09, "losses/sft": 1.0063115358352661, "losses/total": 3.6752307863707756e-09, "ref_logps/chosen": -201.70510864257812, "ref_logps/rejected": -232.6201171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.2268691062927246, "rewards/margins": 13.75606632232666, "rewards/rejected": -14.98293685913086, "step": 2578 }, { "epoch": 0.62, "learning_rate": 8.469333333333333e-08, "logps/chosen": -216.56422424316406, "logps/rejected": -389.373291015625, "loss": 0.0005, "losses/dpo": 3.266917986133322e-12, "losses/sft": 0.6576153039932251, "losses/total": 3.266917986133322e-12, "ref_logps/chosen": -204.2621307373047, "ref_logps/rejected": -222.71694946289062, "rewards/accuracies": 1.0, "rewards/chosen": -1.230210304260254, "rewards/margins": 15.4354248046875, "rewards/rejected": -16.665634155273438, "step": 2579 }, { "epoch": 0.62, "learning_rate": 8.464e-08, "logps/chosen": -261.9508056640625, "logps/rejected": -421.34942626953125, "loss": 0.0002, "losses/dpo": 3.755426547513707e-08, "losses/sft": 0.6774671077728271, "losses/total": 3.755426547513707e-08, "ref_logps/chosen": -247.5092315673828, "ref_logps/rejected": -247.14895629882812, "rewards/accuracies": 1.0, "rewards/chosen": -1.4441567659378052, "rewards/margins": 15.975890159606934, "rewards/rejected": -17.420047760009766, "step": 2580 }, { "epoch": 0.62, "learning_rate": 8.458666666666666e-08, "logps/chosen": -212.80029296875, "logps/rejected": -329.87994384765625, "loss": 0.0036, "losses/dpo": 7.354690865213342e-07, "losses/sft": 0.4910261631011963, "losses/total": 7.354690865213342e-07, "ref_logps/chosen": -203.587646484375, "ref_logps/rejected": -196.93499755859375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9212659597396851, "rewards/margins": 12.373229026794434, "rewards/rejected": -13.294495582580566, "step": 2581 }, { "epoch": 0.62, "learning_rate": 8.453333333333334e-08, "logps/chosen": -249.640380859375, "logps/rejected": -344.3807067871094, "loss": 0.0032, "losses/dpo": 1.8563352568889968e-05, "losses/sft": 0.8013087511062622, "losses/total": 1.8563352568889968e-05, "ref_logps/chosen": -241.19776916503906, "ref_logps/rejected": -216.342041015625, "rewards/accuracies": 1.0, "rewards/chosen": -0.8442612290382385, "rewards/margins": 11.95960521697998, "rewards/rejected": -12.803866386413574, "step": 2582 }, { "epoch": 0.62, "learning_rate": 8.448e-08, "logps/chosen": -249.22250366210938, "logps/rejected": -407.59161376953125, "loss": 0.0001, "losses/dpo": 6.826680554183895e-09, "losses/sft": 0.4952363073825836, "losses/total": 6.826680554183895e-09, "ref_logps/chosen": -234.9183349609375, "ref_logps/rejected": -241.655517578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.430417776107788, "rewards/margins": 15.163188934326172, "rewards/rejected": -16.593608856201172, "step": 2583 }, { "epoch": 0.62, "learning_rate": 8.442666666666666e-08, "logps/chosen": -273.3056640625, "logps/rejected": -375.9482421875, "loss": 0.0022, "losses/dpo": 3.873025988809786e-08, "losses/sft": 1.1387662887573242, "losses/total": 3.873025988809786e-08, "ref_logps/chosen": -258.9555358886719, "ref_logps/rejected": -221.2054901123047, "rewards/accuracies": 1.0, "rewards/chosen": -1.4350130558013916, "rewards/margins": 14.039263725280762, "rewards/rejected": -15.474275588989258, "step": 2584 }, { "epoch": 0.62, "learning_rate": 8.437333333333333e-08, "logps/chosen": -215.89102172851562, "logps/rejected": -337.7173156738281, "loss": 0.0014, "losses/dpo": 6.414348519001578e-08, "losses/sft": 0.6980515122413635, "losses/total": 6.414348519001578e-08, "ref_logps/chosen": -202.63540649414062, "ref_logps/rejected": -202.73883056640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.3255630731582642, "rewards/margins": 12.172286987304688, "rewards/rejected": -13.49785041809082, "step": 2585 }, { "epoch": 0.62, "learning_rate": 8.431999999999999e-08, "logps/chosen": -218.32781982421875, "logps/rejected": -349.014404296875, "loss": 0.0023, "losses/dpo": 1.3854891626335908e-12, "losses/sft": 0.6776223182678223, "losses/total": 1.3854891626335908e-12, "ref_logps/chosen": -206.6015625, "ref_logps/rejected": -204.8570556640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.17262601852417, "rewards/margins": 13.243110656738281, "rewards/rejected": -14.415736198425293, "step": 2586 }, { "epoch": 0.62, "learning_rate": 8.426666666666667e-08, "logps/chosen": -264.403076171875, "logps/rejected": -372.9270324707031, "loss": 0.0005, "losses/dpo": 3.9247742833481425e-09, "losses/sft": 0.5180347561836243, "losses/total": 3.9247742833481425e-09, "ref_logps/chosen": -252.51483154296875, "ref_logps/rejected": -227.02484130859375, "rewards/accuracies": 1.0, "rewards/chosen": -1.188823938369751, "rewards/margins": 13.401396751403809, "rewards/rejected": -14.590221405029297, "step": 2587 }, { "epoch": 0.62, "learning_rate": 8.421333333333333e-08, "logps/chosen": -213.79505920410156, "logps/rejected": -356.3222351074219, "loss": 0.005, "losses/dpo": 9.878140971864013e-09, "losses/sft": 0.8215731978416443, "losses/total": 9.878140971864013e-09, "ref_logps/chosen": -202.52720642089844, "ref_logps/rejected": -213.18972778320312, "rewards/accuracies": 1.0, "rewards/chosen": -1.1267861127853394, "rewards/margins": 13.186468124389648, "rewards/rejected": -14.313253402709961, "step": 2588 }, { "epoch": 0.62, "learning_rate": 8.416e-08, "logps/chosen": -228.71575927734375, "logps/rejected": -368.60595703125, "loss": 0.0048, "losses/dpo": 8.216682800821218e-08, "losses/sft": 0.4696879982948303, "losses/total": 8.216682800821218e-08, "ref_logps/chosen": -217.0951690673828, "ref_logps/rejected": -222.68692016601562, "rewards/accuracies": 1.0, "rewards/chosen": -1.1620593070983887, "rewards/margins": 13.429847717285156, "rewards/rejected": -14.591907501220703, "step": 2589 }, { "epoch": 0.62, "learning_rate": 8.410666666666666e-08, "logps/chosen": -254.59857177734375, "logps/rejected": -360.58673095703125, "loss": 0.0062, "losses/dpo": 3.522195513028237e-08, "losses/sft": 0.7807767987251282, "losses/total": 3.522195513028237e-08, "ref_logps/chosen": -234.6222381591797, "ref_logps/rejected": -213.67343139648438, "rewards/accuracies": 1.0, "rewards/chosen": -1.9976332187652588, "rewards/margins": 12.693696022033691, "rewards/rejected": -14.691329002380371, "step": 2590 }, { "epoch": 0.62, "learning_rate": 8.405333333333333e-08, "logps/chosen": -258.0845947265625, "logps/rejected": -396.863525390625, "loss": 0.0029, "losses/dpo": 2.0545427105389535e-05, "losses/sft": 0.6453127264976501, "losses/total": 2.0545427105389535e-05, "ref_logps/chosen": -241.9594268798828, "ref_logps/rejected": -240.0741424560547, "rewards/accuracies": 1.0, "rewards/chosen": -1.612515926361084, "rewards/margins": 14.066424369812012, "rewards/rejected": -15.678939819335938, "step": 2591 }, { "epoch": 0.62, "learning_rate": 8.4e-08, "logps/chosen": -267.2237548828125, "logps/rejected": -388.8734130859375, "loss": 0.0038, "losses/dpo": 2.589203802472184e-07, "losses/sft": 0.6531758904457092, "losses/total": 2.589203802472184e-07, "ref_logps/chosen": -254.24501037597656, "ref_logps/rejected": -233.70602416992188, "rewards/accuracies": 1.0, "rewards/chosen": -1.2978730201721191, "rewards/margins": 14.21886920928955, "rewards/rejected": -15.516743659973145, "step": 2592 }, { "epoch": 0.62, "learning_rate": 8.394666666666666e-08, "logps/chosen": -199.96990966796875, "logps/rejected": -300.5130615234375, "loss": 0.0042, "losses/dpo": 1.2932963144862697e-08, "losses/sft": 0.778632402420044, "losses/total": 1.2932963144862697e-08, "ref_logps/chosen": -188.26864624023438, "ref_logps/rejected": -181.2875518798828, "rewards/accuracies": 1.0, "rewards/chosen": -1.1701250076293945, "rewards/margins": 10.75242805480957, "rewards/rejected": -11.922552108764648, "step": 2593 }, { "epoch": 0.62, "learning_rate": 8.389333333333332e-08, "logps/chosen": -262.8861083984375, "logps/rejected": -410.55517578125, "loss": 0.0001, "losses/dpo": 9.192504535349144e-07, "losses/sft": 0.6471185088157654, "losses/total": 9.192504535349144e-07, "ref_logps/chosen": -252.30235290527344, "ref_logps/rejected": -250.23768615722656, "rewards/accuracies": 1.0, "rewards/chosen": -1.0583784580230713, "rewards/margins": 14.973371505737305, "rewards/rejected": -16.031749725341797, "step": 2594 }, { "epoch": 0.62, "learning_rate": 8.384e-08, "logps/chosen": -251.41055297851562, "logps/rejected": -357.693359375, "loss": 0.0015, "losses/dpo": 2.1165462385397404e-05, "losses/sft": 1.0440821647644043, "losses/total": 2.1165462385397404e-05, "ref_logps/chosen": -233.06878662109375, "ref_logps/rejected": -222.58038330078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.8341782093048096, "rewards/margins": 11.677117347717285, "rewards/rejected": -13.511295318603516, "step": 2595 }, { "epoch": 0.62, "learning_rate": 8.378666666666667e-08, "logps/chosen": -235.2710418701172, "logps/rejected": -392.083984375, "loss": 0.0037, "losses/dpo": 6.9477286146479855e-09, "losses/sft": 1.0903071165084839, "losses/total": 6.9477286146479855e-09, "ref_logps/chosen": -225.4890594482422, "ref_logps/rejected": -235.708740234375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9781986474990845, "rewards/margins": 14.659326553344727, "rewards/rejected": -15.63752555847168, "step": 2596 }, { "epoch": 0.62, "learning_rate": 8.373333333333333e-08, "logps/chosen": -238.81655883789062, "logps/rejected": -371.18017578125, "loss": 0.0013, "losses/dpo": 1.9318140402901918e-06, "losses/sft": 0.5166619420051575, "losses/total": 1.9318140402901918e-06, "ref_logps/chosen": -221.2509765625, "ref_logps/rejected": -215.08815002441406, "rewards/accuracies": 1.0, "rewards/chosen": -1.756555438041687, "rewards/margins": 13.852649688720703, "rewards/rejected": -15.60920524597168, "step": 2597 }, { "epoch": 0.62, "learning_rate": 8.367999999999999e-08, "logps/chosen": -223.5009765625, "logps/rejected": -359.7727966308594, "loss": 0.0041, "losses/dpo": 6.459031283156946e-07, "losses/sft": 0.6876288652420044, "losses/total": 6.459031283156946e-07, "ref_logps/chosen": -208.58786010742188, "ref_logps/rejected": -204.3538360595703, "rewards/accuracies": 1.0, "rewards/chosen": -1.4913090467453003, "rewards/margins": 14.050588607788086, "rewards/rejected": -15.54189682006836, "step": 2598 }, { "epoch": 0.62, "learning_rate": 8.362666666666667e-08, "logps/chosen": -252.6735076904297, "logps/rejected": -409.2117004394531, "loss": 0.0024, "losses/dpo": 5.866578056679117e-12, "losses/sft": 0.561637282371521, "losses/total": 5.866578056679117e-12, "ref_logps/chosen": -239.3270263671875, "ref_logps/rejected": -257.013916015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.334647536277771, "rewards/margins": 13.885135650634766, "rewards/rejected": -15.219783782958984, "step": 2599 }, { "epoch": 0.62, "learning_rate": 8.357333333333333e-08, "logps/chosen": -280.45440673828125, "logps/rejected": -389.078369140625, "loss": 0.001, "losses/dpo": 6.736458058043127e-10, "losses/sft": 0.5103203058242798, "losses/total": 6.736458058043127e-10, "ref_logps/chosen": -261.4869079589844, "ref_logps/rejected": -228.47837829589844, "rewards/accuracies": 1.0, "rewards/chosen": -1.8967503309249878, "rewards/margins": 14.163249015808105, "rewards/rejected": -16.059999465942383, "step": 2600 }, { "epoch": 0.62, "learning_rate": 8.352e-08, "logps/chosen": -279.02685546875, "logps/rejected": -399.13958740234375, "loss": 0.0002, "losses/dpo": 5.842366146424638e-09, "losses/sft": 0.6165714263916016, "losses/total": 5.842366146424638e-09, "ref_logps/chosen": -263.27099609375, "ref_logps/rejected": -237.45809936523438, "rewards/accuracies": 1.0, "rewards/chosen": -1.5755852460861206, "rewards/margins": 14.592560768127441, "rewards/rejected": -16.168148040771484, "step": 2601 }, { "epoch": 0.62, "learning_rate": 8.346666666666666e-08, "logps/chosen": -253.3692626953125, "logps/rejected": -380.3880310058594, "loss": 0.0039, "losses/dpo": 6.694644127946958e-08, "losses/sft": 0.5941912531852722, "losses/total": 6.694644127946958e-08, "ref_logps/chosen": -241.46243286132812, "ref_logps/rejected": -228.7844696044922, "rewards/accuracies": 1.0, "rewards/chosen": -1.190684199333191, "rewards/margins": 13.969669342041016, "rewards/rejected": -15.160354614257812, "step": 2602 }, { "epoch": 0.62, "learning_rate": 8.341333333333332e-08, "logps/chosen": -230.57159423828125, "logps/rejected": -400.44012451171875, "loss": 0.0004, "losses/dpo": 1.506733759981671e-08, "losses/sft": 0.91917884349823, "losses/total": 1.506733759981671e-08, "ref_logps/chosen": -216.10076904296875, "ref_logps/rejected": -229.3763427734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4470818042755127, "rewards/margins": 15.659294128417969, "rewards/rejected": -17.10637664794922, "step": 2603 }, { "epoch": 0.62, "learning_rate": 8.336e-08, "logps/chosen": -289.0740661621094, "logps/rejected": -418.27581787109375, "loss": 0.0005, "losses/dpo": 1.1734991645084847e-08, "losses/sft": 0.5521331429481506, "losses/total": 1.1734991645084847e-08, "ref_logps/chosen": -271.3929138183594, "ref_logps/rejected": -252.8068084716797, "rewards/accuracies": 1.0, "rewards/chosen": -1.7681173086166382, "rewards/margins": 14.77878475189209, "rewards/rejected": -16.54690170288086, "step": 2604 }, { "epoch": 0.63, "learning_rate": 8.330666666666666e-08, "logps/chosen": -217.6448974609375, "logps/rejected": -362.4423828125, "loss": 0.0018, "losses/dpo": 3.0966580766289553e-07, "losses/sft": 0.7206001281738281, "losses/total": 3.0966580766289553e-07, "ref_logps/chosen": -205.8201904296875, "ref_logps/rejected": -224.44789123535156, "rewards/accuracies": 1.0, "rewards/chosen": -1.182471513748169, "rewards/margins": 12.616978645324707, "rewards/rejected": -13.799450874328613, "step": 2605 }, { "epoch": 0.63, "learning_rate": 8.325333333333333e-08, "logps/chosen": -214.36607360839844, "logps/rejected": -355.35809326171875, "loss": 0.0039, "losses/dpo": 1.3436955725865118e-07, "losses/sft": 0.5616378784179688, "losses/total": 1.3436955725865118e-07, "ref_logps/chosen": -201.3533477783203, "ref_logps/rejected": -203.03282165527344, "rewards/accuracies": 1.0, "rewards/chosen": -1.3012731075286865, "rewards/margins": 13.931256294250488, "rewards/rejected": -15.232528686523438, "step": 2606 }, { "epoch": 0.63, "learning_rate": 8.319999999999999e-08, "logps/chosen": -232.29586791992188, "logps/rejected": -380.7147216796875, "loss": 0.0026, "losses/dpo": 4.380565243877754e-09, "losses/sft": 0.5552980303764343, "losses/total": 4.380565243877754e-09, "ref_logps/chosen": -216.93434143066406, "ref_logps/rejected": -225.92919921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5361521244049072, "rewards/margins": 13.942399978637695, "rewards/rejected": -15.478551864624023, "step": 2607 }, { "epoch": 0.63, "learning_rate": 8.314666666666667e-08, "logps/chosen": -236.05914306640625, "logps/rejected": -351.7141418457031, "loss": 0.003, "losses/dpo": 6.493660720252592e-08, "losses/sft": 0.4378453493118286, "losses/total": 6.493660720252592e-08, "ref_logps/chosen": -222.22817993164062, "ref_logps/rejected": -214.61831665039062, "rewards/accuracies": 1.0, "rewards/chosen": -1.3830971717834473, "rewards/margins": 12.32648754119873, "rewards/rejected": -13.709585189819336, "step": 2608 }, { "epoch": 0.63, "learning_rate": 8.309333333333333e-08, "logps/chosen": -222.26190185546875, "logps/rejected": -369.6341552734375, "loss": 0.001, "losses/dpo": 3.8293873672046175e-07, "losses/sft": 0.7418511509895325, "losses/total": 3.8293873672046175e-07, "ref_logps/chosen": -210.72540283203125, "ref_logps/rejected": -223.28805541992188, "rewards/accuracies": 1.0, "rewards/chosen": -1.1536474227905273, "rewards/margins": 13.480962753295898, "rewards/rejected": -14.63460922241211, "step": 2609 }, { "epoch": 0.63, "learning_rate": 8.304e-08, "logps/chosen": -215.87539672851562, "logps/rejected": -353.50689697265625, "loss": 0.0008, "losses/dpo": 2.0062424255229416e-07, "losses/sft": 1.3188802003860474, "losses/total": 2.0062424255229416e-07, "ref_logps/chosen": -202.63478088378906, "ref_logps/rejected": -203.62637329101562, "rewards/accuracies": 1.0, "rewards/chosen": -1.3240617513656616, "rewards/margins": 13.663989067077637, "rewards/rejected": -14.988051414489746, "step": 2610 }, { "epoch": 0.63, "learning_rate": 8.298666666666666e-08, "logps/chosen": -247.3969268798828, "logps/rejected": -358.7458801269531, "loss": 0.0062, "losses/dpo": 1.6458510572192608e-06, "losses/sft": 0.5386102199554443, "losses/total": 1.6458510572192608e-06, "ref_logps/chosen": -228.72088623046875, "ref_logps/rejected": -208.89730834960938, "rewards/accuracies": 1.0, "rewards/chosen": -1.8676042556762695, "rewards/margins": 13.117252349853516, "rewards/rejected": -14.984856605529785, "step": 2611 }, { "epoch": 0.63, "learning_rate": 8.293333333333333e-08, "logps/chosen": -207.63751220703125, "logps/rejected": -354.69287109375, "loss": 0.0004, "losses/dpo": 8.256188621658112e-09, "losses/sft": 0.577063262462616, "losses/total": 8.256188621658112e-09, "ref_logps/chosen": -197.48553466796875, "ref_logps/rejected": -211.0398406982422, "rewards/accuracies": 1.0, "rewards/chosen": -1.0151982307434082, "rewards/margins": 13.350102424621582, "rewards/rejected": -14.365301132202148, "step": 2612 }, { "epoch": 0.63, "learning_rate": 8.288e-08, "logps/chosen": -259.5014953613281, "logps/rejected": -391.60052490234375, "loss": 0.0007, "losses/dpo": 2.7039068900558405e-11, "losses/sft": 0.7183183431625366, "losses/total": 2.7039068900558405e-11, "ref_logps/chosen": -244.43080139160156, "ref_logps/rejected": -227.969482421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5070712566375732, "rewards/margins": 14.856035232543945, "rewards/rejected": -16.36310577392578, "step": 2613 }, { "epoch": 0.63, "learning_rate": 8.282666666666666e-08, "logps/chosen": -226.6422119140625, "logps/rejected": -379.40679931640625, "loss": 0.0004, "losses/dpo": 6.9321745286288206e-06, "losses/sft": 0.5194158554077148, "losses/total": 6.9321745286288206e-06, "ref_logps/chosen": -217.09915161132812, "ref_logps/rejected": -231.94949340820312, "rewards/accuracies": 1.0, "rewards/chosen": -0.9543063044548035, "rewards/margins": 13.791421890258789, "rewards/rejected": -14.7457275390625, "step": 2614 }, { "epoch": 0.63, "learning_rate": 8.277333333333333e-08, "logps/chosen": -283.5508117675781, "logps/rejected": -416.8413391113281, "loss": 0.0025, "losses/dpo": 1.4111084965406917e-05, "losses/sft": 0.5746532678604126, "losses/total": 1.4111084965406917e-05, "ref_logps/chosen": -264.55364990234375, "ref_logps/rejected": -254.31825256347656, "rewards/accuracies": 1.0, "rewards/chosen": -1.8997132778167725, "rewards/margins": 14.352598190307617, "rewards/rejected": -16.25231170654297, "step": 2615 }, { "epoch": 0.63, "learning_rate": 8.272e-08, "logps/chosen": -199.592529296875, "logps/rejected": -348.1997375488281, "loss": 0.0055, "losses/dpo": 2.814411573126563e-06, "losses/sft": 0.568132221698761, "losses/total": 2.814411573126563e-06, "ref_logps/chosen": -191.55380249023438, "ref_logps/rejected": -207.8323211669922, "rewards/accuracies": 1.0, "rewards/chosen": -0.8038707375526428, "rewards/margins": 13.23287296295166, "rewards/rejected": -14.0367431640625, "step": 2616 }, { "epoch": 0.63, "learning_rate": 8.266666666666667e-08, "logps/chosen": -197.78912353515625, "logps/rejected": -344.13543701171875, "loss": 0.0022, "losses/dpo": 4.4672114896116e-09, "losses/sft": 0.6382784843444824, "losses/total": 4.4672114896116e-09, "ref_logps/chosen": -186.21356201171875, "ref_logps/rejected": -199.75546264648438, "rewards/accuracies": 1.0, "rewards/chosen": -1.157554030418396, "rewards/margins": 13.280441284179688, "rewards/rejected": -14.437994956970215, "step": 2617 }, { "epoch": 0.63, "learning_rate": 8.261333333333333e-08, "logps/chosen": -231.89297485351562, "logps/rejected": -361.5591125488281, "loss": 0.0183, "losses/dpo": 3.882126287102494e-10, "losses/sft": 0.595874547958374, "losses/total": 3.882126287102494e-10, "ref_logps/chosen": -216.64755249023438, "ref_logps/rejected": -207.64352416992188, "rewards/accuracies": 1.0, "rewards/chosen": -1.5245404243469238, "rewards/margins": 13.867019653320312, "rewards/rejected": -15.391559600830078, "step": 2618 }, { "epoch": 0.63, "learning_rate": 8.255999999999999e-08, "logps/chosen": -278.203125, "logps/rejected": -382.1341247558594, "loss": 0.0126, "losses/dpo": 1.45621945057961e-10, "losses/sft": 0.6630565524101257, "losses/total": 1.45621945057961e-10, "ref_logps/chosen": -263.52606201171875, "ref_logps/rejected": -228.7919921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.467708706855774, "rewards/margins": 13.866504669189453, "rewards/rejected": -15.334214210510254, "step": 2619 }, { "epoch": 0.63, "learning_rate": 8.250666666666666e-08, "logps/chosen": -251.36817932128906, "logps/rejected": -422.33355712890625, "loss": 0.0015, "losses/dpo": 1.794063786419997e-09, "losses/sft": 0.49252671003341675, "losses/total": 1.794063786419997e-09, "ref_logps/chosen": -238.27865600585938, "ref_logps/rejected": -262.45904541015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.3089526891708374, "rewards/margins": 14.678500175476074, "rewards/rejected": -15.987452507019043, "step": 2620 }, { "epoch": 0.63, "learning_rate": 8.245333333333333e-08, "logps/chosen": -217.30845642089844, "logps/rejected": -338.3323974609375, "loss": 0.0002, "losses/dpo": 3.2927044202324396e-08, "losses/sft": 0.6576458215713501, "losses/total": 3.2927044202324396e-08, "ref_logps/chosen": -206.96702575683594, "ref_logps/rejected": -199.69247436523438, "rewards/accuracies": 1.0, "rewards/chosen": -1.0341429710388184, "rewards/margins": 12.829851150512695, "rewards/rejected": -13.863994598388672, "step": 2621 }, { "epoch": 0.63, "learning_rate": 8.24e-08, "logps/chosen": -216.7017822265625, "logps/rejected": -359.85107421875, "loss": 0.0013, "losses/dpo": 1.8467439133473817e-08, "losses/sft": 0.5687446594238281, "losses/total": 1.8467439133473817e-08, "ref_logps/chosen": -200.4991455078125, "ref_logps/rejected": -204.6064453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6202634572982788, "rewards/margins": 13.904199600219727, "rewards/rejected": -15.524462699890137, "step": 2622 }, { "epoch": 0.63, "learning_rate": 8.234666666666666e-08, "logps/chosen": -213.0919189453125, "logps/rejected": -398.33160400390625, "loss": 0.0004, "losses/dpo": 3.445299157078807e-08, "losses/sft": 0.3852906823158264, "losses/total": 3.445299157078807e-08, "ref_logps/chosen": -200.406005859375, "ref_logps/rejected": -241.1290283203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.268592119216919, "rewards/margins": 14.451663970947266, "rewards/rejected": -15.720254898071289, "step": 2623 }, { "epoch": 0.63, "learning_rate": 8.229333333333332e-08, "logps/chosen": -218.81802368164062, "logps/rejected": -353.94085693359375, "loss": 0.0007, "losses/dpo": 2.6800901764545415e-07, "losses/sft": 1.0713340044021606, "losses/total": 2.6800901764545415e-07, "ref_logps/chosen": -206.4056396484375, "ref_logps/rejected": -205.0465850830078, "rewards/accuracies": 1.0, "rewards/chosen": -1.241237998008728, "rewards/margins": 13.648189544677734, "rewards/rejected": -14.889427185058594, "step": 2624 }, { "epoch": 0.63, "learning_rate": 8.224e-08, "logps/chosen": -245.85482788085938, "logps/rejected": -376.97039794921875, "loss": 0.0019, "losses/dpo": 4.839050689042779e-06, "losses/sft": 0.4792114198207855, "losses/total": 4.839050689042779e-06, "ref_logps/chosen": -231.5870361328125, "ref_logps/rejected": -217.46986389160156, "rewards/accuracies": 1.0, "rewards/chosen": -1.4267780780792236, "rewards/margins": 14.523275375366211, "rewards/rejected": -15.950055122375488, "step": 2625 }, { "epoch": 0.63, "learning_rate": 8.218666666666666e-08, "logps/chosen": -249.71847534179688, "logps/rejected": -374.64422607421875, "loss": 0.0005, "losses/dpo": 7.47577701076807e-07, "losses/sft": 0.5900368094444275, "losses/total": 7.47577701076807e-07, "ref_logps/chosen": -238.02804565429688, "ref_logps/rejected": -219.99481201171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.1690428256988525, "rewards/margins": 14.295900344848633, "rewards/rejected": -15.464942932128906, "step": 2626 }, { "epoch": 0.63, "learning_rate": 8.213333333333333e-08, "logps/chosen": -246.2582244873047, "logps/rejected": -401.9337158203125, "loss": 0.0013, "losses/dpo": 8.600947154491223e-08, "losses/sft": 0.5045589804649353, "losses/total": 8.600947154491223e-08, "ref_logps/chosen": -227.51055908203125, "ref_logps/rejected": -244.33201599121094, "rewards/accuracies": 1.0, "rewards/chosen": -1.8747661113739014, "rewards/margins": 13.885403633117676, "rewards/rejected": -15.760169982910156, "step": 2627 }, { "epoch": 0.63, "learning_rate": 8.207999999999999e-08, "logps/chosen": -225.89263916015625, "logps/rejected": -366.0123596191406, "loss": 0.0008, "losses/dpo": 1.3524415010124358e-07, "losses/sft": 0.545307993888855, "losses/total": 1.3524415010124358e-07, "ref_logps/chosen": -213.83453369140625, "ref_logps/rejected": -222.6425323486328, "rewards/accuracies": 1.0, "rewards/chosen": -1.2058110237121582, "rewards/margins": 13.131172180175781, "rewards/rejected": -14.336983680725098, "step": 2628 }, { "epoch": 0.63, "learning_rate": 8.202666666666667e-08, "logps/chosen": -243.2987060546875, "logps/rejected": -369.6585388183594, "loss": 0.0116, "losses/dpo": 8.82089423726029e-09, "losses/sft": 0.49878573417663574, "losses/total": 8.82089423726029e-09, "ref_logps/chosen": -229.98284912109375, "ref_logps/rejected": -225.8712921142578, "rewards/accuracies": 1.0, "rewards/chosen": -1.331585168838501, "rewards/margins": 13.047138214111328, "rewards/rejected": -14.37872314453125, "step": 2629 }, { "epoch": 0.63, "learning_rate": 8.197333333333333e-08, "logps/chosen": -174.29574584960938, "logps/rejected": -338.0757141113281, "loss": 0.0233, "losses/dpo": 2.1959782316116616e-05, "losses/sft": 0.37745344638824463, "losses/total": 2.1959782316116616e-05, "ref_logps/chosen": -164.81292724609375, "ref_logps/rejected": -201.6544952392578, "rewards/accuracies": 1.0, "rewards/chosen": -0.9482825398445129, "rewards/margins": 12.693838119506836, "rewards/rejected": -13.642120361328125, "step": 2630 }, { "epoch": 0.63, "learning_rate": 8.192e-08, "logps/chosen": -243.88336181640625, "logps/rejected": -372.12353515625, "loss": 0.0058, "losses/dpo": 1.3300169143803942e-07, "losses/sft": 0.6704740524291992, "losses/total": 1.3300169143803942e-07, "ref_logps/chosen": -230.3245391845703, "ref_logps/rejected": -221.18212890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.3558818101882935, "rewards/margins": 13.738256454467773, "rewards/rejected": -15.094139099121094, "step": 2631 }, { "epoch": 0.63, "learning_rate": 8.186666666666666e-08, "logps/chosen": -301.9359436035156, "logps/rejected": -424.47332763671875, "loss": 0.0008, "losses/dpo": 5.258904366201023e-06, "losses/sft": 0.6845031976699829, "losses/total": 5.258904366201023e-06, "ref_logps/chosen": -284.6892395019531, "ref_logps/rejected": -260.82147216796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.724666714668274, "rewards/margins": 14.640518188476562, "rewards/rejected": -16.365184783935547, "step": 2632 }, { "epoch": 0.63, "learning_rate": 8.181333333333334e-08, "logps/chosen": -251.81118774414062, "logps/rejected": -394.3677062988281, "loss": 0.0002, "losses/dpo": 1.6490722121176304e-09, "losses/sft": 0.5668243169784546, "losses/total": 1.6490722121176304e-09, "ref_logps/chosen": -235.80413818359375, "ref_logps/rejected": -229.607666015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6007075309753418, "rewards/margins": 14.875293731689453, "rewards/rejected": -16.476001739501953, "step": 2633 }, { "epoch": 0.63, "learning_rate": 8.176e-08, "logps/chosen": -304.5751953125, "logps/rejected": -391.035400390625, "loss": 0.0003, "losses/dpo": 1.5518045515250378e-08, "losses/sft": 0.4674060046672821, "losses/total": 1.5518045515250378e-08, "ref_logps/chosen": -292.2821044921875, "ref_logps/rejected": -245.92279052734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.2293097972869873, "rewards/margins": 13.281949996948242, "rewards/rejected": -14.511260986328125, "step": 2634 }, { "epoch": 0.63, "learning_rate": 8.170666666666666e-08, "logps/chosen": -225.95071411132812, "logps/rejected": -340.1015319824219, "loss": 0.0023, "losses/dpo": 1.4978631182605007e-11, "losses/sft": 0.9136857986450195, "losses/total": 1.4978631182605007e-11, "ref_logps/chosen": -208.56344604492188, "ref_logps/rejected": -198.50904846191406, "rewards/accuracies": 1.0, "rewards/chosen": -1.7387264966964722, "rewards/margins": 12.420522689819336, "rewards/rejected": -14.159248352050781, "step": 2635 }, { "epoch": 0.63, "learning_rate": 8.165333333333333e-08, "logps/chosen": -209.2057342529297, "logps/rejected": -401.8856201171875, "loss": 0.0014, "losses/dpo": 2.1389337234722916e-07, "losses/sft": 0.8455060124397278, "losses/total": 2.1389337234722916e-07, "ref_logps/chosen": -199.31800842285156, "ref_logps/rejected": -235.745849609375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9887728691101074, "rewards/margins": 15.625202178955078, "rewards/rejected": -16.613975524902344, "step": 2636 }, { "epoch": 0.63, "learning_rate": 8.159999999999999e-08, "logps/chosen": -248.6187744140625, "logps/rejected": -393.25421142578125, "loss": 0.0002, "losses/dpo": 4.039387491161506e-08, "losses/sft": 0.6557378768920898, "losses/total": 4.039387491161506e-08, "ref_logps/chosen": -237.86648559570312, "ref_logps/rejected": -244.5269775390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.0752314329147339, "rewards/margins": 13.797492027282715, "rewards/rejected": -14.872723579406738, "step": 2637 }, { "epoch": 0.63, "learning_rate": 8.154666666666667e-08, "logps/chosen": -258.329833984375, "logps/rejected": -388.5717468261719, "loss": 0.0002, "losses/dpo": 5.117715695668323e-10, "losses/sft": 0.47359853982925415, "losses/total": 5.117715695668323e-10, "ref_logps/chosen": -242.81466674804688, "ref_logps/rejected": -228.90652465820312, "rewards/accuracies": 1.0, "rewards/chosen": -1.5515161752700806, "rewards/margins": 14.415002822875977, "rewards/rejected": -15.96651840209961, "step": 2638 }, { "epoch": 0.63, "learning_rate": 8.149333333333333e-08, "logps/chosen": -259.5225830078125, "logps/rejected": -383.0189208984375, "loss": 0.0007, "losses/dpo": 1.8530597500898693e-08, "losses/sft": 0.8655976057052612, "losses/total": 1.8530597500898693e-08, "ref_logps/chosen": -250.6000518798828, "ref_logps/rejected": -229.42193603515625, "rewards/accuracies": 1.0, "rewards/chosen": -0.8922525644302368, "rewards/margins": 14.467443466186523, "rewards/rejected": -15.359695434570312, "step": 2639 }, { "epoch": 0.63, "learning_rate": 8.144e-08, "logps/chosen": -229.18032836914062, "logps/rejected": -341.5494384765625, "loss": 0.0052, "losses/dpo": 1.154992901319929e-08, "losses/sft": 0.46575891971588135, "losses/total": 1.154992901319929e-08, "ref_logps/chosen": -216.39468383789062, "ref_logps/rejected": -199.78662109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.2785615921020508, "rewards/margins": 12.897722244262695, "rewards/rejected": -14.176284790039062, "step": 2640 }, { "epoch": 0.63, "learning_rate": 8.138666666666666e-08, "logps/chosen": -212.1249542236328, "logps/rejected": -351.297119140625, "loss": 0.0003, "losses/dpo": 4.069031092512887e-06, "losses/sft": 0.8895800709724426, "losses/total": 4.069031092512887e-06, "ref_logps/chosen": -198.87132263183594, "ref_logps/rejected": -207.61477661132812, "rewards/accuracies": 1.0, "rewards/chosen": -1.3253635168075562, "rewards/margins": 13.042868614196777, "rewards/rejected": -14.368231773376465, "step": 2641 }, { "epoch": 0.63, "learning_rate": 8.133333333333333e-08, "logps/chosen": -232.5130615234375, "logps/rejected": -328.3144836425781, "loss": 0.0031, "losses/dpo": 1.960926283572917e-06, "losses/sft": 0.863074004650116, "losses/total": 1.960926283572917e-06, "ref_logps/chosen": -220.01498413085938, "ref_logps/rejected": -200.45501708984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.249807596206665, "rewards/margins": 11.536140441894531, "rewards/rejected": -12.785947799682617, "step": 2642 }, { "epoch": 0.63, "learning_rate": 8.128e-08, "logps/chosen": -290.0380554199219, "logps/rejected": -400.09918212890625, "loss": 0.003, "losses/dpo": 5.410297987396007e-09, "losses/sft": 0.5499901175498962, "losses/total": 5.410297987396007e-09, "ref_logps/chosen": -273.3919982910156, "ref_logps/rejected": -244.22360229492188, "rewards/accuracies": 1.0, "rewards/chosen": -1.6646060943603516, "rewards/margins": 13.922952651977539, "rewards/rejected": -15.58755874633789, "step": 2643 }, { "epoch": 0.63, "learning_rate": 8.122666666666666e-08, "logps/chosen": -223.64109802246094, "logps/rejected": -412.5223388671875, "loss": 0.0001, "losses/dpo": 3.4113877944719206e-08, "losses/sft": 0.641564667224884, "losses/total": 3.4113877944719206e-08, "ref_logps/chosen": -212.1126251220703, "ref_logps/rejected": -244.6995086669922, "rewards/accuracies": 1.0, "rewards/chosen": -1.152848243713379, "rewards/margins": 15.629434585571289, "rewards/rejected": -16.78228187561035, "step": 2644 }, { "epoch": 0.63, "learning_rate": 8.117333333333332e-08, "logps/chosen": -238.15878295898438, "logps/rejected": -399.82452392578125, "loss": 0.0043, "losses/dpo": 6.630842541710535e-10, "losses/sft": 0.5448419451713562, "losses/total": 6.630842541710535e-10, "ref_logps/chosen": -224.35751342773438, "ref_logps/rejected": -231.30276489257812, "rewards/accuracies": 1.0, "rewards/chosen": -1.3801274299621582, "rewards/margins": 15.472049713134766, "rewards/rejected": -16.852176666259766, "step": 2645 }, { "epoch": 0.63, "learning_rate": 8.112e-08, "logps/chosen": -296.89007568359375, "logps/rejected": -416.25860595703125, "loss": 0.0, "losses/dpo": 9.846671034097199e-09, "losses/sft": 0.6996808052062988, "losses/total": 9.846671034097199e-09, "ref_logps/chosen": -279.35089111328125, "ref_logps/rejected": -254.06512451171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7539153099060059, "rewards/margins": 14.465434074401855, "rewards/rejected": -16.219348907470703, "step": 2646 }, { "epoch": 0.64, "learning_rate": 8.106666666666666e-08, "logps/chosen": -257.4575500488281, "logps/rejected": -391.7779235839844, "loss": 0.0001, "losses/dpo": 3.166488227179798e-07, "losses/sft": 0.6454625129699707, "losses/total": 3.166488227179798e-07, "ref_logps/chosen": -246.15858459472656, "ref_logps/rejected": -238.15774536132812, "rewards/accuracies": 1.0, "rewards/chosen": -1.129895806312561, "rewards/margins": 14.232121467590332, "rewards/rejected": -15.362017631530762, "step": 2647 }, { "epoch": 0.64, "learning_rate": 8.101333333333333e-08, "logps/chosen": -230.25453186035156, "logps/rejected": -430.8430480957031, "loss": 0.0001, "losses/dpo": 4.632044081631648e-09, "losses/sft": 0.5678019523620605, "losses/total": 4.632044081631648e-09, "ref_logps/chosen": -213.1623077392578, "ref_logps/rejected": -257.80767822265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7092232704162598, "rewards/margins": 15.59431266784668, "rewards/rejected": -17.30353546142578, "step": 2648 }, { "epoch": 0.64, "learning_rate": 8.095999999999999e-08, "logps/chosen": -247.31387329101562, "logps/rejected": -399.19927978515625, "loss": 0.0028, "losses/dpo": 2.7619780240684122e-08, "losses/sft": 0.9213975667953491, "losses/total": 2.7619780240684122e-08, "ref_logps/chosen": -229.26307678222656, "ref_logps/rejected": -229.45492553710938, "rewards/accuracies": 1.0, "rewards/chosen": -1.8050800561904907, "rewards/margins": 15.169357299804688, "rewards/rejected": -16.974437713623047, "step": 2649 }, { "epoch": 0.64, "learning_rate": 8.090666666666667e-08, "logps/chosen": -268.81292724609375, "logps/rejected": -373.5562744140625, "loss": 0.0124, "losses/dpo": 1.3395232656421285e-07, "losses/sft": 0.6370897889137268, "losses/total": 1.3395232656421285e-07, "ref_logps/chosen": -253.90489196777344, "ref_logps/rejected": -224.7471923828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.490805745124817, "rewards/margins": 13.39010238647461, "rewards/rejected": -14.880908012390137, "step": 2650 }, { "epoch": 0.64, "learning_rate": 8.085333333333333e-08, "logps/chosen": -304.2880859375, "logps/rejected": -387.331787109375, "loss": 0.0047, "losses/dpo": 1.6277375834761187e-05, "losses/sft": 0.706890344619751, "losses/total": 1.6277375834761187e-05, "ref_logps/chosen": -285.59112548828125, "ref_logps/rejected": -227.92758178710938, "rewards/accuracies": 1.0, "rewards/chosen": -1.8696967363357544, "rewards/margins": 14.070724487304688, "rewards/rejected": -15.940421104431152, "step": 2651 }, { "epoch": 0.64, "learning_rate": 8.08e-08, "logps/chosen": -261.66180419921875, "logps/rejected": -369.7543029785156, "loss": 0.0032, "losses/dpo": 2.3407148546539247e-05, "losses/sft": 1.1541874408721924, "losses/total": 2.3407148546539247e-05, "ref_logps/chosen": -249.43527221679688, "ref_logps/rejected": -219.89117431640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2226555347442627, "rewards/margins": 13.763660430908203, "rewards/rejected": -14.986316680908203, "step": 2652 }, { "epoch": 0.64, "learning_rate": 8.074666666666666e-08, "logps/chosen": -234.89651489257812, "logps/rejected": -402.9244384765625, "loss": 0.0072, "losses/dpo": 5.7455720181565084e-09, "losses/sft": 0.5653026700019836, "losses/total": 5.7455720181565084e-09, "ref_logps/chosen": -223.37857055664062, "ref_logps/rejected": -234.14175415039062, "rewards/accuracies": 1.0, "rewards/chosen": -1.1517932415008545, "rewards/margins": 15.72647476196289, "rewards/rejected": -16.878267288208008, "step": 2653 }, { "epoch": 0.64, "learning_rate": 8.069333333333332e-08, "logps/chosen": -224.6970977783203, "logps/rejected": -344.3360290527344, "loss": 0.0151, "losses/dpo": 7.520220557388768e-11, "losses/sft": 0.9746740460395813, "losses/total": 7.520220557388768e-11, "ref_logps/chosen": -209.61325073242188, "ref_logps/rejected": -208.73867797851562, "rewards/accuracies": 1.0, "rewards/chosen": -1.508385419845581, "rewards/margins": 12.051351547241211, "rewards/rejected": -13.559736251831055, "step": 2654 }, { "epoch": 0.64, "learning_rate": 8.064e-08, "logps/chosen": -325.47662353515625, "logps/rejected": -403.1207580566406, "loss": 0.0082, "losses/dpo": 1.5382958906684507e-08, "losses/sft": 0.9862791299819946, "losses/total": 1.5382958906684507e-08, "ref_logps/chosen": -310.3712463378906, "ref_logps/rejected": -255.12786865234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5105408430099487, "rewards/margins": 13.288749694824219, "rewards/rejected": -14.79928970336914, "step": 2655 }, { "epoch": 0.64, "learning_rate": 8.058666666666666e-08, "logps/chosen": -245.3616943359375, "logps/rejected": -386.3573913574219, "loss": 0.0012, "losses/dpo": 9.157742619780862e-11, "losses/sft": 0.46204403042793274, "losses/total": 9.157742619780862e-11, "ref_logps/chosen": -233.15066528320312, "ref_logps/rejected": -226.07716369628906, "rewards/accuracies": 1.0, "rewards/chosen": -1.221101999282837, "rewards/margins": 14.80691909790039, "rewards/rejected": -16.02802085876465, "step": 2656 }, { "epoch": 0.64, "learning_rate": 8.053333333333333e-08, "logps/chosen": -257.37286376953125, "logps/rejected": -387.10101318359375, "loss": 0.0014, "losses/dpo": 4.5231354772745647e-10, "losses/sft": 0.5145624279975891, "losses/total": 4.5231354772745647e-10, "ref_logps/chosen": -242.18124389648438, "ref_logps/rejected": -228.99951171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5191619396209717, "rewards/margins": 14.290989875793457, "rewards/rejected": -15.810152053833008, "step": 2657 }, { "epoch": 0.64, "learning_rate": 8.047999999999999e-08, "logps/chosen": -284.73699951171875, "logps/rejected": -377.987060546875, "loss": 0.0011, "losses/dpo": 8.251685557070232e-09, "losses/sft": 0.5270417928695679, "losses/total": 8.251685557070232e-09, "ref_logps/chosen": -271.9695129394531, "ref_logps/rejected": -224.61138916015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2767468690872192, "rewards/margins": 14.060820579528809, "rewards/rejected": -15.337566375732422, "step": 2658 }, { "epoch": 0.64, "learning_rate": 8.042666666666667e-08, "logps/chosen": -238.311279296875, "logps/rejected": -403.22857666015625, "loss": 0.0002, "losses/dpo": 1.0074608525201256e-07, "losses/sft": 0.7448208332061768, "losses/total": 1.0074608525201256e-07, "ref_logps/chosen": -223.5196990966797, "ref_logps/rejected": -237.8289337158203, "rewards/accuracies": 1.0, "rewards/chosen": -1.4791579246520996, "rewards/margins": 15.060807228088379, "rewards/rejected": -16.539966583251953, "step": 2659 }, { "epoch": 0.64, "learning_rate": 8.037333333333333e-08, "logps/chosen": -236.89697265625, "logps/rejected": -372.93670654296875, "loss": 0.0029, "losses/dpo": 9.087526287032688e-09, "losses/sft": 0.9157344102859497, "losses/total": 9.087526287032688e-09, "ref_logps/chosen": -224.50042724609375, "ref_logps/rejected": -224.19607543945312, "rewards/accuracies": 1.0, "rewards/chosen": -1.2396520376205444, "rewards/margins": 13.634410858154297, "rewards/rejected": -14.874062538146973, "step": 2660 }, { "epoch": 0.64, "learning_rate": 8.032e-08, "logps/chosen": -268.69482421875, "logps/rejected": -420.32635498046875, "loss": 0.0018, "losses/dpo": 1.3630827311317262e-07, "losses/sft": 0.5017516613006592, "losses/total": 1.3630827311317262e-07, "ref_logps/chosen": -256.18438720703125, "ref_logps/rejected": -263.40948486328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.2510476112365723, "rewards/margins": 14.440641403198242, "rewards/rejected": -15.691688537597656, "step": 2661 }, { "epoch": 0.64, "learning_rate": 8.026666666666666e-08, "logps/chosen": -262.5874938964844, "logps/rejected": -390.1113586425781, "loss": 0.0072, "losses/dpo": 2.616720564674324e-07, "losses/sft": 0.5891592502593994, "losses/total": 2.616720564674324e-07, "ref_logps/chosen": -248.26707458496094, "ref_logps/rejected": -230.1810302734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4320447444915771, "rewards/margins": 14.560988426208496, "rewards/rejected": -15.993033409118652, "step": 2662 }, { "epoch": 0.64, "learning_rate": 8.021333333333333e-08, "logps/chosen": -300.69140625, "logps/rejected": -447.8079833984375, "loss": 0.0077, "losses/dpo": 1.2066330157267657e-07, "losses/sft": 0.5490847229957581, "losses/total": 1.2066330157267657e-07, "ref_logps/chosen": -281.98883056640625, "ref_logps/rejected": -265.5295104980469, "rewards/accuracies": 1.0, "rewards/chosen": -1.8702584505081177, "rewards/margins": 16.357589721679688, "rewards/rejected": -18.22784996032715, "step": 2663 }, { "epoch": 0.64, "learning_rate": 8.016e-08, "logps/chosen": -200.42422485351562, "logps/rejected": -380.5035400390625, "loss": 0.0036, "losses/dpo": 2.8477398217319205e-08, "losses/sft": 0.5537548661231995, "losses/total": 2.8477398217319205e-08, "ref_logps/chosen": -188.5023193359375, "ref_logps/rejected": -227.9166259765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.1921913623809814, "rewards/margins": 14.06650161743164, "rewards/rejected": -15.25869369506836, "step": 2664 }, { "epoch": 0.64, "learning_rate": 8.010666666666666e-08, "logps/chosen": -195.37106323242188, "logps/rejected": -316.51885986328125, "loss": 0.0102, "losses/dpo": 1.670354663474427e-07, "losses/sft": 0.8010944724082947, "losses/total": 1.670354663474427e-07, "ref_logps/chosen": -185.26434326171875, "ref_logps/rejected": -184.96881103515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.0106723308563232, "rewards/margins": 12.144330978393555, "rewards/rejected": -13.155003547668457, "step": 2665 }, { "epoch": 0.64, "learning_rate": 8.005333333333332e-08, "logps/chosen": -219.46255493164062, "logps/rejected": -374.8087158203125, "loss": 0.0019, "losses/dpo": 2.284368224095279e-12, "losses/sft": 0.7337790131568909, "losses/total": 2.284368224095279e-12, "ref_logps/chosen": -208.57891845703125, "ref_logps/rejected": -230.5312957763672, "rewards/accuracies": 1.0, "rewards/chosen": -1.0883642435073853, "rewards/margins": 13.339380264282227, "rewards/rejected": -14.427743911743164, "step": 2666 }, { "epoch": 0.64, "learning_rate": 8e-08, "logps/chosen": -219.92465209960938, "logps/rejected": -366.2488098144531, "loss": 0.0003, "losses/dpo": 1.0210254686171538e-06, "losses/sft": 0.9889361262321472, "losses/total": 1.0210254686171538e-06, "ref_logps/chosen": -206.1148681640625, "ref_logps/rejected": -209.92727661132812, "rewards/accuracies": 1.0, "rewards/chosen": -1.3809775114059448, "rewards/margins": 14.251176834106445, "rewards/rejected": -15.632155418395996, "step": 2667 }, { "epoch": 0.64, "learning_rate": 7.994666666666667e-08, "logps/chosen": -246.6681671142578, "logps/rejected": -375.5209655761719, "loss": 0.0025, "losses/dpo": 5.5742439144523814e-06, "losses/sft": 0.8414515852928162, "losses/total": 5.5742439144523814e-06, "ref_logps/chosen": -233.36795043945312, "ref_logps/rejected": -235.07223510742188, "rewards/accuracies": 1.0, "rewards/chosen": -1.3300211429595947, "rewards/margins": 12.714853286743164, "rewards/rejected": -14.04487419128418, "step": 2668 }, { "epoch": 0.64, "learning_rate": 7.989333333333333e-08, "logps/chosen": -236.46041870117188, "logps/rejected": -394.06439208984375, "loss": 0.0204, "losses/dpo": 7.838590931896761e-07, "losses/sft": 0.7498183250427246, "losses/total": 7.838590931896761e-07, "ref_logps/chosen": -219.71786499023438, "ref_logps/rejected": -223.50538635253906, "rewards/accuracies": 1.0, "rewards/chosen": -1.6742541790008545, "rewards/margins": 15.381647109985352, "rewards/rejected": -17.05590057373047, "step": 2669 }, { "epoch": 0.64, "learning_rate": 7.983999999999999e-08, "logps/chosen": -246.22140502929688, "logps/rejected": -350.1307067871094, "loss": 0.0071, "losses/dpo": 4.830766897612193e-10, "losses/sft": 0.5077197551727295, "losses/total": 4.830766897612193e-10, "ref_logps/chosen": -233.7220458984375, "ref_logps/rejected": -199.59750366210938, "rewards/accuracies": 1.0, "rewards/chosen": -1.2499371767044067, "rewards/margins": 13.803382873535156, "rewards/rejected": -15.05332088470459, "step": 2670 }, { "epoch": 0.64, "learning_rate": 7.978666666666666e-08, "logps/chosen": -233.82235717773438, "logps/rejected": -336.7143859863281, "loss": 0.0061, "losses/dpo": 8.599396750241795e-10, "losses/sft": 0.5824528336524963, "losses/total": 8.599396750241795e-10, "ref_logps/chosen": -220.67864990234375, "ref_logps/rejected": -190.62698364257812, "rewards/accuracies": 1.0, "rewards/chosen": -1.3143723011016846, "rewards/margins": 13.294368743896484, "rewards/rejected": -14.60874080657959, "step": 2671 }, { "epoch": 0.64, "learning_rate": 7.973333333333333e-08, "logps/chosen": -245.1529541015625, "logps/rejected": -405.85064697265625, "loss": 0.0033, "losses/dpo": 1.9982007160024295e-08, "losses/sft": 0.7429933547973633, "losses/total": 1.9982007160024295e-08, "ref_logps/chosen": -228.35023498535156, "ref_logps/rejected": -230.18423461914062, "rewards/accuracies": 1.0, "rewards/chosen": -1.6802711486816406, "rewards/margins": 15.886371612548828, "rewards/rejected": -17.56664276123047, "step": 2672 }, { "epoch": 0.64, "learning_rate": 7.968e-08, "logps/chosen": -247.91799926757812, "logps/rejected": -367.7476806640625, "loss": 0.0001, "losses/dpo": 1.0952567208732944e-05, "losses/sft": 0.7320864200592041, "losses/total": 1.0952567208732944e-05, "ref_logps/chosen": -231.8622283935547, "ref_logps/rejected": -206.03970336914062, "rewards/accuracies": 1.0, "rewards/chosen": -1.6055755615234375, "rewards/margins": 14.565221786499023, "rewards/rejected": -16.170795440673828, "step": 2673 }, { "epoch": 0.64, "learning_rate": 7.962666666666666e-08, "logps/chosen": -251.51004028320312, "logps/rejected": -387.6538391113281, "loss": 0.0001, "losses/dpo": 8.172853995347396e-06, "losses/sft": 0.8772715330123901, "losses/total": 8.172853995347396e-06, "ref_logps/chosen": -238.42831420898438, "ref_logps/rejected": -234.51364135742188, "rewards/accuracies": 1.0, "rewards/chosen": -1.308172345161438, "rewards/margins": 14.00584888458252, "rewards/rejected": -15.314022064208984, "step": 2674 }, { "epoch": 0.64, "learning_rate": 7.957333333333332e-08, "logps/chosen": -240.0501708984375, "logps/rejected": -330.68218994140625, "loss": 0.0035, "losses/dpo": 2.1194588306627793e-09, "losses/sft": 0.5824681520462036, "losses/total": 2.1194588306627793e-09, "ref_logps/chosen": -226.22744750976562, "ref_logps/rejected": -193.87477111816406, "rewards/accuracies": 1.0, "rewards/chosen": -1.3822721242904663, "rewards/margins": 12.298471450805664, "rewards/rejected": -13.680744171142578, "step": 2675 }, { "epoch": 0.64, "learning_rate": 7.952e-08, "logps/chosen": -238.94322204589844, "logps/rejected": -352.373779296875, "loss": 0.0029, "losses/dpo": 5.706536967409193e-07, "losses/sft": 0.608443558216095, "losses/total": 5.706536967409193e-07, "ref_logps/chosen": -228.16317749023438, "ref_logps/rejected": -214.46080017089844, "rewards/accuracies": 1.0, "rewards/chosen": -1.0780023336410522, "rewards/margins": 12.713294982910156, "rewards/rejected": -13.791297912597656, "step": 2676 }, { "epoch": 0.64, "learning_rate": 7.946666666666666e-08, "logps/chosen": -256.5945129394531, "logps/rejected": -369.916748046875, "loss": 0.0082, "losses/dpo": 6.200627012731275e-06, "losses/sft": 0.5830753445625305, "losses/total": 6.200627012731275e-06, "ref_logps/chosen": -238.60011291503906, "ref_logps/rejected": -211.67007446289062, "rewards/accuracies": 1.0, "rewards/chosen": -1.7994389533996582, "rewards/margins": 14.025226593017578, "rewards/rejected": -15.824665069580078, "step": 2677 }, { "epoch": 0.64, "learning_rate": 7.941333333333333e-08, "logps/chosen": -277.24884033203125, "logps/rejected": -420.92120361328125, "loss": 0.001, "losses/dpo": 1.4042937479530337e-08, "losses/sft": 0.5863664746284485, "losses/total": 1.4042937479530337e-08, "ref_logps/chosen": -258.7060546875, "ref_logps/rejected": -249.92373657226562, "rewards/accuracies": 1.0, "rewards/chosen": -1.8542799949645996, "rewards/margins": 15.245468139648438, "rewards/rejected": -17.099748611450195, "step": 2678 }, { "epoch": 0.64, "learning_rate": 7.935999999999999e-08, "logps/chosen": -220.3861846923828, "logps/rejected": -377.0479736328125, "loss": 0.0035, "losses/dpo": 7.415122960097165e-10, "losses/sft": 0.6529808044433594, "losses/total": 7.415122960097165e-10, "ref_logps/chosen": -208.14822387695312, "ref_logps/rejected": -225.35720825195312, "rewards/accuracies": 1.0, "rewards/chosen": -1.223795771598816, "rewards/margins": 13.945279121398926, "rewards/rejected": -15.169075965881348, "step": 2679 }, { "epoch": 0.64, "learning_rate": 7.930666666666667e-08, "logps/chosen": -234.92105102539062, "logps/rejected": -361.18927001953125, "loss": 0.001, "losses/dpo": 6.289013754212647e-07, "losses/sft": 0.4949352443218231, "losses/total": 6.289013754212647e-07, "ref_logps/chosen": -220.649169921875, "ref_logps/rejected": -212.44009399414062, "rewards/accuracies": 1.0, "rewards/chosen": -1.4271876811981201, "rewards/margins": 13.447728157043457, "rewards/rejected": -14.874916076660156, "step": 2680 }, { "epoch": 0.64, "learning_rate": 7.925333333333333e-08, "logps/chosen": -241.81912231445312, "logps/rejected": -374.4825134277344, "loss": 0.0019, "losses/dpo": 2.095787432487839e-11, "losses/sft": 0.5210906267166138, "losses/total": 2.095787432487839e-11, "ref_logps/chosen": -230.6298828125, "ref_logps/rejected": -221.74765014648438, "rewards/accuracies": 1.0, "rewards/chosen": -1.1189241409301758, "rewards/margins": 14.154562950134277, "rewards/rejected": -15.273487091064453, "step": 2681 }, { "epoch": 0.64, "learning_rate": 7.92e-08, "logps/chosen": -262.11968994140625, "logps/rejected": -375.5576171875, "loss": 0.0004, "losses/dpo": 1.543389771541115e-05, "losses/sft": 0.7123439311981201, "losses/total": 1.543389771541115e-05, "ref_logps/chosen": -241.79220581054688, "ref_logps/rejected": -221.2032470703125, "rewards/accuracies": 1.0, "rewards/chosen": -2.032749891281128, "rewards/margins": 13.402685165405273, "rewards/rejected": -15.43543529510498, "step": 2682 }, { "epoch": 0.64, "learning_rate": 7.914666666666666e-08, "logps/chosen": -248.19207763671875, "logps/rejected": -383.90460205078125, "loss": 0.0012, "losses/dpo": 3.6017058224757648e-09, "losses/sft": 1.0304348468780518, "losses/total": 3.6017058224757648e-09, "ref_logps/chosen": -232.557373046875, "ref_logps/rejected": -219.62039184570312, "rewards/accuracies": 1.0, "rewards/chosen": -1.5634710788726807, "rewards/margins": 14.864950180053711, "rewards/rejected": -16.428421020507812, "step": 2683 }, { "epoch": 0.64, "learning_rate": 7.909333333333333e-08, "logps/chosen": -206.62535095214844, "logps/rejected": -366.87225341796875, "loss": 0.0031, "losses/dpo": 7.007682825133088e-07, "losses/sft": 0.5479745864868164, "losses/total": 7.007682825133088e-07, "ref_logps/chosen": -194.12051391601562, "ref_logps/rejected": -215.2145233154297, "rewards/accuracies": 1.0, "rewards/chosen": -1.2504839897155762, "rewards/margins": 13.915288925170898, "rewards/rejected": -15.165773391723633, "step": 2684 }, { "epoch": 0.64, "learning_rate": 7.904e-08, "logps/chosen": -251.7005157470703, "logps/rejected": -385.10748291015625, "loss": 0.0002, "losses/dpo": 9.058888394974929e-08, "losses/sft": 0.8251362442970276, "losses/total": 9.058888394974929e-08, "ref_logps/chosen": -236.5096893310547, "ref_logps/rejected": -232.74302673339844, "rewards/accuracies": 1.0, "rewards/chosen": -1.5190849304199219, "rewards/margins": 13.717361450195312, "rewards/rejected": -15.236446380615234, "step": 2685 }, { "epoch": 0.64, "learning_rate": 7.898666666666666e-08, "logps/chosen": -222.9251708984375, "logps/rejected": -361.707763671875, "loss": 0.0122, "losses/dpo": 2.126525714629679e-06, "losses/sft": 0.6553788185119629, "losses/total": 2.126525714629679e-06, "ref_logps/chosen": -210.56480407714844, "ref_logps/rejected": -219.44509887695312, "rewards/accuracies": 1.0, "rewards/chosen": -1.2360339164733887, "rewards/margins": 12.990232467651367, "rewards/rejected": -14.226264953613281, "step": 2686 }, { "epoch": 0.64, "learning_rate": 7.893333333333333e-08, "logps/chosen": -279.22955322265625, "logps/rejected": -398.65313720703125, "loss": 0.0006, "losses/dpo": 1.7899348847549845e-07, "losses/sft": 0.9266939163208008, "losses/total": 1.7899348847549845e-07, "ref_logps/chosen": -263.5242004394531, "ref_logps/rejected": -238.86094665527344, "rewards/accuracies": 1.0, "rewards/chosen": -1.5705327987670898, "rewards/margins": 14.408685684204102, "rewards/rejected": -15.979218482971191, "step": 2687 }, { "epoch": 0.65, "learning_rate": 7.887999999999999e-08, "logps/chosen": -238.13235473632812, "logps/rejected": -354.1825256347656, "loss": 0.0159, "losses/dpo": 1.7677322938780549e-09, "losses/sft": 0.6823322176933289, "losses/total": 1.7677322938780549e-09, "ref_logps/chosen": -222.98158264160156, "ref_logps/rejected": -212.17039489746094, "rewards/accuracies": 1.0, "rewards/chosen": -1.5150777101516724, "rewards/margins": 12.686136245727539, "rewards/rejected": -14.201213836669922, "step": 2688 }, { "epoch": 0.65, "learning_rate": 7.882666666666667e-08, "logps/chosen": -209.161865234375, "logps/rejected": -348.5656433105469, "loss": 0.0027, "losses/dpo": 1.3515840180389205e-07, "losses/sft": 0.5864468812942505, "losses/total": 1.3515840180389205e-07, "ref_logps/chosen": -195.96995544433594, "ref_logps/rejected": -204.4036865234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.319191813468933, "rewards/margins": 13.097002029418945, "rewards/rejected": -14.4161958694458, "step": 2689 }, { "epoch": 0.65, "learning_rate": 7.877333333333333e-08, "logps/chosen": -223.05520629882812, "logps/rejected": -376.9498291015625, "loss": 0.0012, "losses/dpo": 0.0001879497867776081, "losses/sft": 1.1062322854995728, "losses/total": 0.0001879497867776081, "ref_logps/chosen": -211.57098388671875, "ref_logps/rejected": -231.3128662109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.1484220027923584, "rewards/margins": 13.415274620056152, "rewards/rejected": -14.56369686126709, "step": 2690 }, { "epoch": 0.65, "learning_rate": 7.871999999999999e-08, "logps/chosen": -231.50997924804688, "logps/rejected": -362.416259765625, "loss": 0.0002, "losses/dpo": 3.77512515115086e-05, "losses/sft": 0.75555020570755, "losses/total": 3.77512515115086e-05, "ref_logps/chosen": -217.74359130859375, "ref_logps/rejected": -209.16851806640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.3766381740570068, "rewards/margins": 13.948136329650879, "rewards/rejected": -15.324773788452148, "step": 2691 }, { "epoch": 0.65, "learning_rate": 7.866666666666666e-08, "logps/chosen": -214.359130859375, "logps/rejected": -357.0116271972656, "loss": 0.0106, "losses/dpo": 1.4265204129060294e-07, "losses/sft": 0.7224798202514648, "losses/total": 1.4265204129060294e-07, "ref_logps/chosen": -199.9131317138672, "ref_logps/rejected": -198.36044311523438, "rewards/accuracies": 1.0, "rewards/chosen": -1.4446008205413818, "rewards/margins": 14.420515060424805, "rewards/rejected": -15.865116119384766, "step": 2692 }, { "epoch": 0.65, "learning_rate": 7.861333333333333e-08, "logps/chosen": -228.35397338867188, "logps/rejected": -331.1730651855469, "loss": 0.0009, "losses/dpo": 7.706175253474612e-09, "losses/sft": 0.5076683163642883, "losses/total": 7.706175253474612e-09, "ref_logps/chosen": -216.587158203125, "ref_logps/rejected": -197.60516357421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.1766817569732666, "rewards/margins": 12.180109024047852, "rewards/rejected": -13.356790542602539, "step": 2693 }, { "epoch": 0.65, "learning_rate": 7.856e-08, "logps/chosen": -229.39056396484375, "logps/rejected": -354.7082824707031, "loss": 0.0008, "losses/dpo": 2.7720454454538412e-05, "losses/sft": 0.7180525660514832, "losses/total": 2.7720454454538412e-05, "ref_logps/chosen": -212.79010009765625, "ref_logps/rejected": -201.2152557373047, "rewards/accuracies": 1.0, "rewards/chosen": -1.660047173500061, "rewards/margins": 13.689253807067871, "rewards/rejected": -15.349300384521484, "step": 2694 }, { "epoch": 0.65, "learning_rate": 7.850666666666666e-08, "logps/chosen": -233.3737335205078, "logps/rejected": -403.4537353515625, "loss": 0.0004, "losses/dpo": 7.55515330297385e-08, "losses/sft": 0.559812605381012, "losses/total": 7.55515330297385e-08, "ref_logps/chosen": -220.96267700195312, "ref_logps/rejected": -231.29168701171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.2411051988601685, "rewards/margins": 15.975098609924316, "rewards/rejected": -17.216203689575195, "step": 2695 }, { "epoch": 0.65, "learning_rate": 7.845333333333332e-08, "logps/chosen": -260.9588623046875, "logps/rejected": -372.4158935546875, "loss": 0.0011, "losses/dpo": 5.416248072265262e-08, "losses/sft": 0.7375234365463257, "losses/total": 5.416248072265262e-08, "ref_logps/chosen": -244.56219482421875, "ref_logps/rejected": -212.830810546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.6396664381027222, "rewards/margins": 14.318840026855469, "rewards/rejected": -15.958505630493164, "step": 2696 }, { "epoch": 0.65, "learning_rate": 7.84e-08, "logps/chosen": -204.90452575683594, "logps/rejected": -371.9442138671875, "loss": 0.0042, "losses/dpo": 1.050829996529501e-05, "losses/sft": 0.5148326754570007, "losses/total": 1.050829996529501e-05, "ref_logps/chosen": -193.65972900390625, "ref_logps/rejected": -219.6253662109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.1244808435440063, "rewards/margins": 14.107404708862305, "rewards/rejected": -15.23188591003418, "step": 2697 }, { "epoch": 0.65, "learning_rate": 7.834666666666666e-08, "logps/chosen": -268.14251708984375, "logps/rejected": -379.69232177734375, "loss": 0.0064, "losses/dpo": 9.346771548734978e-06, "losses/sft": 0.6246466636657715, "losses/total": 9.346771548734978e-06, "ref_logps/chosen": -252.943359375, "ref_logps/rejected": -226.4244384765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.5199167728424072, "rewards/margins": 13.80687141418457, "rewards/rejected": -15.326787948608398, "step": 2698 }, { "epoch": 0.65, "learning_rate": 7.829333333333333e-08, "logps/chosen": -193.54806518554688, "logps/rejected": -350.6060485839844, "loss": 0.004, "losses/dpo": 1.771767088598608e-08, "losses/sft": 0.6340698599815369, "losses/total": 1.771767088598608e-08, "ref_logps/chosen": -178.04664611816406, "ref_logps/rejected": -211.59976196289062, "rewards/accuracies": 1.0, "rewards/chosen": -1.550144076347351, "rewards/margins": 12.350482940673828, "rewards/rejected": -13.900628089904785, "step": 2699 }, { "epoch": 0.65, "learning_rate": 7.823999999999999e-08, "logps/chosen": -250.53318786621094, "logps/rejected": -389.44329833984375, "loss": 0.0042, "losses/dpo": 2.557625862209534e-07, "losses/sft": 0.5530925989151001, "losses/total": 2.557625862209534e-07, "ref_logps/chosen": -234.3764190673828, "ref_logps/rejected": -229.75543212890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6156773567199707, "rewards/margins": 14.353109359741211, "rewards/rejected": -15.968785285949707, "step": 2700 }, { "epoch": 0.65, "learning_rate": 7.818666666666667e-08, "logps/chosen": -238.04193115234375, "logps/rejected": -396.4454040527344, "loss": 0.0004, "losses/dpo": 1.5599626479456674e-08, "losses/sft": 0.4494199752807617, "losses/total": 1.5599626479456674e-08, "ref_logps/chosen": -220.377685546875, "ref_logps/rejected": -239.0454864501953, "rewards/accuracies": 1.0, "rewards/chosen": -1.7664239406585693, "rewards/margins": 13.973567008972168, "rewards/rejected": -15.739990234375, "step": 2701 }, { "epoch": 0.65, "learning_rate": 7.813333333333333e-08, "logps/chosen": -225.07321166992188, "logps/rejected": -355.23114013671875, "loss": 0.0071, "losses/dpo": 6.0139848656604045e-09, "losses/sft": 0.5390440225601196, "losses/total": 6.0139848656604045e-09, "ref_logps/chosen": -212.16781616210938, "ref_logps/rejected": -212.85072326660156, "rewards/accuracies": 1.0, "rewards/chosen": -1.2905409336090088, "rewards/margins": 12.947498321533203, "rewards/rejected": -14.238039016723633, "step": 2702 }, { "epoch": 0.65, "learning_rate": 7.808e-08, "logps/chosen": -226.6835479736328, "logps/rejected": -388.5356750488281, "loss": 0.0003, "losses/dpo": 8.062716574386286e-07, "losses/sft": 0.7061335444450378, "losses/total": 8.062716574386286e-07, "ref_logps/chosen": -211.37718200683594, "ref_logps/rejected": -221.52532958984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5306369066238403, "rewards/margins": 15.170398712158203, "rewards/rejected": -16.70103645324707, "step": 2703 }, { "epoch": 0.65, "learning_rate": 7.802666666666666e-08, "logps/chosen": -256.1248779296875, "logps/rejected": -426.000732421875, "loss": 0.0005, "losses/dpo": 4.352606275404014e-09, "losses/sft": 0.6175724864006042, "losses/total": 4.352606275404014e-09, "ref_logps/chosen": -242.91043090820312, "ref_logps/rejected": -253.91586303710938, "rewards/accuracies": 1.0, "rewards/chosen": -1.3214415311813354, "rewards/margins": 15.887044906616211, "rewards/rejected": -17.208486557006836, "step": 2704 }, { "epoch": 0.65, "learning_rate": 7.797333333333332e-08, "logps/chosen": -227.42672729492188, "logps/rejected": -342.1433410644531, "loss": 0.0004, "losses/dpo": 6.509521455200229e-08, "losses/sft": 0.5523036122322083, "losses/total": 6.509521455200229e-08, "ref_logps/chosen": -214.6903839111328, "ref_logps/rejected": -196.53460693359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.273634672164917, "rewards/margins": 13.287239074707031, "rewards/rejected": -14.560873985290527, "step": 2705 }, { "epoch": 0.65, "learning_rate": 7.792e-08, "logps/chosen": -248.30677795410156, "logps/rejected": -427.04449462890625, "loss": 0.0001, "losses/dpo": 2.9863276722608134e-05, "losses/sft": 0.7219273447990417, "losses/total": 2.9863276722608134e-05, "ref_logps/chosen": -230.62435913085938, "ref_logps/rejected": -248.1080780029297, "rewards/accuracies": 1.0, "rewards/chosen": -1.7682442665100098, "rewards/margins": 16.125396728515625, "rewards/rejected": -17.89364242553711, "step": 2706 }, { "epoch": 0.65, "learning_rate": 7.786666666666666e-08, "logps/chosen": -257.9132080078125, "logps/rejected": -379.8531799316406, "loss": 0.001, "losses/dpo": 1.1979826908259383e-08, "losses/sft": 0.5193655490875244, "losses/total": 1.1979826908259383e-08, "ref_logps/chosen": -246.69642639160156, "ref_logps/rejected": -226.94342041015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.1216754913330078, "rewards/margins": 14.169301986694336, "rewards/rejected": -15.290977478027344, "step": 2707 }, { "epoch": 0.65, "learning_rate": 7.781333333333333e-08, "logps/chosen": -235.9889373779297, "logps/rejected": -349.419677734375, "loss": 0.0006, "losses/dpo": 2.4772774850134738e-05, "losses/sft": 0.5473957061767578, "losses/total": 2.4772774850134738e-05, "ref_logps/chosen": -220.12777709960938, "ref_logps/rejected": -206.7932891845703, "rewards/accuracies": 1.0, "rewards/chosen": -1.5861178636550903, "rewards/margins": 12.676521301269531, "rewards/rejected": -14.262638092041016, "step": 2708 }, { "epoch": 0.65, "learning_rate": 7.775999999999999e-08, "logps/chosen": -182.14683532714844, "logps/rejected": -362.68011474609375, "loss": 0.0018, "losses/dpo": 3.050259920200915e-07, "losses/sft": 0.6064096689224243, "losses/total": 3.050259920200915e-07, "ref_logps/chosen": -169.56198120117188, "ref_logps/rejected": -213.38973999023438, "rewards/accuracies": 1.0, "rewards/chosen": -1.258484959602356, "rewards/margins": 13.670552253723145, "rewards/rejected": -14.929036140441895, "step": 2709 }, { "epoch": 0.65, "learning_rate": 7.770666666666667e-08, "logps/chosen": -226.8301544189453, "logps/rejected": -392.36724853515625, "loss": 0.0006, "losses/dpo": 3.2004154491005465e-06, "losses/sft": 0.5448986887931824, "losses/total": 3.2004154491005465e-06, "ref_logps/chosen": -212.5450439453125, "ref_logps/rejected": -235.5379180908203, "rewards/accuracies": 1.0, "rewards/chosen": -1.428511142730713, "rewards/margins": 14.25442123413086, "rewards/rejected": -15.68293285369873, "step": 2710 }, { "epoch": 0.65, "learning_rate": 7.765333333333333e-08, "logps/chosen": -205.14642333984375, "logps/rejected": -388.30810546875, "loss": 0.0001, "losses/dpo": 2.3220739819862501e-07, "losses/sft": 0.3477966785430908, "losses/total": 2.3220739819862501e-07, "ref_logps/chosen": -193.84060668945312, "ref_logps/rejected": -225.53515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.1305828094482422, "rewards/margins": 15.146713256835938, "rewards/rejected": -16.27729606628418, "step": 2711 }, { "epoch": 0.65, "learning_rate": 7.76e-08, "logps/chosen": -223.80242919921875, "logps/rejected": -385.3640441894531, "loss": 0.0011, "losses/dpo": 3.1792155823495705e-06, "losses/sft": 0.7096987962722778, "losses/total": 3.1792155823495705e-06, "ref_logps/chosen": -209.69134521484375, "ref_logps/rejected": -223.40567016601562, "rewards/accuracies": 1.0, "rewards/chosen": -1.4111082553863525, "rewards/margins": 14.78472900390625, "rewards/rejected": -16.195837020874023, "step": 2712 }, { "epoch": 0.65, "learning_rate": 7.754666666666666e-08, "logps/chosen": -212.6829833984375, "logps/rejected": -418.6374816894531, "loss": 0.0004, "losses/dpo": 2.4342625692952424e-07, "losses/sft": 0.5945309400558472, "losses/total": 2.4342625692952424e-07, "ref_logps/chosen": -200.6993865966797, "ref_logps/rejected": -252.2823486328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.1983613967895508, "rewards/margins": 15.437152862548828, "rewards/rejected": -16.635515213012695, "step": 2713 }, { "epoch": 0.65, "learning_rate": 7.749333333333333e-08, "logps/chosen": -257.3575134277344, "logps/rejected": -395.72802734375, "loss": 0.0002, "losses/dpo": 1.7253321402677102e-06, "losses/sft": 0.5763826370239258, "losses/total": 1.7253321402677102e-06, "ref_logps/chosen": -239.34747314453125, "ref_logps/rejected": -226.35809326171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.8010019063949585, "rewards/margins": 15.135990142822266, "rewards/rejected": -16.936992645263672, "step": 2714 }, { "epoch": 0.65, "learning_rate": 7.744e-08, "logps/chosen": -273.5062255859375, "logps/rejected": -387.92730712890625, "loss": 0.0027, "losses/dpo": 1.9501922565723362e-07, "losses/sft": 0.6112452149391174, "losses/total": 1.9501922565723362e-07, "ref_logps/chosen": -255.12428283691406, "ref_logps/rejected": -227.252685546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.8381937742233276, "rewards/margins": 14.229268074035645, "rewards/rejected": -16.067461013793945, "step": 2715 }, { "epoch": 0.65, "learning_rate": 7.738666666666666e-08, "logps/chosen": -251.00628662109375, "logps/rejected": -354.4610900878906, "loss": 0.0062, "losses/dpo": 1.1567064575501718e-05, "losses/sft": 0.6521955132484436, "losses/total": 1.1567064575501718e-05, "ref_logps/chosen": -237.80352783203125, "ref_logps/rejected": -216.55386352539062, "rewards/accuracies": 1.0, "rewards/chosen": -1.3202767372131348, "rewards/margins": 12.47044563293457, "rewards/rejected": -13.790721893310547, "step": 2716 }, { "epoch": 0.65, "learning_rate": 7.733333333333332e-08, "logps/chosen": -209.84854125976562, "logps/rejected": -382.6781921386719, "loss": 0.0002, "losses/dpo": 1.6921419501159107e-06, "losses/sft": 0.5530831813812256, "losses/total": 1.6921419501159107e-06, "ref_logps/chosen": -194.36825561523438, "ref_logps/rejected": -227.71124267578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.5480289459228516, "rewards/margins": 13.9486665725708, "rewards/rejected": -15.496695518493652, "step": 2717 }, { "epoch": 0.65, "learning_rate": 7.728e-08, "logps/chosen": -234.3031005859375, "logps/rejected": -354.29840087890625, "loss": 0.0022, "losses/dpo": 1.623049001864274e-07, "losses/sft": 0.7052112817764282, "losses/total": 1.623049001864274e-07, "ref_logps/chosen": -221.23585510253906, "ref_logps/rejected": -209.24131774902344, "rewards/accuracies": 1.0, "rewards/chosen": -1.3067249059677124, "rewards/margins": 13.198982238769531, "rewards/rejected": -14.505706787109375, "step": 2718 }, { "epoch": 0.65, "learning_rate": 7.722666666666666e-08, "logps/chosen": -242.67152404785156, "logps/rejected": -361.2890930175781, "loss": 0.0004, "losses/dpo": 5.831104772369144e-07, "losses/sft": 0.6686034798622131, "losses/total": 5.831104772369144e-07, "ref_logps/chosen": -225.84222412109375, "ref_logps/rejected": -212.86659240722656, "rewards/accuracies": 1.0, "rewards/chosen": -1.6829299926757812, "rewards/margins": 13.159320831298828, "rewards/rejected": -14.84225082397461, "step": 2719 }, { "epoch": 0.65, "learning_rate": 7.717333333333333e-08, "logps/chosen": -219.23760986328125, "logps/rejected": -358.406982421875, "loss": 0.0049, "losses/dpo": 1.1128542354299498e-07, "losses/sft": 0.4391847550868988, "losses/total": 1.1128542354299498e-07, "ref_logps/chosen": -207.28948974609375, "ref_logps/rejected": -216.88192749023438, "rewards/accuracies": 1.0, "rewards/chosen": -1.1948102712631226, "rewards/margins": 12.957696914672852, "rewards/rejected": -14.152505874633789, "step": 2720 }, { "epoch": 0.65, "learning_rate": 7.711999999999999e-08, "logps/chosen": -207.95547485351562, "logps/rejected": -391.06903076171875, "loss": 0.0017, "losses/dpo": 6.146708142473756e-10, "losses/sft": 0.44035977125167847, "losses/total": 6.146708142473756e-10, "ref_logps/chosen": -197.21282958984375, "ref_logps/rejected": -223.4503173828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.074263572692871, "rewards/margins": 15.68760871887207, "rewards/rejected": -16.761873245239258, "step": 2721 }, { "epoch": 0.65, "learning_rate": 7.706666666666667e-08, "logps/chosen": -240.5331268310547, "logps/rejected": -400.1267395019531, "loss": 0.0004, "losses/dpo": 1.7355363013393799e-07, "losses/sft": 0.6840642690658569, "losses/total": 1.7355363013393799e-07, "ref_logps/chosen": -225.5490264892578, "ref_logps/rejected": -233.0901336669922, "rewards/accuracies": 1.0, "rewards/chosen": -1.498410940170288, "rewards/margins": 15.205249786376953, "rewards/rejected": -16.703662872314453, "step": 2722 }, { "epoch": 0.65, "learning_rate": 7.701333333333333e-08, "logps/chosen": -222.21701049804688, "logps/rejected": -356.73822021484375, "loss": 0.0069, "losses/dpo": 1.7620432890552706e-09, "losses/sft": 0.5302571654319763, "losses/total": 1.7620432890552706e-09, "ref_logps/chosen": -207.5615997314453, "ref_logps/rejected": -201.00289916992188, "rewards/accuracies": 1.0, "rewards/chosen": -1.4655401706695557, "rewards/margins": 14.107993125915527, "rewards/rejected": -15.573533058166504, "step": 2723 }, { "epoch": 0.65, "learning_rate": 7.696e-08, "logps/chosen": -235.74130249023438, "logps/rejected": -394.01239013671875, "loss": 0.0003, "losses/dpo": 3.114361746270333e-08, "losses/sft": 0.5615527629852295, "losses/total": 3.114361746270333e-08, "ref_logps/chosen": -222.98855590820312, "ref_logps/rejected": -227.01812744140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2752764225006104, "rewards/margins": 15.42414665222168, "rewards/rejected": -16.699424743652344, "step": 2724 }, { "epoch": 0.65, "learning_rate": 7.690666666666666e-08, "logps/chosen": -241.95907592773438, "logps/rejected": -396.9065856933594, "loss": 0.001, "losses/dpo": 1.553267559017968e-08, "losses/sft": 0.6241295337677002, "losses/total": 1.553267559017968e-08, "ref_logps/chosen": -229.06553649902344, "ref_logps/rejected": -246.8953399658203, "rewards/accuracies": 1.0, "rewards/chosen": -1.289354920387268, "rewards/margins": 13.711771011352539, "rewards/rejected": -15.001126289367676, "step": 2725 }, { "epoch": 0.65, "learning_rate": 7.685333333333332e-08, "logps/chosen": -274.5227355957031, "logps/rejected": -430.88458251953125, "loss": 0.0006, "losses/dpo": 5.652270473888166e-08, "losses/sft": 0.6242626309394836, "losses/total": 5.652270473888166e-08, "ref_logps/chosen": -261.1727294921875, "ref_logps/rejected": -265.08306884765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.3350019454956055, "rewards/margins": 15.245148658752441, "rewards/rejected": -16.580150604248047, "step": 2726 }, { "epoch": 0.65, "learning_rate": 7.68e-08, "logps/chosen": -237.47540283203125, "logps/rejected": -360.9921875, "loss": 0.0004, "losses/dpo": 3.754760413698932e-08, "losses/sft": 0.7057653665542603, "losses/total": 3.754760413698932e-08, "ref_logps/chosen": -224.78607177734375, "ref_logps/rejected": -212.42330932617188, "rewards/accuracies": 1.0, "rewards/chosen": -1.2689323425292969, "rewards/margins": 13.587953567504883, "rewards/rejected": -14.85688591003418, "step": 2727 }, { "epoch": 0.65, "learning_rate": 7.674666666666666e-08, "logps/chosen": -233.53521728515625, "logps/rejected": -355.89892578125, "loss": 0.0055, "losses/dpo": 1.9107540083496133e-06, "losses/sft": 0.47299060225486755, "losses/total": 1.9107540083496133e-06, "ref_logps/chosen": -219.46087646484375, "ref_logps/rejected": -221.50624084472656, "rewards/accuracies": 1.0, "rewards/chosen": -1.4074338674545288, "rewards/margins": 12.031834602355957, "rewards/rejected": -13.439268112182617, "step": 2728 }, { "epoch": 0.65, "learning_rate": 7.669333333333333e-08, "logps/chosen": -236.76287841796875, "logps/rejected": -344.82061767578125, "loss": 0.0096, "losses/dpo": 2.6378475581623206e-07, "losses/sft": 0.5950477123260498, "losses/total": 2.6378475581623206e-07, "ref_logps/chosen": -224.87539672851562, "ref_logps/rejected": -208.17947387695312, "rewards/accuracies": 1.0, "rewards/chosen": -1.1887491941452026, "rewards/margins": 12.475366592407227, "rewards/rejected": -13.664114952087402, "step": 2729 }, { "epoch": 0.66, "learning_rate": 7.663999999999999e-08, "logps/chosen": -253.68157958984375, "logps/rejected": -391.7320861816406, "loss": 0.0001, "losses/dpo": 0.00010579751688055694, "losses/sft": 0.6624691486358643, "losses/total": 0.00010579751688055694, "ref_logps/chosen": -233.9149169921875, "ref_logps/rejected": -223.92437744140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.976666808128357, "rewards/margins": 14.804105758666992, "rewards/rejected": -16.780771255493164, "step": 2730 }, { "epoch": 0.66, "learning_rate": 7.658666666666667e-08, "logps/chosen": -215.77566528320312, "logps/rejected": -359.27020263671875, "loss": 0.0041, "losses/dpo": 4.329013734150067e-07, "losses/sft": 0.6487375497817993, "losses/total": 4.329013734150067e-07, "ref_logps/chosen": -198.4511260986328, "ref_logps/rejected": -204.73912048339844, "rewards/accuracies": 1.0, "rewards/chosen": -1.7324515581130981, "rewards/margins": 13.720657348632812, "rewards/rejected": -15.453109741210938, "step": 2731 }, { "epoch": 0.66, "learning_rate": 7.653333333333333e-08, "logps/chosen": -237.5922088623047, "logps/rejected": -384.0703125, "loss": 0.0127, "losses/dpo": 2.4043433768383693e-06, "losses/sft": 0.8274433016777039, "losses/total": 2.4043433768383693e-06, "ref_logps/chosen": -223.448974609375, "ref_logps/rejected": -213.082763671875, "rewards/accuracies": 1.0, "rewards/chosen": -1.4143221378326416, "rewards/margins": 15.684433937072754, "rewards/rejected": -17.098756790161133, "step": 2732 }, { "epoch": 0.66, "learning_rate": 7.648e-08, "logps/chosen": -249.55929565429688, "logps/rejected": -380.4614562988281, "loss": 0.0196, "losses/dpo": 5.323788521138795e-09, "losses/sft": 0.7000470161437988, "losses/total": 5.323788521138795e-09, "ref_logps/chosen": -237.400634765625, "ref_logps/rejected": -221.81790161132812, "rewards/accuracies": 1.0, "rewards/chosen": -1.215867042541504, "rewards/margins": 14.648487091064453, "rewards/rejected": -15.864355087280273, "step": 2733 }, { "epoch": 0.66, "learning_rate": 7.642666666666666e-08, "logps/chosen": -219.51324462890625, "logps/rejected": -367.32977294921875, "loss": 0.0009, "losses/dpo": 7.757964826105024e-10, "losses/sft": 0.5407132506370544, "losses/total": 7.757964826105024e-10, "ref_logps/chosen": -205.80145263671875, "ref_logps/rejected": -208.34519958496094, "rewards/accuracies": 1.0, "rewards/chosen": -1.3711779117584229, "rewards/margins": 14.5272798538208, "rewards/rejected": -15.898458480834961, "step": 2734 }, { "epoch": 0.66, "learning_rate": 7.637333333333333e-08, "logps/chosen": -249.20315551757812, "logps/rejected": -408.5445861816406, "loss": 0.0001, "losses/dpo": 1.005555532174185e-06, "losses/sft": 0.5418455600738525, "losses/total": 1.005555532174185e-06, "ref_logps/chosen": -232.9054412841797, "ref_logps/rejected": -242.50259399414062, "rewards/accuracies": 1.0, "rewards/chosen": -1.6297709941864014, "rewards/margins": 14.974427223205566, "rewards/rejected": -16.604198455810547, "step": 2735 }, { "epoch": 0.66, "learning_rate": 7.632e-08, "logps/chosen": -202.21319580078125, "logps/rejected": -361.53851318359375, "loss": 0.0009, "losses/dpo": 2.244087937697259e-07, "losses/sft": 0.4229632616043091, "losses/total": 2.244087937697259e-07, "ref_logps/chosen": -191.1613311767578, "ref_logps/rejected": -206.33486938476562, "rewards/accuracies": 1.0, "rewards/chosen": -1.1051887273788452, "rewards/margins": 14.415178298950195, "rewards/rejected": -15.520367622375488, "step": 2736 }, { "epoch": 0.66, "learning_rate": 7.626666666666667e-08, "logps/chosen": -214.0995635986328, "logps/rejected": -377.98175048828125, "loss": 0.0016, "losses/dpo": 5.112888779024161e-09, "losses/sft": 0.5629317760467529, "losses/total": 5.112888779024161e-09, "ref_logps/chosen": -203.28073120117188, "ref_logps/rejected": -217.75643920898438, "rewards/accuracies": 1.0, "rewards/chosen": -1.0818824768066406, "rewards/margins": 14.94064712524414, "rewards/rejected": -16.02252960205078, "step": 2737 }, { "epoch": 0.66, "learning_rate": 7.621333333333332e-08, "logps/chosen": -259.016845703125, "logps/rejected": -371.28271484375, "loss": 0.0006, "losses/dpo": 1.5832277155958252e-10, "losses/sft": 0.9233028292655945, "losses/total": 1.5832277155958252e-10, "ref_logps/chosen": -238.51144409179688, "ref_logps/rejected": -219.91387939453125, "rewards/accuracies": 1.0, "rewards/chosen": -2.0505363941192627, "rewards/margins": 13.086345672607422, "rewards/rejected": -15.136882781982422, "step": 2738 }, { "epoch": 0.66, "learning_rate": 7.616e-08, "logps/chosen": -271.7879638671875, "logps/rejected": -371.5497131347656, "loss": 0.0004, "losses/dpo": 2.9772693324048305e-06, "losses/sft": 0.6991206407546997, "losses/total": 2.9772693324048305e-06, "ref_logps/chosen": -256.73626708984375, "ref_logps/rejected": -219.94839477539062, "rewards/accuracies": 1.0, "rewards/chosen": -1.5051695108413696, "rewards/margins": 13.654961585998535, "rewards/rejected": -15.160130500793457, "step": 2739 }, { "epoch": 0.66, "learning_rate": 7.610666666666667e-08, "logps/chosen": -252.2594757080078, "logps/rejected": -390.6661376953125, "loss": 0.0003, "losses/dpo": 1.761315182591261e-08, "losses/sft": 1.2035889625549316, "losses/total": 1.761315182591261e-08, "ref_logps/chosen": -231.3419952392578, "ref_logps/rejected": -224.99818420410156, "rewards/accuracies": 1.0, "rewards/chosen": -2.091749429702759, "rewards/margins": 14.47504711151123, "rewards/rejected": -16.566795349121094, "step": 2740 }, { "epoch": 0.66, "learning_rate": 7.605333333333333e-08, "logps/chosen": -205.96238708496094, "logps/rejected": -346.2606201171875, "loss": 0.0004, "losses/dpo": 9.033050973883405e-10, "losses/sft": 0.4116910398006439, "losses/total": 9.033050973883405e-10, "ref_logps/chosen": -190.31912231445312, "ref_logps/rejected": -196.82237243652344, "rewards/accuracies": 1.0, "rewards/chosen": -1.5643260478973389, "rewards/margins": 13.379500389099121, "rewards/rejected": -14.943826675415039, "step": 2741 }, { "epoch": 0.66, "learning_rate": 7.599999999999999e-08, "logps/chosen": -242.95941162109375, "logps/rejected": -360.47589111328125, "loss": 0.0048, "losses/dpo": 9.528255962010235e-10, "losses/sft": 0.8617374897003174, "losses/total": 9.528255962010235e-10, "ref_logps/chosen": -228.7780303955078, "ref_logps/rejected": -218.9791259765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4181393384933472, "rewards/margins": 12.731538772583008, "rewards/rejected": -14.149677276611328, "step": 2742 }, { "epoch": 0.66, "learning_rate": 7.594666666666666e-08, "logps/chosen": -203.59295654296875, "logps/rejected": -367.6116943359375, "loss": 0.0003, "losses/dpo": 1.9530743244899895e-08, "losses/sft": 0.39932677149772644, "losses/total": 1.9530743244899895e-08, "ref_logps/chosen": -193.06011962890625, "ref_logps/rejected": -212.305419921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.0532844066619873, "rewards/margins": 14.477344512939453, "rewards/rejected": -15.530628204345703, "step": 2743 }, { "epoch": 0.66, "learning_rate": 7.589333333333333e-08, "logps/chosen": -260.854736328125, "logps/rejected": -363.32659912109375, "loss": 0.0013, "losses/dpo": 1.9013599512618384e-06, "losses/sft": 0.677607536315918, "losses/total": 1.9013599512618384e-06, "ref_logps/chosen": -247.87649536132812, "ref_logps/rejected": -211.7073211669922, "rewards/accuracies": 1.0, "rewards/chosen": -1.297825574874878, "rewards/margins": 13.864103317260742, "rewards/rejected": -15.161928176879883, "step": 2744 }, { "epoch": 0.66, "learning_rate": 7.584e-08, "logps/chosen": -236.78900146484375, "logps/rejected": -335.2223815917969, "loss": 0.0183, "losses/dpo": 7.787719913388003e-10, "losses/sft": 0.5471977591514587, "losses/total": 7.787719913388003e-10, "ref_logps/chosen": -222.58224487304688, "ref_logps/rejected": -201.46780395507812, "rewards/accuracies": 1.0, "rewards/chosen": -1.420676589012146, "rewards/margins": 11.954781532287598, "rewards/rejected": -13.375457763671875, "step": 2745 }, { "epoch": 0.66, "learning_rate": 7.578666666666666e-08, "logps/chosen": -247.95948791503906, "logps/rejected": -378.35833740234375, "loss": 0.0012, "losses/dpo": 3.193833981640637e-05, "losses/sft": 0.623752772808075, "losses/total": 3.193833981640637e-05, "ref_logps/chosen": -232.66505432128906, "ref_logps/rejected": -218.1106719970703, "rewards/accuracies": 1.0, "rewards/chosen": -1.5294435024261475, "rewards/margins": 14.495323181152344, "rewards/rejected": -16.02476692199707, "step": 2746 }, { "epoch": 0.66, "learning_rate": 7.573333333333332e-08, "logps/chosen": -203.63876342773438, "logps/rejected": -338.08135986328125, "loss": 0.0051, "losses/dpo": 1.381488146101617e-09, "losses/sft": 0.7223578691482544, "losses/total": 1.381488146101617e-09, "ref_logps/chosen": -191.1183624267578, "ref_logps/rejected": -194.16140747070312, "rewards/accuracies": 1.0, "rewards/chosen": -1.2520402669906616, "rewards/margins": 13.139955520629883, "rewards/rejected": -14.391996383666992, "step": 2747 }, { "epoch": 0.66, "learning_rate": 7.568e-08, "logps/chosen": -240.76449584960938, "logps/rejected": -360.4083251953125, "loss": 0.0135, "losses/dpo": 2.6055602120322874e-06, "losses/sft": 0.8986564874649048, "losses/total": 2.6055602120322874e-06, "ref_logps/chosen": -223.20855712890625, "ref_logps/rejected": -219.07183837890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7555921077728271, "rewards/margins": 12.378057479858398, "rewards/rejected": -14.133649826049805, "step": 2748 }, { "epoch": 0.66, "learning_rate": 7.562666666666666e-08, "logps/chosen": -272.914794921875, "logps/rejected": -357.60699462890625, "loss": 0.0012, "losses/dpo": 0.0015359405661001801, "losses/sft": 0.9022651314735413, "losses/total": 0.0015359405661001801, "ref_logps/chosen": -256.93939208984375, "ref_logps/rejected": -219.7994384765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.5975396633148193, "rewards/margins": 12.18321418762207, "rewards/rejected": -13.780754089355469, "step": 2749 }, { "epoch": 0.66, "learning_rate": 7.557333333333333e-08, "logps/chosen": -234.05361938476562, "logps/rejected": -356.97247314453125, "loss": 0.0007, "losses/dpo": 5.987812983221374e-07, "losses/sft": 0.8279770016670227, "losses/total": 5.987812983221374e-07, "ref_logps/chosen": -220.02383422851562, "ref_logps/rejected": -209.04885864257812, "rewards/accuracies": 1.0, "rewards/chosen": -1.4029803276062012, "rewards/margins": 13.389379501342773, "rewards/rejected": -14.79236125946045, "step": 2750 }, { "epoch": 0.66, "learning_rate": 7.551999999999999e-08, "logps/chosen": -189.72653198242188, "logps/rejected": -348.14703369140625, "loss": 0.0048, "losses/dpo": 2.487746769475052e-06, "losses/sft": 0.9303142428398132, "losses/total": 2.487746769475052e-06, "ref_logps/chosen": -177.91921997070312, "ref_logps/rejected": -206.28073120117188, "rewards/accuracies": 1.0, "rewards/chosen": -1.1807312965393066, "rewards/margins": 13.005901336669922, "rewards/rejected": -14.186633110046387, "step": 2751 }, { "epoch": 0.66, "learning_rate": 7.546666666666667e-08, "logps/chosen": -236.67819213867188, "logps/rejected": -350.00146484375, "loss": 0.0028, "losses/dpo": 2.8067538551113103e-06, "losses/sft": 1.4299299716949463, "losses/total": 2.8067538551113103e-06, "ref_logps/chosen": -222.52923583984375, "ref_logps/rejected": -200.54148864746094, "rewards/accuracies": 1.0, "rewards/chosen": -1.4148969650268555, "rewards/margins": 13.53110122680664, "rewards/rejected": -14.94599723815918, "step": 2752 }, { "epoch": 0.66, "learning_rate": 7.541333333333333e-08, "logps/chosen": -270.3222351074219, "logps/rejected": -392.13836669921875, "loss": 0.0086, "losses/dpo": 2.031665280455286e-09, "losses/sft": 0.652421772480011, "losses/total": 2.031665280455286e-09, "ref_logps/chosen": -254.74942016601562, "ref_logps/rejected": -234.4583740234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5572800636291504, "rewards/margins": 14.21071720123291, "rewards/rejected": -15.767997741699219, "step": 2753 }, { "epoch": 0.66, "learning_rate": 7.536000000000001e-08, "logps/chosen": -212.17066955566406, "logps/rejected": -392.8315734863281, "loss": 0.0102, "losses/dpo": 2.2587537387153134e-06, "losses/sft": 0.5841712355613708, "losses/total": 2.2587537387153134e-06, "ref_logps/chosen": -197.8037872314453, "ref_logps/rejected": -229.37570190429688, "rewards/accuracies": 1.0, "rewards/chosen": -1.4366884231567383, "rewards/margins": 14.90889835357666, "rewards/rejected": -16.3455867767334, "step": 2754 }, { "epoch": 0.66, "learning_rate": 7.530666666666666e-08, "logps/chosen": -214.715087890625, "logps/rejected": -374.46923828125, "loss": 0.0007, "losses/dpo": 2.141729282811866e-08, "losses/sft": 0.7594180107116699, "losses/total": 2.141729282811866e-08, "ref_logps/chosen": -204.03414916992188, "ref_logps/rejected": -228.4016876220703, "rewards/accuracies": 1.0, "rewards/chosen": -1.0680960416793823, "rewards/margins": 13.538656234741211, "rewards/rejected": -14.606752395629883, "step": 2755 }, { "epoch": 0.66, "learning_rate": 7.525333333333333e-08, "logps/chosen": -251.9751739501953, "logps/rejected": -358.9111633300781, "loss": 0.0072, "losses/dpo": 4.3833060203724017e-07, "losses/sft": 0.6205087304115295, "losses/total": 4.3833060203724017e-07, "ref_logps/chosen": -234.766845703125, "ref_logps/rejected": -193.52928161621094, "rewards/accuracies": 1.0, "rewards/chosen": -1.7208311557769775, "rewards/margins": 14.817355155944824, "rewards/rejected": -16.53818702697754, "step": 2756 }, { "epoch": 0.66, "learning_rate": 7.52e-08, "logps/chosen": -216.0153045654297, "logps/rejected": -345.97088623046875, "loss": 0.0043, "losses/dpo": 4.612872362486087e-05, "losses/sft": 0.3629561960697174, "losses/total": 4.612872362486087e-05, "ref_logps/chosen": -202.4451904296875, "ref_logps/rejected": -196.38807678222656, "rewards/accuracies": 1.0, "rewards/chosen": -1.3570117950439453, "rewards/margins": 13.601268768310547, "rewards/rejected": -14.958281517028809, "step": 2757 }, { "epoch": 0.66, "learning_rate": 7.514666666666666e-08, "logps/chosen": -249.4560546875, "logps/rejected": -374.65020751953125, "loss": 0.001, "losses/dpo": 1.972348826484449e-07, "losses/sft": 0.706177294254303, "losses/total": 1.972348826484449e-07, "ref_logps/chosen": -233.54281616210938, "ref_logps/rejected": -224.355712890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.5913238525390625, "rewards/margins": 13.438127517700195, "rewards/rejected": -15.029451370239258, "step": 2758 }, { "epoch": 0.66, "learning_rate": 7.509333333333333e-08, "logps/chosen": -232.46815490722656, "logps/rejected": -375.0450439453125, "loss": 0.0011, "losses/dpo": 4.114794762699603e-07, "losses/sft": 0.5048618912696838, "losses/total": 4.114794762699603e-07, "ref_logps/chosen": -217.19822692871094, "ref_logps/rejected": -224.1824493408203, "rewards/accuracies": 1.0, "rewards/chosen": -1.5269925594329834, "rewards/margins": 13.559268951416016, "rewards/rejected": -15.086259841918945, "step": 2759 }, { "epoch": 0.66, "learning_rate": 7.503999999999999e-08, "logps/chosen": -241.0010986328125, "logps/rejected": -394.29058837890625, "loss": 0.0005, "losses/dpo": 1.447781528440828e-06, "losses/sft": 0.5441401600837708, "losses/total": 1.447781528440828e-06, "ref_logps/chosen": -226.744140625, "ref_logps/rejected": -229.83692932128906, "rewards/accuracies": 1.0, "rewards/chosen": -1.425695776939392, "rewards/margins": 15.019672393798828, "rewards/rejected": -16.445369720458984, "step": 2760 }, { "epoch": 0.66, "learning_rate": 7.498666666666667e-08, "logps/chosen": -211.87954711914062, "logps/rejected": -364.6490478515625, "loss": 0.0003, "losses/dpo": 1.6405836504418403e-05, "losses/sft": 0.9453028440475464, "losses/total": 1.6405836504418403e-05, "ref_logps/chosen": -201.9436492919922, "ref_logps/rejected": -220.97512817382812, "rewards/accuracies": 1.0, "rewards/chosen": -0.9935891628265381, "rewards/margins": 13.373802185058594, "rewards/rejected": -14.367391586303711, "step": 2761 }, { "epoch": 0.66, "learning_rate": 7.493333333333333e-08, "logps/chosen": -245.39422607421875, "logps/rejected": -414.9714660644531, "loss": 0.0051, "losses/dpo": 2.1732004640284686e-08, "losses/sft": 0.7564847469329834, "losses/total": 2.1732004640284686e-08, "ref_logps/chosen": -230.21173095703125, "ref_logps/rejected": -238.20741271972656, "rewards/accuracies": 1.0, "rewards/chosen": -1.51824951171875, "rewards/margins": 16.158157348632812, "rewards/rejected": -17.676406860351562, "step": 2762 }, { "epoch": 0.66, "learning_rate": 7.487999999999999e-08, "logps/chosen": -289.2324523925781, "logps/rejected": -395.076171875, "loss": 0.0007, "losses/dpo": 9.922521826410957e-08, "losses/sft": 0.7161190509796143, "losses/total": 9.922521826410957e-08, "ref_logps/chosen": -274.2120361328125, "ref_logps/rejected": -222.15505981445312, "rewards/accuracies": 1.0, "rewards/chosen": -1.5020391941070557, "rewards/margins": 15.790075302124023, "rewards/rejected": -17.292112350463867, "step": 2763 }, { "epoch": 0.66, "learning_rate": 7.482666666666666e-08, "logps/chosen": -188.4949493408203, "logps/rejected": -354.74200439453125, "loss": 0.0001, "losses/dpo": 1.7633360016588995e-07, "losses/sft": 0.5259307622909546, "losses/total": 1.7633360016588995e-07, "ref_logps/chosen": -170.8844757080078, "ref_logps/rejected": -202.08981323242188, "rewards/accuracies": 1.0, "rewards/chosen": -1.7610453367233276, "rewards/margins": 13.504173278808594, "rewards/rejected": -15.265217781066895, "step": 2764 }, { "epoch": 0.66, "learning_rate": 7.477333333333333e-08, "logps/chosen": -250.9267578125, "logps/rejected": -371.6160888671875, "loss": 0.03, "losses/dpo": 3.274030699329522e-11, "losses/sft": 0.5489125847816467, "losses/total": 3.274030699329522e-11, "ref_logps/chosen": -237.05670166015625, "ref_logps/rejected": -220.2842254638672, "rewards/accuracies": 0.96875, "rewards/chosen": -1.3870066404342651, "rewards/margins": 13.74618148803711, "rewards/rejected": -15.133188247680664, "step": 2765 }, { "epoch": 0.66, "learning_rate": 7.472e-08, "logps/chosen": -231.10658264160156, "logps/rejected": -368.59423828125, "loss": 0.0001, "losses/dpo": 8.814776464305396e-08, "losses/sft": 0.8308603763580322, "losses/total": 8.814776464305396e-08, "ref_logps/chosen": -215.6041259765625, "ref_logps/rejected": -214.80699157714844, "rewards/accuracies": 1.0, "rewards/chosen": -1.5502458810806274, "rewards/margins": 13.82847785949707, "rewards/rejected": -15.378724098205566, "step": 2766 }, { "epoch": 0.66, "learning_rate": 7.466666666666666e-08, "logps/chosen": -297.8423767089844, "logps/rejected": -409.0821533203125, "loss": 0.0004, "losses/dpo": 2.047731868515257e-06, "losses/sft": 0.6505066752433777, "losses/total": 2.047731868515257e-06, "ref_logps/chosen": -283.2508544921875, "ref_logps/rejected": -251.77378845214844, "rewards/accuracies": 1.0, "rewards/chosen": -1.459152340888977, "rewards/margins": 14.271683692932129, "rewards/rejected": -15.730835914611816, "step": 2767 }, { "epoch": 0.66, "learning_rate": 7.461333333333332e-08, "logps/chosen": -254.35459899902344, "logps/rejected": -396.95587158203125, "loss": 0.0026, "losses/dpo": 3.781402924118993e-08, "losses/sft": 0.771155059337616, "losses/total": 3.781402924118993e-08, "ref_logps/chosen": -237.23342895507812, "ref_logps/rejected": -233.29885864257812, "rewards/accuracies": 1.0, "rewards/chosen": -1.7121168375015259, "rewards/margins": 14.653583526611328, "rewards/rejected": -16.365699768066406, "step": 2768 }, { "epoch": 0.66, "learning_rate": 7.456e-08, "logps/chosen": -246.9232177734375, "logps/rejected": -401.5460205078125, "loss": 0.0058, "losses/dpo": 0.18279264867305756, "losses/sft": 0.47285202145576477, "losses/total": 0.18279264867305756, "ref_logps/chosen": -233.11233520507812, "ref_logps/rejected": -231.7586212158203, "rewards/accuracies": 1.0, "rewards/chosen": -1.381089687347412, "rewards/margins": 15.597654342651367, "rewards/rejected": -16.978742599487305, "step": 2769 }, { "epoch": 0.66, "learning_rate": 7.450666666666666e-08, "logps/chosen": -244.06358337402344, "logps/rejected": -428.70611572265625, "loss": 0.0007, "losses/dpo": 1.272728500367748e-08, "losses/sft": 0.5016013979911804, "losses/total": 1.272728500367748e-08, "ref_logps/chosen": -227.13629150390625, "ref_logps/rejected": -264.8075866699219, "rewards/accuracies": 1.0, "rewards/chosen": -1.6927294731140137, "rewards/margins": 14.697121620178223, "rewards/rejected": -16.389850616455078, "step": 2770 }, { "epoch": 0.66, "learning_rate": 7.445333333333334e-08, "logps/chosen": -202.0186309814453, "logps/rejected": -376.2176208496094, "loss": 0.0027, "losses/dpo": 3.5806618781109023e-10, "losses/sft": 0.5165377855300903, "losses/total": 3.5806618781109023e-10, "ref_logps/chosen": -187.8802947998047, "ref_logps/rejected": -210.42041015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4138320684432983, "rewards/margins": 15.165886878967285, "rewards/rejected": -16.57971954345703, "step": 2771 }, { "epoch": 0.67, "learning_rate": 7.439999999999999e-08, "logps/chosen": -240.9132843017578, "logps/rejected": -384.36444091796875, "loss": 0.0013, "losses/dpo": 3.2741120321588824e-06, "losses/sft": 1.0345726013183594, "losses/total": 3.2741120321588824e-06, "ref_logps/chosen": -228.83364868164062, "ref_logps/rejected": -227.70993041992188, "rewards/accuracies": 1.0, "rewards/chosen": -1.207962989807129, "rewards/margins": 14.457490921020508, "rewards/rejected": -15.665453910827637, "step": 2772 }, { "epoch": 0.67, "learning_rate": 7.434666666666667e-08, "logps/chosen": -245.48892211914062, "logps/rejected": -366.2760009765625, "loss": 0.0011, "losses/dpo": 2.4197649963753065e-06, "losses/sft": 0.7236236333847046, "losses/total": 2.4197649963753065e-06, "ref_logps/chosen": -228.32826232910156, "ref_logps/rejected": -208.22409057617188, "rewards/accuracies": 1.0, "rewards/chosen": -1.7160640954971313, "rewards/margins": 14.089129447937012, "rewards/rejected": -15.805192947387695, "step": 2773 }, { "epoch": 0.67, "learning_rate": 7.429333333333333e-08, "logps/chosen": -217.96226501464844, "logps/rejected": -348.89239501953125, "loss": 0.0033, "losses/dpo": 4.76226006185243e-07, "losses/sft": 0.7545977830886841, "losses/total": 4.76226006185243e-07, "ref_logps/chosen": -205.0191650390625, "ref_logps/rejected": -204.3870849609375, "rewards/accuracies": 1.0, "rewards/chosen": -1.294310212135315, "rewards/margins": 13.15622329711914, "rewards/rejected": -14.450531959533691, "step": 2774 }, { "epoch": 0.67, "learning_rate": 7.424e-08, "logps/chosen": -222.06483459472656, "logps/rejected": -354.20489501953125, "loss": 0.0001, "losses/dpo": 6.17118134371708e-09, "losses/sft": 0.8853685259819031, "losses/total": 6.17118134371708e-09, "ref_logps/chosen": -206.67926025390625, "ref_logps/rejected": -212.15882873535156, "rewards/accuracies": 1.0, "rewards/chosen": -1.5385589599609375, "rewards/margins": 12.666049003601074, "rewards/rejected": -14.204607009887695, "step": 2775 }, { "epoch": 0.67, "learning_rate": 7.418666666666666e-08, "logps/chosen": -253.83177185058594, "logps/rejected": -411.9518127441406, "loss": 0.0007, "losses/dpo": 1.5392724606044794e-07, "losses/sft": 0.7911475300788879, "losses/total": 1.5392724606044794e-07, "ref_logps/chosen": -238.23643493652344, "ref_logps/rejected": -257.91436767578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.5595344305038452, "rewards/margins": 13.844215393066406, "rewards/rejected": -15.403749465942383, "step": 2776 }, { "epoch": 0.67, "learning_rate": 7.413333333333332e-08, "logps/chosen": -225.96832275390625, "logps/rejected": -395.1840515136719, "loss": 0.0006, "losses/dpo": 6.517711370612744e-11, "losses/sft": 0.48650938272476196, "losses/total": 6.517711370612744e-11, "ref_logps/chosen": -211.97830200195312, "ref_logps/rejected": -236.0324249267578, "rewards/accuracies": 1.0, "rewards/chosen": -1.3990044593811035, "rewards/margins": 14.516160011291504, "rewards/rejected": -15.915165901184082, "step": 2777 }, { "epoch": 0.67, "learning_rate": 7.408e-08, "logps/chosen": -230.9328155517578, "logps/rejected": -359.3879089355469, "loss": 0.0029, "losses/dpo": 4.707942480308702e-06, "losses/sft": 0.42101988196372986, "losses/total": 4.707942480308702e-06, "ref_logps/chosen": -219.034912109375, "ref_logps/rejected": -214.24777221679688, "rewards/accuracies": 1.0, "rewards/chosen": -1.1897895336151123, "rewards/margins": 13.324224472045898, "rewards/rejected": -14.514013290405273, "step": 2778 }, { "epoch": 0.67, "learning_rate": 7.402666666666666e-08, "logps/chosen": -202.31573486328125, "logps/rejected": -353.1336669921875, "loss": 0.0043, "losses/dpo": 3.984672147794299e-09, "losses/sft": 0.6100955009460449, "losses/total": 3.984672147794299e-09, "ref_logps/chosen": -189.3427276611328, "ref_logps/rejected": -201.16207885742188, "rewards/accuracies": 1.0, "rewards/chosen": -1.2972999811172485, "rewards/margins": 13.899858474731445, "rewards/rejected": -15.197158813476562, "step": 2779 }, { "epoch": 0.67, "learning_rate": 7.397333333333333e-08, "logps/chosen": -223.6563720703125, "logps/rejected": -362.7452697753906, "loss": 0.0015, "losses/dpo": 7.661779477530217e-07, "losses/sft": 0.5653785467147827, "losses/total": 7.661779477530217e-07, "ref_logps/chosen": -211.26419067382812, "ref_logps/rejected": -222.30934143066406, "rewards/accuracies": 1.0, "rewards/chosen": -1.2392191886901855, "rewards/margins": 12.804370880126953, "rewards/rejected": -14.043590545654297, "step": 2780 }, { "epoch": 0.67, "learning_rate": 7.391999999999999e-08, "logps/chosen": -230.43484497070312, "logps/rejected": -381.576904296875, "loss": 0.0004, "losses/dpo": 1.4334776032853824e-08, "losses/sft": 0.7064616680145264, "losses/total": 1.4334776032853824e-08, "ref_logps/chosen": -215.5785369873047, "ref_logps/rejected": -216.7882537841797, "rewards/accuracies": 1.0, "rewards/chosen": -1.4856295585632324, "rewards/margins": 14.993240356445312, "rewards/rejected": -16.478870391845703, "step": 2781 }, { "epoch": 0.67, "learning_rate": 7.386666666666667e-08, "logps/chosen": -283.05364990234375, "logps/rejected": -390.40875244140625, "loss": 0.0002, "losses/dpo": 3.336656106966984e-08, "losses/sft": 1.0434088706970215, "losses/total": 3.336656106966984e-08, "ref_logps/chosen": -270.7557373046875, "ref_logps/rejected": -233.91317749023438, "rewards/accuracies": 1.0, "rewards/chosen": -1.2297903299331665, "rewards/margins": 14.419767379760742, "rewards/rejected": -15.649558067321777, "step": 2782 }, { "epoch": 0.67, "learning_rate": 7.381333333333333e-08, "logps/chosen": -219.56626892089844, "logps/rejected": -393.853271484375, "loss": 0.0015, "losses/dpo": 1.1812055618065642e-06, "losses/sft": 0.7555860280990601, "losses/total": 1.1812055618065642e-06, "ref_logps/chosen": -205.28054809570312, "ref_logps/rejected": -233.09820556640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4285719394683838, "rewards/margins": 14.646933555603027, "rewards/rejected": -16.07550621032715, "step": 2783 }, { "epoch": 0.67, "learning_rate": 7.376e-08, "logps/chosen": -231.85142517089844, "logps/rejected": -403.0119323730469, "loss": 0.0017, "losses/dpo": 1.6237671362050589e-15, "losses/sft": 0.7842759490013123, "losses/total": 1.6237671362050589e-15, "ref_logps/chosen": -216.70260620117188, "ref_logps/rejected": -220.48162841796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5148813724517822, "rewards/margins": 16.738147735595703, "rewards/rejected": -18.253032684326172, "step": 2784 }, { "epoch": 0.67, "learning_rate": 7.370666666666666e-08, "logps/chosen": -232.54681396484375, "logps/rejected": -373.8544921875, "loss": 0.0045, "losses/dpo": 2.0792211008480166e-10, "losses/sft": 0.665946900844574, "losses/total": 2.0792211008480166e-10, "ref_logps/chosen": -215.63113403320312, "ref_logps/rejected": -218.9615936279297, "rewards/accuracies": 1.0, "rewards/chosen": -1.6915690898895264, "rewards/margins": 13.797723770141602, "rewards/rejected": -15.48929214477539, "step": 2785 }, { "epoch": 0.67, "learning_rate": 7.365333333333333e-08, "logps/chosen": -235.98092651367188, "logps/rejected": -338.541015625, "loss": 0.0017, "losses/dpo": 5.087842964712763e-07, "losses/sft": 1.0364751815795898, "losses/total": 5.087842964712763e-07, "ref_logps/chosen": -221.5592498779297, "ref_logps/rejected": -197.60708618164062, "rewards/accuracies": 1.0, "rewards/chosen": -1.4421658515930176, "rewards/margins": 12.651229858398438, "rewards/rejected": -14.093395233154297, "step": 2786 }, { "epoch": 0.67, "learning_rate": 7.36e-08, "logps/chosen": -242.8510284423828, "logps/rejected": -403.5357666015625, "loss": 0.0017, "losses/dpo": 0.0005242894403636456, "losses/sft": 0.6204997897148132, "losses/total": 0.0005242894403636456, "ref_logps/chosen": -230.66891479492188, "ref_logps/rejected": -248.21292114257812, "rewards/accuracies": 1.0, "rewards/chosen": -1.2182115316390991, "rewards/margins": 14.314070701599121, "rewards/rejected": -15.532281875610352, "step": 2787 }, { "epoch": 0.67, "learning_rate": 7.354666666666667e-08, "logps/chosen": -221.09608459472656, "logps/rejected": -393.15289306640625, "loss": 0.0005, "losses/dpo": 1.4522163382935105e-07, "losses/sft": 0.43388304114341736, "losses/total": 1.4522163382935105e-07, "ref_logps/chosen": -205.56968688964844, "ref_logps/rejected": -232.73773193359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5526387691497803, "rewards/margins": 14.48887825012207, "rewards/rejected": -16.04151725769043, "step": 2788 }, { "epoch": 0.67, "learning_rate": 7.349333333333332e-08, "logps/chosen": -256.93267822265625, "logps/rejected": -387.8260192871094, "loss": 0.0004, "losses/dpo": 2.599750459353345e-08, "losses/sft": 0.4201700687408447, "losses/total": 2.599750459353345e-08, "ref_logps/chosen": -242.6312255859375, "ref_logps/rejected": -230.67491149902344, "rewards/accuracies": 1.0, "rewards/chosen": -1.4301480054855347, "rewards/margins": 14.28496265411377, "rewards/rejected": -15.715110778808594, "step": 2789 }, { "epoch": 0.67, "learning_rate": 7.344e-08, "logps/chosen": -244.3152618408203, "logps/rejected": -412.66986083984375, "loss": 0.0002, "losses/dpo": 7.696083770269979e-09, "losses/sft": 0.518929660320282, "losses/total": 7.696083770269979e-09, "ref_logps/chosen": -227.96104431152344, "ref_logps/rejected": -237.40106201171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.6354210376739502, "rewards/margins": 15.891457557678223, "rewards/rejected": -17.526878356933594, "step": 2790 }, { "epoch": 0.67, "learning_rate": 7.338666666666666e-08, "logps/chosen": -242.71151733398438, "logps/rejected": -378.4720764160156, "loss": 0.0121, "losses/dpo": 0.37500420212745667, "losses/sft": 0.5943918824195862, "losses/total": 0.37500420212745667, "ref_logps/chosen": -223.420654296875, "ref_logps/rejected": -220.30052185058594, "rewards/accuracies": 1.0, "rewards/chosen": -1.9290876388549805, "rewards/margins": 13.888067245483398, "rewards/rejected": -15.817155838012695, "step": 2791 }, { "epoch": 0.67, "learning_rate": 7.333333333333333e-08, "logps/chosen": -231.31683349609375, "logps/rejected": -387.7380065917969, "loss": 0.0039, "losses/dpo": 3.288920302212963e-15, "losses/sft": 0.7932576537132263, "losses/total": 3.288920302212963e-15, "ref_logps/chosen": -216.3850555419922, "ref_logps/rejected": -225.76156616210938, "rewards/accuracies": 1.0, "rewards/chosen": -1.4931795597076416, "rewards/margins": 14.704463958740234, "rewards/rejected": -16.197643280029297, "step": 2792 }, { "epoch": 0.67, "learning_rate": 7.327999999999999e-08, "logps/chosen": -205.46246337890625, "logps/rejected": -335.3683776855469, "loss": 0.0022, "losses/dpo": 6.437252064728227e-09, "losses/sft": 0.6321721076965332, "losses/total": 6.437252064728227e-09, "ref_logps/chosen": -187.8211212158203, "ref_logps/rejected": -186.3533477783203, "rewards/accuracies": 1.0, "rewards/chosen": -1.7641329765319824, "rewards/margins": 13.137370109558105, "rewards/rejected": -14.90150260925293, "step": 2793 }, { "epoch": 0.67, "learning_rate": 7.322666666666666e-08, "logps/chosen": -247.4796600341797, "logps/rejected": -381.441162109375, "loss": 0.003, "losses/dpo": 5.033258098502413e-12, "losses/sft": 0.8294782042503357, "losses/total": 5.033258098502413e-12, "ref_logps/chosen": -232.0845184326172, "ref_logps/rejected": -213.1763916015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.539513111114502, "rewards/margins": 15.286965370178223, "rewards/rejected": -16.826478958129883, "step": 2794 }, { "epoch": 0.67, "learning_rate": 7.317333333333333e-08, "logps/chosen": -243.52879333496094, "logps/rejected": -368.30023193359375, "loss": 0.0009, "losses/dpo": 6.491862158952699e-09, "losses/sft": 0.6504318118095398, "losses/total": 6.491862158952699e-09, "ref_logps/chosen": -222.4209747314453, "ref_logps/rejected": -213.5924835205078, "rewards/accuracies": 1.0, "rewards/chosen": -2.1107804775238037, "rewards/margins": 13.359992980957031, "rewards/rejected": -15.470773696899414, "step": 2795 }, { "epoch": 0.67, "learning_rate": 7.312e-08, "logps/chosen": -196.92889404296875, "logps/rejected": -340.090087890625, "loss": 0.0021, "losses/dpo": 4.934501252051859e-09, "losses/sft": 0.427021861076355, "losses/total": 4.934501252051859e-09, "ref_logps/chosen": -183.13267517089844, "ref_logps/rejected": -200.4329376220703, "rewards/accuracies": 1.0, "rewards/chosen": -1.379622459411621, "rewards/margins": 12.58609390258789, "rewards/rejected": -13.965716361999512, "step": 2796 }, { "epoch": 0.67, "learning_rate": 7.306666666666666e-08, "logps/chosen": -211.69790649414062, "logps/rejected": -395.5250244140625, "loss": 0.0003, "losses/dpo": 1.2243098353792448e-05, "losses/sft": 0.41490960121154785, "losses/total": 1.2243098353792448e-05, "ref_logps/chosen": -196.25816345214844, "ref_logps/rejected": -223.10769653320312, "rewards/accuracies": 1.0, "rewards/chosen": -1.5439739227294922, "rewards/margins": 15.697758674621582, "rewards/rejected": -17.24173355102539, "step": 2797 }, { "epoch": 0.67, "learning_rate": 7.301333333333332e-08, "logps/chosen": -200.11660766601562, "logps/rejected": -365.933349609375, "loss": 0.0004, "losses/dpo": 4.311140244794842e-08, "losses/sft": 0.6096054911613464, "losses/total": 4.311140244794842e-08, "ref_logps/chosen": -185.86964416503906, "ref_logps/rejected": -209.60699462890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4246940612792969, "rewards/margins": 14.207942962646484, "rewards/rejected": -15.632637977600098, "step": 2798 }, { "epoch": 0.67, "learning_rate": 7.296e-08, "logps/chosen": -268.7724914550781, "logps/rejected": -388.6920166015625, "loss": 0.0022, "losses/dpo": 5.417829811449337e-07, "losses/sft": 0.48959678411483765, "losses/total": 5.417829811449337e-07, "ref_logps/chosen": -252.7098388671875, "ref_logps/rejected": -236.4072265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6062674522399902, "rewards/margins": 13.622210502624512, "rewards/rejected": -15.228479385375977, "step": 2799 }, { "epoch": 0.67, "learning_rate": 7.290666666666666e-08, "logps/chosen": -237.2550811767578, "logps/rejected": -389.0709228515625, "loss": 0.0002, "losses/dpo": 7.447166494589652e-11, "losses/sft": 0.5744878053665161, "losses/total": 7.447166494589652e-11, "ref_logps/chosen": -223.88265991210938, "ref_logps/rejected": -223.992431640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.3372430801391602, "rewards/margins": 15.17060661315918, "rewards/rejected": -16.507850646972656, "step": 2800 }, { "epoch": 0.67, "learning_rate": 7.285333333333334e-08, "logps/chosen": -270.78851318359375, "logps/rejected": -345.92535400390625, "loss": 0.0008, "losses/dpo": 6.272547352637048e-07, "losses/sft": 0.5519476532936096, "losses/total": 6.272547352637048e-07, "ref_logps/chosen": -254.74215698242188, "ref_logps/rejected": -200.28770446777344, "rewards/accuracies": 1.0, "rewards/chosen": -1.60463285446167, "rewards/margins": 12.95913314819336, "rewards/rejected": -14.563765525817871, "step": 2801 }, { "epoch": 0.67, "learning_rate": 7.279999999999999e-08, "logps/chosen": -242.26171875, "logps/rejected": -360.7890625, "loss": 0.0041, "losses/dpo": 1.329032300878552e-10, "losses/sft": 0.5469934940338135, "losses/total": 1.329032300878552e-10, "ref_logps/chosen": -224.6832275390625, "ref_logps/rejected": -200.9990234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.7578486204147339, "rewards/margins": 14.221156120300293, "rewards/rejected": -15.979004859924316, "step": 2802 }, { "epoch": 0.67, "learning_rate": 7.274666666666667e-08, "logps/chosen": -253.71478271484375, "logps/rejected": -376.2602844238281, "loss": 0.004, "losses/dpo": 2.0810577552765608e-05, "losses/sft": 0.6590160727500916, "losses/total": 2.0810577552765608e-05, "ref_logps/chosen": -238.29647827148438, "ref_logps/rejected": -218.16342163085938, "rewards/accuracies": 1.0, "rewards/chosen": -1.5418331623077393, "rewards/margins": 14.267853736877441, "rewards/rejected": -15.809687614440918, "step": 2803 }, { "epoch": 0.67, "learning_rate": 7.269333333333333e-08, "logps/chosen": -223.87020874023438, "logps/rejected": -362.9547424316406, "loss": 0.0049, "losses/dpo": 4.017010335477522e-10, "losses/sft": 0.5933305621147156, "losses/total": 4.017010335477522e-10, "ref_logps/chosen": -209.0724639892578, "ref_logps/rejected": -218.64523315429688, "rewards/accuracies": 1.0, "rewards/chosen": -1.4797756671905518, "rewards/margins": 12.951173782348633, "rewards/rejected": -14.430950164794922, "step": 2804 }, { "epoch": 0.67, "learning_rate": 7.264000000000001e-08, "logps/chosen": -234.1326904296875, "logps/rejected": -374.1747131347656, "loss": 0.0004, "losses/dpo": 3.850624263890268e-09, "losses/sft": 0.6223868131637573, "losses/total": 3.850624263890268e-09, "ref_logps/chosen": -221.77401733398438, "ref_logps/rejected": -217.51304626464844, "rewards/accuracies": 1.0, "rewards/chosen": -1.235865831375122, "rewards/margins": 14.430299758911133, "rewards/rejected": -15.66616439819336, "step": 2805 }, { "epoch": 0.67, "learning_rate": 7.258666666666666e-08, "logps/chosen": -269.9523620605469, "logps/rejected": -382.7477722167969, "loss": 0.0007, "losses/dpo": 2.8767834336917986e-08, "losses/sft": 0.72968590259552, "losses/total": 2.8767834336917986e-08, "ref_logps/chosen": -253.9569091796875, "ref_logps/rejected": -219.35369873046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5995452404022217, "rewards/margins": 14.739863395690918, "rewards/rejected": -16.33940887451172, "step": 2806 }, { "epoch": 0.67, "learning_rate": 7.253333333333333e-08, "logps/chosen": -212.0599822998047, "logps/rejected": -349.7183837890625, "loss": 0.0004, "losses/dpo": 0.00016287306789308786, "losses/sft": 0.6514090299606323, "losses/total": 0.00016287306789308786, "ref_logps/chosen": -197.36451721191406, "ref_logps/rejected": -199.64849853515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4695467948913574, "rewards/margins": 13.537442207336426, "rewards/rejected": -15.006989479064941, "step": 2807 }, { "epoch": 0.67, "learning_rate": 7.248e-08, "logps/chosen": -247.5115509033203, "logps/rejected": -375.5611572265625, "loss": 0.0012, "losses/dpo": 9.968746716992882e-09, "losses/sft": 0.5767730474472046, "losses/total": 9.968746716992882e-09, "ref_logps/chosen": -229.28237915039062, "ref_logps/rejected": -211.62583923339844, "rewards/accuracies": 1.0, "rewards/chosen": -1.8229191303253174, "rewards/margins": 14.570611953735352, "rewards/rejected": -16.393531799316406, "step": 2808 }, { "epoch": 0.67, "learning_rate": 7.242666666666666e-08, "logps/chosen": -236.4341583251953, "logps/rejected": -361.9564208984375, "loss": 0.01, "losses/dpo": 2.45292897105287e-09, "losses/sft": 0.7211565971374512, "losses/total": 2.45292897105287e-09, "ref_logps/chosen": -221.22216796875, "ref_logps/rejected": -218.6099395751953, "rewards/accuracies": 1.0, "rewards/chosen": -1.52119779586792, "rewards/margins": 12.813450813293457, "rewards/rejected": -14.334648132324219, "step": 2809 }, { "epoch": 0.67, "learning_rate": 7.237333333333332e-08, "logps/chosen": -246.19007873535156, "logps/rejected": -372.453125, "loss": 0.0031, "losses/dpo": 3.6499141486956432e-09, "losses/sft": 0.584047794342041, "losses/total": 3.6499141486956432e-09, "ref_logps/chosen": -226.0930633544922, "ref_logps/rejected": -217.27670288085938, "rewards/accuracies": 1.0, "rewards/chosen": -2.009702682495117, "rewards/margins": 13.507936477661133, "rewards/rejected": -15.517641067504883, "step": 2810 }, { "epoch": 0.67, "learning_rate": 7.231999999999999e-08, "logps/chosen": -245.19424438476562, "logps/rejected": -358.545166015625, "loss": 0.0002, "losses/dpo": 6.411192998712067e-08, "losses/sft": 0.6765973567962646, "losses/total": 6.411192998712067e-08, "ref_logps/chosen": -232.5487060546875, "ref_logps/rejected": -219.50537109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.2645543813705444, "rewards/margins": 12.639425277709961, "rewards/rejected": -13.903980255126953, "step": 2811 }, { "epoch": 0.67, "learning_rate": 7.226666666666667e-08, "logps/chosen": -262.72930908203125, "logps/rejected": -379.51593017578125, "loss": 0.0051, "losses/dpo": 1.0674187922443679e-10, "losses/sft": 0.7296695709228516, "losses/total": 1.0674187922443679e-10, "ref_logps/chosen": -244.1405029296875, "ref_logps/rejected": -226.70877075195312, "rewards/accuracies": 1.0, "rewards/chosen": -1.8588792085647583, "rewards/margins": 13.421833038330078, "rewards/rejected": -15.280713081359863, "step": 2812 }, { "epoch": 0.68, "learning_rate": 7.221333333333333e-08, "logps/chosen": -243.78506469726562, "logps/rejected": -432.0267333984375, "loss": 0.0003, "losses/dpo": 7.756989930385316e-07, "losses/sft": 0.7973130941390991, "losses/total": 7.756989930385316e-07, "ref_logps/chosen": -226.6387176513672, "ref_logps/rejected": -253.18505859375, "rewards/accuracies": 1.0, "rewards/chosen": -1.7146334648132324, "rewards/margins": 16.169538497924805, "rewards/rejected": -17.884174346923828, "step": 2813 }, { "epoch": 0.68, "learning_rate": 7.215999999999999e-08, "logps/chosen": -187.91314697265625, "logps/rejected": -350.52166748046875, "loss": 0.0008, "losses/dpo": 5.712288420056666e-09, "losses/sft": 0.5585101842880249, "losses/total": 5.712288420056666e-09, "ref_logps/chosen": -175.3816375732422, "ref_logps/rejected": -202.8912353515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2531521320343018, "rewards/margins": 13.509891510009766, "rewards/rejected": -14.763044357299805, "step": 2814 }, { "epoch": 0.68, "learning_rate": 7.210666666666666e-08, "logps/chosen": -230.3196563720703, "logps/rejected": -348.9541931152344, "loss": 0.0025, "losses/dpo": 3.169620654830396e-08, "losses/sft": 0.5692175030708313, "losses/total": 3.169620654830396e-08, "ref_logps/chosen": -213.7891845703125, "ref_logps/rejected": -204.98048400878906, "rewards/accuracies": 1.0, "rewards/chosen": -1.6530481576919556, "rewards/margins": 12.744325637817383, "rewards/rejected": -14.397374153137207, "step": 2815 }, { "epoch": 0.68, "learning_rate": 7.205333333333333e-08, "logps/chosen": -243.68856811523438, "logps/rejected": -341.2588195800781, "loss": 0.0012, "losses/dpo": 7.024363526397792e-07, "losses/sft": 0.6906898617744446, "losses/total": 7.024363526397792e-07, "ref_logps/chosen": -227.1466522216797, "ref_logps/rejected": -194.84036254882812, "rewards/accuracies": 1.0, "rewards/chosen": -1.6541943550109863, "rewards/margins": 12.987653732299805, "rewards/rejected": -14.641847610473633, "step": 2816 }, { "epoch": 0.68, "learning_rate": 7.2e-08, "logps/chosen": -281.570068359375, "logps/rejected": -385.17132568359375, "loss": 0.0049, "losses/dpo": 3.611633292166516e-05, "losses/sft": 0.8522354960441589, "losses/total": 3.611633292166516e-05, "ref_logps/chosen": -264.9147644042969, "ref_logps/rejected": -234.14559936523438, "rewards/accuracies": 1.0, "rewards/chosen": -1.665529489517212, "rewards/margins": 13.437042236328125, "rewards/rejected": -15.102571487426758, "step": 2817 }, { "epoch": 0.68, "learning_rate": 7.194666666666667e-08, "logps/chosen": -207.1126708984375, "logps/rejected": -381.74334716796875, "loss": 0.0016, "losses/dpo": 1.3539371934712108e-07, "losses/sft": 0.7053784728050232, "losses/total": 1.3539371934712108e-07, "ref_logps/chosen": -191.47100830078125, "ref_logps/rejected": -217.89755249023438, "rewards/accuracies": 1.0, "rewards/chosen": -1.5641652345657349, "rewards/margins": 14.820416450500488, "rewards/rejected": -16.384580612182617, "step": 2818 }, { "epoch": 0.68, "learning_rate": 7.189333333333332e-08, "logps/chosen": -284.873046875, "logps/rejected": -402.1562194824219, "loss": 0.0001, "losses/dpo": 9.18336340305359e-08, "losses/sft": 0.5135020613670349, "losses/total": 9.18336340305359e-08, "ref_logps/chosen": -264.70745849609375, "ref_logps/rejected": -227.71267700195312, "rewards/accuracies": 1.0, "rewards/chosen": -2.0165624618530273, "rewards/margins": 15.4277925491333, "rewards/rejected": -17.444355010986328, "step": 2819 }, { "epoch": 0.68, "learning_rate": 7.184e-08, "logps/chosen": -219.82412719726562, "logps/rejected": -389.622802734375, "loss": 0.0004, "losses/dpo": 1.4820870225529248e-12, "losses/sft": 0.7164180278778076, "losses/total": 1.4820870225529248e-12, "ref_logps/chosen": -203.54393005371094, "ref_logps/rejected": -223.76766967773438, "rewards/accuracies": 1.0, "rewards/chosen": -1.6280194520950317, "rewards/margins": 14.957491874694824, "rewards/rejected": -16.585512161254883, "step": 2820 }, { "epoch": 0.68, "learning_rate": 7.178666666666666e-08, "logps/chosen": -189.48619079589844, "logps/rejected": -365.50262451171875, "loss": 0.0003, "losses/dpo": 5.169212045075255e-07, "losses/sft": 0.6002659797668457, "losses/total": 5.169212045075255e-07, "ref_logps/chosen": -174.47055053710938, "ref_logps/rejected": -210.60891723632812, "rewards/accuracies": 1.0, "rewards/chosen": -1.50156569480896, "rewards/margins": 13.987807273864746, "rewards/rejected": -15.489372253417969, "step": 2821 }, { "epoch": 0.68, "learning_rate": 7.173333333333334e-08, "logps/chosen": -245.71502685546875, "logps/rejected": -378.9700927734375, "loss": 0.0006, "losses/dpo": 7.344890917693192e-08, "losses/sft": 0.6231539249420166, "losses/total": 7.344890917693192e-08, "ref_logps/chosen": -234.2681884765625, "ref_logps/rejected": -220.2763671875, "rewards/accuracies": 1.0, "rewards/chosen": -1.144683599472046, "rewards/margins": 14.724691390991211, "rewards/rejected": -15.869375228881836, "step": 2822 }, { "epoch": 0.68, "learning_rate": 7.167999999999999e-08, "logps/chosen": -227.3431854248047, "logps/rejected": -394.9544372558594, "loss": 0.0005, "losses/dpo": 2.9474769291937264e-08, "losses/sft": 0.5970489382743835, "losses/total": 2.9474769291937264e-08, "ref_logps/chosen": -211.0482177734375, "ref_logps/rejected": -223.74058532714844, "rewards/accuracies": 1.0, "rewards/chosen": -1.6294962167739868, "rewards/margins": 15.491889953613281, "rewards/rejected": -17.12138557434082, "step": 2823 }, { "epoch": 0.68, "learning_rate": 7.162666666666667e-08, "logps/chosen": -283.8501892089844, "logps/rejected": -435.87982177734375, "loss": 0.0007, "losses/dpo": 1.3003258914068283e-08, "losses/sft": 0.5822546482086182, "losses/total": 1.3003258914068283e-08, "ref_logps/chosen": -265.99395751953125, "ref_logps/rejected": -265.18902587890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7856212854385376, "rewards/margins": 15.283458709716797, "rewards/rejected": -17.069080352783203, "step": 2824 }, { "epoch": 0.68, "learning_rate": 7.157333333333333e-08, "logps/chosen": -288.44769287109375, "logps/rejected": -394.7734375, "loss": 0.0003, "losses/dpo": 1.311439667972536e-08, "losses/sft": 0.9940028786659241, "losses/total": 1.311439667972536e-08, "ref_logps/chosen": -268.401611328125, "ref_logps/rejected": -236.31004333496094, "rewards/accuracies": 1.0, "rewards/chosen": -2.0046069622039795, "rewards/margins": 13.841732025146484, "rewards/rejected": -15.846339225769043, "step": 2825 }, { "epoch": 0.68, "learning_rate": 7.152e-08, "logps/chosen": -237.78164672851562, "logps/rejected": -378.23193359375, "loss": 0.0001, "losses/dpo": 2.9060785777801357e-08, "losses/sft": 0.695504903793335, "losses/total": 2.9060785777801357e-08, "ref_logps/chosen": -224.79635620117188, "ref_logps/rejected": -226.26673889160156, "rewards/accuracies": 1.0, "rewards/chosen": -1.2985292673110962, "rewards/margins": 13.897990226745605, "rewards/rejected": -15.19651985168457, "step": 2826 }, { "epoch": 0.68, "learning_rate": 7.146666666666666e-08, "logps/chosen": -282.9804992675781, "logps/rejected": -399.5784606933594, "loss": 0.0086, "losses/dpo": 0.00013162086543161422, "losses/sft": 0.5868927836418152, "losses/total": 0.00013162086543161422, "ref_logps/chosen": -267.9090576171875, "ref_logps/rejected": -224.26303100585938, "rewards/accuracies": 1.0, "rewards/chosen": -1.5071423053741455, "rewards/margins": 16.024402618408203, "rewards/rejected": -17.531543731689453, "step": 2827 }, { "epoch": 0.68, "learning_rate": 7.141333333333332e-08, "logps/chosen": -246.1436767578125, "logps/rejected": -367.5550231933594, "loss": 0.002, "losses/dpo": 1.8568198090207488e-09, "losses/sft": 0.4787437319755554, "losses/total": 1.8568198090207488e-09, "ref_logps/chosen": -233.27874755859375, "ref_logps/rejected": -225.16343688964844, "rewards/accuracies": 1.0, "rewards/chosen": -1.2864943742752075, "rewards/margins": 12.952662467956543, "rewards/rejected": -14.239157676696777, "step": 2828 }, { "epoch": 0.68, "learning_rate": 7.136e-08, "logps/chosen": -236.5603485107422, "logps/rejected": -358.107666015625, "loss": 0.0008, "losses/dpo": 1.582408572176064e-06, "losses/sft": 0.6286494731903076, "losses/total": 1.582408572176064e-06, "ref_logps/chosen": -221.6136932373047, "ref_logps/rejected": -209.59869384765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4946675300598145, "rewards/margins": 13.356228828430176, "rewards/rejected": -14.850896835327148, "step": 2829 }, { "epoch": 0.68, "learning_rate": 7.130666666666666e-08, "logps/chosen": -218.7021484375, "logps/rejected": -375.98260498046875, "loss": 0.0006, "losses/dpo": 1.4161502193132947e-08, "losses/sft": 0.4861396551132202, "losses/total": 1.4161502193132947e-08, "ref_logps/chosen": -204.13623046875, "ref_logps/rejected": -219.3883819580078, "rewards/accuracies": 1.0, "rewards/chosen": -1.4565919637680054, "rewards/margins": 14.202829360961914, "rewards/rejected": -15.659421920776367, "step": 2830 }, { "epoch": 0.68, "learning_rate": 7.125333333333333e-08, "logps/chosen": -224.67239379882812, "logps/rejected": -356.5744934082031, "loss": 0.018, "losses/dpo": 7.926084677478684e-09, "losses/sft": 0.46613362431526184, "losses/total": 7.926084677478684e-09, "ref_logps/chosen": -215.185302734375, "ref_logps/rejected": -205.97885131835938, "rewards/accuracies": 1.0, "rewards/chosen": -0.948710560798645, "rewards/margins": 14.110854148864746, "rewards/rejected": -15.059565544128418, "step": 2831 }, { "epoch": 0.68, "learning_rate": 7.119999999999999e-08, "logps/chosen": -214.40386962890625, "logps/rejected": -418.39697265625, "loss": 0.0036, "losses/dpo": 4.5055063679910745e-08, "losses/sft": 0.6331080794334412, "losses/total": 4.5055063679910745e-08, "ref_logps/chosen": -198.65745544433594, "ref_logps/rejected": -248.93890380859375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5746413469314575, "rewards/margins": 15.371167182922363, "rewards/rejected": -16.94580841064453, "step": 2832 }, { "epoch": 0.68, "learning_rate": 7.114666666666667e-08, "logps/chosen": -269.40679931640625, "logps/rejected": -369.5074462890625, "loss": 0.0004, "losses/dpo": 1.1703446034516674e-07, "losses/sft": 1.2092280387878418, "losses/total": 1.1703446034516674e-07, "ref_logps/chosen": -253.00726318359375, "ref_logps/rejected": -218.7852783203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6399521827697754, "rewards/margins": 13.432266235351562, "rewards/rejected": -15.07221794128418, "step": 2833 }, { "epoch": 0.68, "learning_rate": 7.109333333333333e-08, "logps/chosen": -244.1639404296875, "logps/rejected": -378.816650390625, "loss": 0.0006, "losses/dpo": 4.5918432078906335e-07, "losses/sft": 0.4566803574562073, "losses/total": 4.5918432078906335e-07, "ref_logps/chosen": -225.3189697265625, "ref_logps/rejected": -220.23580932617188, "rewards/accuracies": 1.0, "rewards/chosen": -1.8844963312149048, "rewards/margins": 13.97358512878418, "rewards/rejected": -15.858081817626953, "step": 2834 }, { "epoch": 0.68, "learning_rate": 7.104e-08, "logps/chosen": -305.26385498046875, "logps/rejected": -398.37384033203125, "loss": 0.0003, "losses/dpo": 3.303887581296294e-07, "losses/sft": 0.6047285199165344, "losses/total": 3.303887581296294e-07, "ref_logps/chosen": -290.53558349609375, "ref_logps/rejected": -246.4016571044922, "rewards/accuracies": 1.0, "rewards/chosen": -1.4728264808654785, "rewards/margins": 13.724393844604492, "rewards/rejected": -15.197219848632812, "step": 2835 }, { "epoch": 0.68, "learning_rate": 7.098666666666666e-08, "logps/chosen": -249.06924438476562, "logps/rejected": -406.6490478515625, "loss": 0.0, "losses/dpo": 3.013430159626296e-06, "losses/sft": 0.5084461569786072, "losses/total": 3.013430159626296e-06, "ref_logps/chosen": -236.88314819335938, "ref_logps/rejected": -234.7386016845703, "rewards/accuracies": 1.0, "rewards/chosen": -1.2186083793640137, "rewards/margins": 15.972434997558594, "rewards/rejected": -17.191043853759766, "step": 2836 }, { "epoch": 0.68, "learning_rate": 7.093333333333333e-08, "logps/chosen": -279.89996337890625, "logps/rejected": -421.4607238769531, "loss": 0.0013, "losses/dpo": 8.934410544725324e-09, "losses/sft": 0.5571203231811523, "losses/total": 8.934410544725324e-09, "ref_logps/chosen": -262.5610656738281, "ref_logps/rejected": -250.71896362304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.7338879108428955, "rewards/margins": 15.340287208557129, "rewards/rejected": -17.074176788330078, "step": 2837 }, { "epoch": 0.68, "learning_rate": 7.088e-08, "logps/chosen": -251.97140502929688, "logps/rejected": -350.06451416015625, "loss": 0.0009, "losses/dpo": 3.294388051244823e-08, "losses/sft": 0.5914376378059387, "losses/total": 3.294388051244823e-08, "ref_logps/chosen": -237.20159912109375, "ref_logps/rejected": -204.66773986816406, "rewards/accuracies": 1.0, "rewards/chosen": -1.47697913646698, "rewards/margins": 13.062698364257812, "rewards/rejected": -14.539677619934082, "step": 2838 }, { "epoch": 0.68, "learning_rate": 7.082666666666667e-08, "logps/chosen": -210.26119995117188, "logps/rejected": -367.0761413574219, "loss": 0.0002, "losses/dpo": 2.226935613691694e-08, "losses/sft": 0.9032076001167297, "losses/total": 2.226935613691694e-08, "ref_logps/chosen": -197.37844848632812, "ref_logps/rejected": -214.1616973876953, "rewards/accuracies": 1.0, "rewards/chosen": -1.2882747650146484, "rewards/margins": 14.003171920776367, "rewards/rejected": -15.291446685791016, "step": 2839 }, { "epoch": 0.68, "learning_rate": 7.077333333333332e-08, "logps/chosen": -272.39306640625, "logps/rejected": -390.0110168457031, "loss": 0.0011, "losses/dpo": 0.00016669082106091082, "losses/sft": 1.4795600175857544, "losses/total": 0.00016669082106091082, "ref_logps/chosen": -256.0879211425781, "ref_logps/rejected": -231.80996704101562, "rewards/accuracies": 1.0, "rewards/chosen": -1.6305100917816162, "rewards/margins": 14.189594268798828, "rewards/rejected": -15.82010555267334, "step": 2840 }, { "epoch": 0.68, "learning_rate": 7.072e-08, "logps/chosen": -210.10328674316406, "logps/rejected": -355.22088623046875, "loss": 0.0034, "losses/dpo": 6.344513394651585e-07, "losses/sft": 0.5591238141059875, "losses/total": 6.344513394651585e-07, "ref_logps/chosen": -194.04507446289062, "ref_logps/rejected": -216.58184814453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6058201789855957, "rewards/margins": 12.258081436157227, "rewards/rejected": -13.863903045654297, "step": 2841 }, { "epoch": 0.68, "learning_rate": 7.066666666666666e-08, "logps/chosen": -238.13992309570312, "logps/rejected": -389.19158935546875, "loss": 0.0014, "losses/dpo": 2.674652023415547e-06, "losses/sft": 0.6460161805152893, "losses/total": 2.674652023415547e-06, "ref_logps/chosen": -226.7434539794922, "ref_logps/rejected": -240.44137573242188, "rewards/accuracies": 1.0, "rewards/chosen": -1.1396452188491821, "rewards/margins": 13.73537826538086, "rewards/rejected": -14.875022888183594, "step": 2842 }, { "epoch": 0.68, "learning_rate": 7.061333333333333e-08, "logps/chosen": -227.982177734375, "logps/rejected": -349.0655517578125, "loss": 0.0038, "losses/dpo": 1.8562639070296427e-07, "losses/sft": 0.5710725784301758, "losses/total": 1.8562639070296427e-07, "ref_logps/chosen": -216.7921905517578, "ref_logps/rejected": -211.24099731445312, "rewards/accuracies": 1.0, "rewards/chosen": -1.1189992427825928, "rewards/margins": 12.663455963134766, "rewards/rejected": -13.782455444335938, "step": 2843 }, { "epoch": 0.68, "learning_rate": 7.055999999999999e-08, "logps/chosen": -231.5380401611328, "logps/rejected": -386.448486328125, "loss": 0.0075, "losses/dpo": 3.525643842294812e-05, "losses/sft": 0.7343424558639526, "losses/total": 3.525643842294812e-05, "ref_logps/chosen": -215.40338134765625, "ref_logps/rejected": -222.87210083007812, "rewards/accuracies": 1.0, "rewards/chosen": -1.6134660243988037, "rewards/margins": 14.744173049926758, "rewards/rejected": -16.35763931274414, "step": 2844 }, { "epoch": 0.68, "learning_rate": 7.050666666666665e-08, "logps/chosen": -236.6239471435547, "logps/rejected": -356.48345947265625, "loss": 0.0055, "losses/dpo": 7.723144790361403e-08, "losses/sft": 0.5209023952484131, "losses/total": 7.723144790361403e-08, "ref_logps/chosen": -219.67640686035156, "ref_logps/rejected": -201.8258514404297, "rewards/accuracies": 1.0, "rewards/chosen": -1.6947537660598755, "rewards/margins": 13.771005630493164, "rewards/rejected": -15.46575927734375, "step": 2845 }, { "epoch": 0.68, "learning_rate": 7.045333333333333e-08, "logps/chosen": -229.55799865722656, "logps/rejected": -377.47589111328125, "loss": 0.0065, "losses/dpo": 7.997277319538032e-12, "losses/sft": 0.5952551960945129, "losses/total": 7.997277319538032e-12, "ref_logps/chosen": -215.021728515625, "ref_logps/rejected": -222.51846313476562, "rewards/accuracies": 1.0, "rewards/chosen": -1.4536256790161133, "rewards/margins": 14.042118072509766, "rewards/rejected": -15.495742797851562, "step": 2846 }, { "epoch": 0.68, "learning_rate": 7.04e-08, "logps/chosen": -267.560546875, "logps/rejected": -396.40692138671875, "loss": 0.0005, "losses/dpo": 3.10043162699003e-07, "losses/sft": 1.1306182146072388, "losses/total": 3.10043162699003e-07, "ref_logps/chosen": -254.613037109375, "ref_logps/rejected": -227.5005645751953, "rewards/accuracies": 1.0, "rewards/chosen": -1.294750690460205, "rewards/margins": 15.595888137817383, "rewards/rejected": -16.890640258789062, "step": 2847 }, { "epoch": 0.68, "learning_rate": 7.034666666666666e-08, "logps/chosen": -278.4498291015625, "logps/rejected": -418.34765625, "loss": 0.0003, "losses/dpo": 3.76847623329013e-07, "losses/sft": 1.2214407920837402, "losses/total": 3.76847623329013e-07, "ref_logps/chosen": -263.1769104003906, "ref_logps/rejected": -255.561767578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.5272936820983887, "rewards/margins": 14.751293182373047, "rewards/rejected": -16.278587341308594, "step": 2848 }, { "epoch": 0.68, "learning_rate": 7.029333333333332e-08, "logps/chosen": -259.050048828125, "logps/rejected": -425.49688720703125, "loss": 0.0001, "losses/dpo": 3.287675838237192e-07, "losses/sft": 0.42791253328323364, "losses/total": 3.287675838237192e-07, "ref_logps/chosen": -242.84906005859375, "ref_logps/rejected": -257.2586364746094, "rewards/accuracies": 1.0, "rewards/chosen": -1.6200995445251465, "rewards/margins": 15.203723907470703, "rewards/rejected": -16.823822021484375, "step": 2849 }, { "epoch": 0.68, "learning_rate": 7.024e-08, "logps/chosen": -261.2701110839844, "logps/rejected": -351.970947265625, "loss": 0.0027, "losses/dpo": 0.038985975086688995, "losses/sft": 0.4055805504322052, "losses/total": 0.038985975086688995, "ref_logps/chosen": -246.3310546875, "ref_logps/rejected": -197.4893341064453, "rewards/accuracies": 1.0, "rewards/chosen": -1.4939059019088745, "rewards/margins": 13.954254150390625, "rewards/rejected": -15.448160171508789, "step": 2850 }, { "epoch": 0.68, "learning_rate": 7.018666666666666e-08, "logps/chosen": -247.37709045410156, "logps/rejected": -420.5168151855469, "loss": 0.0004, "losses/dpo": 8.089167558011923e-09, "losses/sft": 0.6606017351150513, "losses/total": 8.089167558011923e-09, "ref_logps/chosen": -234.04696655273438, "ref_logps/rejected": -253.9906463623047, "rewards/accuracies": 1.0, "rewards/chosen": -1.3330113887786865, "rewards/margins": 15.31960391998291, "rewards/rejected": -16.65261459350586, "step": 2851 }, { "epoch": 0.68, "learning_rate": 7.013333333333334e-08, "logps/chosen": -224.15948486328125, "logps/rejected": -388.677001953125, "loss": 0.001, "losses/dpo": 5.518813739335826e-10, "losses/sft": 0.5457571148872375, "losses/total": 5.518813739335826e-10, "ref_logps/chosen": -210.11170959472656, "ref_logps/rejected": -229.54815673828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.4047791957855225, "rewards/margins": 14.508106231689453, "rewards/rejected": -15.912883758544922, "step": 2852 }, { "epoch": 0.68, "learning_rate": 7.007999999999999e-08, "logps/chosen": -231.91555786132812, "logps/rejected": -381.84271240234375, "loss": 0.0003, "losses/dpo": 3.9858352174348965e-09, "losses/sft": 0.8429479598999023, "losses/total": 3.9858352174348965e-09, "ref_logps/chosen": -218.34230041503906, "ref_logps/rejected": -217.2205810546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.3573274612426758, "rewards/margins": 15.104887008666992, "rewards/rejected": -16.462215423583984, "step": 2853 }, { "epoch": 0.68, "learning_rate": 7.002666666666667e-08, "logps/chosen": -247.24560546875, "logps/rejected": -351.81805419921875, "loss": 0.0005, "losses/dpo": 2.551217903601355e-08, "losses/sft": 0.7151646614074707, "losses/total": 2.551217903601355e-08, "ref_logps/chosen": -230.689697265625, "ref_logps/rejected": -197.6855926513672, "rewards/accuracies": 1.0, "rewards/chosen": -1.655590534210205, "rewards/margins": 13.757654190063477, "rewards/rejected": -15.413246154785156, "step": 2854 }, { "epoch": 0.69, "learning_rate": 6.997333333333333e-08, "logps/chosen": -270.2198791503906, "logps/rejected": -397.0694580078125, "loss": 0.0003, "losses/dpo": 3.7022500691819005e-06, "losses/sft": 1.1309940814971924, "losses/total": 3.7022500691819005e-06, "ref_logps/chosen": -255.9229278564453, "ref_logps/rejected": -239.9974822998047, "rewards/accuracies": 1.0, "rewards/chosen": -1.4296929836273193, "rewards/margins": 14.277502059936523, "rewards/rejected": -15.707197189331055, "step": 2855 }, { "epoch": 0.69, "learning_rate": 6.992000000000001e-08, "logps/chosen": -183.81005859375, "logps/rejected": -376.71160888671875, "loss": 0.0005, "losses/dpo": 4.388602974358946e-05, "losses/sft": 1.3629754781723022, "losses/total": 4.388602974358946e-05, "ref_logps/chosen": -174.75320434570312, "ref_logps/rejected": -219.1119384765625, "rewards/accuracies": 1.0, "rewards/chosen": -0.905683159828186, "rewards/margins": 14.854286193847656, "rewards/rejected": -15.759967803955078, "step": 2856 }, { "epoch": 0.69, "learning_rate": 6.986666666666666e-08, "logps/chosen": -281.65057373046875, "logps/rejected": -393.22918701171875, "loss": 0.0027, "losses/dpo": 4.827861133094302e-08, "losses/sft": 0.42599037289619446, "losses/total": 4.827861133094302e-08, "ref_logps/chosen": -265.93048095703125, "ref_logps/rejected": -231.8197021484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.572007179260254, "rewards/margins": 14.568941116333008, "rewards/rejected": -16.140947341918945, "step": 2857 }, { "epoch": 0.69, "learning_rate": 6.981333333333333e-08, "logps/chosen": -229.0779571533203, "logps/rejected": -375.3690490722656, "loss": 0.0018, "losses/dpo": 0.0017685170751065016, "losses/sft": 0.7142726182937622, "losses/total": 0.0017685170751065016, "ref_logps/chosen": -215.38803100585938, "ref_logps/rejected": -225.70848083496094, "rewards/accuracies": 1.0, "rewards/chosen": -1.3689910173416138, "rewards/margins": 13.597065925598145, "rewards/rejected": -14.966055870056152, "step": 2858 }, { "epoch": 0.69, "learning_rate": 6.976e-08, "logps/chosen": -278.31109619140625, "logps/rejected": -396.78759765625, "loss": 0.0009, "losses/dpo": 6.0234448540086305e-09, "losses/sft": 0.5805127620697021, "losses/total": 6.0234448540086305e-09, "ref_logps/chosen": -260.3315124511719, "ref_logps/rejected": -231.34396362304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.7979563474655151, "rewards/margins": 14.746408462524414, "rewards/rejected": -16.54436492919922, "step": 2859 }, { "epoch": 0.69, "learning_rate": 6.970666666666666e-08, "logps/chosen": -249.9180908203125, "logps/rejected": -386.9043884277344, "loss": 0.0001, "losses/dpo": 9.767765485335644e-10, "losses/sft": 0.6845131516456604, "losses/total": 9.767765485335644e-10, "ref_logps/chosen": -233.55624389648438, "ref_logps/rejected": -224.19766235351562, "rewards/accuracies": 1.0, "rewards/chosen": -1.6361836194992065, "rewards/margins": 14.634489059448242, "rewards/rejected": -16.270671844482422, "step": 2860 }, { "epoch": 0.69, "learning_rate": 6.965333333333332e-08, "logps/chosen": -224.9002227783203, "logps/rejected": -370.40106201171875, "loss": 0.0003, "losses/dpo": 4.5953756853123195e-06, "losses/sft": 0.5948551893234253, "losses/total": 4.5953756853123195e-06, "ref_logps/chosen": -209.906494140625, "ref_logps/rejected": -212.2843780517578, "rewards/accuracies": 1.0, "rewards/chosen": -1.4993737936019897, "rewards/margins": 14.312295913696289, "rewards/rejected": -15.81166934967041, "step": 2861 }, { "epoch": 0.69, "learning_rate": 6.959999999999999e-08, "logps/chosen": -262.33184814453125, "logps/rejected": -412.06890869140625, "loss": 0.0002, "losses/dpo": 3.7936094265944575e-09, "losses/sft": 0.5569926500320435, "losses/total": 3.7936094265944575e-09, "ref_logps/chosen": -247.16270446777344, "ref_logps/rejected": -250.8943634033203, "rewards/accuracies": 1.0, "rewards/chosen": -1.5169148445129395, "rewards/margins": 14.600540161132812, "rewards/rejected": -16.117454528808594, "step": 2862 }, { "epoch": 0.69, "learning_rate": 6.954666666666666e-08, "logps/chosen": -256.5555419921875, "logps/rejected": -433.9830017089844, "loss": 0.0001, "losses/dpo": 8.828564546092821e-10, "losses/sft": 0.5876707434654236, "losses/total": 8.828564546092821e-10, "ref_logps/chosen": -242.28134155273438, "ref_logps/rejected": -242.7264404296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.427418828010559, "rewards/margins": 17.698238372802734, "rewards/rejected": -19.125656127929688, "step": 2863 }, { "epoch": 0.69, "learning_rate": 6.949333333333333e-08, "logps/chosen": -203.2036895751953, "logps/rejected": -379.45111083984375, "loss": 0.0005, "losses/dpo": 1.4797476577399493e-09, "losses/sft": 0.7377405166625977, "losses/total": 1.4797476577399493e-09, "ref_logps/chosen": -187.2958221435547, "ref_logps/rejected": -216.43374633789062, "rewards/accuracies": 1.0, "rewards/chosen": -1.590787649154663, "rewards/margins": 14.710948944091797, "rewards/rejected": -16.30173683166504, "step": 2864 }, { "epoch": 0.69, "learning_rate": 6.944e-08, "logps/chosen": -247.97021484375, "logps/rejected": -393.2159729003906, "loss": 0.0002, "losses/dpo": 3.6390913010109216e-05, "losses/sft": 0.6109196543693542, "losses/total": 3.6390913010109216e-05, "ref_logps/chosen": -229.27378845214844, "ref_logps/rejected": -233.75814819335938, "rewards/accuracies": 1.0, "rewards/chosen": -1.8696414232254028, "rewards/margins": 14.076141357421875, "rewards/rejected": -15.945781707763672, "step": 2865 }, { "epoch": 0.69, "learning_rate": 6.938666666666666e-08, "logps/chosen": -233.61471557617188, "logps/rejected": -349.5015869140625, "loss": 0.0203, "losses/dpo": 1.0132757779501844e-05, "losses/sft": 1.1755938529968262, "losses/total": 1.0132757779501844e-05, "ref_logps/chosen": -219.43312072753906, "ref_logps/rejected": -201.37657165527344, "rewards/accuracies": 1.0, "rewards/chosen": -1.41815984249115, "rewards/margins": 13.394342422485352, "rewards/rejected": -14.812501907348633, "step": 2866 }, { "epoch": 0.69, "learning_rate": 6.933333333333333e-08, "logps/chosen": -229.46347045898438, "logps/rejected": -374.43304443359375, "loss": 0.0035, "losses/dpo": 4.643289841510523e-08, "losses/sft": 0.7513282299041748, "losses/total": 4.643289841510523e-08, "ref_logps/chosen": -212.36953735351562, "ref_logps/rejected": -221.83908081054688, "rewards/accuracies": 1.0, "rewards/chosen": -1.7093925476074219, "rewards/margins": 13.550003051757812, "rewards/rejected": -15.259395599365234, "step": 2867 }, { "epoch": 0.69, "learning_rate": 6.928e-08, "logps/chosen": -211.1050567626953, "logps/rejected": -344.13690185546875, "loss": 0.0005, "losses/dpo": 1.0957337259753785e-09, "losses/sft": 0.7145963311195374, "losses/total": 1.0957337259753785e-09, "ref_logps/chosen": -200.32247924804688, "ref_logps/rejected": -191.95162963867188, "rewards/accuracies": 1.0, "rewards/chosen": -1.0782575607299805, "rewards/margins": 14.140270233154297, "rewards/rejected": -15.218528747558594, "step": 2868 }, { "epoch": 0.69, "learning_rate": 6.922666666666667e-08, "logps/chosen": -280.2318115234375, "logps/rejected": -388.0779113769531, "loss": 0.0026, "losses/dpo": 6.501357319166345e-08, "losses/sft": 0.609988272190094, "losses/total": 6.501357319166345e-08, "ref_logps/chosen": -261.1234436035156, "ref_logps/rejected": -226.5567169189453, "rewards/accuracies": 1.0, "rewards/chosen": -1.910838007926941, "rewards/margins": 14.241283416748047, "rewards/rejected": -16.15212059020996, "step": 2869 }, { "epoch": 0.69, "learning_rate": 6.917333333333332e-08, "logps/chosen": -253.08731079101562, "logps/rejected": -409.593017578125, "loss": 0.0001, "losses/dpo": 1.8889847908099e-08, "losses/sft": 0.5506288409233093, "losses/total": 1.8889847908099e-08, "ref_logps/chosen": -238.67922973632812, "ref_logps/rejected": -239.95086669921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.440808653831482, "rewards/margins": 15.523406028747559, "rewards/rejected": -16.964214324951172, "step": 2870 }, { "epoch": 0.69, "learning_rate": 6.912e-08, "logps/chosen": -236.19308471679688, "logps/rejected": -416.05523681640625, "loss": 0.0001, "losses/dpo": 6.188817991414908e-08, "losses/sft": 0.5764992833137512, "losses/total": 6.188817991414908e-08, "ref_logps/chosen": -222.02304077148438, "ref_logps/rejected": -232.0347900390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4170050621032715, "rewards/margins": 16.98503875732422, "rewards/rejected": -18.402042388916016, "step": 2871 }, { "epoch": 0.69, "learning_rate": 6.906666666666666e-08, "logps/chosen": -226.64813232421875, "logps/rejected": -358.81036376953125, "loss": 0.0163, "losses/dpo": 1.3187669765102328e-06, "losses/sft": 0.6806116104125977, "losses/total": 1.3187669765102328e-06, "ref_logps/chosen": -211.37493896484375, "ref_logps/rejected": -203.74668884277344, "rewards/accuracies": 1.0, "rewards/chosen": -1.5273188352584839, "rewards/margins": 13.979049682617188, "rewards/rejected": -15.506368637084961, "step": 2872 }, { "epoch": 0.69, "learning_rate": 6.901333333333334e-08, "logps/chosen": -269.8950500488281, "logps/rejected": -407.3785705566406, "loss": 0.0009, "losses/dpo": 2.4427333045196065e-08, "losses/sft": 0.5470179915428162, "losses/total": 2.4427333045196065e-08, "ref_logps/chosen": -250.4482421875, "ref_logps/rejected": -245.01637268066406, "rewards/accuracies": 1.0, "rewards/chosen": -1.9446792602539062, "rewards/margins": 14.291542053222656, "rewards/rejected": -16.236221313476562, "step": 2873 }, { "epoch": 0.69, "learning_rate": 6.895999999999999e-08, "logps/chosen": -207.58021545410156, "logps/rejected": -372.92059326171875, "loss": 0.0016, "losses/dpo": 3.095735578995118e-08, "losses/sft": 0.652857780456543, "losses/total": 3.095735578995118e-08, "ref_logps/chosen": -190.9667205810547, "ref_logps/rejected": -219.85821533203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6613504886627197, "rewards/margins": 13.64488410949707, "rewards/rejected": -15.306236267089844, "step": 2874 }, { "epoch": 0.69, "learning_rate": 6.890666666666667e-08, "logps/chosen": -263.38092041015625, "logps/rejected": -383.1024169921875, "loss": 0.0009, "losses/dpo": 1.9657363736769184e-06, "losses/sft": 0.5434017777442932, "losses/total": 1.9657363736769184e-06, "ref_logps/chosen": -249.18714904785156, "ref_logps/rejected": -234.9380340576172, "rewards/accuracies": 1.0, "rewards/chosen": -1.4193761348724365, "rewards/margins": 13.397062301635742, "rewards/rejected": -14.816437721252441, "step": 2875 }, { "epoch": 0.69, "learning_rate": 6.885333333333333e-08, "logps/chosen": -260.58740234375, "logps/rejected": -408.7444763183594, "loss": 0.01, "losses/dpo": 0.0028533085715025663, "losses/sft": 0.6059852838516235, "losses/total": 0.0028533085715025663, "ref_logps/chosen": -244.41343688964844, "ref_logps/rejected": -241.27822875976562, "rewards/accuracies": 1.0, "rewards/chosen": -1.6173973083496094, "rewards/margins": 15.129229545593262, "rewards/rejected": -16.746625900268555, "step": 2876 }, { "epoch": 0.69, "learning_rate": 6.88e-08, "logps/chosen": -234.3332977294922, "logps/rejected": -376.0960693359375, "loss": 0.0027, "losses/dpo": 8.626869885119959e-09, "losses/sft": 0.5475190877914429, "losses/total": 8.626869885119959e-09, "ref_logps/chosen": -221.34266662597656, "ref_logps/rejected": -223.69703674316406, "rewards/accuracies": 1.0, "rewards/chosen": -1.2990624904632568, "rewards/margins": 13.940839767456055, "rewards/rejected": -15.23990249633789, "step": 2877 }, { "epoch": 0.69, "learning_rate": 6.874666666666666e-08, "logps/chosen": -248.37477111816406, "logps/rejected": -397.44403076171875, "loss": 0.0074, "losses/dpo": 7.339443186538119e-08, "losses/sft": 0.7726445198059082, "losses/total": 7.339443186538119e-08, "ref_logps/chosen": -230.6895294189453, "ref_logps/rejected": -224.95440673828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.7685229778289795, "rewards/margins": 15.480438232421875, "rewards/rejected": -17.24896240234375, "step": 2878 }, { "epoch": 0.69, "learning_rate": 6.869333333333332e-08, "logps/chosen": -215.32504272460938, "logps/rejected": -374.9479064941406, "loss": 0.001, "losses/dpo": 1.9551920559024438e-05, "losses/sft": 0.40029653906822205, "losses/total": 1.9551920559024438e-05, "ref_logps/chosen": -201.96102905273438, "ref_logps/rejected": -210.26177978515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.3364003896713257, "rewards/margins": 15.13221263885498, "rewards/rejected": -16.468612670898438, "step": 2879 }, { "epoch": 0.69, "learning_rate": 6.864e-08, "logps/chosen": -225.48684692382812, "logps/rejected": -401.46307373046875, "loss": 0.0007, "losses/dpo": 6.496936877375958e-10, "losses/sft": 0.5628843307495117, "losses/total": 6.496936877375958e-10, "ref_logps/chosen": -212.7427215576172, "ref_logps/rejected": -237.321044921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.274411678314209, "rewards/margins": 15.139791488647461, "rewards/rejected": -16.414203643798828, "step": 2880 }, { "epoch": 0.69, "learning_rate": 6.858666666666666e-08, "logps/chosen": -243.14517211914062, "logps/rejected": -395.64215087890625, "loss": 0.0003, "losses/dpo": 9.851164861629513e-08, "losses/sft": 0.7954573631286621, "losses/total": 9.851164861629513e-08, "ref_logps/chosen": -226.5669403076172, "ref_logps/rejected": -225.21377563476562, "rewards/accuracies": 1.0, "rewards/chosen": -1.657822847366333, "rewards/margins": 15.385015487670898, "rewards/rejected": -17.04283905029297, "step": 2881 }, { "epoch": 0.69, "learning_rate": 6.853333333333334e-08, "logps/chosen": -207.68576049804688, "logps/rejected": -365.9462890625, "loss": 0.0006, "losses/dpo": 7.199266960355999e-09, "losses/sft": 0.714634895324707, "losses/total": 7.199266960355999e-09, "ref_logps/chosen": -196.41551208496094, "ref_logps/rejected": -200.3193359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.127024531364441, "rewards/margins": 15.43567180633545, "rewards/rejected": -16.56269645690918, "step": 2882 }, { "epoch": 0.69, "learning_rate": 6.847999999999999e-08, "logps/chosen": -268.2600402832031, "logps/rejected": -382.3830871582031, "loss": 0.0, "losses/dpo": 6.5618497302466494e-09, "losses/sft": 0.5706751346588135, "losses/total": 6.5618497302466494e-09, "ref_logps/chosen": -252.05616760253906, "ref_logps/rejected": -223.64474487304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.620387077331543, "rewards/margins": 14.253451347351074, "rewards/rejected": -15.873836517333984, "step": 2883 }, { "epoch": 0.69, "learning_rate": 6.842666666666667e-08, "logps/chosen": -260.08856201171875, "logps/rejected": -415.8165588378906, "loss": 0.0006, "losses/dpo": 2.479852261039195e-12, "losses/sft": 0.7244076132774353, "losses/total": 2.479852261039195e-12, "ref_logps/chosen": -243.49163818359375, "ref_logps/rejected": -246.05892944335938, "rewards/accuracies": 1.0, "rewards/chosen": -1.6596946716308594, "rewards/margins": 15.316065788269043, "rewards/rejected": -16.975759506225586, "step": 2884 }, { "epoch": 0.69, "learning_rate": 6.837333333333333e-08, "logps/chosen": -281.357421875, "logps/rejected": -417.0627746582031, "loss": 0.0002, "losses/dpo": 7.3426460467374e-09, "losses/sft": 0.455869197845459, "losses/total": 7.3426460467374e-09, "ref_logps/chosen": -263.7887268066406, "ref_logps/rejected": -244.99526977539062, "rewards/accuracies": 1.0, "rewards/chosen": -1.7568682432174683, "rewards/margins": 15.44987964630127, "rewards/rejected": -17.206748962402344, "step": 2885 }, { "epoch": 0.69, "learning_rate": 6.832e-08, "logps/chosen": -255.4770965576172, "logps/rejected": -385.3888854980469, "loss": 0.0037, "losses/dpo": 1.763631196638471e-08, "losses/sft": 0.6088932156562805, "losses/total": 1.763631196638471e-08, "ref_logps/chosen": -239.8722686767578, "ref_logps/rejected": -235.0405731201172, "rewards/accuracies": 1.0, "rewards/chosen": -1.5604807138442993, "rewards/margins": 13.474348068237305, "rewards/rejected": -15.034829139709473, "step": 2886 }, { "epoch": 0.69, "learning_rate": 6.826666666666666e-08, "logps/chosen": -245.70266723632812, "logps/rejected": -402.4307861328125, "loss": 0.0001, "losses/dpo": 3.0333278111527306e-09, "losses/sft": 0.6580431461334229, "losses/total": 3.0333278111527306e-09, "ref_logps/chosen": -227.96932983398438, "ref_logps/rejected": -236.73460388183594, "rewards/accuracies": 1.0, "rewards/chosen": -1.7733365297317505, "rewards/margins": 14.796279907226562, "rewards/rejected": -16.569616317749023, "step": 2887 }, { "epoch": 0.69, "learning_rate": 6.821333333333333e-08, "logps/chosen": -213.95953369140625, "logps/rejected": -347.9738464355469, "loss": 0.0376, "losses/dpo": 1.4671599046778283e-06, "losses/sft": 0.4920559823513031, "losses/total": 1.4671599046778283e-06, "ref_logps/chosen": -194.46536254882812, "ref_logps/rejected": -198.47030639648438, "rewards/accuracies": 0.96875, "rewards/chosen": -1.949416160583496, "rewards/margins": 13.000936508178711, "rewards/rejected": -14.950353622436523, "step": 2888 }, { "epoch": 0.69, "learning_rate": 6.816e-08, "logps/chosen": -271.0352783203125, "logps/rejected": -384.2379150390625, "loss": 0.0511, "losses/dpo": 2.7084762210805025e-10, "losses/sft": 0.7481120824813843, "losses/total": 2.7084762210805025e-10, "ref_logps/chosen": -250.41348266601562, "ref_logps/rejected": -214.37100219726562, "rewards/accuracies": 0.96875, "rewards/chosen": -2.0621821880340576, "rewards/margins": 14.924508094787598, "rewards/rejected": -16.986690521240234, "step": 2889 }, { "epoch": 0.69, "learning_rate": 6.810666666666667e-08, "logps/chosen": -242.4361572265625, "logps/rejected": -376.3067626953125, "loss": 0.0036, "losses/dpo": 1.2772619584211498e-07, "losses/sft": 0.30284959077835083, "losses/total": 1.2772619584211498e-07, "ref_logps/chosen": -231.60781860351562, "ref_logps/rejected": -236.18856811523438, "rewards/accuracies": 1.0, "rewards/chosen": -1.0828328132629395, "rewards/margins": 12.928990364074707, "rewards/rejected": -14.011823654174805, "step": 2890 }, { "epoch": 0.69, "learning_rate": 6.805333333333332e-08, "logps/chosen": -208.18702697753906, "logps/rejected": -383.2355651855469, "loss": 0.0087, "losses/dpo": 1.225359674839055e-10, "losses/sft": 0.4788980484008789, "losses/total": 1.225359674839055e-10, "ref_logps/chosen": -196.10743713378906, "ref_logps/rejected": -221.00982666015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2079567909240723, "rewards/margins": 15.014617919921875, "rewards/rejected": -16.222576141357422, "step": 2891 }, { "epoch": 0.69, "learning_rate": 6.8e-08, "logps/chosen": -212.2808380126953, "logps/rejected": -363.9945068359375, "loss": 0.0083, "losses/dpo": 4.1739493263602545e-13, "losses/sft": 0.617884635925293, "losses/total": 4.1739493263602545e-13, "ref_logps/chosen": -198.18008422851562, "ref_logps/rejected": -204.49368286132812, "rewards/accuracies": 1.0, "rewards/chosen": -1.4100773334503174, "rewards/margins": 14.540006637573242, "rewards/rejected": -15.950084686279297, "step": 2892 }, { "epoch": 0.69, "learning_rate": 6.794666666666666e-08, "logps/chosen": -257.6595764160156, "logps/rejected": -371.98443603515625, "loss": 0.0011, "losses/dpo": 1.580152186875239e-08, "losses/sft": 0.6522312760353088, "losses/total": 1.580152186875239e-08, "ref_logps/chosen": -240.57974243164062, "ref_logps/rejected": -212.84881591796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7079845666885376, "rewards/margins": 14.205575942993164, "rewards/rejected": -15.91356086730957, "step": 2893 }, { "epoch": 0.69, "learning_rate": 6.789333333333334e-08, "logps/chosen": -227.16603088378906, "logps/rejected": -390.1265563964844, "loss": 0.0035, "losses/dpo": 5.431693494983847e-08, "losses/sft": 0.7841598987579346, "losses/total": 5.431693494983847e-08, "ref_logps/chosen": -211.52365112304688, "ref_logps/rejected": -230.98703002929688, "rewards/accuracies": 1.0, "rewards/chosen": -1.5642387866973877, "rewards/margins": 14.349716186523438, "rewards/rejected": -15.913954734802246, "step": 2894 }, { "epoch": 0.69, "learning_rate": 6.783999999999999e-08, "logps/chosen": -245.2891845703125, "logps/rejected": -316.86273193359375, "loss": 0.0262, "losses/dpo": 8.83752218214795e-05, "losses/sft": 0.8020130395889282, "losses/total": 8.83752218214795e-05, "ref_logps/chosen": -227.8045196533203, "ref_logps/rejected": -190.4285888671875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7484662532806396, "rewards/margins": 10.894946098327637, "rewards/rejected": -12.643412590026855, "step": 2895 }, { "epoch": 0.69, "learning_rate": 6.778666666666665e-08, "logps/chosen": -240.22894287109375, "logps/rejected": -393.35382080078125, "loss": 0.0015, "losses/dpo": 9.390872435233177e-08, "losses/sft": 0.7759676575660706, "losses/total": 9.390872435233177e-08, "ref_logps/chosen": -227.82611083984375, "ref_logps/rejected": -226.49185180664062, "rewards/accuracies": 1.0, "rewards/chosen": -1.2402832508087158, "rewards/margins": 15.445917129516602, "rewards/rejected": -16.686199188232422, "step": 2896 }, { "epoch": 0.7, "learning_rate": 6.773333333333333e-08, "logps/chosen": -244.5709228515625, "logps/rejected": -364.33648681640625, "loss": 0.0004, "losses/dpo": 5.350461762532177e-08, "losses/sft": 0.6365132331848145, "losses/total": 5.350461762532177e-08, "ref_logps/chosen": -227.87933349609375, "ref_logps/rejected": -205.12832641601562, "rewards/accuracies": 1.0, "rewards/chosen": -1.6691573858261108, "rewards/margins": 14.251656532287598, "rewards/rejected": -15.920814514160156, "step": 2897 }, { "epoch": 0.7, "learning_rate": 6.768e-08, "logps/chosen": -268.611083984375, "logps/rejected": -427.0044860839844, "loss": 0.0022, "losses/dpo": 3.216437471564859e-05, "losses/sft": 0.6349978446960449, "losses/total": 3.216437471564859e-05, "ref_logps/chosen": -250.15835571289062, "ref_logps/rejected": -249.2617950439453, "rewards/accuracies": 1.0, "rewards/chosen": -1.8452740907669067, "rewards/margins": 15.928995132446289, "rewards/rejected": -17.774269104003906, "step": 2898 }, { "epoch": 0.7, "learning_rate": 6.762666666666667e-08, "logps/chosen": -218.96315002441406, "logps/rejected": -332.97210693359375, "loss": 0.0076, "losses/dpo": 2.67915766016813e-07, "losses/sft": 0.5196946263313293, "losses/total": 2.67915766016813e-07, "ref_logps/chosen": -205.5115966796875, "ref_logps/rejected": -191.27670288085938, "rewards/accuracies": 1.0, "rewards/chosen": -1.345154047012329, "rewards/margins": 12.824384689331055, "rewards/rejected": -14.169538497924805, "step": 2899 }, { "epoch": 0.7, "learning_rate": 6.757333333333332e-08, "logps/chosen": -266.7840881347656, "logps/rejected": -367.54296875, "loss": 0.0024, "losses/dpo": 8.750129509849103e-09, "losses/sft": 0.9176406860351562, "losses/total": 8.750129509849103e-09, "ref_logps/chosen": -250.91680908203125, "ref_logps/rejected": -214.228759765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.586726188659668, "rewards/margins": 13.744695663452148, "rewards/rejected": -15.331422805786133, "step": 2900 }, { "epoch": 0.7, "learning_rate": 6.752e-08, "logps/chosen": -197.90133666992188, "logps/rejected": -351.38134765625, "loss": 0.0072, "losses/dpo": 1.059738039543845e-07, "losses/sft": 1.1141220331192017, "losses/total": 1.059738039543845e-07, "ref_logps/chosen": -183.00106811523438, "ref_logps/rejected": -195.55853271484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4900262355804443, "rewards/margins": 14.092257499694824, "rewards/rejected": -15.582283020019531, "step": 2901 }, { "epoch": 0.7, "learning_rate": 6.746666666666666e-08, "logps/chosen": -244.57562255859375, "logps/rejected": -359.0419006347656, "loss": 0.0042, "losses/dpo": 1.2734635390643234e-07, "losses/sft": 0.6559257507324219, "losses/total": 1.2734635390643234e-07, "ref_logps/chosen": -227.58966064453125, "ref_logps/rejected": -208.6125946044922, "rewards/accuracies": 1.0, "rewards/chosen": -1.698596477508545, "rewards/margins": 13.344334602355957, "rewards/rejected": -15.042930603027344, "step": 2902 }, { "epoch": 0.7, "learning_rate": 6.741333333333334e-08, "logps/chosen": -246.14622497558594, "logps/rejected": -389.7471923828125, "loss": 0.0003, "losses/dpo": 7.131153001438406e-09, "losses/sft": 0.6988133788108826, "losses/total": 7.131153001438406e-09, "ref_logps/chosen": -231.62628173828125, "ref_logps/rejected": -221.90264892578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.4519940614700317, "rewards/margins": 15.332460403442383, "rewards/rejected": -16.784454345703125, "step": 2903 }, { "epoch": 0.7, "learning_rate": 6.735999999999999e-08, "logps/chosen": -226.87240600585938, "logps/rejected": -341.7292785644531, "loss": 0.0136, "losses/dpo": 2.28418093684013e-06, "losses/sft": 0.5204680562019348, "losses/total": 2.28418093684013e-06, "ref_logps/chosen": -214.31863403320312, "ref_logps/rejected": -198.94798278808594, "rewards/accuracies": 1.0, "rewards/chosen": -1.2553783655166626, "rewards/margins": 13.022750854492188, "rewards/rejected": -14.278129577636719, "step": 2904 }, { "epoch": 0.7, "learning_rate": 6.730666666666667e-08, "logps/chosen": -175.11412048339844, "logps/rejected": -333.37860107421875, "loss": 0.0007, "losses/dpo": 3.3079852745920846e-10, "losses/sft": 0.5704410076141357, "losses/total": 3.3079852745920846e-10, "ref_logps/chosen": -162.5743408203125, "ref_logps/rejected": -177.35397338867188, "rewards/accuracies": 1.0, "rewards/chosen": -1.253976821899414, "rewards/margins": 14.348487854003906, "rewards/rejected": -15.602463722229004, "step": 2905 }, { "epoch": 0.7, "learning_rate": 6.725333333333333e-08, "logps/chosen": -230.15643310546875, "logps/rejected": -387.491943359375, "loss": 0.0001, "losses/dpo": 1.0279917432853836e-06, "losses/sft": 0.6914904713630676, "losses/total": 1.0279917432853836e-06, "ref_logps/chosen": -214.69754028320312, "ref_logps/rejected": -225.2200164794922, "rewards/accuracies": 1.0, "rewards/chosen": -1.5458908081054688, "rewards/margins": 14.68129825592041, "rewards/rejected": -16.227190017700195, "step": 2906 }, { "epoch": 0.7, "learning_rate": 6.72e-08, "logps/chosen": -214.55780029296875, "logps/rejected": -386.1656494140625, "loss": 0.0002, "losses/dpo": 4.752497007043388e-11, "losses/sft": 0.4688982367515564, "losses/total": 4.752497007043388e-11, "ref_logps/chosen": -202.06573486328125, "ref_logps/rejected": -226.48362731933594, "rewards/accuracies": 1.0, "rewards/chosen": -1.2492079734802246, "rewards/margins": 14.718994140625, "rewards/rejected": -15.968202590942383, "step": 2907 }, { "epoch": 0.7, "learning_rate": 6.714666666666666e-08, "logps/chosen": -176.91796875, "logps/rejected": -323.2365417480469, "loss": 0.0011, "losses/dpo": 3.444220820006394e-12, "losses/sft": 0.6250882744789124, "losses/total": 3.444220820006394e-12, "ref_logps/chosen": -166.05477905273438, "ref_logps/rejected": -190.64071655273438, "rewards/accuracies": 1.0, "rewards/chosen": -1.0863194465637207, "rewards/margins": 12.173262596130371, "rewards/rejected": -13.259583473205566, "step": 2908 }, { "epoch": 0.7, "learning_rate": 6.709333333333333e-08, "logps/chosen": -252.7950897216797, "logps/rejected": -411.7117004394531, "loss": 0.0042, "losses/dpo": 6.859816263204283e-14, "losses/sft": 0.9001317620277405, "losses/total": 6.859816263204283e-14, "ref_logps/chosen": -236.64691162109375, "ref_logps/rejected": -243.11151123046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.6148171424865723, "rewards/margins": 15.245203018188477, "rewards/rejected": -16.86001968383789, "step": 2909 }, { "epoch": 0.7, "learning_rate": 6.704e-08, "logps/chosen": -255.15721130371094, "logps/rejected": -410.56976318359375, "loss": 0.0045, "losses/dpo": 1.8890712993879788e-08, "losses/sft": 0.8043617606163025, "losses/total": 1.8890712993879788e-08, "ref_logps/chosen": -240.3349609375, "ref_logps/rejected": -236.23931884765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4822251796722412, "rewards/margins": 15.95081901550293, "rewards/rejected": -17.43304443359375, "step": 2910 }, { "epoch": 0.7, "learning_rate": 6.698666666666667e-08, "logps/chosen": -267.44171142578125, "logps/rejected": -390.19964599609375, "loss": 0.0125, "losses/dpo": 0.0032274911645799875, "losses/sft": 0.6438911557197571, "losses/total": 0.0032274911645799875, "ref_logps/chosen": -254.29690551757812, "ref_logps/rejected": -235.32400512695312, "rewards/accuracies": 1.0, "rewards/chosen": -1.3144816160202026, "rewards/margins": 14.17308235168457, "rewards/rejected": -15.487564086914062, "step": 2911 }, { "epoch": 0.7, "learning_rate": 6.693333333333334e-08, "logps/chosen": -225.1309051513672, "logps/rejected": -369.46282958984375, "loss": 0.0003, "losses/dpo": 4.830467759120438e-08, "losses/sft": 0.8147609233856201, "losses/total": 4.830467759120438e-08, "ref_logps/chosen": -211.78921508789062, "ref_logps/rejected": -210.31097412109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.3341693878173828, "rewards/margins": 14.581018447875977, "rewards/rejected": -15.915186882019043, "step": 2912 }, { "epoch": 0.7, "learning_rate": 6.687999999999999e-08, "logps/chosen": -266.57257080078125, "logps/rejected": -403.49688720703125, "loss": 0.0, "losses/dpo": 5.541641030504252e-07, "losses/sft": 0.7712023854255676, "losses/total": 5.541641030504252e-07, "ref_logps/chosen": -252.08273315429688, "ref_logps/rejected": -237.96763610839844, "rewards/accuracies": 1.0, "rewards/chosen": -1.4489846229553223, "rewards/margins": 15.103938102722168, "rewards/rejected": -16.55292320251465, "step": 2913 }, { "epoch": 0.7, "learning_rate": 6.682666666666666e-08, "logps/chosen": -238.04074096679688, "logps/rejected": -372.101318359375, "loss": 0.0025, "losses/dpo": 4.577807715122617e-07, "losses/sft": 0.5761622190475464, "losses/total": 4.577807715122617e-07, "ref_logps/chosen": -223.08837890625, "ref_logps/rejected": -217.99832153320312, "rewards/accuracies": 1.0, "rewards/chosen": -1.4952373504638672, "rewards/margins": 13.915061950683594, "rewards/rejected": -15.410299301147461, "step": 2914 }, { "epoch": 0.7, "learning_rate": 6.677333333333333e-08, "logps/chosen": -243.58229064941406, "logps/rejected": -370.37567138671875, "loss": 0.002, "losses/dpo": 8.323997741088718e-11, "losses/sft": 0.5978314876556396, "losses/total": 8.323997741088718e-11, "ref_logps/chosen": -232.03530883789062, "ref_logps/rejected": -219.51339721679688, "rewards/accuracies": 1.0, "rewards/chosen": -1.1546990871429443, "rewards/margins": 13.93152904510498, "rewards/rejected": -15.086228370666504, "step": 2915 }, { "epoch": 0.7, "learning_rate": 6.672e-08, "logps/chosen": -216.7068634033203, "logps/rejected": -401.29791259765625, "loss": 0.0004, "losses/dpo": 3.9002591734060843e-07, "losses/sft": 0.45557770133018494, "losses/total": 3.9002591734060843e-07, "ref_logps/chosen": -205.5843505859375, "ref_logps/rejected": -225.3549041748047, "rewards/accuracies": 1.0, "rewards/chosen": -1.11225426197052, "rewards/margins": 16.4820499420166, "rewards/rejected": -17.594303131103516, "step": 2916 }, { "epoch": 0.7, "learning_rate": 6.666666666666665e-08, "logps/chosen": -212.13571166992188, "logps/rejected": -381.59912109375, "loss": 0.0025, "losses/dpo": 3.1760211527398496e-07, "losses/sft": 0.5378157496452332, "losses/total": 3.1760211527398496e-07, "ref_logps/chosen": -198.16424560546875, "ref_logps/rejected": -219.81199645996094, "rewards/accuracies": 1.0, "rewards/chosen": -1.397148609161377, "rewards/margins": 14.78156566619873, "rewards/rejected": -16.178712844848633, "step": 2917 }, { "epoch": 0.7, "learning_rate": 6.661333333333333e-08, "logps/chosen": -267.822021484375, "logps/rejected": -421.6392822265625, "loss": 0.0015, "losses/dpo": 1.923804404668772e-07, "losses/sft": 0.42795684933662415, "losses/total": 1.923804404668772e-07, "ref_logps/chosen": -250.72314453125, "ref_logps/rejected": -254.62069702148438, "rewards/accuracies": 1.0, "rewards/chosen": -1.7098877429962158, "rewards/margins": 14.991971969604492, "rewards/rejected": -16.701858520507812, "step": 2918 }, { "epoch": 0.7, "learning_rate": 6.656e-08, "logps/chosen": -245.3384552001953, "logps/rejected": -388.7682189941406, "loss": 0.0003, "losses/dpo": 1.3034174628501205e-08, "losses/sft": 0.5966114401817322, "losses/total": 1.3034174628501205e-08, "ref_logps/chosen": -231.21694946289062, "ref_logps/rejected": -222.42178344726562, "rewards/accuracies": 1.0, "rewards/chosen": -1.4121499061584473, "rewards/margins": 15.222494125366211, "rewards/rejected": -16.6346435546875, "step": 2919 }, { "epoch": 0.7, "learning_rate": 6.650666666666667e-08, "logps/chosen": -233.31295776367188, "logps/rejected": -380.0196838378906, "loss": 0.0014, "losses/dpo": 1.5661506722608465e-06, "losses/sft": 0.505538284778595, "losses/total": 1.5661506722608465e-06, "ref_logps/chosen": -218.8023223876953, "ref_logps/rejected": -222.43927001953125, "rewards/accuracies": 1.0, "rewards/chosen": -1.451066017150879, "rewards/margins": 14.306978225708008, "rewards/rejected": -15.758045196533203, "step": 2920 }, { "epoch": 0.7, "learning_rate": 6.645333333333332e-08, "logps/chosen": -247.68817138671875, "logps/rejected": -368.95758056640625, "loss": 0.0081, "losses/dpo": 9.875730455632947e-09, "losses/sft": 0.7400521039962769, "losses/total": 9.875730455632947e-09, "ref_logps/chosen": -230.82321166992188, "ref_logps/rejected": -220.95059204101562, "rewards/accuracies": 1.0, "rewards/chosen": -1.6864969730377197, "rewards/margins": 13.114202499389648, "rewards/rejected": -14.800700187683105, "step": 2921 }, { "epoch": 0.7, "learning_rate": 6.64e-08, "logps/chosen": -248.8460235595703, "logps/rejected": -353.479736328125, "loss": 0.0036, "losses/dpo": 2.2870573957334273e-05, "losses/sft": 0.6823410391807556, "losses/total": 2.2870573957334273e-05, "ref_logps/chosen": -228.1117401123047, "ref_logps/rejected": -202.9570770263672, "rewards/accuracies": 1.0, "rewards/chosen": -2.0734288692474365, "rewards/margins": 12.978837966918945, "rewards/rejected": -15.052267074584961, "step": 2922 }, { "epoch": 0.7, "learning_rate": 6.634666666666666e-08, "logps/chosen": -228.5235137939453, "logps/rejected": -378.532958984375, "loss": 0.0053, "losses/dpo": 1.5918644180601405e-07, "losses/sft": 0.5587152242660522, "losses/total": 1.5918644180601405e-07, "ref_logps/chosen": -214.26036071777344, "ref_logps/rejected": -224.2455291748047, "rewards/accuracies": 1.0, "rewards/chosen": -1.4263174533843994, "rewards/margins": 14.002424240112305, "rewards/rejected": -15.428742408752441, "step": 2923 }, { "epoch": 0.7, "learning_rate": 6.629333333333334e-08, "logps/chosen": -271.5759582519531, "logps/rejected": -387.558837890625, "loss": 0.0231, "losses/dpo": 2.3318382602610654e-07, "losses/sft": 1.2867642641067505, "losses/total": 2.3318382602610654e-07, "ref_logps/chosen": -256.07745361328125, "ref_logps/rejected": -227.1412811279297, "rewards/accuracies": 0.96875, "rewards/chosen": -1.5498535633087158, "rewards/margins": 14.491903305053711, "rewards/rejected": -16.04175567626953, "step": 2924 }, { "epoch": 0.7, "learning_rate": 6.623999999999999e-08, "logps/chosen": -218.53207397460938, "logps/rejected": -370.600341796875, "loss": 0.0054, "losses/dpo": 6.244252337417322e-11, "losses/sft": 0.637803852558136, "losses/total": 6.244252337417322e-11, "ref_logps/chosen": -204.96975708007812, "ref_logps/rejected": -216.80223083496094, "rewards/accuracies": 1.0, "rewards/chosen": -1.3562325239181519, "rewards/margins": 14.023578643798828, "rewards/rejected": -15.379810333251953, "step": 2925 }, { "epoch": 0.7, "learning_rate": 6.618666666666667e-08, "logps/chosen": -250.450927734375, "logps/rejected": -382.68182373046875, "loss": 0.0021, "losses/dpo": 2.036225277068449e-15, "losses/sft": 0.7592039108276367, "losses/total": 2.036225277068449e-15, "ref_logps/chosen": -235.0809783935547, "ref_logps/rejected": -221.69146728515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.5369940996170044, "rewards/margins": 14.562042236328125, "rewards/rejected": -16.099035263061523, "step": 2926 }, { "epoch": 0.7, "learning_rate": 6.613333333333333e-08, "logps/chosen": -226.05166625976562, "logps/rejected": -373.6939697265625, "loss": 0.0007, "losses/dpo": 2.0660833399688272e-07, "losses/sft": 0.5339536666870117, "losses/total": 2.0660833399688272e-07, "ref_logps/chosen": -212.75625610351562, "ref_logps/rejected": -216.27635192871094, "rewards/accuracies": 1.0, "rewards/chosen": -1.3295395374298096, "rewards/margins": 14.412220001220703, "rewards/rejected": -15.741759300231934, "step": 2927 }, { "epoch": 0.7, "learning_rate": 6.608000000000001e-08, "logps/chosen": -299.396728515625, "logps/rejected": -412.00341796875, "loss": 0.0001, "losses/dpo": 3.5181329849365284e-08, "losses/sft": 0.7083300948143005, "losses/total": 3.5181329849365284e-08, "ref_logps/chosen": -282.0892333984375, "ref_logps/rejected": -250.824462890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7307496070861816, "rewards/margins": 14.387149810791016, "rewards/rejected": -16.11789894104004, "step": 2928 }, { "epoch": 0.7, "learning_rate": 6.602666666666667e-08, "logps/chosen": -252.21963500976562, "logps/rejected": -377.8438720703125, "loss": 0.0005, "losses/dpo": 2.3828217177168654e-09, "losses/sft": 0.5956133604049683, "losses/total": 2.3828217177168654e-09, "ref_logps/chosen": -231.194580078125, "ref_logps/rejected": -213.47703552246094, "rewards/accuracies": 1.0, "rewards/chosen": -2.1025052070617676, "rewards/margins": 14.334179878234863, "rewards/rejected": -16.436683654785156, "step": 2929 }, { "epoch": 0.7, "learning_rate": 6.597333333333332e-08, "logps/chosen": -245.55245971679688, "logps/rejected": -365.59649658203125, "loss": 0.0008, "losses/dpo": 9.487136480856861e-07, "losses/sft": 0.880591630935669, "losses/total": 9.487136480856861e-07, "ref_logps/chosen": -232.355712890625, "ref_logps/rejected": -220.61459350585938, "rewards/accuracies": 1.0, "rewards/chosen": -1.3196747303009033, "rewards/margins": 13.178515434265137, "rewards/rejected": -14.498190879821777, "step": 2930 }, { "epoch": 0.7, "learning_rate": 6.592e-08, "logps/chosen": -278.77032470703125, "logps/rejected": -404.5990905761719, "loss": 0.0024, "losses/dpo": 3.3395701848348835e-07, "losses/sft": 0.8500970602035522, "losses/total": 3.3395701848348835e-07, "ref_logps/chosen": -263.1935729980469, "ref_logps/rejected": -239.6925048828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.5576772689819336, "rewards/margins": 14.9329833984375, "rewards/rejected": -16.490657806396484, "step": 2931 }, { "epoch": 0.7, "learning_rate": 6.586666666666666e-08, "logps/chosen": -202.17910766601562, "logps/rejected": -383.27227783203125, "loss": 0.0006, "losses/dpo": 5.741051012364551e-08, "losses/sft": 0.7244478464126587, "losses/total": 5.741051012364551e-08, "ref_logps/chosen": -189.99774169921875, "ref_logps/rejected": -226.52349853515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2181360721588135, "rewards/margins": 14.456741333007812, "rewards/rejected": -15.674875259399414, "step": 2932 }, { "epoch": 0.7, "learning_rate": 6.581333333333334e-08, "logps/chosen": -242.64993286132812, "logps/rejected": -364.2058410644531, "loss": 0.0031, "losses/dpo": 7.749047514771235e-10, "losses/sft": 0.6949943900108337, "losses/total": 7.749047514771235e-10, "ref_logps/chosen": -230.8692626953125, "ref_logps/rejected": -226.37185668945312, "rewards/accuracies": 1.0, "rewards/chosen": -1.1780683994293213, "rewards/margins": 12.605330467224121, "rewards/rejected": -13.783398628234863, "step": 2933 }, { "epoch": 0.7, "learning_rate": 6.575999999999999e-08, "logps/chosen": -287.1042175292969, "logps/rejected": -410.5334777832031, "loss": 0.0007, "losses/dpo": 3.957133731802287e-09, "losses/sft": 0.7101190686225891, "losses/total": 3.957133731802287e-09, "ref_logps/chosen": -270.7315368652344, "ref_logps/rejected": -243.28684997558594, "rewards/accuracies": 1.0, "rewards/chosen": -1.6372679471969604, "rewards/margins": 15.087394714355469, "rewards/rejected": -16.72466278076172, "step": 2934 }, { "epoch": 0.7, "learning_rate": 6.570666666666666e-08, "logps/chosen": -237.0970458984375, "logps/rejected": -377.684814453125, "loss": 0.0013, "losses/dpo": 4.12600002164254e-06, "losses/sft": 0.5759851932525635, "losses/total": 4.12600002164254e-06, "ref_logps/chosen": -219.6580810546875, "ref_logps/rejected": -210.6109619140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7438945770263672, "rewards/margins": 14.963491439819336, "rewards/rejected": -16.707386016845703, "step": 2935 }, { "epoch": 0.7, "learning_rate": 6.565333333333333e-08, "logps/chosen": -233.54498291015625, "logps/rejected": -392.1181945800781, "loss": 0.0042, "losses/dpo": 2.325330683561333e-08, "losses/sft": 0.6388527154922485, "losses/total": 2.325330683561333e-08, "ref_logps/chosen": -217.23007202148438, "ref_logps/rejected": -228.5554962158203, "rewards/accuracies": 1.0, "rewards/chosen": -1.6314914226531982, "rewards/margins": 14.724778175354004, "rewards/rejected": -16.35626983642578, "step": 2936 }, { "epoch": 0.7, "learning_rate": 6.56e-08, "logps/chosen": -267.33807373046875, "logps/rejected": -345.8489990234375, "loss": 0.0007, "losses/dpo": 2.4010034394450486e-05, "losses/sft": 0.934158205986023, "losses/total": 2.4010034394450486e-05, "ref_logps/chosen": -253.69683837890625, "ref_logps/rejected": -200.96226501464844, "rewards/accuracies": 1.0, "rewards/chosen": -1.3641235828399658, "rewards/margins": 13.12454891204834, "rewards/rejected": -14.488672256469727, "step": 2937 }, { "epoch": 0.71, "learning_rate": 6.554666666666666e-08, "logps/chosen": -267.0343017578125, "logps/rejected": -391.39617919921875, "loss": 0.0004, "losses/dpo": 8.797454938758165e-06, "losses/sft": 0.5672616362571716, "losses/total": 8.797454938758165e-06, "ref_logps/chosen": -251.69320678710938, "ref_logps/rejected": -234.24795532226562, "rewards/accuracies": 1.0, "rewards/chosen": -1.5341119766235352, "rewards/margins": 14.180709838867188, "rewards/rejected": -15.714822769165039, "step": 2938 }, { "epoch": 0.71, "learning_rate": 6.549333333333333e-08, "logps/chosen": -265.7948303222656, "logps/rejected": -419.88580322265625, "loss": 0.0002, "losses/dpo": 5.799033715447877e-07, "losses/sft": 0.5939426422119141, "losses/total": 5.799033715447877e-07, "ref_logps/chosen": -252.44558715820312, "ref_logps/rejected": -262.3111572265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.3349264860153198, "rewards/margins": 14.422536849975586, "rewards/rejected": -15.757463455200195, "step": 2939 }, { "epoch": 0.71, "learning_rate": 6.544e-08, "logps/chosen": -271.89752197265625, "logps/rejected": -371.96624755859375, "loss": 0.0015, "losses/dpo": 4.937334985299913e-09, "losses/sft": 0.8913024067878723, "losses/total": 4.937334985299913e-09, "ref_logps/chosen": -257.667236328125, "ref_logps/rejected": -222.31410217285156, "rewards/accuracies": 1.0, "rewards/chosen": -1.4230270385742188, "rewards/margins": 13.542184829711914, "rewards/rejected": -14.965213775634766, "step": 2940 }, { "epoch": 0.71, "learning_rate": 6.538666666666667e-08, "logps/chosen": -225.69464111328125, "logps/rejected": -350.900390625, "loss": 0.0015, "losses/dpo": 0.00023530701582785696, "losses/sft": 0.8966333866119385, "losses/total": 0.00023530701582785696, "ref_logps/chosen": -214.72421264648438, "ref_logps/rejected": -211.91323852539062, "rewards/accuracies": 1.0, "rewards/chosen": -1.0970429182052612, "rewards/margins": 12.801673889160156, "rewards/rejected": -13.898715019226074, "step": 2941 }, { "epoch": 0.71, "learning_rate": 6.533333333333332e-08, "logps/chosen": -234.14492797851562, "logps/rejected": -373.32159423828125, "loss": 0.0025, "losses/dpo": 1.786084613542016e-09, "losses/sft": 0.4022860825061798, "losses/total": 1.786084613542016e-09, "ref_logps/chosen": -215.60882568359375, "ref_logps/rejected": -215.31297302246094, "rewards/accuracies": 1.0, "rewards/chosen": -1.8536098003387451, "rewards/margins": 13.947250366210938, "rewards/rejected": -15.800859451293945, "step": 2942 }, { "epoch": 0.71, "learning_rate": 6.528e-08, "logps/chosen": -248.78099060058594, "logps/rejected": -386.52490234375, "loss": 0.0013, "losses/dpo": 1.8966657080454752e-06, "losses/sft": 0.684781551361084, "losses/total": 1.8966657080454752e-06, "ref_logps/chosen": -229.70664978027344, "ref_logps/rejected": -227.18092346191406, "rewards/accuracies": 1.0, "rewards/chosen": -1.9074339866638184, "rewards/margins": 14.026966094970703, "rewards/rejected": -15.934398651123047, "step": 2943 }, { "epoch": 0.71, "learning_rate": 6.522666666666666e-08, "logps/chosen": -251.9311981201172, "logps/rejected": -342.59356689453125, "loss": 0.0014, "losses/dpo": 2.808247359098459e-07, "losses/sft": 0.7288301587104797, "losses/total": 2.808247359098459e-07, "ref_logps/chosen": -233.16049194335938, "ref_logps/rejected": -194.5128631591797, "rewards/accuracies": 1.0, "rewards/chosen": -1.8770709037780762, "rewards/margins": 12.931001663208008, "rewards/rejected": -14.808073997497559, "step": 2944 }, { "epoch": 0.71, "learning_rate": 6.517333333333334e-08, "logps/chosen": -293.2246398925781, "logps/rejected": -407.44659423828125, "loss": 0.0154, "losses/dpo": 1.5321400814727326e-09, "losses/sft": 0.6004728078842163, "losses/total": 1.5321400814727326e-09, "ref_logps/chosen": -275.93896484375, "ref_logps/rejected": -238.29945373535156, "rewards/accuracies": 1.0, "rewards/chosen": -1.7285654544830322, "rewards/margins": 15.186149597167969, "rewards/rejected": -16.914714813232422, "step": 2945 }, { "epoch": 0.71, "learning_rate": 6.512e-08, "logps/chosen": -261.5355529785156, "logps/rejected": -401.5315246582031, "loss": 0.0065, "losses/dpo": 3.639571843905287e-08, "losses/sft": 0.5312297344207764, "losses/total": 3.639571843905287e-08, "ref_logps/chosen": -245.18524169921875, "ref_logps/rejected": -238.67799377441406, "rewards/accuracies": 1.0, "rewards/chosen": -1.6350295543670654, "rewards/margins": 14.650322914123535, "rewards/rejected": -16.28535270690918, "step": 2946 }, { "epoch": 0.71, "learning_rate": 6.506666666666665e-08, "logps/chosen": -223.6868896484375, "logps/rejected": -375.39617919921875, "loss": 0.0044, "losses/dpo": 1.8113062438818162e-10, "losses/sft": 0.44995272159576416, "losses/total": 1.8113062438818162e-10, "ref_logps/chosen": -209.13230895996094, "ref_logps/rejected": -216.75912475585938, "rewards/accuracies": 1.0, "rewards/chosen": -1.4554568529129028, "rewards/margins": 14.40825080871582, "rewards/rejected": -15.863706588745117, "step": 2947 }, { "epoch": 0.71, "learning_rate": 6.501333333333333e-08, "logps/chosen": -221.0892333984375, "logps/rejected": -365.1075439453125, "loss": 0.012, "losses/dpo": 8.438291843049228e-07, "losses/sft": 0.6734924912452698, "losses/total": 8.438291843049228e-07, "ref_logps/chosen": -207.161376953125, "ref_logps/rejected": -212.08384704589844, "rewards/accuracies": 1.0, "rewards/chosen": -1.392783522605896, "rewards/margins": 13.909584999084473, "rewards/rejected": -15.302369117736816, "step": 2948 }, { "epoch": 0.71, "learning_rate": 6.496e-08, "logps/chosen": -225.38192749023438, "logps/rejected": -402.95660400390625, "loss": 0.0005, "losses/dpo": 5.477412196341902e-05, "losses/sft": 0.7767971754074097, "losses/total": 5.477412196341902e-05, "ref_logps/chosen": -210.53167724609375, "ref_logps/rejected": -233.2450714111328, "rewards/accuracies": 1.0, "rewards/chosen": -1.4850244522094727, "rewards/margins": 15.486129760742188, "rewards/rejected": -16.971153259277344, "step": 2949 }, { "epoch": 0.71, "learning_rate": 6.490666666666667e-08, "logps/chosen": -256.083984375, "logps/rejected": -398.8038330078125, "loss": 0.0241, "losses/dpo": 1.6865246266206668e-07, "losses/sft": 0.44324424862861633, "losses/total": 1.6865246266206668e-07, "ref_logps/chosen": -241.88690185546875, "ref_logps/rejected": -229.02577209472656, "rewards/accuracies": 0.96875, "rewards/chosen": -1.4197111129760742, "rewards/margins": 15.558094024658203, "rewards/rejected": -16.977806091308594, "step": 2950 }, { "epoch": 0.71, "learning_rate": 6.485333333333332e-08, "logps/chosen": -230.66236877441406, "logps/rejected": -359.0621032714844, "loss": 0.0004, "losses/dpo": 1.6933037003813434e-12, "losses/sft": 0.775347113609314, "losses/total": 1.6933037003813434e-12, "ref_logps/chosen": -216.2656707763672, "ref_logps/rejected": -206.4904327392578, "rewards/accuracies": 1.0, "rewards/chosen": -1.4396697282791138, "rewards/margins": 13.817495346069336, "rewards/rejected": -15.25716495513916, "step": 2951 }, { "epoch": 0.71, "learning_rate": 6.48e-08, "logps/chosen": -224.4795684814453, "logps/rejected": -350.44049072265625, "loss": 0.0027, "losses/dpo": 1.0248622857034206e-06, "losses/sft": 1.0290040969848633, "losses/total": 1.0248622857034206e-06, "ref_logps/chosen": -212.69833374023438, "ref_logps/rejected": -200.99642944335938, "rewards/accuracies": 1.0, "rewards/chosen": -1.178122878074646, "rewards/margins": 13.76628303527832, "rewards/rejected": -14.944405555725098, "step": 2952 }, { "epoch": 0.71, "learning_rate": 6.474666666666666e-08, "logps/chosen": -292.766845703125, "logps/rejected": -442.489501953125, "loss": 0.0002, "losses/dpo": 1.7526598128370097e-07, "losses/sft": 0.5988019108772278, "losses/total": 1.7526598128370097e-07, "ref_logps/chosen": -273.17803955078125, "ref_logps/rejected": -268.8455810546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.958881139755249, "rewards/margins": 15.405512809753418, "rewards/rejected": -17.36439323425293, "step": 2953 }, { "epoch": 0.71, "learning_rate": 6.469333333333334e-08, "logps/chosen": -255.9879150390625, "logps/rejected": -423.2003479003906, "loss": 0.0002, "losses/dpo": 1.4039074347493852e-09, "losses/sft": 0.3876265585422516, "losses/total": 1.4039074347493852e-09, "ref_logps/chosen": -239.71615600585938, "ref_logps/rejected": -242.59104919433594, "rewards/accuracies": 1.0, "rewards/chosen": -1.6271746158599854, "rewards/margins": 16.433753967285156, "rewards/rejected": -18.060928344726562, "step": 2954 }, { "epoch": 0.71, "learning_rate": 6.463999999999999e-08, "logps/chosen": -217.10501098632812, "logps/rejected": -372.75396728515625, "loss": 0.0017, "losses/dpo": 4.242355355899008e-10, "losses/sft": 0.8190194368362427, "losses/total": 4.242355355899008e-10, "ref_logps/chosen": -204.3765869140625, "ref_logps/rejected": -220.12380981445312, "rewards/accuracies": 1.0, "rewards/chosen": -1.2728426456451416, "rewards/margins": 13.990174293518066, "rewards/rejected": -15.263017654418945, "step": 2955 }, { "epoch": 0.71, "learning_rate": 6.458666666666667e-08, "logps/chosen": -298.4643249511719, "logps/rejected": -437.78546142578125, "loss": 0.0001, "losses/dpo": 9.81441462499788e-06, "losses/sft": 0.5293275117874146, "losses/total": 9.81441462499788e-06, "ref_logps/chosen": -278.31390380859375, "ref_logps/rejected": -242.44918823242188, "rewards/accuracies": 1.0, "rewards/chosen": -2.015043020248413, "rewards/margins": 17.518585205078125, "rewards/rejected": -19.533628463745117, "step": 2956 }, { "epoch": 0.71, "learning_rate": 6.453333333333333e-08, "logps/chosen": -266.953369140625, "logps/rejected": -385.32452392578125, "loss": 0.0004, "losses/dpo": 1.914983505102441e-09, "losses/sft": 0.702484130859375, "losses/total": 1.914983505102441e-09, "ref_logps/chosen": -249.29547119140625, "ref_logps/rejected": -214.5974884033203, "rewards/accuracies": 1.0, "rewards/chosen": -1.7657898664474487, "rewards/margins": 15.306909561157227, "rewards/rejected": -17.07269859313965, "step": 2957 }, { "epoch": 0.71, "learning_rate": 6.448e-08, "logps/chosen": -282.9254150390625, "logps/rejected": -362.47235107421875, "loss": 0.006, "losses/dpo": 7.613552099883236e-08, "losses/sft": 0.4187239408493042, "losses/total": 7.613552099883236e-08, "ref_logps/chosen": -263.2778625488281, "ref_logps/rejected": -213.84405517578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.9647538661956787, "rewards/margins": 12.898078918457031, "rewards/rejected": -14.862832069396973, "step": 2958 }, { "epoch": 0.71, "learning_rate": 6.442666666666666e-08, "logps/chosen": -264.8803405761719, "logps/rejected": -415.10394287109375, "loss": 0.0009, "losses/dpo": 4.79427053612369e-10, "losses/sft": 0.6057167649269104, "losses/total": 4.79427053612369e-10, "ref_logps/chosen": -247.2928466796875, "ref_logps/rejected": -246.2809600830078, "rewards/accuracies": 1.0, "rewards/chosen": -1.7587506771087646, "rewards/margins": 15.123544692993164, "rewards/rejected": -16.882295608520508, "step": 2959 }, { "epoch": 0.71, "learning_rate": 6.437333333333333e-08, "logps/chosen": -255.79733276367188, "logps/rejected": -387.1834411621094, "loss": 0.0, "losses/dpo": 6.575643585193802e-09, "losses/sft": 0.5744213461875916, "losses/total": 6.575643585193802e-09, "ref_logps/chosen": -238.11898803710938, "ref_logps/rejected": -216.17520141601562, "rewards/accuracies": 1.0, "rewards/chosen": -1.7678346633911133, "rewards/margins": 15.332988739013672, "rewards/rejected": -17.10082244873047, "step": 2960 }, { "epoch": 0.71, "learning_rate": 6.432e-08, "logps/chosen": -270.626708984375, "logps/rejected": -425.187744140625, "loss": 0.0011, "losses/dpo": 5.229829724839874e-08, "losses/sft": 0.6055815815925598, "losses/total": 5.229829724839874e-08, "ref_logps/chosen": -253.93666076660156, "ref_logps/rejected": -252.7337646484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.6690078973770142, "rewards/margins": 15.576391220092773, "rewards/rejected": -17.245399475097656, "step": 2961 }, { "epoch": 0.71, "learning_rate": 6.426666666666667e-08, "logps/chosen": -221.1980438232422, "logps/rejected": -425.90240478515625, "loss": 0.0001, "losses/dpo": 2.0205213502322295e-08, "losses/sft": 0.6699194312095642, "losses/total": 2.0205213502322295e-08, "ref_logps/chosen": -204.93655395507812, "ref_logps/rejected": -231.2295379638672, "rewards/accuracies": 1.0, "rewards/chosen": -1.6261475086212158, "rewards/margins": 17.841142654418945, "rewards/rejected": -19.467288970947266, "step": 2962 }, { "epoch": 0.71, "learning_rate": 6.421333333333334e-08, "logps/chosen": -266.10845947265625, "logps/rejected": -379.994873046875, "loss": 0.0004, "losses/dpo": 1.6559189575104938e-08, "losses/sft": 0.7054398059844971, "losses/total": 1.6559189575104938e-08, "ref_logps/chosen": -249.2591552734375, "ref_logps/rejected": -220.08251953125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6849281787872314, "rewards/margins": 14.306306838989258, "rewards/rejected": -15.991235733032227, "step": 2963 }, { "epoch": 0.71, "learning_rate": 6.415999999999999e-08, "logps/chosen": -218.3438262939453, "logps/rejected": -387.10748291015625, "loss": 0.0017, "losses/dpo": 1.827466803661082e-05, "losses/sft": 0.8808196187019348, "losses/total": 1.827466803661082e-05, "ref_logps/chosen": -205.63487243652344, "ref_logps/rejected": -231.98428344726562, "rewards/accuracies": 1.0, "rewards/chosen": -1.2708959579467773, "rewards/margins": 14.24142074584961, "rewards/rejected": -15.512317657470703, "step": 2964 }, { "epoch": 0.71, "learning_rate": 6.410666666666666e-08, "logps/chosen": -220.80149841308594, "logps/rejected": -373.5385437011719, "loss": 0.0001, "losses/dpo": 1.9607389323272884e-10, "losses/sft": 0.4772378206253052, "losses/total": 1.9607389323272884e-10, "ref_logps/chosen": -205.19775390625, "ref_logps/rejected": -221.363037109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5603734254837036, "rewards/margins": 13.657176971435547, "rewards/rejected": -15.217550277709961, "step": 2965 }, { "epoch": 0.71, "learning_rate": 6.405333333333333e-08, "logps/chosen": -232.95925903320312, "logps/rejected": -357.6673583984375, "loss": 0.004, "losses/dpo": 3.1960098567651585e-08, "losses/sft": 0.963789165019989, "losses/total": 3.1960098567651585e-08, "ref_logps/chosen": -218.54818725585938, "ref_logps/rejected": -210.542724609375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4411083459854126, "rewards/margins": 13.271352767944336, "rewards/rejected": -14.712461471557617, "step": 2966 }, { "epoch": 0.71, "learning_rate": 6.4e-08, "logps/chosen": -246.38397216796875, "logps/rejected": -387.0699462890625, "loss": 0.0008, "losses/dpo": 2.817474076266535e-08, "losses/sft": 0.47435903549194336, "losses/total": 2.817474076266535e-08, "ref_logps/chosen": -228.8514862060547, "ref_logps/rejected": -225.32655334472656, "rewards/accuracies": 1.0, "rewards/chosen": -1.7532495260238647, "rewards/margins": 14.421092987060547, "rewards/rejected": -16.17434310913086, "step": 2967 }, { "epoch": 0.71, "learning_rate": 6.394666666666665e-08, "logps/chosen": -276.43743896484375, "logps/rejected": -428.0321960449219, "loss": 0.0071, "losses/dpo": 1.0919245951868106e-08, "losses/sft": 0.8909705877304077, "losses/total": 1.0919245951868106e-08, "ref_logps/chosen": -259.2986145019531, "ref_logps/rejected": -250.64044189453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.7138798236846924, "rewards/margins": 16.025297164916992, "rewards/rejected": -17.739177703857422, "step": 2968 }, { "epoch": 0.71, "learning_rate": 6.389333333333333e-08, "logps/chosen": -226.931640625, "logps/rejected": -385.4866638183594, "loss": 0.0002, "losses/dpo": 6.964897103500789e-09, "losses/sft": 0.6304705142974854, "losses/total": 6.964897103500789e-09, "ref_logps/chosen": -212.72415161132812, "ref_logps/rejected": -228.37405395507812, "rewards/accuracies": 1.0, "rewards/chosen": -1.420748233795166, "rewards/margins": 14.29051399230957, "rewards/rejected": -15.711261749267578, "step": 2969 }, { "epoch": 0.71, "learning_rate": 6.384e-08, "logps/chosen": -257.2867431640625, "logps/rejected": -372.557861328125, "loss": 0.0054, "losses/dpo": 3.2694867257987426e-08, "losses/sft": 0.953050434589386, "losses/total": 3.2694867257987426e-08, "ref_logps/chosen": -239.7035675048828, "ref_logps/rejected": -216.69705200195312, "rewards/accuracies": 1.0, "rewards/chosen": -1.7583180665969849, "rewards/margins": 13.827760696411133, "rewards/rejected": -15.586077690124512, "step": 2970 }, { "epoch": 0.71, "learning_rate": 6.378666666666667e-08, "logps/chosen": -206.09698486328125, "logps/rejected": -381.49420166015625, "loss": 0.0003, "losses/dpo": 2.267672272182608e-08, "losses/sft": 0.5786144137382507, "losses/total": 2.267672272182608e-08, "ref_logps/chosen": -194.90011596679688, "ref_logps/rejected": -223.5653533935547, "rewards/accuracies": 1.0, "rewards/chosen": -1.119686484336853, "rewards/margins": 14.673202514648438, "rewards/rejected": -15.792888641357422, "step": 2971 }, { "epoch": 0.71, "learning_rate": 6.373333333333332e-08, "logps/chosen": -233.75906372070312, "logps/rejected": -417.825927734375, "loss": 0.0004, "losses/dpo": 6.735771052035489e-08, "losses/sft": 0.7819118499755859, "losses/total": 6.735771052035489e-08, "ref_logps/chosen": -216.9157257080078, "ref_logps/rejected": -236.0186767578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6843345165252686, "rewards/margins": 16.49639320373535, "rewards/rejected": -18.180727005004883, "step": 2972 }, { "epoch": 0.71, "learning_rate": 6.368e-08, "logps/chosen": -308.6373291015625, "logps/rejected": -396.4114990234375, "loss": 0.0002, "losses/dpo": 3.295048998097627e-07, "losses/sft": 0.4529604911804199, "losses/total": 3.295048998097627e-07, "ref_logps/chosen": -292.2327575683594, "ref_logps/rejected": -234.6851348876953, "rewards/accuracies": 1.0, "rewards/chosen": -1.6404550075531006, "rewards/margins": 14.53217887878418, "rewards/rejected": -16.17263412475586, "step": 2973 }, { "epoch": 0.71, "learning_rate": 6.362666666666666e-08, "logps/chosen": -235.367431640625, "logps/rejected": -398.0397644042969, "loss": 0.0001, "losses/dpo": 2.5707258544116485e-09, "losses/sft": 0.8763350248336792, "losses/total": 2.5707258544116485e-09, "ref_logps/chosen": -220.7000274658203, "ref_logps/rejected": -225.08103942871094, "rewards/accuracies": 1.0, "rewards/chosen": -1.4667408466339111, "rewards/margins": 15.829132080078125, "rewards/rejected": -17.295873641967773, "step": 2974 }, { "epoch": 0.71, "learning_rate": 6.357333333333334e-08, "logps/chosen": -254.88909912109375, "logps/rejected": -397.9168701171875, "loss": 0.0021, "losses/dpo": 6.03656968162225e-12, "losses/sft": 0.7724855542182922, "losses/total": 6.03656968162225e-12, "ref_logps/chosen": -239.09405517578125, "ref_logps/rejected": -230.6595458984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5795049667358398, "rewards/margins": 15.146230697631836, "rewards/rejected": -16.72573471069336, "step": 2975 }, { "epoch": 0.71, "learning_rate": 6.352e-08, "logps/chosen": -236.50234985351562, "logps/rejected": -379.1544189453125, "loss": 0.0015, "losses/dpo": 8.575079313288825e-09, "losses/sft": 0.9420257806777954, "losses/total": 8.575079313288825e-09, "ref_logps/chosen": -224.89382934570312, "ref_logps/rejected": -221.63790893554688, "rewards/accuracies": 1.0, "rewards/chosen": -1.1608526706695557, "rewards/margins": 14.590797424316406, "rewards/rejected": -15.751649856567383, "step": 2976 }, { "epoch": 0.71, "learning_rate": 6.346666666666667e-08, "logps/chosen": -231.23837280273438, "logps/rejected": -324.28289794921875, "loss": 0.0019, "losses/dpo": 1.3917676733310458e-10, "losses/sft": 0.5141953825950623, "losses/total": 1.3917676733310458e-10, "ref_logps/chosen": -216.71466064453125, "ref_logps/rejected": -184.24732971191406, "rewards/accuracies": 1.0, "rewards/chosen": -1.4523714780807495, "rewards/margins": 12.551186561584473, "rewards/rejected": -14.003559112548828, "step": 2977 }, { "epoch": 0.71, "learning_rate": 6.341333333333333e-08, "logps/chosen": -242.01019287109375, "logps/rejected": -393.8915710449219, "loss": 0.0001, "losses/dpo": 2.493418287485838e-05, "losses/sft": 0.9238871932029724, "losses/total": 2.493418287485838e-05, "ref_logps/chosen": -227.32078552246094, "ref_logps/rejected": -233.08641052246094, "rewards/accuracies": 1.0, "rewards/chosen": -1.4689419269561768, "rewards/margins": 14.611576080322266, "rewards/rejected": -16.08051872253418, "step": 2978 }, { "epoch": 0.71, "learning_rate": 6.336e-08, "logps/chosen": -232.58096313476562, "logps/rejected": -425.279052734375, "loss": 0.0, "losses/dpo": 1.0690584702210959e-11, "losses/sft": 0.5680596828460693, "losses/total": 1.0690584702210959e-11, "ref_logps/chosen": -216.50723266601562, "ref_logps/rejected": -246.37261962890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6073729991912842, "rewards/margins": 16.28326988220215, "rewards/rejected": -17.890644073486328, "step": 2979 }, { "epoch": 0.72, "learning_rate": 6.330666666666667e-08, "logps/chosen": -240.26417541503906, "logps/rejected": -397.9005126953125, "loss": 0.0008, "losses/dpo": 4.7200410335790366e-05, "losses/sft": 0.6139506101608276, "losses/total": 4.7200410335790366e-05, "ref_logps/chosen": -222.367919921875, "ref_logps/rejected": -228.73855590820312, "rewards/accuracies": 1.0, "rewards/chosen": -1.7896238565444946, "rewards/margins": 15.126572608947754, "rewards/rejected": -16.916194915771484, "step": 2980 }, { "epoch": 0.72, "learning_rate": 6.325333333333332e-08, "logps/chosen": -267.2680969238281, "logps/rejected": -390.4936828613281, "loss": 0.0125, "losses/dpo": 4.432827882538959e-09, "losses/sft": 0.2780210077762604, "losses/total": 4.432827882538959e-09, "ref_logps/chosen": -251.0361328125, "ref_logps/rejected": -226.6484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.6231952905654907, "rewards/margins": 14.761330604553223, "rewards/rejected": -16.384525299072266, "step": 2981 }, { "epoch": 0.72, "learning_rate": 6.32e-08, "logps/chosen": -280.11572265625, "logps/rejected": -420.1015625, "loss": 0.0001, "losses/dpo": 3.058473296846387e-08, "losses/sft": 0.6656902432441711, "losses/total": 3.058473296846387e-08, "ref_logps/chosen": -261.32794189453125, "ref_logps/rejected": -247.87518310546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.8787766695022583, "rewards/margins": 15.343860626220703, "rewards/rejected": -17.222637176513672, "step": 2982 }, { "epoch": 0.72, "learning_rate": 6.314666666666666e-08, "logps/chosen": -240.32032775878906, "logps/rejected": -353.7478942871094, "loss": 0.0106, "losses/dpo": 2.8868378174062848e-11, "losses/sft": 0.6426172852516174, "losses/total": 2.8868378174062848e-11, "ref_logps/chosen": -224.25096130371094, "ref_logps/rejected": -204.89578247070312, "rewards/accuracies": 1.0, "rewards/chosen": -1.6069366931915283, "rewards/margins": 13.278276443481445, "rewards/rejected": -14.885212898254395, "step": 2983 }, { "epoch": 0.72, "learning_rate": 6.309333333333334e-08, "logps/chosen": -201.92337036132812, "logps/rejected": -365.0804443359375, "loss": 0.0024, "losses/dpo": 1.6213620668281692e-08, "losses/sft": 0.6881344318389893, "losses/total": 1.6213620668281692e-08, "ref_logps/chosen": -187.4097900390625, "ref_logps/rejected": -203.26661682128906, "rewards/accuracies": 1.0, "rewards/chosen": -1.4513581991195679, "rewards/margins": 14.730025291442871, "rewards/rejected": -16.18138313293457, "step": 2984 }, { "epoch": 0.72, "learning_rate": 6.303999999999999e-08, "logps/chosen": -258.90826416015625, "logps/rejected": -386.32061767578125, "loss": 0.0016, "losses/dpo": 6.993954304590488e-10, "losses/sft": 0.49741944670677185, "losses/total": 6.993954304590488e-10, "ref_logps/chosen": -240.79562377929688, "ref_logps/rejected": -224.89064025878906, "rewards/accuracies": 1.0, "rewards/chosen": -1.8112645149230957, "rewards/margins": 14.331731796264648, "rewards/rejected": -16.142995834350586, "step": 2985 }, { "epoch": 0.72, "learning_rate": 6.298666666666666e-08, "logps/chosen": -257.36395263671875, "logps/rejected": -387.19696044921875, "loss": 0.0039, "losses/dpo": 4.405080460401223e-07, "losses/sft": 0.5754361748695374, "losses/total": 4.405080460401223e-07, "ref_logps/chosen": -240.4801025390625, "ref_logps/rejected": -225.6656036376953, "rewards/accuracies": 1.0, "rewards/chosen": -1.6883846521377563, "rewards/margins": 14.464752197265625, "rewards/rejected": -16.15313720703125, "step": 2986 }, { "epoch": 0.72, "learning_rate": 6.293333333333333e-08, "logps/chosen": -315.10430908203125, "logps/rejected": -456.298095703125, "loss": 0.0, "losses/dpo": 1.3677939705303288e-06, "losses/sft": 0.5858655571937561, "losses/total": 1.3677939705303288e-06, "ref_logps/chosen": -300.1410217285156, "ref_logps/rejected": -270.88214111328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.496328592300415, "rewards/margins": 17.045270919799805, "rewards/rejected": -18.54159927368164, "step": 2987 }, { "epoch": 0.72, "learning_rate": 6.288e-08, "logps/chosen": -249.23133850097656, "logps/rejected": -389.0323791503906, "loss": 0.0016, "losses/dpo": 1.1090262009520041e-13, "losses/sft": 0.8677173256874084, "losses/total": 1.1090262009520041e-13, "ref_logps/chosen": -232.61935424804688, "ref_logps/rejected": -219.95645141601562, "rewards/accuracies": 1.0, "rewards/chosen": -1.6611993312835693, "rewards/margins": 15.246393203735352, "rewards/rejected": -16.9075927734375, "step": 2988 }, { "epoch": 0.72, "learning_rate": 6.282666666666665e-08, "logps/chosen": -263.07275390625, "logps/rejected": -396.8138427734375, "loss": 0.0007, "losses/dpo": 1.1426783075307867e-08, "losses/sft": 0.6002727150917053, "losses/total": 1.1426783075307867e-08, "ref_logps/chosen": -247.2364501953125, "ref_logps/rejected": -223.55941772460938, "rewards/accuracies": 1.0, "rewards/chosen": -1.583628535270691, "rewards/margins": 15.741816520690918, "rewards/rejected": -17.3254451751709, "step": 2989 }, { "epoch": 0.72, "learning_rate": 6.277333333333333e-08, "logps/chosen": -200.5232696533203, "logps/rejected": -334.056640625, "loss": 0.0002, "losses/dpo": 1.0252787507880612e-09, "losses/sft": 0.6027934551239014, "losses/total": 1.0252787507880612e-09, "ref_logps/chosen": -185.89483642578125, "ref_logps/rejected": -197.72515869140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4628437757492065, "rewards/margins": 12.170300483703613, "rewards/rejected": -13.63314437866211, "step": 2990 }, { "epoch": 0.72, "learning_rate": 6.272e-08, "logps/chosen": -261.4381103515625, "logps/rejected": -383.39031982421875, "loss": 0.0009, "losses/dpo": 0.00013063164078630507, "losses/sft": 0.633308470249176, "losses/total": 0.00013063164078630507, "ref_logps/chosen": -250.68759155273438, "ref_logps/rejected": -221.54293823242188, "rewards/accuracies": 1.0, "rewards/chosen": -1.0750519037246704, "rewards/margins": 15.109686851501465, "rewards/rejected": -16.184738159179688, "step": 2991 }, { "epoch": 0.72, "learning_rate": 6.266666666666667e-08, "logps/chosen": -227.06651306152344, "logps/rejected": -375.8355712890625, "loss": 0.0001, "losses/dpo": 4.571401868158631e-10, "losses/sft": 1.0099576711654663, "losses/total": 4.571401868158631e-10, "ref_logps/chosen": -212.91358947753906, "ref_logps/rejected": -212.95169067382812, "rewards/accuracies": 1.0, "rewards/chosen": -1.4152928590774536, "rewards/margins": 14.87309455871582, "rewards/rejected": -16.288387298583984, "step": 2992 }, { "epoch": 0.72, "learning_rate": 6.261333333333334e-08, "logps/chosen": -286.1430969238281, "logps/rejected": -401.53955078125, "loss": 0.0008, "losses/dpo": 1.2557729855977584e-10, "losses/sft": 0.6043654680252075, "losses/total": 1.2557729855977584e-10, "ref_logps/chosen": -269.1602783203125, "ref_logps/rejected": -231.52978515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6982810497283936, "rewards/margins": 15.302695274353027, "rewards/rejected": -17.0009765625, "step": 2993 }, { "epoch": 0.72, "learning_rate": 6.256e-08, "logps/chosen": -235.49822998046875, "logps/rejected": -360.89007568359375, "loss": 0.0005, "losses/dpo": 5.92901088225517e-08, "losses/sft": 0.6094950437545776, "losses/total": 5.92901088225517e-08, "ref_logps/chosen": -216.206787109375, "ref_logps/rejected": -204.9479217529297, "rewards/accuracies": 1.0, "rewards/chosen": -1.9291434288024902, "rewards/margins": 13.665071487426758, "rewards/rejected": -15.594215393066406, "step": 2994 }, { "epoch": 0.72, "learning_rate": 6.250666666666666e-08, "logps/chosen": -255.32846069335938, "logps/rejected": -395.5396423339844, "loss": 0.0064, "losses/dpo": 5.011355219686209e-10, "losses/sft": 0.6249167323112488, "losses/total": 5.011355219686209e-10, "ref_logps/chosen": -233.765869140625, "ref_logps/rejected": -226.98837280273438, "rewards/accuracies": 1.0, "rewards/chosen": -2.1562604904174805, "rewards/margins": 14.698867797851562, "rewards/rejected": -16.85512924194336, "step": 2995 }, { "epoch": 0.72, "learning_rate": 6.245333333333334e-08, "logps/chosen": -200.39117431640625, "logps/rejected": -351.222900390625, "loss": 0.0067, "losses/dpo": 3.7257095186760125e-09, "losses/sft": 0.5517131686210632, "losses/total": 3.7257095186760125e-09, "ref_logps/chosen": -183.16104125976562, "ref_logps/rejected": -202.729248046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7230144739151, "rewards/margins": 13.126348495483398, "rewards/rejected": -14.84936237335205, "step": 2996 }, { "epoch": 0.72, "learning_rate": 6.24e-08, "logps/chosen": -237.18682861328125, "logps/rejected": -355.3611145019531, "loss": 0.0019, "losses/dpo": 5.239745064500312e-07, "losses/sft": 0.5615024566650391, "losses/total": 5.239745064500312e-07, "ref_logps/chosen": -221.33502197265625, "ref_logps/rejected": -203.01675415039062, "rewards/accuracies": 1.0, "rewards/chosen": -1.5851800441741943, "rewards/margins": 13.649255752563477, "rewards/rejected": -15.23443603515625, "step": 2997 }, { "epoch": 0.72, "learning_rate": 6.234666666666665e-08, "logps/chosen": -204.81202697753906, "logps/rejected": -372.41937255859375, "loss": 0.0025, "losses/dpo": 4.250653518056424e-08, "losses/sft": 0.6252569556236267, "losses/total": 4.250653518056424e-08, "ref_logps/chosen": -191.27174377441406, "ref_logps/rejected": -215.67384338378906, "rewards/accuracies": 1.0, "rewards/chosen": -1.3540291786193848, "rewards/margins": 14.320525169372559, "rewards/rejected": -15.674553871154785, "step": 2998 }, { "epoch": 0.72, "learning_rate": 6.229333333333333e-08, "logps/chosen": -255.12281799316406, "logps/rejected": -402.67449951171875, "loss": 0.0011, "losses/dpo": 8.232658728957176e-06, "losses/sft": 0.7690080404281616, "losses/total": 8.232658728957176e-06, "ref_logps/chosen": -240.60350036621094, "ref_logps/rejected": -235.4475555419922, "rewards/accuracies": 1.0, "rewards/chosen": -1.451932668685913, "rewards/margins": 15.270763397216797, "rewards/rejected": -16.722694396972656, "step": 2999 }, { "epoch": 0.72, "learning_rate": 6.224e-08, "logps/chosen": -228.3297119140625, "logps/rejected": -386.4849853515625, "loss": 0.0006, "losses/dpo": 2.2518958076034323e-07, "losses/sft": 0.8804810047149658, "losses/total": 2.2518958076034323e-07, "ref_logps/chosen": -216.62796020507812, "ref_logps/rejected": -233.17442321777344, "rewards/accuracies": 1.0, "rewards/chosen": -1.1701743602752686, "rewards/margins": 14.160882949829102, "rewards/rejected": -15.33105754852295, "step": 3000 }, { "epoch": 0.72, "learning_rate": 6.218666666666667e-08, "logps/chosen": -233.7404022216797, "logps/rejected": -407.9372253417969, "loss": 0.0011, "losses/dpo": 8.512635929491807e-08, "losses/sft": 0.6119607090950012, "losses/total": 8.512635929491807e-08, "ref_logps/chosen": -218.15133666992188, "ref_logps/rejected": -241.0629425048828, "rewards/accuracies": 1.0, "rewards/chosen": -1.5589094161987305, "rewards/margins": 15.128517150878906, "rewards/rejected": -16.687427520751953, "step": 3001 }, { "epoch": 0.72, "learning_rate": 6.213333333333332e-08, "logps/chosen": -211.58389282226562, "logps/rejected": -357.81622314453125, "loss": 0.0067, "losses/dpo": 3.550294059095904e-05, "losses/sft": 0.6739386916160583, "losses/total": 3.550294059095904e-05, "ref_logps/chosen": -198.27122497558594, "ref_logps/rejected": -209.36587524414062, "rewards/accuracies": 1.0, "rewards/chosen": -1.331266164779663, "rewards/margins": 13.513769149780273, "rewards/rejected": -14.845035552978516, "step": 3002 }, { "epoch": 0.72, "learning_rate": 6.208e-08, "logps/chosen": -231.51097106933594, "logps/rejected": -404.9263610839844, "loss": 0.0002, "losses/dpo": 1.3408690335836582e-07, "losses/sft": 0.4495912790298462, "losses/total": 1.3408690335836582e-07, "ref_logps/chosen": -214.44627380371094, "ref_logps/rejected": -229.368408203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.7064696550369263, "rewards/margins": 15.849324226379395, "rewards/rejected": -17.55579376220703, "step": 3003 }, { "epoch": 0.72, "learning_rate": 6.202666666666666e-08, "logps/chosen": -242.09857177734375, "logps/rejected": -392.8101806640625, "loss": 0.0021, "losses/dpo": 6.088297141104704e-06, "losses/sft": 0.5196855664253235, "losses/total": 6.088297141104704e-06, "ref_logps/chosen": -226.96328735351562, "ref_logps/rejected": -225.05616760253906, "rewards/accuracies": 1.0, "rewards/chosen": -1.5135283470153809, "rewards/margins": 15.261873245239258, "rewards/rejected": -16.775402069091797, "step": 3004 }, { "epoch": 0.72, "learning_rate": 6.197333333333334e-08, "logps/chosen": -234.9093780517578, "logps/rejected": -331.8186950683594, "loss": 0.0112, "losses/dpo": 2.259861139464192e-05, "losses/sft": 0.578220784664154, "losses/total": 2.259861139464192e-05, "ref_logps/chosen": -216.2509765625, "ref_logps/rejected": -189.75315856933594, "rewards/accuracies": 1.0, "rewards/chosen": -1.8658390045166016, "rewards/margins": 12.340715408325195, "rewards/rejected": -14.206554412841797, "step": 3005 }, { "epoch": 0.72, "learning_rate": 6.191999999999999e-08, "logps/chosen": -261.9598083496094, "logps/rejected": -418.2833251953125, "loss": 0.0008, "losses/dpo": 3.7623539700382125e-11, "losses/sft": 0.6537788510322571, "losses/total": 3.7623539700382125e-11, "ref_logps/chosen": -242.22531127929688, "ref_logps/rejected": -239.5977020263672, "rewards/accuracies": 1.0, "rewards/chosen": -1.9734482765197754, "rewards/margins": 15.895111083984375, "rewards/rejected": -17.868560791015625, "step": 3006 }, { "epoch": 0.72, "learning_rate": 6.186666666666666e-08, "logps/chosen": -285.9158935546875, "logps/rejected": -435.8902587890625, "loss": 0.0001, "losses/dpo": 4.637646655591965e-10, "losses/sft": 0.5653272867202759, "losses/total": 4.637646655591965e-10, "ref_logps/chosen": -271.25250244140625, "ref_logps/rejected": -260.5496826171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.466339349746704, "rewards/margins": 16.06771469116211, "rewards/rejected": -17.534053802490234, "step": 3007 }, { "epoch": 0.72, "learning_rate": 6.181333333333333e-08, "logps/chosen": -278.9303283691406, "logps/rejected": -398.129638671875, "loss": 0.0002, "losses/dpo": 4.243287943239693e-09, "losses/sft": 0.6062456965446472, "losses/total": 4.243287943239693e-09, "ref_logps/chosen": -260.34637451171875, "ref_logps/rejected": -234.73500061035156, "rewards/accuracies": 1.0, "rewards/chosen": -1.858396053314209, "rewards/margins": 14.481067657470703, "rewards/rejected": -16.33946418762207, "step": 3008 }, { "epoch": 0.72, "learning_rate": 6.176e-08, "logps/chosen": -252.5132293701172, "logps/rejected": -413.454833984375, "loss": 0.0002, "losses/dpo": 2.4208594240349157e-08, "losses/sft": 0.6219801902770996, "losses/total": 2.4208594240349157e-08, "ref_logps/chosen": -236.11309814453125, "ref_logps/rejected": -237.56375122070312, "rewards/accuracies": 1.0, "rewards/chosen": -1.6400120258331299, "rewards/margins": 15.949098587036133, "rewards/rejected": -17.589109420776367, "step": 3009 }, { "epoch": 0.72, "learning_rate": 6.170666666666667e-08, "logps/chosen": -293.75299072265625, "logps/rejected": -397.40252685546875, "loss": 0.001, "losses/dpo": 1.0232089380224352e-06, "losses/sft": 0.7252843976020813, "losses/total": 1.0232089380224352e-06, "ref_logps/chosen": -275.67218017578125, "ref_logps/rejected": -238.38427734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.8080835342407227, "rewards/margins": 14.093738555908203, "rewards/rejected": -15.90182113647461, "step": 3010 }, { "epoch": 0.72, "learning_rate": 6.165333333333333e-08, "logps/chosen": -244.137451171875, "logps/rejected": -448.06866455078125, "loss": 0.0003, "losses/dpo": 1.728934932998527e-07, "losses/sft": 0.5288990139961243, "losses/total": 1.728934932998527e-07, "ref_logps/chosen": -232.3067626953125, "ref_logps/rejected": -256.09185791015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.1830682754516602, "rewards/margins": 18.014610290527344, "rewards/rejected": -19.197677612304688, "step": 3011 }, { "epoch": 0.72, "learning_rate": 6.16e-08, "logps/chosen": -266.8443603515625, "logps/rejected": -410.3909912109375, "loss": 0.0014, "losses/dpo": 1.9189030364685777e-09, "losses/sft": 0.5128068923950195, "losses/total": 1.9189030364685777e-09, "ref_logps/chosen": -253.7377471923828, "ref_logps/rejected": -252.853759765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.310659408569336, "rewards/margins": 14.443063735961914, "rewards/rejected": -15.75372314453125, "step": 3012 }, { "epoch": 0.72, "learning_rate": 6.154666666666667e-08, "logps/chosen": -234.93563842773438, "logps/rejected": -435.44635009765625, "loss": 0.0, "losses/dpo": 2.6417344006768317e-11, "losses/sft": 0.7183496952056885, "losses/total": 2.6417344006768317e-11, "ref_logps/chosen": -217.66702270507812, "ref_logps/rejected": -253.8595428466797, "rewards/accuracies": 1.0, "rewards/chosen": -1.7268582582473755, "rewards/margins": 16.43182373046875, "rewards/rejected": -18.158679962158203, "step": 3013 }, { "epoch": 0.72, "learning_rate": 6.149333333333334e-08, "logps/chosen": -250.45645141601562, "logps/rejected": -414.8089599609375, "loss": 0.0009, "losses/dpo": 3.7026310906185245e-08, "losses/sft": 0.486914724111557, "losses/total": 3.7026310906185245e-08, "ref_logps/chosen": -231.35226440429688, "ref_logps/rejected": -244.04922485351562, "rewards/accuracies": 1.0, "rewards/chosen": -1.9104201793670654, "rewards/margins": 15.165548324584961, "rewards/rejected": -17.075969696044922, "step": 3014 }, { "epoch": 0.72, "learning_rate": 6.143999999999999e-08, "logps/chosen": -247.69573974609375, "logps/rejected": -414.0513000488281, "loss": 0.0055, "losses/dpo": 3.214711696841732e-08, "losses/sft": 0.6144576668739319, "losses/total": 3.214711696841732e-08, "ref_logps/chosen": -231.78546142578125, "ref_logps/rejected": -239.51199340820312, "rewards/accuracies": 1.0, "rewards/chosen": -1.5910274982452393, "rewards/margins": 15.86290454864502, "rewards/rejected": -17.45393180847168, "step": 3015 }, { "epoch": 0.72, "learning_rate": 6.138666666666666e-08, "logps/chosen": -261.96588134765625, "logps/rejected": -361.89544677734375, "loss": 0.0301, "losses/dpo": 4.3888306322514836e-08, "losses/sft": 1.1431106328964233, "losses/total": 4.3888306322514836e-08, "ref_logps/chosen": -246.94985961914062, "ref_logps/rejected": -215.50665283203125, "rewards/accuracies": 0.96875, "rewards/chosen": -1.5016019344329834, "rewards/margins": 13.137276649475098, "rewards/rejected": -14.638877868652344, "step": 3016 }, { "epoch": 0.72, "learning_rate": 6.133333333333333e-08, "logps/chosen": -220.40155029296875, "logps/rejected": -387.6556091308594, "loss": 0.0028, "losses/dpo": 1.97447843675036e-06, "losses/sft": 0.7303394675254822, "losses/total": 1.97447843675036e-06, "ref_logps/chosen": -202.721923828125, "ref_logps/rejected": -224.0131378173828, "rewards/accuracies": 1.0, "rewards/chosen": -1.7679632902145386, "rewards/margins": 14.596284866333008, "rewards/rejected": -16.364248275756836, "step": 3017 }, { "epoch": 0.72, "learning_rate": 6.128e-08, "logps/chosen": -233.55848693847656, "logps/rejected": -383.27935791015625, "loss": 0.002, "losses/dpo": 5.988222070985039e-12, "losses/sft": 0.5285466313362122, "losses/total": 5.988222070985039e-12, "ref_logps/chosen": -218.8360137939453, "ref_logps/rejected": -231.63319396972656, "rewards/accuracies": 1.0, "rewards/chosen": -1.4722477197647095, "rewards/margins": 13.69236946105957, "rewards/rejected": -15.164616584777832, "step": 3018 }, { "epoch": 0.72, "learning_rate": 6.122666666666665e-08, "logps/chosen": -210.1376953125, "logps/rejected": -374.38916015625, "loss": 0.0051, "losses/dpo": 2.001964048758964e-06, "losses/sft": 0.7666177749633789, "losses/total": 2.001964048758964e-06, "ref_logps/chosen": -193.65530395507812, "ref_logps/rejected": -209.8081817626953, "rewards/accuracies": 1.0, "rewards/chosen": -1.6482372283935547, "rewards/margins": 14.809864044189453, "rewards/rejected": -16.458101272583008, "step": 3019 }, { "epoch": 0.72, "learning_rate": 6.117333333333333e-08, "logps/chosen": -246.21636962890625, "logps/rejected": -401.45135498046875, "loss": 0.0016, "losses/dpo": 1.8194086806033738e-05, "losses/sft": 0.4969121217727661, "losses/total": 1.8194086806033738e-05, "ref_logps/chosen": -226.18626403808594, "ref_logps/rejected": -235.89547729492188, "rewards/accuracies": 1.0, "rewards/chosen": -2.0030100345611572, "rewards/margins": 14.552579879760742, "rewards/rejected": -16.55558967590332, "step": 3020 }, { "epoch": 0.72, "learning_rate": 6.112e-08, "logps/chosen": -271.2095031738281, "logps/rejected": -424.771728515625, "loss": 0.0001, "losses/dpo": 2.8004365049127955e-08, "losses/sft": 0.4146760106086731, "losses/total": 2.8004365049127955e-08, "ref_logps/chosen": -257.47686767578125, "ref_logps/rejected": -248.41256713867188, "rewards/accuracies": 1.0, "rewards/chosen": -1.3732601404190063, "rewards/margins": 16.26266098022461, "rewards/rejected": -17.63591957092285, "step": 3021 }, { "epoch": 0.73, "learning_rate": 6.106666666666667e-08, "logps/chosen": -285.32421875, "logps/rejected": -428.7833557128906, "loss": 0.002, "losses/dpo": 5.688213788879182e-10, "losses/sft": 1.0538239479064941, "losses/total": 5.688213788879182e-10, "ref_logps/chosen": -271.543701171875, "ref_logps/rejected": -261.4596862792969, "rewards/accuracies": 1.0, "rewards/chosen": -1.3780508041381836, "rewards/margins": 15.354316711425781, "rewards/rejected": -16.73236846923828, "step": 3022 }, { "epoch": 0.73, "learning_rate": 6.101333333333332e-08, "logps/chosen": -240.723388671875, "logps/rejected": -382.8076171875, "loss": 0.0009, "losses/dpo": 1.0685712731328678e-11, "losses/sft": 0.4360954761505127, "losses/total": 1.0685712731328678e-11, "ref_logps/chosen": -222.94100952148438, "ref_logps/rejected": -214.05374145507812, "rewards/accuracies": 1.0, "rewards/chosen": -1.7782349586486816, "rewards/margins": 15.097153663635254, "rewards/rejected": -16.875389099121094, "step": 3023 }, { "epoch": 0.73, "learning_rate": 6.096e-08, "logps/chosen": -201.45065307617188, "logps/rejected": -334.49249267578125, "loss": 0.0003, "losses/dpo": 5.2236881487033315e-08, "losses/sft": 1.2064944505691528, "losses/total": 5.2236881487033315e-08, "ref_logps/chosen": -188.92233276367188, "ref_logps/rejected": -193.38510131835938, "rewards/accuracies": 1.0, "rewards/chosen": -1.2528319358825684, "rewards/margins": 12.857908248901367, "rewards/rejected": -14.110738754272461, "step": 3024 }, { "epoch": 0.73, "learning_rate": 6.090666666666666e-08, "logps/chosen": -256.31512451171875, "logps/rejected": -380.34808349609375, "loss": 0.0077, "losses/dpo": 6.540396043419605e-06, "losses/sft": 0.788582980632782, "losses/total": 6.540396043419605e-06, "ref_logps/chosen": -238.67274475097656, "ref_logps/rejected": -230.79733276367188, "rewards/accuracies": 1.0, "rewards/chosen": -1.7642371654510498, "rewards/margins": 13.190835952758789, "rewards/rejected": -14.955074310302734, "step": 3025 }, { "epoch": 0.73, "learning_rate": 6.085333333333334e-08, "logps/chosen": -217.16571044921875, "logps/rejected": -389.1166076660156, "loss": 0.0004, "losses/dpo": 1.9341422685670295e-08, "losses/sft": 0.5713791847229004, "losses/total": 1.9341422685670295e-08, "ref_logps/chosen": -206.5950469970703, "ref_logps/rejected": -229.52085876464844, "rewards/accuracies": 1.0, "rewards/chosen": -1.057065486907959, "rewards/margins": 14.902510643005371, "rewards/rejected": -15.959575653076172, "step": 3026 }, { "epoch": 0.73, "learning_rate": 6.08e-08, "logps/chosen": -255.80206298828125, "logps/rejected": -408.53741455078125, "loss": 0.001, "losses/dpo": 2.7725167228709324e-07, "losses/sft": 0.7252696752548218, "losses/total": 2.7725167228709324e-07, "ref_logps/chosen": -236.43270874023438, "ref_logps/rejected": -233.059326171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.9369356632232666, "rewards/margins": 15.610873222351074, "rewards/rejected": -17.547809600830078, "step": 3027 }, { "epoch": 0.73, "learning_rate": 6.074666666666667e-08, "logps/chosen": -296.79107666015625, "logps/rejected": -378.73602294921875, "loss": 0.0021, "losses/dpo": 5.928140126343351e-06, "losses/sft": 0.7347967028617859, "losses/total": 5.928140126343351e-06, "ref_logps/chosen": -279.60028076171875, "ref_logps/rejected": -229.17965698242188, "rewards/accuracies": 1.0, "rewards/chosen": -1.7190756797790527, "rewards/margins": 13.23655891418457, "rewards/rejected": -14.955635070800781, "step": 3028 }, { "epoch": 0.73, "learning_rate": 6.069333333333333e-08, "logps/chosen": -219.64898681640625, "logps/rejected": -395.25927734375, "loss": 0.0007, "losses/dpo": 6.4489785039922864e-12, "losses/sft": 0.5448210835456848, "losses/total": 6.4489785039922864e-12, "ref_logps/chosen": -205.07318115234375, "ref_logps/rejected": -227.7557830810547, "rewards/accuracies": 1.0, "rewards/chosen": -1.457580327987671, "rewards/margins": 15.292766571044922, "rewards/rejected": -16.750347137451172, "step": 3029 }, { "epoch": 0.73, "learning_rate": 6.064e-08, "logps/chosen": -253.86883544921875, "logps/rejected": -382.86083984375, "loss": 0.0, "losses/dpo": 2.190990699091344e-06, "losses/sft": 1.021280288696289, "losses/total": 2.190990699091344e-06, "ref_logps/chosen": -238.506103515625, "ref_logps/rejected": -225.856201171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5362730026245117, "rewards/margins": 14.164189338684082, "rewards/rejected": -15.700461387634277, "step": 3030 }, { "epoch": 0.73, "learning_rate": 6.058666666666667e-08, "logps/chosen": -266.7141418457031, "logps/rejected": -402.3363037109375, "loss": 0.0015, "losses/dpo": 7.07873204497389e-10, "losses/sft": 0.6662260293960571, "losses/total": 7.07873204497389e-10, "ref_logps/chosen": -249.8252716064453, "ref_logps/rejected": -235.24378967285156, "rewards/accuracies": 1.0, "rewards/chosen": -1.6888879537582397, "rewards/margins": 15.020362854003906, "rewards/rejected": -16.70924949645996, "step": 3031 }, { "epoch": 0.73, "learning_rate": 6.053333333333332e-08, "logps/chosen": -253.05487060546875, "logps/rejected": -368.5594482421875, "loss": 0.002, "losses/dpo": 7.614199262206967e-07, "losses/sft": 0.9295206665992737, "losses/total": 7.614199262206967e-07, "ref_logps/chosen": -235.75863647460938, "ref_logps/rejected": -212.1963653564453, "rewards/accuracies": 1.0, "rewards/chosen": -1.7296209335327148, "rewards/margins": 13.90669059753418, "rewards/rejected": -15.636310577392578, "step": 3032 }, { "epoch": 0.73, "learning_rate": 6.048e-08, "logps/chosen": -232.278076171875, "logps/rejected": -378.0439453125, "loss": 0.0005, "losses/dpo": 3.3211286498691095e-10, "losses/sft": 1.1478503942489624, "losses/total": 3.3211286498691095e-10, "ref_logps/chosen": -213.4884033203125, "ref_logps/rejected": -208.56918334960938, "rewards/accuracies": 1.0, "rewards/chosen": -1.8789680004119873, "rewards/margins": 15.06850814819336, "rewards/rejected": -16.94747543334961, "step": 3033 }, { "epoch": 0.73, "learning_rate": 6.042666666666666e-08, "logps/chosen": -239.90386962890625, "logps/rejected": -359.84161376953125, "loss": 0.0084, "losses/dpo": 1.9503409998122834e-09, "losses/sft": 0.6185787916183472, "losses/total": 1.9503409998122834e-09, "ref_logps/chosen": -225.00369262695312, "ref_logps/rejected": -206.08438110351562, "rewards/accuracies": 1.0, "rewards/chosen": -1.4900184869766235, "rewards/margins": 13.885704040527344, "rewards/rejected": -15.37572193145752, "step": 3034 }, { "epoch": 0.73, "learning_rate": 6.037333333333334e-08, "logps/chosen": -274.86962890625, "logps/rejected": -417.6570129394531, "loss": 0.0, "losses/dpo": 5.0371108955005184e-05, "losses/sft": 0.4680729806423187, "losses/total": 5.0371108955005184e-05, "ref_logps/chosen": -258.74884033203125, "ref_logps/rejected": -243.3790740966797, "rewards/accuracies": 1.0, "rewards/chosen": -1.6120755672454834, "rewards/margins": 15.815720558166504, "rewards/rejected": -17.42779541015625, "step": 3035 }, { "epoch": 0.73, "learning_rate": 6.031999999999999e-08, "logps/chosen": -233.00360107421875, "logps/rejected": -359.29241943359375, "loss": 0.0043, "losses/dpo": 1.641763702764365e-08, "losses/sft": 0.5854521989822388, "losses/total": 1.641763702764365e-08, "ref_logps/chosen": -218.53204345703125, "ref_logps/rejected": -208.9547882080078, "rewards/accuracies": 1.0, "rewards/chosen": -1.4471532106399536, "rewards/margins": 13.586609840393066, "rewards/rejected": -15.033761978149414, "step": 3036 }, { "epoch": 0.73, "learning_rate": 6.026666666666666e-08, "logps/chosen": -248.29742431640625, "logps/rejected": -377.9998779296875, "loss": 0.0036, "losses/dpo": 1.195288173994058e-10, "losses/sft": 0.486015647649765, "losses/total": 1.195288173994058e-10, "ref_logps/chosen": -230.64010620117188, "ref_logps/rejected": -225.09194946289062, "rewards/accuracies": 1.0, "rewards/chosen": -1.7657318115234375, "rewards/margins": 13.525060653686523, "rewards/rejected": -15.290792465209961, "step": 3037 }, { "epoch": 0.73, "learning_rate": 6.021333333333333e-08, "logps/chosen": -255.3173370361328, "logps/rejected": -406.69537353515625, "loss": 0.0007, "losses/dpo": 2.6701714173071878e-09, "losses/sft": 0.5535905957221985, "losses/total": 2.6701714173071878e-09, "ref_logps/chosen": -241.22142028808594, "ref_logps/rejected": -241.2442626953125, "rewards/accuracies": 1.0, "rewards/chosen": -1.4095911979675293, "rewards/margins": 15.135520935058594, "rewards/rejected": -16.54511260986328, "step": 3038 }, { "epoch": 0.73, "learning_rate": 6.016e-08, "logps/chosen": -201.97512817382812, "logps/rejected": -382.7515869140625, "loss": 0.0019, "losses/dpo": 2.304780082340585e-06, "losses/sft": 0.8172578811645508, "losses/total": 2.304780082340585e-06, "ref_logps/chosen": -188.9693145751953, "ref_logps/rejected": -228.47044372558594, "rewards/accuracies": 1.0, "rewards/chosen": -1.300581693649292, "rewards/margins": 14.12753677368164, "rewards/rejected": -15.428118705749512, "step": 3039 }, { "epoch": 0.73, "learning_rate": 6.010666666666667e-08, "logps/chosen": -235.34912109375, "logps/rejected": -374.1011657714844, "loss": 0.0054, "losses/dpo": 3.2027571705839364e-07, "losses/sft": 0.9292404055595398, "losses/total": 3.2027571705839364e-07, "ref_logps/chosen": -223.2943115234375, "ref_logps/rejected": -211.225341796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.2054797410964966, "rewards/margins": 15.08210277557373, "rewards/rejected": -16.287580490112305, "step": 3040 }, { "epoch": 0.73, "learning_rate": 6.005333333333333e-08, "logps/chosen": -273.78863525390625, "logps/rejected": -444.5646057128906, "loss": 0.0002, "losses/dpo": 2.170432500392394e-09, "losses/sft": 0.6312301158905029, "losses/total": 2.170432500392394e-09, "ref_logps/chosen": -258.34283447265625, "ref_logps/rejected": -270.62896728515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.5445795059204102, "rewards/margins": 15.84898567199707, "rewards/rejected": -17.393564224243164, "step": 3041 }, { "epoch": 0.73, "learning_rate": 6e-08, "logps/chosen": -249.45289611816406, "logps/rejected": -428.11712646484375, "loss": 0.0, "losses/dpo": 2.216150534195549e-07, "losses/sft": 0.816923201084137, "losses/total": 2.216150534195549e-07, "ref_logps/chosen": -230.76290893554688, "ref_logps/rejected": -250.48973083496094, "rewards/accuracies": 1.0, "rewards/chosen": -1.8689980506896973, "rewards/margins": 15.893741607666016, "rewards/rejected": -17.762741088867188, "step": 3042 }, { "epoch": 0.73, "learning_rate": 5.994666666666667e-08, "logps/chosen": -274.668701171875, "logps/rejected": -403.81011962890625, "loss": 0.0002, "losses/dpo": 1.380245784332601e-08, "losses/sft": 0.6402283310890198, "losses/total": 1.380245784332601e-08, "ref_logps/chosen": -251.14651489257812, "ref_logps/rejected": -231.09027099609375, "rewards/accuracies": 1.0, "rewards/chosen": -2.3522162437438965, "rewards/margins": 14.919767379760742, "rewards/rejected": -17.271984100341797, "step": 3043 }, { "epoch": 0.73, "learning_rate": 5.989333333333333e-08, "logps/chosen": -251.95639038085938, "logps/rejected": -385.8038330078125, "loss": 0.0008, "losses/dpo": 1.1610741523782053e-07, "losses/sft": 1.0826334953308105, "losses/total": 1.1610741523782053e-07, "ref_logps/chosen": -235.0647735595703, "ref_logps/rejected": -222.54647827148438, "rewards/accuracies": 1.0, "rewards/chosen": -1.689159870147705, "rewards/margins": 14.636577606201172, "rewards/rejected": -16.32573699951172, "step": 3044 }, { "epoch": 0.73, "learning_rate": 5.984e-08, "logps/chosen": -268.8955078125, "logps/rejected": -421.73028564453125, "loss": 0.0, "losses/dpo": 2.9251177124933747e-07, "losses/sft": 0.7264569997787476, "losses/total": 2.9251177124933747e-07, "ref_logps/chosen": -252.33226013183594, "ref_logps/rejected": -255.57522583007812, "rewards/accuracies": 1.0, "rewards/chosen": -1.656324028968811, "rewards/margins": 14.959181785583496, "rewards/rejected": -16.615507125854492, "step": 3045 }, { "epoch": 0.73, "learning_rate": 5.978666666666666e-08, "logps/chosen": -219.83924865722656, "logps/rejected": -375.3699035644531, "loss": 0.0018, "losses/dpo": 2.2650038999927347e-07, "losses/sft": 0.5905054211616516, "losses/total": 2.2650038999927347e-07, "ref_logps/chosen": -207.94586181640625, "ref_logps/rejected": -211.26528930664062, "rewards/accuracies": 1.0, "rewards/chosen": -1.1893396377563477, "rewards/margins": 15.221121788024902, "rewards/rejected": -16.41046142578125, "step": 3046 }, { "epoch": 0.73, "learning_rate": 5.973333333333334e-08, "logps/chosen": -191.18264770507812, "logps/rejected": -367.79327392578125, "loss": 0.003, "losses/dpo": 1.4100847693043761e-05, "losses/sft": 0.38608357310295105, "losses/total": 1.4100847693043761e-05, "ref_logps/chosen": -178.10641479492188, "ref_logps/rejected": -218.4461669921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.3076237440109253, "rewards/margins": 13.627090454101562, "rewards/rejected": -14.934713363647461, "step": 3047 }, { "epoch": 0.73, "learning_rate": 5.968e-08, "logps/chosen": -224.65684509277344, "logps/rejected": -387.62176513671875, "loss": 0.0007, "losses/dpo": 2.7406409586205882e-08, "losses/sft": 0.5282044410705566, "losses/total": 2.7406409586205882e-08, "ref_logps/chosen": -210.240478515625, "ref_logps/rejected": -231.8208465576172, "rewards/accuracies": 1.0, "rewards/chosen": -1.4416366815567017, "rewards/margins": 14.138458251953125, "rewards/rejected": -15.580095291137695, "step": 3048 }, { "epoch": 0.73, "learning_rate": 5.962666666666665e-08, "logps/chosen": -259.270263671875, "logps/rejected": -398.3517150878906, "loss": 0.0018, "losses/dpo": 2.244843547161146e-11, "losses/sft": 0.3972098231315613, "losses/total": 2.244843547161146e-11, "ref_logps/chosen": -243.02835083007812, "ref_logps/rejected": -224.12969970703125, "rewards/accuracies": 1.0, "rewards/chosen": -1.624192476272583, "rewards/margins": 15.79800796508789, "rewards/rejected": -17.42220115661621, "step": 3049 }, { "epoch": 0.73, "learning_rate": 5.957333333333333e-08, "logps/chosen": -242.94821166992188, "logps/rejected": -396.3998107910156, "loss": 0.0017, "losses/dpo": 3.0954996788068456e-08, "losses/sft": 0.45683565735816956, "losses/total": 3.0954996788068456e-08, "ref_logps/chosen": -223.90396118164062, "ref_logps/rejected": -227.74700927734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.9044238328933716, "rewards/margins": 14.960857391357422, "rewards/rejected": -16.865280151367188, "step": 3050 }, { "epoch": 0.73, "learning_rate": 5.951999999999999e-08, "logps/chosen": -171.87142944335938, "logps/rejected": -341.9065856933594, "loss": 0.0015, "losses/dpo": 1.6689355106791481e-06, "losses/sft": 0.4698904752731323, "losses/total": 1.6689355106791481e-06, "ref_logps/chosen": -160.1012725830078, "ref_logps/rejected": -194.2662353515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.177015781402588, "rewards/margins": 13.587021827697754, "rewards/rejected": -14.764036178588867, "step": 3051 }, { "epoch": 0.73, "learning_rate": 5.946666666666666e-08, "logps/chosen": -270.48089599609375, "logps/rejected": -399.3010559082031, "loss": 0.0009, "losses/dpo": 5.432631837720692e-07, "losses/sft": 0.7411627173423767, "losses/total": 5.432631837720692e-07, "ref_logps/chosen": -249.97027587890625, "ref_logps/rejected": -232.41375732421875, "rewards/accuracies": 1.0, "rewards/chosen": -2.0510611534118652, "rewards/margins": 14.637669563293457, "rewards/rejected": -16.688730239868164, "step": 3052 }, { "epoch": 0.73, "learning_rate": 5.9413333333333327e-08, "logps/chosen": -189.37269592285156, "logps/rejected": -409.2838134765625, "loss": 0.0003, "losses/dpo": 5.218982468413458e-10, "losses/sft": 0.4775984585285187, "losses/total": 5.218982468413458e-10, "ref_logps/chosen": -175.28627014160156, "ref_logps/rejected": -233.37612915039062, "rewards/accuracies": 1.0, "rewards/chosen": -1.4086430072784424, "rewards/margins": 16.182125091552734, "rewards/rejected": -17.590770721435547, "step": 3053 }, { "epoch": 0.73, "learning_rate": 5.936e-08, "logps/chosen": -274.4596252441406, "logps/rejected": -368.1010437011719, "loss": 0.0018, "losses/dpo": 3.590917785345482e-08, "losses/sft": 0.6466270089149475, "losses/total": 3.590917785345482e-08, "ref_logps/chosen": -259.0924072265625, "ref_logps/rejected": -220.72586059570312, "rewards/accuracies": 1.0, "rewards/chosen": -1.5367223024368286, "rewards/margins": 13.20079517364502, "rewards/rejected": -14.737518310546875, "step": 3054 }, { "epoch": 0.73, "learning_rate": 5.930666666666666e-08, "logps/chosen": -267.8638916015625, "logps/rejected": -436.72509765625, "loss": 0.0, "losses/dpo": 1.6858813012277096e-07, "losses/sft": 0.6801276803016663, "losses/total": 1.6858813012277096e-07, "ref_logps/chosen": -248.50987243652344, "ref_logps/rejected": -257.2386474609375, "rewards/accuracies": 1.0, "rewards/chosen": -1.9354033470153809, "rewards/margins": 16.013240814208984, "rewards/rejected": -17.948644638061523, "step": 3055 }, { "epoch": 0.73, "learning_rate": 5.925333333333333e-08, "logps/chosen": -237.62940979003906, "logps/rejected": -386.7899169921875, "loss": 0.0018, "losses/dpo": 9.250924648540604e-08, "losses/sft": 0.6035469770431519, "losses/total": 9.250924648540604e-08, "ref_logps/chosen": -219.05551147460938, "ref_logps/rejected": -213.48672485351562, "rewards/accuracies": 1.0, "rewards/chosen": -1.8573894500732422, "rewards/margins": 15.472929954528809, "rewards/rejected": -17.330318450927734, "step": 3056 }, { "epoch": 0.73, "learning_rate": 5.9199999999999994e-08, "logps/chosen": -286.1883239746094, "logps/rejected": -380.02301025390625, "loss": 0.0002, "losses/dpo": 1.7615908518564538e-06, "losses/sft": 1.113499641418457, "losses/total": 1.7615908518564538e-06, "ref_logps/chosen": -266.70263671875, "ref_logps/rejected": -222.3311767578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.9485677480697632, "rewards/margins": 13.8206148147583, "rewards/rejected": -15.769182205200195, "step": 3057 }, { "epoch": 0.73, "learning_rate": 5.9146666666666664e-08, "logps/chosen": -255.96693420410156, "logps/rejected": -390.8895568847656, "loss": 0.0006, "losses/dpo": 1.4053467722874302e-08, "losses/sft": 0.5451884269714355, "losses/total": 1.4053467722874302e-08, "ref_logps/chosen": -236.4529266357422, "ref_logps/rejected": -223.30368041992188, "rewards/accuracies": 1.0, "rewards/chosen": -1.9514015913009644, "rewards/margins": 14.807188987731934, "rewards/rejected": -16.758590698242188, "step": 3058 }, { "epoch": 0.73, "learning_rate": 5.909333333333333e-08, "logps/chosen": -252.18289184570312, "logps/rejected": -360.66436767578125, "loss": 0.0021, "losses/dpo": 9.555711812936352e-07, "losses/sft": 0.5282284021377563, "losses/total": 9.555711812936352e-07, "ref_logps/chosen": -237.5963592529297, "ref_logps/rejected": -214.3684539794922, "rewards/accuracies": 1.0, "rewards/chosen": -1.4586540460586548, "rewards/margins": 13.170938491821289, "rewards/rejected": -14.629591941833496, "step": 3059 }, { "epoch": 0.73, "learning_rate": 5.904e-08, "logps/chosen": -239.16885375976562, "logps/rejected": -413.27691650390625, "loss": 0.0002, "losses/dpo": 2.192130921230273e-07, "losses/sft": 0.49347230792045593, "losses/total": 2.192130921230273e-07, "ref_logps/chosen": -225.43533325195312, "ref_logps/rejected": -247.83255004882812, "rewards/accuracies": 1.0, "rewards/chosen": -1.373353362083435, "rewards/margins": 15.17108154296875, "rewards/rejected": -16.54443359375, "step": 3060 }, { "epoch": 0.73, "learning_rate": 5.898666666666666e-08, "logps/chosen": -275.1767578125, "logps/rejected": -373.3331298828125, "loss": 0.0011, "losses/dpo": 1.3030612535658292e-06, "losses/sft": 0.5183713436126709, "losses/total": 1.3030612535658292e-06, "ref_logps/chosen": -258.730224609375, "ref_logps/rejected": -218.92410278320312, "rewards/accuracies": 1.0, "rewards/chosen": -1.6446539163589478, "rewards/margins": 13.796249389648438, "rewards/rejected": -15.440902709960938, "step": 3061 }, { "epoch": 0.73, "learning_rate": 5.893333333333333e-08, "logps/chosen": -235.53961181640625, "logps/rejected": -350.8050231933594, "loss": 0.0026, "losses/dpo": 6.446733459597453e-05, "losses/sft": 0.7274771928787231, "losses/total": 6.446733459597453e-05, "ref_logps/chosen": -221.17982482910156, "ref_logps/rejected": -192.21717834472656, "rewards/accuracies": 1.0, "rewards/chosen": -1.435978651046753, "rewards/margins": 14.422805786132812, "rewards/rejected": -15.858784675598145, "step": 3062 }, { "epoch": 0.74, "learning_rate": 5.8879999999999995e-08, "logps/chosen": -248.49691772460938, "logps/rejected": -383.35101318359375, "loss": 0.0003, "losses/dpo": 5.542856911233685e-07, "losses/sft": 0.7336254119873047, "losses/total": 5.542856911233685e-07, "ref_logps/chosen": -236.1009521484375, "ref_logps/rejected": -231.7750701904297, "rewards/accuracies": 1.0, "rewards/chosen": -1.2395970821380615, "rewards/margins": 13.917999267578125, "rewards/rejected": -15.15759563446045, "step": 3063 }, { "epoch": 0.74, "learning_rate": 5.882666666666667e-08, "logps/chosen": -259.5725402832031, "logps/rejected": -368.695556640625, "loss": 0.0004, "losses/dpo": 4.7501003130889785e-09, "losses/sft": 0.7465517520904541, "losses/total": 4.7501003130889785e-09, "ref_logps/chosen": -244.80667114257812, "ref_logps/rejected": -206.78021240234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4765853881835938, "rewards/margins": 14.714950561523438, "rewards/rejected": -16.19153594970703, "step": 3064 }, { "epoch": 0.74, "learning_rate": 5.877333333333333e-08, "logps/chosen": -192.00863647460938, "logps/rejected": -347.58087158203125, "loss": 0.0149, "losses/dpo": 1.9009592051588697e-06, "losses/sft": 0.6731716990470886, "losses/total": 1.9009592051588697e-06, "ref_logps/chosen": -179.00015258789062, "ref_logps/rejected": -214.42527770996094, "rewards/accuracies": 1.0, "rewards/chosen": -1.3008501529693604, "rewards/margins": 12.014711380004883, "rewards/rejected": -13.31556224822998, "step": 3065 }, { "epoch": 0.74, "learning_rate": 5.8720000000000006e-08, "logps/chosen": -278.91082763671875, "logps/rejected": -401.0870361328125, "loss": 0.0004, "losses/dpo": 6.427003427234013e-06, "losses/sft": 0.8360316157341003, "losses/total": 6.427003427234013e-06, "ref_logps/chosen": -255.92662048339844, "ref_logps/rejected": -216.38290405273438, "rewards/accuracies": 1.0, "rewards/chosen": -2.298417091369629, "rewards/margins": 16.1719970703125, "rewards/rejected": -18.470413208007812, "step": 3066 }, { "epoch": 0.74, "learning_rate": 5.866666666666666e-08, "logps/chosen": -255.93943786621094, "logps/rejected": -396.45489501953125, "loss": 0.0004, "losses/dpo": 0.00016746272740419954, "losses/sft": 1.1888031959533691, "losses/total": 0.00016746272740419954, "ref_logps/chosen": -238.28294372558594, "ref_logps/rejected": -220.10769653320312, "rewards/accuracies": 1.0, "rewards/chosen": -1.7656500339508057, "rewards/margins": 15.869067192077637, "rewards/rejected": -17.634716033935547, "step": 3067 }, { "epoch": 0.74, "learning_rate": 5.8613333333333326e-08, "logps/chosen": -249.731689453125, "logps/rejected": -384.675048828125, "loss": 0.0011, "losses/dpo": 4.959485044864209e-10, "losses/sft": 0.6468859910964966, "losses/total": 4.959485044864209e-10, "ref_logps/chosen": -236.00425720214844, "ref_logps/rejected": -216.20797729492188, "rewards/accuracies": 1.0, "rewards/chosen": -1.3727431297302246, "rewards/margins": 15.473962783813477, "rewards/rejected": -16.84670639038086, "step": 3068 }, { "epoch": 0.74, "learning_rate": 5.8559999999999996e-08, "logps/chosen": -217.63021850585938, "logps/rejected": -378.9023742675781, "loss": 0.0005, "losses/dpo": 6.429015098063928e-10, "losses/sft": 0.7569068074226379, "losses/total": 6.429015098063928e-10, "ref_logps/chosen": -205.13589477539062, "ref_logps/rejected": -222.125244140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2494347095489502, "rewards/margins": 14.428277015686035, "rewards/rejected": -15.677711486816406, "step": 3069 }, { "epoch": 0.74, "learning_rate": 5.850666666666666e-08, "logps/chosen": -231.92066955566406, "logps/rejected": -348.7017517089844, "loss": 0.0066, "losses/dpo": 1.369620666880067e-10, "losses/sft": 0.643143355846405, "losses/total": 1.369620666880067e-10, "ref_logps/chosen": -218.17263793945312, "ref_logps/rejected": -202.1125946044922, "rewards/accuracies": 1.0, "rewards/chosen": -1.3748050928115845, "rewards/margins": 13.284111022949219, "rewards/rejected": -14.658916473388672, "step": 3070 }, { "epoch": 0.74, "learning_rate": 5.845333333333333e-08, "logps/chosen": -263.69012451171875, "logps/rejected": -385.7288818359375, "loss": 0.0008, "losses/dpo": 3.43342527742152e-08, "losses/sft": 0.5833773016929626, "losses/total": 3.43342527742152e-08, "ref_logps/chosen": -249.6316680908203, "ref_logps/rejected": -227.40277099609375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4058459997177124, "rewards/margins": 14.426763534545898, "rewards/rejected": -15.832609176635742, "step": 3071 }, { "epoch": 0.74, "learning_rate": 5.8399999999999994e-08, "logps/chosen": -288.26763916015625, "logps/rejected": -385.93463134765625, "loss": 0.0001, "losses/dpo": 1.7272257935019297e-07, "losses/sft": 0.6780739426612854, "losses/total": 1.7272257935019297e-07, "ref_logps/chosen": -269.13238525390625, "ref_logps/rejected": -225.07162475585938, "rewards/accuracies": 1.0, "rewards/chosen": -1.9135279655456543, "rewards/margins": 14.172773361206055, "rewards/rejected": -16.086299896240234, "step": 3072 }, { "epoch": 0.74, "learning_rate": 5.8346666666666664e-08, "logps/chosen": -260.4457702636719, "logps/rejected": -368.4257507324219, "loss": 0.0065, "losses/dpo": 2.869066539901155e-09, "losses/sft": 0.6356397867202759, "losses/total": 2.869066539901155e-09, "ref_logps/chosen": -244.5172119140625, "ref_logps/rejected": -214.86534118652344, "rewards/accuracies": 1.0, "rewards/chosen": -1.5928562879562378, "rewards/margins": 13.76318359375, "rewards/rejected": -15.356039047241211, "step": 3073 }, { "epoch": 0.74, "learning_rate": 5.829333333333333e-08, "logps/chosen": -246.093994140625, "logps/rejected": -421.5569152832031, "loss": 0.0001, "losses/dpo": 8.600156142790638e-09, "losses/sft": 0.5254058241844177, "losses/total": 8.600156142790638e-09, "ref_logps/chosen": -229.59930419921875, "ref_logps/rejected": -238.27877807617188, "rewards/accuracies": 1.0, "rewards/chosen": -1.6494661569595337, "rewards/margins": 16.678348541259766, "rewards/rejected": -18.32781410217285, "step": 3074 }, { "epoch": 0.74, "learning_rate": 5.824e-08, "logps/chosen": -200.20242309570312, "logps/rejected": -351.9754333496094, "loss": 0.0015, "losses/dpo": 0.0001841103658080101, "losses/sft": 0.46059590578079224, "losses/total": 0.0001841103658080101, "ref_logps/chosen": -189.682373046875, "ref_logps/rejected": -204.38626098632812, "rewards/accuracies": 1.0, "rewards/chosen": -1.052004337310791, "rewards/margins": 13.706914901733398, "rewards/rejected": -14.758920669555664, "step": 3075 }, { "epoch": 0.74, "learning_rate": 5.818666666666666e-08, "logps/chosen": -247.40830993652344, "logps/rejected": -409.19140625, "loss": 0.0002, "losses/dpo": 1.0536482477618847e-06, "losses/sft": 0.8125726580619812, "losses/total": 1.0536482477618847e-06, "ref_logps/chosen": -230.024658203125, "ref_logps/rejected": -222.73255920410156, "rewards/accuracies": 1.0, "rewards/chosen": -1.7383649349212646, "rewards/margins": 16.907516479492188, "rewards/rejected": -18.64588165283203, "step": 3076 }, { "epoch": 0.74, "learning_rate": 5.813333333333333e-08, "logps/chosen": -224.65426635742188, "logps/rejected": -386.9288635253906, "loss": 0.003, "losses/dpo": 6.084745485246401e-10, "losses/sft": 0.6528034210205078, "losses/total": 6.084745485246401e-10, "ref_logps/chosen": -211.71554565429688, "ref_logps/rejected": -226.3240203857422, "rewards/accuracies": 1.0, "rewards/chosen": -1.2938718795776367, "rewards/margins": 14.766613960266113, "rewards/rejected": -16.06048583984375, "step": 3077 }, { "epoch": 0.74, "learning_rate": 5.8079999999999995e-08, "logps/chosen": -246.2775115966797, "logps/rejected": -393.7344055175781, "loss": 0.0009, "losses/dpo": 0.00026737424195744097, "losses/sft": 0.5449992418289185, "losses/total": 0.00026737424195744097, "ref_logps/chosen": -224.76199340820312, "ref_logps/rejected": -227.35818481445312, "rewards/accuracies": 1.0, "rewards/chosen": -2.1515517234802246, "rewards/margins": 14.48607349395752, "rewards/rejected": -16.63762664794922, "step": 3078 }, { "epoch": 0.74, "learning_rate": 5.802666666666667e-08, "logps/chosen": -265.9767150878906, "logps/rejected": -367.794921875, "loss": 0.0021, "losses/dpo": 8.182724720029455e-09, "losses/sft": 0.6295614242553711, "losses/total": 8.182724720029455e-09, "ref_logps/chosen": -248.9132080078125, "ref_logps/rejected": -205.5928497314453, "rewards/accuracies": 1.0, "rewards/chosen": -1.70635187625885, "rewards/margins": 14.513856887817383, "rewards/rejected": -16.2202091217041, "step": 3079 }, { "epoch": 0.74, "learning_rate": 5.797333333333333e-08, "logps/chosen": -249.6543731689453, "logps/rejected": -356.5787353515625, "loss": 0.0022, "losses/dpo": 1.5029033573199513e-10, "losses/sft": 0.6239222288131714, "losses/total": 1.5029033573199513e-10, "ref_logps/chosen": -232.52114868164062, "ref_logps/rejected": -203.73770141601562, "rewards/accuracies": 1.0, "rewards/chosen": -1.7133228778839111, "rewards/margins": 13.570779800415039, "rewards/rejected": -15.284101486206055, "step": 3080 }, { "epoch": 0.74, "learning_rate": 5.7920000000000005e-08, "logps/chosen": -309.865478515625, "logps/rejected": -396.5211486816406, "loss": 0.0127, "losses/dpo": 2.086615547014503e-09, "losses/sft": 0.7970070242881775, "losses/total": 2.086615547014503e-09, "ref_logps/chosen": -290.095947265625, "ref_logps/rejected": -236.79981994628906, "rewards/accuracies": 1.0, "rewards/chosen": -1.976956844329834, "rewards/margins": 13.99517822265625, "rewards/rejected": -15.972135543823242, "step": 3081 }, { "epoch": 0.74, "learning_rate": 5.786666666666666e-08, "logps/chosen": -206.614990234375, "logps/rejected": -369.2935485839844, "loss": 0.0002, "losses/dpo": 0.00018672751320991665, "losses/sft": 0.8446696400642395, "losses/total": 0.00018672751320991665, "ref_logps/chosen": -192.79763793945312, "ref_logps/rejected": -211.87770080566406, "rewards/accuracies": 1.0, "rewards/chosen": -1.3817343711853027, "rewards/margins": 14.359851837158203, "rewards/rejected": -15.741586685180664, "step": 3082 }, { "epoch": 0.74, "learning_rate": 5.781333333333334e-08, "logps/chosen": -254.13134765625, "logps/rejected": -422.65985107421875, "loss": 0.0008, "losses/dpo": 3.3289626610866208e-09, "losses/sft": 0.7978817224502563, "losses/total": 3.3289626610866208e-09, "ref_logps/chosen": -239.33352661132812, "ref_logps/rejected": -252.42782592773438, "rewards/accuracies": 1.0, "rewards/chosen": -1.4797818660736084, "rewards/margins": 15.543417930603027, "rewards/rejected": -17.0231990814209, "step": 3083 }, { "epoch": 0.74, "learning_rate": 5.7759999999999996e-08, "logps/chosen": -247.70361328125, "logps/rejected": -390.635009765625, "loss": 0.0001, "losses/dpo": 5.4810622707179846e-09, "losses/sft": 0.7134917974472046, "losses/total": 5.4810622707179846e-09, "ref_logps/chosen": -233.21896362304688, "ref_logps/rejected": -233.47669982910156, "rewards/accuracies": 1.0, "rewards/chosen": -1.4484668970108032, "rewards/margins": 14.267364501953125, "rewards/rejected": -15.715831756591797, "step": 3084 }, { "epoch": 0.74, "learning_rate": 5.770666666666666e-08, "logps/chosen": -223.21636962890625, "logps/rejected": -363.07647705078125, "loss": 0.0012, "losses/dpo": 4.720989377204887e-09, "losses/sft": 0.7385419011116028, "losses/total": 4.720989377204887e-09, "ref_logps/chosen": -213.54269409179688, "ref_logps/rejected": -215.87493896484375, "rewards/accuracies": 1.0, "rewards/chosen": -0.9673680067062378, "rewards/margins": 13.752784729003906, "rewards/rejected": -14.72015380859375, "step": 3085 }, { "epoch": 0.74, "learning_rate": 5.765333333333333e-08, "logps/chosen": -244.46896362304688, "logps/rejected": -383.23675537109375, "loss": 0.0028, "losses/dpo": 7.419824310517242e-09, "losses/sft": 0.6275902986526489, "losses/total": 7.419824310517242e-09, "ref_logps/chosen": -225.48788452148438, "ref_logps/rejected": -219.45413208007812, "rewards/accuracies": 1.0, "rewards/chosen": -1.8981071710586548, "rewards/margins": 14.480157852172852, "rewards/rejected": -16.378265380859375, "step": 3086 }, { "epoch": 0.74, "learning_rate": 5.759999999999999e-08, "logps/chosen": -244.09109497070312, "logps/rejected": -374.1014404296875, "loss": 0.001, "losses/dpo": 7.942397850513316e-09, "losses/sft": 0.5714029669761658, "losses/total": 7.942397850513316e-09, "ref_logps/chosen": -230.02041625976562, "ref_logps/rejected": -219.408203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.4070677757263184, "rewards/margins": 14.062256813049316, "rewards/rejected": -15.469324111938477, "step": 3087 }, { "epoch": 0.74, "learning_rate": 5.754666666666666e-08, "logps/chosen": -289.9110107421875, "logps/rejected": -405.35272216796875, "loss": 0.0004, "losses/dpo": 4.616601017914945e-06, "losses/sft": 1.228859782218933, "losses/total": 4.616601017914945e-06, "ref_logps/chosen": -275.1400451660156, "ref_logps/rejected": -240.36302185058594, "rewards/accuracies": 1.0, "rewards/chosen": -1.4770944118499756, "rewards/margins": 15.02187728881836, "rewards/rejected": -16.498971939086914, "step": 3088 }, { "epoch": 0.74, "learning_rate": 5.749333333333333e-08, "logps/chosen": -248.7489013671875, "logps/rejected": -358.031494140625, "loss": 0.0071, "losses/dpo": 4.938577831126167e-07, "losses/sft": 0.6932435035705566, "losses/total": 4.938577831126167e-07, "ref_logps/chosen": -236.5509033203125, "ref_logps/rejected": -222.11126708984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.2197998762130737, "rewards/margins": 12.37222671508789, "rewards/rejected": -13.59202766418457, "step": 3089 }, { "epoch": 0.74, "learning_rate": 5.744e-08, "logps/chosen": -271.74267578125, "logps/rejected": -414.2676086425781, "loss": 0.0009, "losses/dpo": 9.558170227519369e-12, "losses/sft": 0.8263328075408936, "losses/total": 9.558170227519369e-12, "ref_logps/chosen": -254.37681579589844, "ref_logps/rejected": -243.9011993408203, "rewards/accuracies": 1.0, "rewards/chosen": -1.7365844249725342, "rewards/margins": 15.300058364868164, "rewards/rejected": -17.03664207458496, "step": 3090 }, { "epoch": 0.74, "learning_rate": 5.738666666666666e-08, "logps/chosen": -255.46084594726562, "logps/rejected": -391.7716064453125, "loss": 0.0026, "losses/dpo": 6.23731125415361e-07, "losses/sft": 0.598911702632904, "losses/total": 6.23731125415361e-07, "ref_logps/chosen": -239.55075073242188, "ref_logps/rejected": -222.90643310546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5910098552703857, "rewards/margins": 15.295507431030273, "rewards/rejected": -16.886518478393555, "step": 3091 }, { "epoch": 0.74, "learning_rate": 5.733333333333333e-08, "logps/chosen": -259.3294677734375, "logps/rejected": -398.36279296875, "loss": 0.0046, "losses/dpo": 1.4380891226115078e-12, "losses/sft": 0.4358474910259247, "losses/total": 1.4380891226115078e-12, "ref_logps/chosen": -246.89866638183594, "ref_logps/rejected": -232.08033752441406, "rewards/accuracies": 1.0, "rewards/chosen": -1.2430797815322876, "rewards/margins": 15.385165214538574, "rewards/rejected": -16.628246307373047, "step": 3092 }, { "epoch": 0.74, "learning_rate": 5.7279999999999994e-08, "logps/chosen": -207.3104248046875, "logps/rejected": -390.831298828125, "loss": 0.0002, "losses/dpo": 2.4201762371944824e-08, "losses/sft": 0.8991681933403015, "losses/total": 2.4201762371944824e-08, "ref_logps/chosen": -194.13125610351562, "ref_logps/rejected": -234.39541625976562, "rewards/accuracies": 1.0, "rewards/chosen": -1.3179163932800293, "rewards/margins": 14.325671195983887, "rewards/rejected": -15.643587112426758, "step": 3093 }, { "epoch": 0.74, "learning_rate": 5.7226666666666664e-08, "logps/chosen": -247.07913208007812, "logps/rejected": -390.41986083984375, "loss": 0.003, "losses/dpo": 9.89440543386344e-11, "losses/sft": 0.5721003413200378, "losses/total": 9.89440543386344e-11, "ref_logps/chosen": -231.91299438476562, "ref_logps/rejected": -238.700439453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.516615867614746, "rewards/margins": 13.655323028564453, "rewards/rejected": -15.1719388961792, "step": 3094 }, { "epoch": 0.74, "learning_rate": 5.717333333333333e-08, "logps/chosen": -229.10958862304688, "logps/rejected": -373.49237060546875, "loss": 0.0016, "losses/dpo": 1.7436180277385915e-10, "losses/sft": 0.49398908019065857, "losses/total": 1.7436180277385915e-10, "ref_logps/chosen": -208.328125, "ref_logps/rejected": -209.2806396484375, "rewards/accuracies": 1.0, "rewards/chosen": -2.078145980834961, "rewards/margins": 14.343025207519531, "rewards/rejected": -16.421171188354492, "step": 3095 }, { "epoch": 0.74, "learning_rate": 5.7120000000000005e-08, "logps/chosen": -219.70155334472656, "logps/rejected": -410.029541015625, "loss": 0.0096, "losses/dpo": 5.668558173965721e-07, "losses/sft": 1.2514619827270508, "losses/total": 5.668558173965721e-07, "ref_logps/chosen": -207.52166748046875, "ref_logps/rejected": -234.47755432128906, "rewards/accuracies": 1.0, "rewards/chosen": -1.2179875373840332, "rewards/margins": 16.337215423583984, "rewards/rejected": -17.55520248413086, "step": 3096 }, { "epoch": 0.74, "learning_rate": 5.706666666666666e-08, "logps/chosen": -226.3955535888672, "logps/rejected": -407.58184814453125, "loss": 0.0003, "losses/dpo": 1.7370824707541033e-06, "losses/sft": 0.5459921360015869, "losses/total": 1.7370824707541033e-06, "ref_logps/chosen": -210.5128936767578, "ref_logps/rejected": -236.36740112304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.5882642269134521, "rewards/margins": 15.533180236816406, "rewards/rejected": -17.121444702148438, "step": 3097 }, { "epoch": 0.74, "learning_rate": 5.701333333333334e-08, "logps/chosen": -237.31373596191406, "logps/rejected": -396.62640380859375, "loss": 0.0014, "losses/dpo": 1.7885254521843308e-07, "losses/sft": 0.688169538974762, "losses/total": 1.7885254521843308e-07, "ref_logps/chosen": -221.35659790039062, "ref_logps/rejected": -228.48658752441406, "rewards/accuracies": 1.0, "rewards/chosen": -1.5957133769989014, "rewards/margins": 15.218270301818848, "rewards/rejected": -16.813983917236328, "step": 3098 }, { "epoch": 0.74, "learning_rate": 5.6959999999999995e-08, "logps/chosen": -279.10467529296875, "logps/rejected": -430.23309326171875, "loss": 0.0005, "losses/dpo": 2.0724661746397872e-14, "losses/sft": 0.7912597060203552, "losses/total": 2.0724661746397872e-14, "ref_logps/chosen": -267.4696960449219, "ref_logps/rejected": -257.55401611328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.1634997129440308, "rewards/margins": 16.104412078857422, "rewards/rejected": -17.26791000366211, "step": 3099 }, { "epoch": 0.74, "learning_rate": 5.690666666666667e-08, "logps/chosen": -255.3303680419922, "logps/rejected": -388.70391845703125, "loss": 0.0023, "losses/dpo": 2.411742139329931e-09, "losses/sft": 0.39372435212135315, "losses/total": 2.411742139329931e-09, "ref_logps/chosen": -240.42666625976562, "ref_logps/rejected": -217.16690063476562, "rewards/accuracies": 1.0, "rewards/chosen": -1.4903717041015625, "rewards/margins": 15.663334846496582, "rewards/rejected": -17.153705596923828, "step": 3100 }, { "epoch": 0.74, "learning_rate": 5.685333333333333e-08, "logps/chosen": -282.73529052734375, "logps/rejected": -438.58856201171875, "loss": 0.0, "losses/dpo": 1.2680995951086516e-06, "losses/sft": 0.6326246857643127, "losses/total": 1.2680995951086516e-06, "ref_logps/chosen": -262.34149169921875, "ref_logps/rejected": -254.71917724609375, "rewards/accuracies": 1.0, "rewards/chosen": -2.0393810272216797, "rewards/margins": 16.347557067871094, "rewards/rejected": -18.386938095092773, "step": 3101 }, { "epoch": 0.74, "learning_rate": 5.679999999999999e-08, "logps/chosen": -234.31747436523438, "logps/rejected": -384.9892883300781, "loss": 0.0044, "losses/dpo": 7.829440207274274e-09, "losses/sft": 0.7591637372970581, "losses/total": 7.829440207274274e-09, "ref_logps/chosen": -216.73265075683594, "ref_logps/rejected": -207.86402893066406, "rewards/accuracies": 1.0, "rewards/chosen": -1.7584794759750366, "rewards/margins": 15.954047203063965, "rewards/rejected": -17.712528228759766, "step": 3102 }, { "epoch": 0.74, "learning_rate": 5.674666666666666e-08, "logps/chosen": -244.8667449951172, "logps/rejected": -415.303466796875, "loss": 0.0, "losses/dpo": 3.8106224842238134e-09, "losses/sft": 0.505969226360321, "losses/total": 3.8106224842238134e-09, "ref_logps/chosen": -232.1643829345703, "ref_logps/rejected": -236.06332397460938, "rewards/accuracies": 1.0, "rewards/chosen": -1.2702360153198242, "rewards/margins": 16.653778076171875, "rewards/rejected": -17.924015045166016, "step": 3103 }, { "epoch": 0.74, "learning_rate": 5.6693333333333326e-08, "logps/chosen": -242.0594024658203, "logps/rejected": -350.5035400390625, "loss": 0.0027, "losses/dpo": 1.738707169351983e-07, "losses/sft": 0.762179970741272, "losses/total": 1.738707169351983e-07, "ref_logps/chosen": -232.58285522460938, "ref_logps/rejected": -206.87730407714844, "rewards/accuracies": 1.0, "rewards/chosen": -0.9476550221443176, "rewards/margins": 13.414968490600586, "rewards/rejected": -14.362624168395996, "step": 3104 }, { "epoch": 0.75, "learning_rate": 5.6639999999999996e-08, "logps/chosen": -206.40843200683594, "logps/rejected": -374.6959533691406, "loss": 0.0024, "losses/dpo": 3.7031162491985015e-07, "losses/sft": 0.6349008083343506, "losses/total": 3.7031162491985015e-07, "ref_logps/chosen": -194.1262969970703, "ref_logps/rejected": -215.55653381347656, "rewards/accuracies": 1.0, "rewards/chosen": -1.2282142639160156, "rewards/margins": 14.685728073120117, "rewards/rejected": -15.913942337036133, "step": 3105 }, { "epoch": 0.75, "learning_rate": 5.658666666666666e-08, "logps/chosen": -230.17713928222656, "logps/rejected": -364.77325439453125, "loss": 0.001, "losses/dpo": 2.3472923658118816e-06, "losses/sft": 0.8047228455543518, "losses/total": 2.3472923658118816e-06, "ref_logps/chosen": -216.1284942626953, "ref_logps/rejected": -217.13343811035156, "rewards/accuracies": 1.0, "rewards/chosen": -1.4048634767532349, "rewards/margins": 13.359121322631836, "rewards/rejected": -14.763983726501465, "step": 3106 }, { "epoch": 0.75, "learning_rate": 5.653333333333333e-08, "logps/chosen": -279.44134521484375, "logps/rejected": -411.3541564941406, "loss": 0.0015, "losses/dpo": 8.547302421391123e-09, "losses/sft": 0.6622213125228882, "losses/total": 8.547302421391123e-09, "ref_logps/chosen": -260.16668701171875, "ref_logps/rejected": -239.6875762939453, "rewards/accuracies": 1.0, "rewards/chosen": -1.9274649620056152, "rewards/margins": 15.239192962646484, "rewards/rejected": -17.166658401489258, "step": 3107 }, { "epoch": 0.75, "learning_rate": 5.6479999999999994e-08, "logps/chosen": -241.4237060546875, "logps/rejected": -371.696533203125, "loss": 0.0004, "losses/dpo": 2.2856835357742966e-07, "losses/sft": 0.670749306678772, "losses/total": 2.2856835357742966e-07, "ref_logps/chosen": -227.49981689453125, "ref_logps/rejected": -209.702392578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.392387866973877, "rewards/margins": 14.807027816772461, "rewards/rejected": -16.19941520690918, "step": 3108 }, { "epoch": 0.75, "learning_rate": 5.6426666666666664e-08, "logps/chosen": -240.6011199951172, "logps/rejected": -391.6698303222656, "loss": 0.0108, "losses/dpo": 1.0972107666873399e-08, "losses/sft": 0.5322502255439758, "losses/total": 1.0972107666873399e-08, "ref_logps/chosen": -226.81735229492188, "ref_logps/rejected": -236.08529663085938, "rewards/accuracies": 1.0, "rewards/chosen": -1.3783748149871826, "rewards/margins": 14.180078506469727, "rewards/rejected": -15.558454513549805, "step": 3109 }, { "epoch": 0.75, "learning_rate": 5.637333333333333e-08, "logps/chosen": -249.7357177734375, "logps/rejected": -406.14727783203125, "loss": 0.0003, "losses/dpo": 4.885992242173237e-15, "losses/sft": 0.6954367160797119, "losses/total": 4.885992242173237e-15, "ref_logps/chosen": -233.38906860351562, "ref_logps/rejected": -231.53817749023438, "rewards/accuracies": 1.0, "rewards/chosen": -1.6346653699874878, "rewards/margins": 15.826245307922363, "rewards/rejected": -17.46091079711914, "step": 3110 }, { "epoch": 0.75, "learning_rate": 5.6320000000000004e-08, "logps/chosen": -276.60662841796875, "logps/rejected": -417.81842041015625, "loss": 0.0009, "losses/dpo": 9.133617473455757e-11, "losses/sft": 0.7287015318870544, "losses/total": 9.133617473455757e-11, "ref_logps/chosen": -259.61279296875, "ref_logps/rejected": -248.19436645507812, "rewards/accuracies": 1.0, "rewards/chosen": -1.6993860006332397, "rewards/margins": 15.263018608093262, "rewards/rejected": -16.962404251098633, "step": 3111 }, { "epoch": 0.75, "learning_rate": 5.626666666666666e-08, "logps/chosen": -239.98922729492188, "logps/rejected": -379.9514465332031, "loss": 0.0009, "losses/dpo": 1.4846696672066173e-07, "losses/sft": 0.6989594101905823, "losses/total": 1.4846696672066173e-07, "ref_logps/chosen": -220.6123504638672, "ref_logps/rejected": -219.19247436523438, "rewards/accuracies": 1.0, "rewards/chosen": -1.9376856088638306, "rewards/margins": 14.138214111328125, "rewards/rejected": -16.07590103149414, "step": 3112 }, { "epoch": 0.75, "learning_rate": 5.621333333333334e-08, "logps/chosen": -244.29071044921875, "logps/rejected": -425.229736328125, "loss": 0.0018, "losses/dpo": 2.94802487976753e-10, "losses/sft": 0.5094478726387024, "losses/total": 2.94802487976753e-10, "ref_logps/chosen": -226.5318603515625, "ref_logps/rejected": -244.92758178710938, "rewards/accuracies": 1.0, "rewards/chosen": -1.775883436203003, "rewards/margins": 16.254329681396484, "rewards/rejected": -18.03021240234375, "step": 3113 }, { "epoch": 0.75, "learning_rate": 5.6159999999999995e-08, "logps/chosen": -260.1151123046875, "logps/rejected": -391.25787353515625, "loss": 0.0029, "losses/dpo": 3.066753240243969e-10, "losses/sft": 0.6227025985717773, "losses/total": 3.066753240243969e-10, "ref_logps/chosen": -244.05450439453125, "ref_logps/rejected": -230.80850219726562, "rewards/accuracies": 1.0, "rewards/chosen": -1.6060634851455688, "rewards/margins": 14.43887710571289, "rewards/rejected": -16.044940948486328, "step": 3114 }, { "epoch": 0.75, "learning_rate": 5.610666666666667e-08, "logps/chosen": -281.46112060546875, "logps/rejected": -445.4922790527344, "loss": 0.0039, "losses/dpo": 4.1516126647600515e-10, "losses/sft": 0.5312244296073914, "losses/total": 4.1516126647600515e-10, "ref_logps/chosen": -261.8384704589844, "ref_logps/rejected": -260.74664306640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.96226167678833, "rewards/margins": 16.51230239868164, "rewards/rejected": -18.474565505981445, "step": 3115 }, { "epoch": 0.75, "learning_rate": 5.605333333333333e-08, "logps/chosen": -211.7803955078125, "logps/rejected": -380.09942626953125, "loss": 0.0026, "losses/dpo": 9.800974268281415e-12, "losses/sft": 0.7540871500968933, "losses/total": 9.800974268281415e-12, "ref_logps/chosen": -193.85809326171875, "ref_logps/rejected": -216.36122131347656, "rewards/accuracies": 1.0, "rewards/chosen": -1.7922297716140747, "rewards/margins": 14.581591606140137, "rewards/rejected": -16.373821258544922, "step": 3116 }, { "epoch": 0.75, "learning_rate": 5.6000000000000005e-08, "logps/chosen": -247.14334106445312, "logps/rejected": -374.9432067871094, "loss": 0.0005, "losses/dpo": 1.482158493160135e-11, "losses/sft": 0.7647855281829834, "losses/total": 1.482158493160135e-11, "ref_logps/chosen": -232.17657470703125, "ref_logps/rejected": -219.0228271484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4966773986816406, "rewards/margins": 14.095361709594727, "rewards/rejected": -15.592040061950684, "step": 3117 }, { "epoch": 0.75, "learning_rate": 5.594666666666666e-08, "logps/chosen": -201.698974609375, "logps/rejected": -412.7813720703125, "loss": 0.0025, "losses/dpo": 1.9329020606306813e-09, "losses/sft": 0.5585728883743286, "losses/total": 1.9329020606306813e-09, "ref_logps/chosen": -184.86109924316406, "ref_logps/rejected": -237.6831817626953, "rewards/accuracies": 1.0, "rewards/chosen": -1.68378746509552, "rewards/margins": 15.826028823852539, "rewards/rejected": -17.509815216064453, "step": 3118 }, { "epoch": 0.75, "learning_rate": 5.5893333333333326e-08, "logps/chosen": -273.8260192871094, "logps/rejected": -418.5015869140625, "loss": 0.0011, "losses/dpo": 9.670987566323674e-08, "losses/sft": 0.5048779845237732, "losses/total": 9.670987566323674e-08, "ref_logps/chosen": -257.4313659667969, "ref_logps/rejected": -243.9115753173828, "rewards/accuracies": 1.0, "rewards/chosen": -1.639465093612671, "rewards/margins": 15.819540023803711, "rewards/rejected": -17.45900535583496, "step": 3119 }, { "epoch": 0.75, "learning_rate": 5.5839999999999996e-08, "logps/chosen": -199.86080932617188, "logps/rejected": -383.6782531738281, "loss": 0.0008, "losses/dpo": 2.333788440278539e-10, "losses/sft": 0.4184778928756714, "losses/total": 2.333788440278539e-10, "ref_logps/chosen": -189.95960998535156, "ref_logps/rejected": -220.2191925048828, "rewards/accuracies": 1.0, "rewards/chosen": -0.9901188611984253, "rewards/margins": 15.35578727722168, "rewards/rejected": -16.345905303955078, "step": 3120 }, { "epoch": 0.75, "learning_rate": 5.578666666666666e-08, "logps/chosen": -233.10983276367188, "logps/rejected": -388.7635498046875, "loss": 0.0015, "losses/dpo": 3.8535541424522535e-10, "losses/sft": 0.5638095736503601, "losses/total": 3.8535541424522535e-10, "ref_logps/chosen": -219.6275177001953, "ref_logps/rejected": -220.4798126220703, "rewards/accuracies": 1.0, "rewards/chosen": -1.3482319116592407, "rewards/margins": 15.480142593383789, "rewards/rejected": -16.828372955322266, "step": 3121 }, { "epoch": 0.75, "learning_rate": 5.573333333333333e-08, "logps/chosen": -283.6314697265625, "logps/rejected": -393.1962585449219, "loss": 0.0001, "losses/dpo": 1.0317790355429679e-07, "losses/sft": 0.573711097240448, "losses/total": 1.0317790355429679e-07, "ref_logps/chosen": -266.9202880859375, "ref_logps/rejected": -224.55599975585938, "rewards/accuracies": 1.0, "rewards/chosen": -1.6711187362670898, "rewards/margins": 15.192907333374023, "rewards/rejected": -16.864025115966797, "step": 3122 }, { "epoch": 0.75, "learning_rate": 5.567999999999999e-08, "logps/chosen": -287.3785400390625, "logps/rejected": -422.3664855957031, "loss": 0.0005, "losses/dpo": 1.1618750406228173e-09, "losses/sft": 0.5906394124031067, "losses/total": 1.1618750406228173e-09, "ref_logps/chosen": -271.70550537109375, "ref_logps/rejected": -255.57290649414062, "rewards/accuracies": 1.0, "rewards/chosen": -1.5673071146011353, "rewards/margins": 15.112051010131836, "rewards/rejected": -16.679357528686523, "step": 3123 }, { "epoch": 0.75, "learning_rate": 5.562666666666666e-08, "logps/chosen": -224.68309020996094, "logps/rejected": -334.9400329589844, "loss": 0.0055, "losses/dpo": 0.0033769558649510145, "losses/sft": 0.6459147334098816, "losses/total": 0.0033769558649510145, "ref_logps/chosen": -209.80908203125, "ref_logps/rejected": -182.36996459960938, "rewards/accuracies": 1.0, "rewards/chosen": -1.487401008605957, "rewards/margins": 13.769607543945312, "rewards/rejected": -15.25700855255127, "step": 3124 }, { "epoch": 0.75, "learning_rate": 5.557333333333333e-08, "logps/chosen": -266.2928771972656, "logps/rejected": -404.040283203125, "loss": 0.0001, "losses/dpo": 8.09047939753782e-09, "losses/sft": 0.6703627705574036, "losses/total": 8.09047939753782e-09, "ref_logps/chosen": -249.48533630371094, "ref_logps/rejected": -231.80770874023438, "rewards/accuracies": 1.0, "rewards/chosen": -1.680753231048584, "rewards/margins": 15.542505264282227, "rewards/rejected": -17.22325897216797, "step": 3125 }, { "epoch": 0.75, "learning_rate": 5.552e-08, "logps/chosen": -272.38018798828125, "logps/rejected": -412.51202392578125, "loss": 0.0019, "losses/dpo": 1.5770973416096012e-10, "losses/sft": 0.5935007929801941, "losses/total": 1.5770973416096012e-10, "ref_logps/chosen": -256.31317138671875, "ref_logps/rejected": -234.2935791015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6067007780075073, "rewards/margins": 16.21514320373535, "rewards/rejected": -17.82184410095215, "step": 3126 }, { "epoch": 0.75, "learning_rate": 5.546666666666666e-08, "logps/chosen": -249.58938598632812, "logps/rejected": -402.8857727050781, "loss": 0.0036, "losses/dpo": 2.080018475680845e-06, "losses/sft": 0.32410866022109985, "losses/total": 2.080018475680845e-06, "ref_logps/chosen": -234.3629608154297, "ref_logps/rejected": -242.20388793945312, "rewards/accuracies": 1.0, "rewards/chosen": -1.522641897201538, "rewards/margins": 14.545546531677246, "rewards/rejected": -16.06818962097168, "step": 3127 }, { "epoch": 0.75, "learning_rate": 5.541333333333334e-08, "logps/chosen": -251.94705200195312, "logps/rejected": -353.53759765625, "loss": 0.0031, "losses/dpo": 0.05483265966176987, "losses/sft": 0.759585440158844, "losses/total": 0.05483265966176987, "ref_logps/chosen": -232.9617156982422, "ref_logps/rejected": -199.8502655029297, "rewards/accuracies": 1.0, "rewards/chosen": -1.8985342979431152, "rewards/margins": 13.470197677612305, "rewards/rejected": -15.368732452392578, "step": 3128 }, { "epoch": 0.75, "learning_rate": 5.5359999999999994e-08, "logps/chosen": -230.30499267578125, "logps/rejected": -378.79461669921875, "loss": 0.0033, "losses/dpo": 3.2648689085590377e-08, "losses/sft": 0.6376828551292419, "losses/total": 3.2648689085590377e-08, "ref_logps/chosen": -213.56756591796875, "ref_logps/rejected": -212.97262573242188, "rewards/accuracies": 1.0, "rewards/chosen": -1.6737415790557861, "rewards/margins": 14.908458709716797, "rewards/rejected": -16.582199096679688, "step": 3129 }, { "epoch": 0.75, "learning_rate": 5.530666666666667e-08, "logps/chosen": -252.50363159179688, "logps/rejected": -373.9195556640625, "loss": 0.0005, "losses/dpo": 1.653820656810634e-11, "losses/sft": 0.5831024050712585, "losses/total": 1.653820656810634e-11, "ref_logps/chosen": -235.8415985107422, "ref_logps/rejected": -217.69708251953125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6662060022354126, "rewards/margins": 13.956039428710938, "rewards/rejected": -15.622245788574219, "step": 3130 }, { "epoch": 0.75, "learning_rate": 5.525333333333333e-08, "logps/chosen": -244.14447021484375, "logps/rejected": -348.494140625, "loss": 0.0003, "losses/dpo": 1.3689083971257787e-05, "losses/sft": 0.4868246614933014, "losses/total": 1.3689083971257787e-05, "ref_logps/chosen": -227.12677001953125, "ref_logps/rejected": -197.6048126220703, "rewards/accuracies": 1.0, "rewards/chosen": -1.7017712593078613, "rewards/margins": 13.387165069580078, "rewards/rejected": -15.088935852050781, "step": 3131 }, { "epoch": 0.75, "learning_rate": 5.5200000000000005e-08, "logps/chosen": -255.02020263671875, "logps/rejected": -384.99322509765625, "loss": 0.002, "losses/dpo": 4.855787949509249e-08, "losses/sft": 0.6138442158699036, "losses/total": 4.855787949509249e-08, "ref_logps/chosen": -237.5617218017578, "ref_logps/rejected": -222.42596435546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7458467483520508, "rewards/margins": 14.510883331298828, "rewards/rejected": -16.256729125976562, "step": 3132 }, { "epoch": 0.75, "learning_rate": 5.514666666666666e-08, "logps/chosen": -263.3918762207031, "logps/rejected": -397.58978271484375, "loss": 0.0003, "losses/dpo": 4.189542934796009e-09, "losses/sft": 0.6730222105979919, "losses/total": 4.189542934796009e-09, "ref_logps/chosen": -248.42835998535156, "ref_logps/rejected": -229.4434356689453, "rewards/accuracies": 1.0, "rewards/chosen": -1.4963539838790894, "rewards/margins": 15.318283081054688, "rewards/rejected": -16.81463623046875, "step": 3133 }, { "epoch": 0.75, "learning_rate": 5.509333333333334e-08, "logps/chosen": -254.06275939941406, "logps/rejected": -425.04010009765625, "loss": 0.0001, "losses/dpo": 6.353127659369795e-12, "losses/sft": 0.6414142847061157, "losses/total": 6.353127659369795e-12, "ref_logps/chosen": -230.33047485351562, "ref_logps/rejected": -243.8625030517578, "rewards/accuracies": 1.0, "rewards/chosen": -2.373225450515747, "rewards/margins": 15.74453353881836, "rewards/rejected": -18.117759704589844, "step": 3134 }, { "epoch": 0.75, "learning_rate": 5.5039999999999995e-08, "logps/chosen": -243.21417236328125, "logps/rejected": -408.29876708984375, "loss": 0.0004, "losses/dpo": 3.9364650206152874e-07, "losses/sft": 0.5934849977493286, "losses/total": 3.9364650206152874e-07, "ref_logps/chosen": -228.0868682861328, "ref_logps/rejected": -242.6295623779297, "rewards/accuracies": 1.0, "rewards/chosen": -1.5127310752868652, "rewards/margins": 15.05418872833252, "rewards/rejected": -16.56692123413086, "step": 3135 }, { "epoch": 0.75, "learning_rate": 5.498666666666666e-08, "logps/chosen": -242.34677124023438, "logps/rejected": -403.4368896484375, "loss": 0.0009, "losses/dpo": 4.6775920026220774e-08, "losses/sft": 0.8469385504722595, "losses/total": 4.6775920026220774e-08, "ref_logps/chosen": -229.4105224609375, "ref_logps/rejected": -233.03057861328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.2936257123947144, "rewards/margins": 15.747004508972168, "rewards/rejected": -17.040630340576172, "step": 3136 }, { "epoch": 0.75, "learning_rate": 5.493333333333333e-08, "logps/chosen": -235.47137451171875, "logps/rejected": -369.2446594238281, "loss": 0.0009, "losses/dpo": 2.8501471405206757e-08, "losses/sft": 1.0265207290649414, "losses/total": 2.8501471405206757e-08, "ref_logps/chosen": -221.7578887939453, "ref_logps/rejected": -213.43089294433594, "rewards/accuracies": 1.0, "rewards/chosen": -1.3713490962982178, "rewards/margins": 14.210030555725098, "rewards/rejected": -15.581379890441895, "step": 3137 }, { "epoch": 0.75, "learning_rate": 5.487999999999999e-08, "logps/chosen": -252.57598876953125, "logps/rejected": -391.1086120605469, "loss": 0.0035, "losses/dpo": 1.9078368052038996e-10, "losses/sft": 0.5926855802536011, "losses/total": 1.9078368052038996e-10, "ref_logps/chosen": -235.6430206298828, "ref_logps/rejected": -228.79861450195312, "rewards/accuracies": 1.0, "rewards/chosen": -1.6932978630065918, "rewards/margins": 14.537700653076172, "rewards/rejected": -16.230998992919922, "step": 3138 }, { "epoch": 0.75, "learning_rate": 5.482666666666666e-08, "logps/chosen": -224.1503448486328, "logps/rejected": -316.25347900390625, "loss": 0.0036, "losses/dpo": 6.927426410285875e-10, "losses/sft": 0.9622558951377869, "losses/total": 6.927426410285875e-10, "ref_logps/chosen": -210.44635009765625, "ref_logps/rejected": -176.24267578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.3703999519348145, "rewards/margins": 12.630681037902832, "rewards/rejected": -14.001080513000488, "step": 3139 }, { "epoch": 0.75, "learning_rate": 5.4773333333333326e-08, "logps/chosen": -252.44412231445312, "logps/rejected": -376.40087890625, "loss": 0.0017, "losses/dpo": 1.342019273486983e-09, "losses/sft": 0.7289301156997681, "losses/total": 1.342019273486983e-09, "ref_logps/chosen": -236.87643432617188, "ref_logps/rejected": -213.14971923828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.556769609451294, "rewards/margins": 14.76834487915039, "rewards/rejected": -16.32511329650879, "step": 3140 }, { "epoch": 0.75, "learning_rate": 5.4719999999999996e-08, "logps/chosen": -226.120361328125, "logps/rejected": -406.2663879394531, "loss": 0.0033, "losses/dpo": 1.613629905250491e-07, "losses/sft": 0.6464628577232361, "losses/total": 1.613629905250491e-07, "ref_logps/chosen": -208.61190795898438, "ref_logps/rejected": -240.4949493408203, "rewards/accuracies": 1.0, "rewards/chosen": -1.7508469820022583, "rewards/margins": 14.826297760009766, "rewards/rejected": -16.577144622802734, "step": 3141 }, { "epoch": 0.75, "learning_rate": 5.466666666666666e-08, "logps/chosen": -250.19110107421875, "logps/rejected": -432.0926513671875, "loss": 0.0021, "losses/dpo": 2.0747188855807508e-08, "losses/sft": 0.4456334114074707, "losses/total": 2.0747188855807508e-08, "ref_logps/chosen": -234.2620086669922, "ref_logps/rejected": -243.15457153320312, "rewards/accuracies": 1.0, "rewards/chosen": -1.592910647392273, "rewards/margins": 17.30089569091797, "rewards/rejected": -18.89380645751953, "step": 3142 }, { "epoch": 0.75, "learning_rate": 5.4613333333333337e-08, "logps/chosen": -292.55841064453125, "logps/rejected": -423.3252868652344, "loss": 0.0003, "losses/dpo": 3.5809591736324364e-08, "losses/sft": 0.5517544746398926, "losses/total": 3.5809591736324364e-08, "ref_logps/chosen": -273.9399719238281, "ref_logps/rejected": -252.42782592773438, "rewards/accuracies": 1.0, "rewards/chosen": -1.8618422746658325, "rewards/margins": 15.227900505065918, "rewards/rejected": -17.089744567871094, "step": 3143 }, { "epoch": 0.75, "learning_rate": 5.4559999999999994e-08, "logps/chosen": -261.0091552734375, "logps/rejected": -387.130126953125, "loss": 0.0006, "losses/dpo": 5.3776071581523865e-05, "losses/sft": 0.6427079439163208, "losses/total": 5.3776071581523865e-05, "ref_logps/chosen": -245.1374969482422, "ref_logps/rejected": -225.0380859375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5871630907058716, "rewards/margins": 14.62204360961914, "rewards/rejected": -16.20920753479004, "step": 3144 }, { "epoch": 0.75, "learning_rate": 5.450666666666667e-08, "logps/chosen": -292.7342529296875, "logps/rejected": -425.5049133300781, "loss": 0.0005, "losses/dpo": 1.8358128905937576e-10, "losses/sft": 0.5699817538261414, "losses/total": 1.8358128905937576e-10, "ref_logps/chosen": -280.4925537109375, "ref_logps/rejected": -248.45571899414062, "rewards/accuracies": 1.0, "rewards/chosen": -1.2241730690002441, "rewards/margins": 16.48074722290039, "rewards/rejected": -17.704919815063477, "step": 3145 }, { "epoch": 0.75, "learning_rate": 5.445333333333333e-08, "logps/chosen": -270.612060546875, "logps/rejected": -401.49017333984375, "loss": 0.0002, "losses/dpo": 1.769086566127953e-07, "losses/sft": 1.21287202835083, "losses/total": 1.769086566127953e-07, "ref_logps/chosen": -253.19573974609375, "ref_logps/rejected": -225.86459350585938, "rewards/accuracies": 1.0, "rewards/chosen": -1.7416342496871948, "rewards/margins": 15.820921897888184, "rewards/rejected": -17.56255531311035, "step": 3146 }, { "epoch": 0.76, "learning_rate": 5.4400000000000004e-08, "logps/chosen": -229.62359619140625, "logps/rejected": -378.4679260253906, "loss": 0.0018, "losses/dpo": 4.422206600906975e-08, "losses/sft": 0.4760112762451172, "losses/total": 4.422206600906975e-08, "ref_logps/chosen": -217.5887451171875, "ref_logps/rejected": -204.96527099609375, "rewards/accuracies": 1.0, "rewards/chosen": -1.2034852504730225, "rewards/margins": 16.14678192138672, "rewards/rejected": -17.35026741027832, "step": 3147 }, { "epoch": 0.76, "learning_rate": 5.434666666666666e-08, "logps/chosen": -240.71888732910156, "logps/rejected": -413.80316162109375, "loss": 0.0028, "losses/dpo": 2.89894643401567e-07, "losses/sft": 0.624625027179718, "losses/total": 2.89894643401567e-07, "ref_logps/chosen": -223.37362670898438, "ref_logps/rejected": -240.11241149902344, "rewards/accuracies": 1.0, "rewards/chosen": -1.7345259189605713, "rewards/margins": 15.634549140930176, "rewards/rejected": -17.369075775146484, "step": 3148 }, { "epoch": 0.76, "learning_rate": 5.429333333333334e-08, "logps/chosen": -213.3736572265625, "logps/rejected": -400.1468505859375, "loss": 0.0004, "losses/dpo": 7.728755413438648e-09, "losses/sft": 0.4721347391605377, "losses/total": 7.728755413438648e-09, "ref_logps/chosen": -195.7849578857422, "ref_logps/rejected": -229.77401733398438, "rewards/accuracies": 1.0, "rewards/chosen": -1.75886869430542, "rewards/margins": 15.278414726257324, "rewards/rejected": -17.037282943725586, "step": 3149 }, { "epoch": 0.76, "learning_rate": 5.4239999999999995e-08, "logps/chosen": -229.55067443847656, "logps/rejected": -357.65557861328125, "loss": 0.0002, "losses/dpo": 1.1325869309075642e-07, "losses/sft": 0.5208719968795776, "losses/total": 1.1325869309075642e-07, "ref_logps/chosen": -210.95162963867188, "ref_logps/rejected": -202.80810546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.8599051237106323, "rewards/margins": 13.624844551086426, "rewards/rejected": -15.484748840332031, "step": 3150 }, { "epoch": 0.76, "learning_rate": 5.418666666666667e-08, "logps/chosen": -257.42230224609375, "logps/rejected": -390.46722412109375, "loss": 0.0001, "losses/dpo": 8.827437114611314e-10, "losses/sft": 0.4873930811882019, "losses/total": 8.827437114611314e-10, "ref_logps/chosen": -243.57907104492188, "ref_logps/rejected": -232.58529663085938, "rewards/accuracies": 1.0, "rewards/chosen": -1.3843233585357666, "rewards/margins": 14.403865814208984, "rewards/rejected": -15.788188934326172, "step": 3151 }, { "epoch": 0.76, "learning_rate": 5.413333333333333e-08, "logps/chosen": -240.11007690429688, "logps/rejected": -377.958251953125, "loss": 0.0007, "losses/dpo": 3.850259489013297e-08, "losses/sft": 0.6392300724983215, "losses/total": 3.850259489013297e-08, "ref_logps/chosen": -222.83123779296875, "ref_logps/rejected": -214.09127807617188, "rewards/accuracies": 1.0, "rewards/chosen": -1.727885365486145, "rewards/margins": 14.658811569213867, "rewards/rejected": -16.38669776916504, "step": 3152 }, { "epoch": 0.76, "learning_rate": 5.407999999999999e-08, "logps/chosen": -241.4322509765625, "logps/rejected": -398.55255126953125, "loss": 0.0001, "losses/dpo": 3.802171910649577e-09, "losses/sft": 0.5967295169830322, "losses/total": 3.802171910649577e-09, "ref_logps/chosen": -225.3203125, "ref_logps/rejected": -226.32440185546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.6111937761306763, "rewards/margins": 15.611623764038086, "rewards/rejected": -17.222816467285156, "step": 3153 }, { "epoch": 0.76, "learning_rate": 5.402666666666666e-08, "logps/chosen": -222.02696228027344, "logps/rejected": -353.9848937988281, "loss": 0.0026, "losses/dpo": 8.931534267730967e-08, "losses/sft": 0.599562406539917, "losses/total": 8.931534267730967e-08, "ref_logps/chosen": -207.42579650878906, "ref_logps/rejected": -204.73219299316406, "rewards/accuracies": 1.0, "rewards/chosen": -1.4601185321807861, "rewards/margins": 13.465152740478516, "rewards/rejected": -14.925272941589355, "step": 3154 }, { "epoch": 0.76, "learning_rate": 5.3973333333333326e-08, "logps/chosen": -241.8824005126953, "logps/rejected": -391.1502685546875, "loss": 0.0004, "losses/dpo": 1.6014253390039812e-07, "losses/sft": 0.6661319136619568, "losses/total": 1.6014253390039812e-07, "ref_logps/chosen": -226.6103973388672, "ref_logps/rejected": -230.02195739746094, "rewards/accuracies": 1.0, "rewards/chosen": -1.527200698852539, "rewards/margins": 14.58563232421875, "rewards/rejected": -16.11283302307129, "step": 3155 }, { "epoch": 0.76, "learning_rate": 5.3919999999999996e-08, "logps/chosen": -214.96888732910156, "logps/rejected": -373.1434326171875, "loss": 0.0004, "losses/dpo": 2.928839194282773e-06, "losses/sft": 0.6352789998054504, "losses/total": 2.928839194282773e-06, "ref_logps/chosen": -201.1116485595703, "ref_logps/rejected": -218.5321807861328, "rewards/accuracies": 1.0, "rewards/chosen": -1.3857251405715942, "rewards/margins": 14.075399398803711, "rewards/rejected": -15.461124420166016, "step": 3156 }, { "epoch": 0.76, "learning_rate": 5.386666666666666e-08, "logps/chosen": -242.9777069091797, "logps/rejected": -404.28460693359375, "loss": 0.0003, "losses/dpo": 1.3087776551401475e-07, "losses/sft": 0.4548391103744507, "losses/total": 1.3087776551401475e-07, "ref_logps/chosen": -228.01828002929688, "ref_logps/rejected": -232.98626708984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4959404468536377, "rewards/margins": 15.633891105651855, "rewards/rejected": -17.129831314086914, "step": 3157 }, { "epoch": 0.76, "learning_rate": 5.3813333333333336e-08, "logps/chosen": -260.96539306640625, "logps/rejected": -396.999267578125, "loss": 0.0012, "losses/dpo": 2.3771471546751854e-07, "losses/sft": 0.6040354371070862, "losses/total": 2.3771471546751854e-07, "ref_logps/chosen": -241.71804809570312, "ref_logps/rejected": -233.95269775390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.924736738204956, "rewards/margins": 14.37992000579834, "rewards/rejected": -16.304656982421875, "step": 3158 }, { "epoch": 0.76, "learning_rate": 5.375999999999999e-08, "logps/chosen": -266.71661376953125, "logps/rejected": -418.26171875, "loss": 0.0062, "losses/dpo": 2.7838513005917775e-07, "losses/sft": 0.6501189470291138, "losses/total": 2.7838513005917775e-07, "ref_logps/chosen": -249.6021270751953, "ref_logps/rejected": -240.01734924316406, "rewards/accuracies": 1.0, "rewards/chosen": -1.7114481925964355, "rewards/margins": 16.112991333007812, "rewards/rejected": -17.824440002441406, "step": 3159 }, { "epoch": 0.76, "learning_rate": 5.370666666666667e-08, "logps/chosen": -258.1700439453125, "logps/rejected": -374.2591857910156, "loss": 0.0002, "losses/dpo": 7.415284198941663e-05, "losses/sft": 0.40051689743995667, "losses/total": 7.415284198941663e-05, "ref_logps/chosen": -242.46127319335938, "ref_logps/rejected": -210.63601684570312, "rewards/accuracies": 1.0, "rewards/chosen": -1.5708774328231812, "rewards/margins": 14.7914400100708, "rewards/rejected": -16.36231803894043, "step": 3160 }, { "epoch": 0.76, "learning_rate": 5.365333333333333e-08, "logps/chosen": -262.57586669921875, "logps/rejected": -358.6148376464844, "loss": 0.0033, "losses/dpo": 1.1358817175732838e-07, "losses/sft": 0.5521600246429443, "losses/total": 1.1358817175732838e-07, "ref_logps/chosen": -250.47213745117188, "ref_logps/rejected": -206.10061645507812, "rewards/accuracies": 1.0, "rewards/chosen": -1.2103716135025024, "rewards/margins": 14.041051864624023, "rewards/rejected": -15.251422882080078, "step": 3161 }, { "epoch": 0.76, "learning_rate": 5.3600000000000004e-08, "logps/chosen": -275.365234375, "logps/rejected": -412.74951171875, "loss": 0.0035, "losses/dpo": 3.040420346045458e-11, "losses/sft": 0.6142351627349854, "losses/total": 3.040420346045458e-11, "ref_logps/chosen": -259.402587890625, "ref_logps/rejected": -235.5118408203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.5962674617767334, "rewards/margins": 16.127498626708984, "rewards/rejected": -17.723766326904297, "step": 3162 }, { "epoch": 0.76, "learning_rate": 5.354666666666666e-08, "logps/chosen": -190.86666870117188, "logps/rejected": -362.12744140625, "loss": 0.0003, "losses/dpo": 3.364049234733102e-07, "losses/sft": 1.0455715656280518, "losses/total": 3.364049234733102e-07, "ref_logps/chosen": -179.05322265625, "ref_logps/rejected": -206.65843200683594, "rewards/accuracies": 1.0, "rewards/chosen": -1.1813462972640991, "rewards/margins": 14.365556716918945, "rewards/rejected": -15.54690170288086, "step": 3163 }, { "epoch": 0.76, "learning_rate": 5.349333333333334e-08, "logps/chosen": -271.4342956542969, "logps/rejected": -423.4690856933594, "loss": 0.0004, "losses/dpo": 1.3623731032197384e-09, "losses/sft": 0.6971099376678467, "losses/total": 1.3623731032197384e-09, "ref_logps/chosen": -252.589599609375, "ref_logps/rejected": -237.4918670654297, "rewards/accuracies": 1.0, "rewards/chosen": -1.884467601776123, "rewards/margins": 16.7132568359375, "rewards/rejected": -18.59772300720215, "step": 3164 }, { "epoch": 0.76, "learning_rate": 5.3439999999999994e-08, "logps/chosen": -231.75100708007812, "logps/rejected": -386.9609069824219, "loss": 0.0026, "losses/dpo": 6.06820288462373e-11, "losses/sft": 0.5290213823318481, "losses/total": 6.06820288462373e-11, "ref_logps/chosen": -215.48110961914062, "ref_logps/rejected": -223.2525634765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6269879341125488, "rewards/margins": 14.74384593963623, "rewards/rejected": -16.370834350585938, "step": 3165 }, { "epoch": 0.76, "learning_rate": 5.338666666666667e-08, "logps/chosen": -265.34912109375, "logps/rejected": -394.1059875488281, "loss": 0.0002, "losses/dpo": 1.1368674002198986e-08, "losses/sft": 0.6493476629257202, "losses/total": 1.1368674002198986e-08, "ref_logps/chosen": -248.75465393066406, "ref_logps/rejected": -227.91818237304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.6594481468200684, "rewards/margins": 14.959335327148438, "rewards/rejected": -16.61878204345703, "step": 3166 }, { "epoch": 0.76, "learning_rate": 5.333333333333333e-08, "logps/chosen": -196.71612548828125, "logps/rejected": -408.69793701171875, "loss": 0.0005, "losses/dpo": 6.005735286862546e-08, "losses/sft": 0.9687275886535645, "losses/total": 6.005735286862546e-08, "ref_logps/chosen": -182.138916015625, "ref_logps/rejected": -231.63917541503906, "rewards/accuracies": 1.0, "rewards/chosen": -1.4577202796936035, "rewards/margins": 16.248157501220703, "rewards/rejected": -17.705875396728516, "step": 3167 }, { "epoch": 0.76, "learning_rate": 5.3280000000000005e-08, "logps/chosen": -224.04237365722656, "logps/rejected": -384.9669494628906, "loss": 0.0133, "losses/dpo": 2.660527576026084e-09, "losses/sft": 0.8903811573982239, "losses/total": 2.660527576026084e-09, "ref_logps/chosen": -210.081298828125, "ref_logps/rejected": -228.38255310058594, "rewards/accuracies": 1.0, "rewards/chosen": -1.3961066007614136, "rewards/margins": 14.262333869934082, "rewards/rejected": -15.658439636230469, "step": 3168 }, { "epoch": 0.76, "learning_rate": 5.322666666666666e-08, "logps/chosen": -257.4254455566406, "logps/rejected": -367.54705810546875, "loss": 0.0018, "losses/dpo": 8.25971426365868e-07, "losses/sft": 0.3647644519805908, "losses/total": 8.25971426365868e-07, "ref_logps/chosen": -240.80543518066406, "ref_logps/rejected": -211.1455841064453, "rewards/accuracies": 1.0, "rewards/chosen": -1.6619994640350342, "rewards/margins": 13.978147506713867, "rewards/rejected": -15.640148162841797, "step": 3169 }, { "epoch": 0.76, "learning_rate": 5.3173333333333325e-08, "logps/chosen": -257.5028076171875, "logps/rejected": -387.98443603515625, "loss": 0.0186, "losses/dpo": 7.236730681370318e-08, "losses/sft": 1.089744210243225, "losses/total": 7.236730681370318e-08, "ref_logps/chosen": -238.07318115234375, "ref_logps/rejected": -222.40707397460938, "rewards/accuracies": 1.0, "rewards/chosen": -1.9429646730422974, "rewards/margins": 14.61476993560791, "rewards/rejected": -16.557735443115234, "step": 3170 }, { "epoch": 0.76, "learning_rate": 5.3119999999999995e-08, "logps/chosen": -275.94073486328125, "logps/rejected": -402.025390625, "loss": 0.0047, "losses/dpo": 1.947113720257576e-10, "losses/sft": 0.8067202568054199, "losses/total": 1.947113720257576e-10, "ref_logps/chosen": -260.9243469238281, "ref_logps/rejected": -230.00897216796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5016398429870605, "rewards/margins": 15.699999809265137, "rewards/rejected": -17.20163917541504, "step": 3171 }, { "epoch": 0.76, "learning_rate": 5.306666666666666e-08, "logps/chosen": -223.46324157714844, "logps/rejected": -369.23162841796875, "loss": 0.0002, "losses/dpo": 1.1491765875617241e-15, "losses/sft": 0.5992441773414612, "losses/total": 1.1491765875617241e-15, "ref_logps/chosen": -209.3466033935547, "ref_logps/rejected": -206.16921997070312, "rewards/accuracies": 1.0, "rewards/chosen": -1.4116626977920532, "rewards/margins": 14.89457893371582, "rewards/rejected": -16.306241989135742, "step": 3172 }, { "epoch": 0.76, "learning_rate": 5.301333333333333e-08, "logps/chosen": -249.44509887695312, "logps/rejected": -390.6567687988281, "loss": 0.0036, "losses/dpo": 9.308324706580606e-07, "losses/sft": 0.9056822657585144, "losses/total": 9.308324706580606e-07, "ref_logps/chosen": -234.13986206054688, "ref_logps/rejected": -229.67523193359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.530524492263794, "rewards/margins": 14.567628860473633, "rewards/rejected": -16.09815216064453, "step": 3173 }, { "epoch": 0.76, "learning_rate": 5.295999999999999e-08, "logps/chosen": -265.5770263671875, "logps/rejected": -406.18865966796875, "loss": 0.0028, "losses/dpo": 3.3196414733538404e-05, "losses/sft": 0.9073420166969299, "losses/total": 3.3196414733538404e-05, "ref_logps/chosen": -248.17245483398438, "ref_logps/rejected": -243.48159790039062, "rewards/accuracies": 1.0, "rewards/chosen": -1.740455985069275, "rewards/margins": 14.530251502990723, "rewards/rejected": -16.270708084106445, "step": 3174 }, { "epoch": 0.76, "learning_rate": 5.290666666666667e-08, "logps/chosen": -254.31777954101562, "logps/rejected": -393.7252502441406, "loss": 0.0019, "losses/dpo": 1.8009171931510082e-09, "losses/sft": 0.657630443572998, "losses/total": 1.8009171931510082e-09, "ref_logps/chosen": -236.62953186035156, "ref_logps/rejected": -229.9942626953125, "rewards/accuracies": 1.0, "rewards/chosen": -1.7688262462615967, "rewards/margins": 14.604269981384277, "rewards/rejected": -16.373096466064453, "step": 3175 }, { "epoch": 0.76, "learning_rate": 5.2853333333333326e-08, "logps/chosen": -289.4176025390625, "logps/rejected": -412.96881103515625, "loss": 0.0007, "losses/dpo": 1.3230845070211217e-05, "losses/sft": 0.9243308901786804, "losses/total": 1.3230845070211217e-05, "ref_logps/chosen": -275.2911376953125, "ref_logps/rejected": -235.5026397705078, "rewards/accuracies": 1.0, "rewards/chosen": -1.4126503467559814, "rewards/margins": 16.333969116210938, "rewards/rejected": -17.746620178222656, "step": 3176 }, { "epoch": 0.76, "learning_rate": 5.28e-08, "logps/chosen": -209.57078552246094, "logps/rejected": -380.23211669921875, "loss": 0.0005, "losses/dpo": 2.0757245238200994e-06, "losses/sft": 1.2830673456192017, "losses/total": 2.0757245238200994e-06, "ref_logps/chosen": -193.9088134765625, "ref_logps/rejected": -214.66270446777344, "rewards/accuracies": 1.0, "rewards/chosen": -1.5661978721618652, "rewards/margins": 14.990743637084961, "rewards/rejected": -16.556941986083984, "step": 3177 }, { "epoch": 0.76, "learning_rate": 5.274666666666666e-08, "logps/chosen": -235.60154724121094, "logps/rejected": -360.0416564941406, "loss": 0.0009, "losses/dpo": 1.393181179309977e-07, "losses/sft": 0.8515773415565491, "losses/total": 1.393181179309977e-07, "ref_logps/chosen": -222.0101318359375, "ref_logps/rejected": -205.7661895751953, "rewards/accuracies": 1.0, "rewards/chosen": -1.3591413497924805, "rewards/margins": 14.068404197692871, "rewards/rejected": -15.427545547485352, "step": 3178 }, { "epoch": 0.76, "learning_rate": 5.2693333333333337e-08, "logps/chosen": -236.91307067871094, "logps/rejected": -400.73162841796875, "loss": 0.0001, "losses/dpo": 3.5194295833207434e-07, "losses/sft": 0.9075988531112671, "losses/total": 3.5194295833207434e-07, "ref_logps/chosen": -223.6811065673828, "ref_logps/rejected": -227.36441040039062, "rewards/accuracies": 1.0, "rewards/chosen": -1.323197841644287, "rewards/margins": 16.01352310180664, "rewards/rejected": -17.336719512939453, "step": 3179 }, { "epoch": 0.76, "learning_rate": 5.2639999999999994e-08, "logps/chosen": -246.99005126953125, "logps/rejected": -422.4268493652344, "loss": 0.0003, "losses/dpo": 8.987097732671145e-10, "losses/sft": 0.581421434879303, "losses/total": 8.987097732671145e-10, "ref_logps/chosen": -226.6702423095703, "ref_logps/rejected": -238.22421264648438, "rewards/accuracies": 1.0, "rewards/chosen": -2.0319793224334717, "rewards/margins": 16.38828468322754, "rewards/rejected": -18.420265197753906, "step": 3180 }, { "epoch": 0.76, "learning_rate": 5.258666666666667e-08, "logps/chosen": -218.92108154296875, "logps/rejected": -364.8084411621094, "loss": 0.0016, "losses/dpo": 5.161373195505803e-08, "losses/sft": 0.4456452429294586, "losses/total": 5.161373195505803e-08, "ref_logps/chosen": -202.18832397460938, "ref_logps/rejected": -206.365966796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.6732778549194336, "rewards/margins": 14.170970916748047, "rewards/rejected": -15.84424877166748, "step": 3181 }, { "epoch": 0.76, "learning_rate": 5.253333333333333e-08, "logps/chosen": -228.5843048095703, "logps/rejected": -396.49041748046875, "loss": 0.0039, "losses/dpo": 8.721906009923064e-12, "losses/sft": 0.7803520560264587, "losses/total": 8.721906009923064e-12, "ref_logps/chosen": -211.23544311523438, "ref_logps/rejected": -231.38792419433594, "rewards/accuracies": 1.0, "rewards/chosen": -1.7348852157592773, "rewards/margins": 14.775365829467773, "rewards/rejected": -16.51025390625, "step": 3182 }, { "epoch": 0.76, "learning_rate": 5.2480000000000004e-08, "logps/chosen": -265.4773864746094, "logps/rejected": -397.4713439941406, "loss": 0.0101, "losses/dpo": 1.679556405065341e-08, "losses/sft": 0.6477747559547424, "losses/total": 1.679556405065341e-08, "ref_logps/chosen": -245.3030242919922, "ref_logps/rejected": -233.9954833984375, "rewards/accuracies": 1.0, "rewards/chosen": -2.0174365043640137, "rewards/margins": 14.330148696899414, "rewards/rejected": -16.347585678100586, "step": 3183 }, { "epoch": 0.76, "learning_rate": 5.242666666666666e-08, "logps/chosen": -235.6628875732422, "logps/rejected": -394.2304992675781, "loss": 0.0007, "losses/dpo": 1.4863243968932238e-09, "losses/sft": 0.6377862691879272, "losses/total": 1.4863243968932238e-09, "ref_logps/chosen": -220.62777709960938, "ref_logps/rejected": -230.41363525390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.5035115480422974, "rewards/margins": 14.878171920776367, "rewards/rejected": -16.381683349609375, "step": 3184 }, { "epoch": 0.76, "learning_rate": 5.237333333333334e-08, "logps/chosen": -232.080810546875, "logps/rejected": -394.6467590332031, "loss": 0.0002, "losses/dpo": 3.113714441838056e-08, "losses/sft": 0.5767074823379517, "losses/total": 3.113714441838056e-08, "ref_logps/chosen": -216.4274444580078, "ref_logps/rejected": -219.146240234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5653369426727295, "rewards/margins": 15.98471450805664, "rewards/rejected": -17.550052642822266, "step": 3185 }, { "epoch": 0.76, "learning_rate": 5.2319999999999995e-08, "logps/chosen": -281.4280090332031, "logps/rejected": -453.8689270019531, "loss": 0.0, "losses/dpo": 9.039781190267604e-08, "losses/sft": 0.49126800894737244, "losses/total": 9.039781190267604e-08, "ref_logps/chosen": -267.3287048339844, "ref_logps/rejected": -264.211181640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4099304676055908, "rewards/margins": 17.55584716796875, "rewards/rejected": -18.965776443481445, "step": 3186 }, { "epoch": 0.76, "learning_rate": 5.226666666666666e-08, "logps/chosen": -273.0982971191406, "logps/rejected": -397.4013671875, "loss": 0.0, "losses/dpo": 2.3143618577137204e-08, "losses/sft": 0.44014525413513184, "losses/total": 2.3143618577137204e-08, "ref_logps/chosen": -255.06080627441406, "ref_logps/rejected": -221.97003173828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.8037480115890503, "rewards/margins": 15.739386558532715, "rewards/rejected": -17.543134689331055, "step": 3187 }, { "epoch": 0.77, "learning_rate": 5.221333333333333e-08, "logps/chosen": -211.65316772460938, "logps/rejected": -362.6563720703125, "loss": 0.0008, "losses/dpo": 3.768037881712871e-09, "losses/sft": 0.5777065753936768, "losses/total": 3.768037881712871e-09, "ref_logps/chosen": -192.23153686523438, "ref_logps/rejected": -204.92733764648438, "rewards/accuracies": 1.0, "rewards/chosen": -1.9421615600585938, "rewards/margins": 13.830745697021484, "rewards/rejected": -15.772907257080078, "step": 3188 }, { "epoch": 0.77, "learning_rate": 5.215999999999999e-08, "logps/chosen": -258.044921875, "logps/rejected": -418.85467529296875, "loss": 0.0001, "losses/dpo": 6.724380709373368e-13, "losses/sft": 0.5156835913658142, "losses/total": 6.724380709373368e-13, "ref_logps/chosen": -239.57540893554688, "ref_logps/rejected": -238.5620574951172, "rewards/accuracies": 1.0, "rewards/chosen": -1.8469523191452026, "rewards/margins": 16.182308197021484, "rewards/rejected": -18.02926254272461, "step": 3189 }, { "epoch": 0.77, "learning_rate": 5.210666666666667e-08, "logps/chosen": -253.4611358642578, "logps/rejected": -397.6723327636719, "loss": 0.001, "losses/dpo": 1.2696577122994768e-10, "losses/sft": 0.6156488656997681, "losses/total": 1.2696577122994768e-10, "ref_logps/chosen": -237.12997436523438, "ref_logps/rejected": -225.48428344726562, "rewards/accuracies": 1.0, "rewards/chosen": -1.633115530014038, "rewards/margins": 15.585687637329102, "rewards/rejected": -17.21880340576172, "step": 3190 }, { "epoch": 0.77, "learning_rate": 5.2053333333333326e-08, "logps/chosen": -283.91229248046875, "logps/rejected": -426.98095703125, "loss": 0.0003, "losses/dpo": 7.320185013703906e-10, "losses/sft": 0.5515352487564087, "losses/total": 7.320185013703906e-10, "ref_logps/chosen": -267.47607421875, "ref_logps/rejected": -259.666015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6436233520507812, "rewards/margins": 15.087867736816406, "rewards/rejected": -16.731491088867188, "step": 3191 }, { "epoch": 0.77, "learning_rate": 5.2e-08, "logps/chosen": -211.74009704589844, "logps/rejected": -384.2419738769531, "loss": 0.0012, "losses/dpo": 6.8167920197481635e-09, "losses/sft": 0.4369056522846222, "losses/total": 6.8167920197481635e-09, "ref_logps/chosen": -198.47897338867188, "ref_logps/rejected": -219.7960968017578, "rewards/accuracies": 1.0, "rewards/chosen": -1.3261128664016724, "rewards/margins": 15.118476867675781, "rewards/rejected": -16.444589614868164, "step": 3192 }, { "epoch": 0.77, "learning_rate": 5.194666666666666e-08, "logps/chosen": -256.40338134765625, "logps/rejected": -411.31793212890625, "loss": 0.0012, "losses/dpo": 8.687704422527531e-08, "losses/sft": 1.141959309577942, "losses/total": 8.687704422527531e-08, "ref_logps/chosen": -238.99249267578125, "ref_logps/rejected": -236.86599731445312, "rewards/accuracies": 1.0, "rewards/chosen": -1.7410876750946045, "rewards/margins": 15.70410442352295, "rewards/rejected": -17.445192337036133, "step": 3193 }, { "epoch": 0.77, "learning_rate": 5.1893333333333336e-08, "logps/chosen": -245.73046875, "logps/rejected": -406.05157470703125, "loss": 0.0004, "losses/dpo": 4.683534768901154e-07, "losses/sft": 0.618025004863739, "losses/total": 4.683534768901154e-07, "ref_logps/chosen": -228.15512084960938, "ref_logps/rejected": -241.6677703857422, "rewards/accuracies": 1.0, "rewards/chosen": -1.7575352191925049, "rewards/margins": 14.6808443069458, "rewards/rejected": -16.438379287719727, "step": 3194 }, { "epoch": 0.77, "learning_rate": 5.183999999999999e-08, "logps/chosen": -228.00555419921875, "logps/rejected": -390.28973388671875, "loss": 0.0002, "losses/dpo": 2.1012308212831066e-11, "losses/sft": 0.7489068508148193, "losses/total": 2.1012308212831066e-11, "ref_logps/chosen": -209.38864135742188, "ref_logps/rejected": -214.06964111328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.8616920709609985, "rewards/margins": 15.760318756103516, "rewards/rejected": -17.62200927734375, "step": 3195 }, { "epoch": 0.77, "learning_rate": 5.178666666666667e-08, "logps/chosen": -225.03515625, "logps/rejected": -433.74542236328125, "loss": 0.0005, "losses/dpo": 3.4819336303115733e-10, "losses/sft": 0.6543349027633667, "losses/total": 3.4819336303115733e-10, "ref_logps/chosen": -208.2648468017578, "ref_logps/rejected": -239.92025756835938, "rewards/accuracies": 1.0, "rewards/chosen": -1.6770308017730713, "rewards/margins": 17.705486297607422, "rewards/rejected": -19.382516860961914, "step": 3196 }, { "epoch": 0.77, "learning_rate": 5.173333333333333e-08, "logps/chosen": -247.973876953125, "logps/rejected": -404.38311767578125, "loss": 0.0, "losses/dpo": 1.1762897079992118e-11, "losses/sft": 0.7109394073486328, "losses/total": 1.1762897079992118e-11, "ref_logps/chosen": -228.1167449951172, "ref_logps/rejected": -229.04986572265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.985711932182312, "rewards/margins": 15.547615051269531, "rewards/rejected": -17.533329010009766, "step": 3197 }, { "epoch": 0.77, "learning_rate": 5.1680000000000004e-08, "logps/chosen": -270.9088439941406, "logps/rejected": -415.300048828125, "loss": 0.0004, "losses/dpo": 9.002129445434548e-06, "losses/sft": 0.7552345395088196, "losses/total": 9.002129445434548e-06, "ref_logps/chosen": -254.3256378173828, "ref_logps/rejected": -242.81178283691406, "rewards/accuracies": 1.0, "rewards/chosen": -1.6583209037780762, "rewards/margins": 15.590503692626953, "rewards/rejected": -17.248825073242188, "step": 3198 }, { "epoch": 0.77, "learning_rate": 5.162666666666666e-08, "logps/chosen": -208.3226776123047, "logps/rejected": -347.8011474609375, "loss": 0.0036, "losses/dpo": 3.1504113554015056e-12, "losses/sft": 0.5668967366218567, "losses/total": 3.1504113554015056e-12, "ref_logps/chosen": -194.35621643066406, "ref_logps/rejected": -197.6932373046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.3966459035873413, "rewards/margins": 13.614147186279297, "rewards/rejected": -15.010793685913086, "step": 3199 }, { "epoch": 0.77, "learning_rate": 5.157333333333334e-08, "logps/chosen": -244.10302734375, "logps/rejected": -369.9036560058594, "loss": 0.0041, "losses/dpo": 2.7815600489589087e-08, "losses/sft": 1.2679113149642944, "losses/total": 2.7815600489589087e-08, "ref_logps/chosen": -225.12252807617188, "ref_logps/rejected": -208.15394592285156, "rewards/accuracies": 1.0, "rewards/chosen": -1.8980505466461182, "rewards/margins": 14.276918411254883, "rewards/rejected": -16.174968719482422, "step": 3200 }, { "epoch": 0.77, "learning_rate": 5.1519999999999994e-08, "logps/chosen": -245.6568603515625, "logps/rejected": -390.64935302734375, "loss": 0.0119, "losses/dpo": 7.260538126274696e-08, "losses/sft": 0.6522844433784485, "losses/total": 7.260538126274696e-08, "ref_logps/chosen": -231.4637451171875, "ref_logps/rejected": -222.24951171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.4193127155303955, "rewards/margins": 15.420670509338379, "rewards/rejected": -16.839982986450195, "step": 3201 }, { "epoch": 0.77, "learning_rate": 5.146666666666667e-08, "logps/chosen": -243.6846160888672, "logps/rejected": -315.4874267578125, "loss": 0.0044, "losses/dpo": 0.005087747238576412, "losses/sft": 0.6680824160575867, "losses/total": 0.005087747238576412, "ref_logps/chosen": -227.70265197753906, "ref_logps/rejected": -178.28851318359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5981963872909546, "rewards/margins": 12.121698379516602, "rewards/rejected": -13.719894409179688, "step": 3202 }, { "epoch": 0.77, "learning_rate": 5.141333333333333e-08, "logps/chosen": -292.08111572265625, "logps/rejected": -390.9398193359375, "loss": 0.0018, "losses/dpo": 2.6757306439972872e-11, "losses/sft": 0.4507642388343811, "losses/total": 2.6757306439972872e-11, "ref_logps/chosen": -271.5535583496094, "ref_logps/rejected": -226.96322631835938, "rewards/accuracies": 1.0, "rewards/chosen": -2.0527567863464355, "rewards/margins": 14.344900131225586, "rewards/rejected": -16.39765739440918, "step": 3203 }, { "epoch": 0.77, "learning_rate": 5.135999999999999e-08, "logps/chosen": -268.3292236328125, "logps/rejected": -428.694091796875, "loss": 0.0001, "losses/dpo": 2.3456489373074874e-07, "losses/sft": 0.5333341360092163, "losses/total": 2.3456489373074874e-07, "ref_logps/chosen": -248.89573669433594, "ref_logps/rejected": -240.64456176757812, "rewards/accuracies": 1.0, "rewards/chosen": -1.9433484077453613, "rewards/margins": 16.86160659790039, "rewards/rejected": -18.804954528808594, "step": 3204 }, { "epoch": 0.77, "learning_rate": 5.130666666666666e-08, "logps/chosen": -290.7008972167969, "logps/rejected": -416.1033935546875, "loss": 0.0024, "losses/dpo": 2.7795197183877463e-06, "losses/sft": 0.5456452965736389, "losses/total": 2.7795197183877463e-06, "ref_logps/chosen": -270.5681457519531, "ref_logps/rejected": -242.75498962402344, "rewards/accuracies": 1.0, "rewards/chosen": -2.013275623321533, "rewards/margins": 15.321561813354492, "rewards/rejected": -17.3348388671875, "step": 3205 }, { "epoch": 0.77, "learning_rate": 5.1253333333333325e-08, "logps/chosen": -250.13168334960938, "logps/rejected": -421.8016357421875, "loss": 0.0042, "losses/dpo": 2.9333984130630597e-09, "losses/sft": 0.6807904243469238, "losses/total": 2.9333984130630597e-09, "ref_logps/chosen": -231.91030883789062, "ref_logps/rejected": -245.94224548339844, "rewards/accuracies": 1.0, "rewards/chosen": -1.822138786315918, "rewards/margins": 15.763797760009766, "rewards/rejected": -17.5859375, "step": 3206 }, { "epoch": 0.77, "learning_rate": 5.12e-08, "logps/chosen": -243.00811767578125, "logps/rejected": -366.50567626953125, "loss": 0.0008, "losses/dpo": 3.883141914684529e-07, "losses/sft": 0.5799081325531006, "losses/total": 3.883141914684529e-07, "ref_logps/chosen": -226.8514404296875, "ref_logps/rejected": -205.4623260498047, "rewards/accuracies": 1.0, "rewards/chosen": -1.6156656742095947, "rewards/margins": 14.488669395446777, "rewards/rejected": -16.10433578491211, "step": 3207 }, { "epoch": 0.77, "learning_rate": 5.114666666666666e-08, "logps/chosen": -191.2130126953125, "logps/rejected": -375.266845703125, "loss": 0.0021, "losses/dpo": 1.9818582885910985e-10, "losses/sft": 0.6146008968353271, "losses/total": 1.9818582885910985e-10, "ref_logps/chosen": -177.4017333984375, "ref_logps/rejected": -215.19107055664062, "rewards/accuracies": 1.0, "rewards/chosen": -1.3811290264129639, "rewards/margins": 14.62645149230957, "rewards/rejected": -16.007579803466797, "step": 3208 }, { "epoch": 0.77, "learning_rate": 5.1093333333333336e-08, "logps/chosen": -210.99868774414062, "logps/rejected": -370.54718017578125, "loss": 0.0001, "losses/dpo": 1.948250883287983e-06, "losses/sft": 0.6793597340583801, "losses/total": 1.948250883287983e-06, "ref_logps/chosen": -196.7518310546875, "ref_logps/rejected": -209.35443115234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4246864318847656, "rewards/margins": 14.694588661193848, "rewards/rejected": -16.119274139404297, "step": 3209 }, { "epoch": 0.77, "learning_rate": 5.103999999999999e-08, "logps/chosen": -261.8464660644531, "logps/rejected": -371.69879150390625, "loss": 0.0029, "losses/dpo": 5.195366803434354e-09, "losses/sft": 1.0368742942810059, "losses/total": 5.195366803434354e-09, "ref_logps/chosen": -248.4447479248047, "ref_logps/rejected": -212.82733154296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.340171217918396, "rewards/margins": 14.546974182128906, "rewards/rejected": -15.88714599609375, "step": 3210 }, { "epoch": 0.77, "learning_rate": 5.098666666666667e-08, "logps/chosen": -250.09796142578125, "logps/rejected": -388.7271423339844, "loss": 0.0021, "losses/dpo": 2.36159442695838e-11, "losses/sft": 0.7355927228927612, "losses/total": 2.36159442695838e-11, "ref_logps/chosen": -232.67877197265625, "ref_logps/rejected": -217.97341918945312, "rewards/accuracies": 1.0, "rewards/chosen": -1.7419185638427734, "rewards/margins": 15.333455085754395, "rewards/rejected": -17.07537269592285, "step": 3211 }, { "epoch": 0.77, "learning_rate": 5.0933333333333326e-08, "logps/chosen": -276.09466552734375, "logps/rejected": -419.79290771484375, "loss": 0.0075, "losses/dpo": 3.953248395305309e-09, "losses/sft": 0.808053731918335, "losses/total": 3.953248395305309e-09, "ref_logps/chosen": -252.55374145507812, "ref_logps/rejected": -239.884033203125, "rewards/accuracies": 1.0, "rewards/chosen": -2.3540914058685303, "rewards/margins": 15.636796951293945, "rewards/rejected": -17.990888595581055, "step": 3212 }, { "epoch": 0.77, "learning_rate": 5.088e-08, "logps/chosen": -237.55618286132812, "logps/rejected": -414.58349609375, "loss": 0.0003, "losses/dpo": 4.551081929093925e-06, "losses/sft": 1.0888365507125854, "losses/total": 4.551081929093925e-06, "ref_logps/chosen": -222.05364990234375, "ref_logps/rejected": -253.98834228515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.5502541065216064, "rewards/margins": 14.509258270263672, "rewards/rejected": -16.059513092041016, "step": 3213 }, { "epoch": 0.77, "learning_rate": 5.082666666666666e-08, "logps/chosen": -276.4786682128906, "logps/rejected": -386.1014404296875, "loss": 0.0033, "losses/dpo": 2.773460073512979e-05, "losses/sft": 0.6280934810638428, "losses/total": 2.773460073512979e-05, "ref_logps/chosen": -256.36029052734375, "ref_logps/rejected": -221.13009643554688, "rewards/accuracies": 1.0, "rewards/chosen": -2.0118367671966553, "rewards/margins": 14.48530101776123, "rewards/rejected": -16.49713897705078, "step": 3214 }, { "epoch": 0.77, "learning_rate": 5.0773333333333337e-08, "logps/chosen": -250.92446899414062, "logps/rejected": -387.68621826171875, "loss": 0.0004, "losses/dpo": 3.227318501330956e-08, "losses/sft": 0.6688028573989868, "losses/total": 3.227318501330956e-08, "ref_logps/chosen": -231.09808349609375, "ref_logps/rejected": -216.08897399902344, "rewards/accuracies": 1.0, "rewards/chosen": -1.9826383590698242, "rewards/margins": 15.177082061767578, "rewards/rejected": -17.15972137451172, "step": 3215 }, { "epoch": 0.77, "learning_rate": 5.0719999999999994e-08, "logps/chosen": -231.09344482421875, "logps/rejected": -411.1605224609375, "loss": 0.0022, "losses/dpo": 1.4924832481000294e-09, "losses/sft": 0.7242698669433594, "losses/total": 1.4924832481000294e-09, "ref_logps/chosen": -216.29998779296875, "ref_logps/rejected": -237.87954711914062, "rewards/accuracies": 1.0, "rewards/chosen": -1.4793484210968018, "rewards/margins": 15.848747253417969, "rewards/rejected": -17.328094482421875, "step": 3216 }, { "epoch": 0.77, "learning_rate": 5.066666666666667e-08, "logps/chosen": -249.94728088378906, "logps/rejected": -411.906494140625, "loss": 0.0, "losses/dpo": 2.317764036252612e-11, "losses/sft": 0.6074188947677612, "losses/total": 2.317764036252612e-11, "ref_logps/chosen": -237.0310821533203, "ref_logps/rejected": -241.78369140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2916197776794434, "rewards/margins": 15.720661163330078, "rewards/rejected": -17.01228141784668, "step": 3217 }, { "epoch": 0.77, "learning_rate": 5.061333333333333e-08, "logps/chosen": -271.41497802734375, "logps/rejected": -437.3722229003906, "loss": 0.0, "losses/dpo": 1.1475292716056629e-08, "losses/sft": 0.5385633111000061, "losses/total": 1.1475292716056629e-08, "ref_logps/chosen": -255.12826538085938, "ref_logps/rejected": -257.60791015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6286710500717163, "rewards/margins": 16.347763061523438, "rewards/rejected": -17.97643280029297, "step": 3218 }, { "epoch": 0.77, "learning_rate": 5.0560000000000004e-08, "logps/chosen": -256.2149658203125, "logps/rejected": -403.00433349609375, "loss": 0.0038, "losses/dpo": 1.8403644164166622e-09, "losses/sft": 0.4947461783885956, "losses/total": 1.8403644164166622e-09, "ref_logps/chosen": -240.7987823486328, "ref_logps/rejected": -229.61183166503906, "rewards/accuracies": 1.0, "rewards/chosen": -1.5416183471679688, "rewards/margins": 15.79763412475586, "rewards/rejected": -17.339252471923828, "step": 3219 }, { "epoch": 0.77, "learning_rate": 5.050666666666666e-08, "logps/chosen": -186.89447021484375, "logps/rejected": -370.4468994140625, "loss": 0.0002, "losses/dpo": 6.28372631705787e-11, "losses/sft": 0.8789721131324768, "losses/total": 6.28372631705787e-11, "ref_logps/chosen": -176.1622314453125, "ref_logps/rejected": -215.55099487304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.0732245445251465, "rewards/margins": 14.41636848449707, "rewards/rejected": -15.489592552185059, "step": 3220 }, { "epoch": 0.77, "learning_rate": 5.0453333333333325e-08, "logps/chosen": -214.64857482910156, "logps/rejected": -387.12823486328125, "loss": 0.0018, "losses/dpo": 8.908754125513951e-07, "losses/sft": 0.649171769618988, "losses/total": 8.908754125513951e-07, "ref_logps/chosen": -201.22244262695312, "ref_logps/rejected": -233.67041015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.3426121473312378, "rewards/margins": 14.003170013427734, "rewards/rejected": -15.345782279968262, "step": 3221 }, { "epoch": 0.77, "learning_rate": 5.04e-08, "logps/chosen": -270.651123046875, "logps/rejected": -428.03582763671875, "loss": 0.0, "losses/dpo": 1.756543577258185e-09, "losses/sft": 0.705480694770813, "losses/total": 1.756543577258185e-09, "ref_logps/chosen": -249.65029907226562, "ref_logps/rejected": -235.56146240234375, "rewards/accuracies": 1.0, "rewards/chosen": -2.100080966949463, "rewards/margins": 17.147356033325195, "rewards/rejected": -19.2474365234375, "step": 3222 }, { "epoch": 0.77, "learning_rate": 5.034666666666666e-08, "logps/chosen": -252.1560516357422, "logps/rejected": -383.4437255859375, "loss": 0.0223, "losses/dpo": 4.853816335526062e-07, "losses/sft": 0.7521703243255615, "losses/total": 4.853816335526062e-07, "ref_logps/chosen": -234.78887939453125, "ref_logps/rejected": -222.62887573242188, "rewards/accuracies": 0.96875, "rewards/chosen": -1.7367161512374878, "rewards/margins": 14.344771385192871, "rewards/rejected": -16.08148765563965, "step": 3223 }, { "epoch": 0.77, "learning_rate": 5.0293333333333335e-08, "logps/chosen": -247.93447875976562, "logps/rejected": -405.5945129394531, "loss": 0.0002, "losses/dpo": 7.169521154537506e-08, "losses/sft": 0.7116907238960266, "losses/total": 7.169521154537506e-08, "ref_logps/chosen": -229.64285278320312, "ref_logps/rejected": -229.66683959960938, "rewards/accuracies": 1.0, "rewards/chosen": -1.8291621208190918, "rewards/margins": 15.763603210449219, "rewards/rejected": -17.59276580810547, "step": 3224 }, { "epoch": 0.77, "learning_rate": 5.023999999999999e-08, "logps/chosen": -244.3087921142578, "logps/rejected": -372.004638671875, "loss": 0.002, "losses/dpo": 2.275058613721015e-12, "losses/sft": 0.5661515593528748, "losses/total": 2.275058613721015e-12, "ref_logps/chosen": -227.0626220703125, "ref_logps/rejected": -210.82952880859375, "rewards/accuracies": 1.0, "rewards/chosen": -1.7246160507202148, "rewards/margins": 14.392894744873047, "rewards/rejected": -16.117511749267578, "step": 3225 }, { "epoch": 0.77, "learning_rate": 5.018666666666667e-08, "logps/chosen": -220.92282104492188, "logps/rejected": -379.55853271484375, "loss": 0.0022, "losses/dpo": 3.665375913897151e-08, "losses/sft": 0.414384126663208, "losses/total": 3.665375913897151e-08, "ref_logps/chosen": -203.28457641601562, "ref_logps/rejected": -209.40199279785156, "rewards/accuracies": 1.0, "rewards/chosen": -1.7638250589370728, "rewards/margins": 15.251832008361816, "rewards/rejected": -17.015655517578125, "step": 3226 }, { "epoch": 0.77, "learning_rate": 5.0133333333333326e-08, "logps/chosen": -247.98867797851562, "logps/rejected": -376.3617248535156, "loss": 0.0007, "losses/dpo": 3.0846356580127576e-09, "losses/sft": 0.7287662029266357, "losses/total": 3.0846356580127576e-09, "ref_logps/chosen": -232.8673095703125, "ref_logps/rejected": -204.8412322998047, "rewards/accuracies": 1.0, "rewards/chosen": -1.512137770652771, "rewards/margins": 15.639911651611328, "rewards/rejected": -17.152050018310547, "step": 3227 }, { "epoch": 0.77, "learning_rate": 5.008e-08, "logps/chosen": -311.79864501953125, "logps/rejected": -417.6563415527344, "loss": 0.0004, "losses/dpo": 3.897811438946519e-06, "losses/sft": 0.6993193030357361, "losses/total": 3.897811438946519e-06, "ref_logps/chosen": -290.63043212890625, "ref_logps/rejected": -243.19503784179688, "rewards/accuracies": 1.0, "rewards/chosen": -2.1168198585510254, "rewards/margins": 15.32931137084961, "rewards/rejected": -17.446128845214844, "step": 3228 }, { "epoch": 0.77, "learning_rate": 5.002666666666666e-08, "logps/chosen": -254.27130126953125, "logps/rejected": -413.8863220214844, "loss": 0.0005, "losses/dpo": 1.7201196023108878e-09, "losses/sft": 0.7021967172622681, "losses/total": 1.7201196023108878e-09, "ref_logps/chosen": -237.22195434570312, "ref_logps/rejected": -242.6737518310547, "rewards/accuracies": 1.0, "rewards/chosen": -1.7049338817596436, "rewards/margins": 15.416322708129883, "rewards/rejected": -17.121257781982422, "step": 3229 }, { "epoch": 0.78, "learning_rate": 4.997333333333333e-08, "logps/chosen": -257.2760009765625, "logps/rejected": -392.3512878417969, "loss": 0.0013, "losses/dpo": 1.2201602778822394e-15, "losses/sft": 0.6227611899375916, "losses/total": 1.2201602778822394e-15, "ref_logps/chosen": -236.03753662109375, "ref_logps/rejected": -212.48663330078125, "rewards/accuracies": 1.0, "rewards/chosen": -2.1238460540771484, "rewards/margins": 15.862619400024414, "rewards/rejected": -17.986465454101562, "step": 3230 }, { "epoch": 0.78, "learning_rate": 4.991999999999999e-08, "logps/chosen": -209.92410278320312, "logps/rejected": -387.37652587890625, "loss": 0.002, "losses/dpo": 0.00019996405171696097, "losses/sft": 1.0145069360733032, "losses/total": 0.00019996405171696097, "ref_logps/chosen": -193.94500732421875, "ref_logps/rejected": -215.16592407226562, "rewards/accuracies": 1.0, "rewards/chosen": -1.597909688949585, "rewards/margins": 15.623151779174805, "rewards/rejected": -17.221059799194336, "step": 3231 }, { "epoch": 0.78, "learning_rate": 4.986666666666666e-08, "logps/chosen": -240.1548309326172, "logps/rejected": -368.07501220703125, "loss": 0.0105, "losses/dpo": 9.982022675103508e-07, "losses/sft": 0.47934290766716003, "losses/total": 9.982022675103508e-07, "ref_logps/chosen": -224.59506225585938, "ref_logps/rejected": -216.15789794921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5559782981872559, "rewards/margins": 13.635736465454102, "rewards/rejected": -15.1917142868042, "step": 3232 }, { "epoch": 0.78, "learning_rate": 4.981333333333333e-08, "logps/chosen": -230.22128295898438, "logps/rejected": -366.931640625, "loss": 0.0016, "losses/dpo": 1.4109647850091278e-07, "losses/sft": 0.7825839519500732, "losses/total": 1.4109647850091278e-07, "ref_logps/chosen": -213.48202514648438, "ref_logps/rejected": -219.646240234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.6739251613616943, "rewards/margins": 13.054615020751953, "rewards/rejected": -14.728541374206543, "step": 3233 }, { "epoch": 0.78, "learning_rate": 4.976e-08, "logps/chosen": -209.80023193359375, "logps/rejected": -349.0672302246094, "loss": 0.0027, "losses/dpo": 1.3664269715718547e-10, "losses/sft": 0.6120718717575073, "losses/total": 1.3664269715718547e-10, "ref_logps/chosen": -195.88192749023438, "ref_logps/rejected": -206.18849182128906, "rewards/accuracies": 1.0, "rewards/chosen": -1.3918309211730957, "rewards/margins": 12.89604377746582, "rewards/rejected": -14.287874221801758, "step": 3234 }, { "epoch": 0.78, "learning_rate": 4.970666666666666e-08, "logps/chosen": -243.7559051513672, "logps/rejected": -392.5699462890625, "loss": 0.0014, "losses/dpo": 8.728847546990437e-07, "losses/sft": 0.5450760722160339, "losses/total": 8.728847546990437e-07, "ref_logps/chosen": -225.23529052734375, "ref_logps/rejected": -227.28439331054688, "rewards/accuracies": 1.0, "rewards/chosen": -1.8520598411560059, "rewards/margins": 14.67650032043457, "rewards/rejected": -16.528560638427734, "step": 3235 }, { "epoch": 0.78, "learning_rate": 4.965333333333333e-08, "logps/chosen": -273.1053466796875, "logps/rejected": -400.9811706542969, "loss": 0.0001, "losses/dpo": 5.743016062709216e-10, "losses/sft": 0.5483406186103821, "losses/total": 5.743016062709216e-10, "ref_logps/chosen": -250.20730590820312, "ref_logps/rejected": -226.04928588867188, "rewards/accuracies": 1.0, "rewards/chosen": -2.289804220199585, "rewards/margins": 15.203381538391113, "rewards/rejected": -17.493186950683594, "step": 3236 }, { "epoch": 0.78, "learning_rate": 4.9599999999999994e-08, "logps/chosen": -205.9437255859375, "logps/rejected": -356.15576171875, "loss": 0.003, "losses/dpo": 3.6448284390644403e-07, "losses/sft": 0.5770791172981262, "losses/total": 3.6448284390644403e-07, "ref_logps/chosen": -191.04229736328125, "ref_logps/rejected": -193.05062866210938, "rewards/accuracies": 1.0, "rewards/chosen": -1.4901436567306519, "rewards/margins": 14.820369720458984, "rewards/rejected": -16.310514450073242, "step": 3237 }, { "epoch": 0.78, "learning_rate": 4.9546666666666664e-08, "logps/chosen": -228.6337890625, "logps/rejected": -362.05157470703125, "loss": 0.0088, "losses/dpo": 3.029303456969501e-07, "losses/sft": 0.6534504294395447, "losses/total": 3.029303456969501e-07, "ref_logps/chosen": -212.0228271484375, "ref_logps/rejected": -204.0037841796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.6610982418060303, "rewards/margins": 14.143682479858398, "rewards/rejected": -15.804780960083008, "step": 3238 }, { "epoch": 0.78, "learning_rate": 4.9493333333333334e-08, "logps/chosen": -214.1943359375, "logps/rejected": -360.94976806640625, "loss": 0.0043, "losses/dpo": 1.0218873285339214e-05, "losses/sft": 0.6349053382873535, "losses/total": 1.0218873285339214e-05, "ref_logps/chosen": -198.7015380859375, "ref_logps/rejected": -205.59304809570312, "rewards/accuracies": 1.0, "rewards/chosen": -1.5492788553237915, "rewards/margins": 13.986392974853516, "rewards/rejected": -15.535672187805176, "step": 3239 }, { "epoch": 0.78, "learning_rate": 4.944e-08, "logps/chosen": -256.7555847167969, "logps/rejected": -391.3272399902344, "loss": 0.0002, "losses/dpo": 5.374851674844194e-10, "losses/sft": 0.7017624378204346, "losses/total": 5.374851674844194e-10, "ref_logps/chosen": -237.58895874023438, "ref_logps/rejected": -215.00241088867188, "rewards/accuracies": 1.0, "rewards/chosen": -1.9166635274887085, "rewards/margins": 15.715818405151367, "rewards/rejected": -17.63248062133789, "step": 3240 }, { "epoch": 0.78, "learning_rate": 4.938666666666667e-08, "logps/chosen": -242.9679412841797, "logps/rejected": -356.6468811035156, "loss": 0.0006, "losses/dpo": 0.00013919365301262587, "losses/sft": 1.022947072982788, "losses/total": 0.00013919365301262587, "ref_logps/chosen": -224.2841033935547, "ref_logps/rejected": -200.42227172851562, "rewards/accuracies": 1.0, "rewards/chosen": -1.8683823347091675, "rewards/margins": 13.754079818725586, "rewards/rejected": -15.622462272644043, "step": 3241 }, { "epoch": 0.78, "learning_rate": 4.933333333333333e-08, "logps/chosen": -228.60598754882812, "logps/rejected": -385.8021240234375, "loss": 0.0004, "losses/dpo": 1.8918765221087597e-08, "losses/sft": 0.7639707326889038, "losses/total": 1.8918765221087597e-08, "ref_logps/chosen": -210.91958618164062, "ref_logps/rejected": -221.2677001953125, "rewards/accuracies": 1.0, "rewards/chosen": -1.7686409950256348, "rewards/margins": 14.684802055358887, "rewards/rejected": -16.453441619873047, "step": 3242 }, { "epoch": 0.78, "learning_rate": 4.928e-08, "logps/chosen": -232.46949768066406, "logps/rejected": -342.99786376953125, "loss": 0.0031, "losses/dpo": 1.4085991487805538e-12, "losses/sft": 0.5982807874679565, "losses/total": 1.4085991487805538e-12, "ref_logps/chosen": -217.57855224609375, "ref_logps/rejected": -190.52005004882812, "rewards/accuracies": 1.0, "rewards/chosen": -1.4890952110290527, "rewards/margins": 13.758687019348145, "rewards/rejected": -15.247780799865723, "step": 3243 }, { "epoch": 0.78, "learning_rate": 4.9226666666666665e-08, "logps/chosen": -252.65362548828125, "logps/rejected": -416.0131530761719, "loss": 0.0001, "losses/dpo": 8.397876144483618e-12, "losses/sft": 0.5479270219802856, "losses/total": 8.397876144483618e-12, "ref_logps/chosen": -234.97055053710938, "ref_logps/rejected": -227.07546997070312, "rewards/accuracies": 1.0, "rewards/chosen": -1.7683053016662598, "rewards/margins": 17.125463485717773, "rewards/rejected": -18.893768310546875, "step": 3244 }, { "epoch": 0.78, "learning_rate": 4.9173333333333336e-08, "logps/chosen": -245.24102783203125, "logps/rejected": -427.9669189453125, "loss": 0.0008, "losses/dpo": 3.8584413886155744e-08, "losses/sft": 0.4919750690460205, "losses/total": 3.8584413886155744e-08, "ref_logps/chosen": -230.36073303222656, "ref_logps/rejected": -240.35191345214844, "rewards/accuracies": 1.0, "rewards/chosen": -1.4880285263061523, "rewards/margins": 17.27347183227539, "rewards/rejected": -18.761499404907227, "step": 3245 }, { "epoch": 0.78, "learning_rate": 4.912e-08, "logps/chosen": -262.72540283203125, "logps/rejected": -416.06402587890625, "loss": 0.0001, "losses/dpo": 4.785265900864033e-06, "losses/sft": 0.6765075922012329, "losses/total": 4.785265900864033e-06, "ref_logps/chosen": -239.74949645996094, "ref_logps/rejected": -234.56573486328125, "rewards/accuracies": 1.0, "rewards/chosen": -2.297589063644409, "rewards/margins": 15.852240562438965, "rewards/rejected": -18.149829864501953, "step": 3246 }, { "epoch": 0.78, "learning_rate": 4.906666666666666e-08, "logps/chosen": -230.7233428955078, "logps/rejected": -363.3648681640625, "loss": 0.0031, "losses/dpo": 4.289290345127483e-08, "losses/sft": 0.9612492918968201, "losses/total": 4.289290345127483e-08, "ref_logps/chosen": -215.51715087890625, "ref_logps/rejected": -208.59005737304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.5206167697906494, "rewards/margins": 13.956863403320312, "rewards/rejected": -15.477479934692383, "step": 3247 }, { "epoch": 0.78, "learning_rate": 4.9013333333333326e-08, "logps/chosen": -241.3734130859375, "logps/rejected": -396.529296875, "loss": 0.0022, "losses/dpo": 3.7810612930552767e-13, "losses/sft": 0.6597289443016052, "losses/total": 3.7810612930552767e-13, "ref_logps/chosen": -223.41485595703125, "ref_logps/rejected": -230.1977081298828, "rewards/accuracies": 1.0, "rewards/chosen": -1.7958564758300781, "rewards/margins": 14.837303161621094, "rewards/rejected": -16.633159637451172, "step": 3248 }, { "epoch": 0.78, "learning_rate": 4.8959999999999996e-08, "logps/chosen": -249.615234375, "logps/rejected": -421.84326171875, "loss": 0.0003, "losses/dpo": 3.3767844076493247e-09, "losses/sft": 0.6408440470695496, "losses/total": 3.3767844076493247e-09, "ref_logps/chosen": -231.4321746826172, "ref_logps/rejected": -236.2296142578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.818306565284729, "rewards/margins": 16.743057250976562, "rewards/rejected": -18.561363220214844, "step": 3249 }, { "epoch": 0.78, "learning_rate": 4.890666666666666e-08, "logps/chosen": -244.92178344726562, "logps/rejected": -370.08514404296875, "loss": 0.0009, "losses/dpo": 2.4751398086664267e-07, "losses/sft": 0.624944806098938, "losses/total": 2.4751398086664267e-07, "ref_logps/chosen": -228.65447998046875, "ref_logps/rejected": -205.9083709716797, "rewards/accuracies": 1.0, "rewards/chosen": -1.626732587814331, "rewards/margins": 14.790945053100586, "rewards/rejected": -16.417678833007812, "step": 3250 }, { "epoch": 0.78, "learning_rate": 4.885333333333333e-08, "logps/chosen": -260.57403564453125, "logps/rejected": -423.1791687011719, "loss": 0.0001, "losses/dpo": 3.817999072452949e-07, "losses/sft": 0.5636202692985535, "losses/total": 3.817999072452949e-07, "ref_logps/chosen": -245.5852813720703, "ref_logps/rejected": -240.6829833984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4988741874694824, "rewards/margins": 16.75074577331543, "rewards/rejected": -18.24962043762207, "step": 3251 }, { "epoch": 0.78, "learning_rate": 4.8799999999999994e-08, "logps/chosen": -241.51266479492188, "logps/rejected": -374.8704833984375, "loss": 0.0026, "losses/dpo": 4.601751788868569e-05, "losses/sft": 0.5347001552581787, "losses/total": 4.601751788868569e-05, "ref_logps/chosen": -226.83416748046875, "ref_logps/rejected": -210.4130401611328, "rewards/accuracies": 1.0, "rewards/chosen": -1.467851161956787, "rewards/margins": 14.977892875671387, "rewards/rejected": -16.445743560791016, "step": 3252 }, { "epoch": 0.78, "learning_rate": 4.8746666666666664e-08, "logps/chosen": -241.42604064941406, "logps/rejected": -393.59576416015625, "loss": 0.0036, "losses/dpo": 8.132091666723795e-10, "losses/sft": 0.8428794741630554, "losses/total": 8.132091666723795e-10, "ref_logps/chosen": -227.21435546875, "ref_logps/rejected": -220.93499755859375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4211667776107788, "rewards/margins": 15.844913482666016, "rewards/rejected": -17.266080856323242, "step": 3253 }, { "epoch": 0.78, "learning_rate": 4.8693333333333334e-08, "logps/chosen": -266.23480224609375, "logps/rejected": -357.15594482421875, "loss": 0.0115, "losses/dpo": 6.995061752057552e-10, "losses/sft": 0.6320765614509583, "losses/total": 6.995061752057552e-10, "ref_logps/chosen": -250.95089721679688, "ref_logps/rejected": -192.4412384033203, "rewards/accuracies": 1.0, "rewards/chosen": -1.528388261795044, "rewards/margins": 14.943082809448242, "rewards/rejected": -16.471471786499023, "step": 3254 }, { "epoch": 0.78, "learning_rate": 4.864e-08, "logps/chosen": -264.684814453125, "logps/rejected": -420.0491027832031, "loss": 0.0001, "losses/dpo": 9.24432665833086e-12, "losses/sft": 0.6947389245033264, "losses/total": 9.24432665833086e-12, "ref_logps/chosen": -246.9912109375, "ref_logps/rejected": -248.43426513671875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7693605422973633, "rewards/margins": 15.392122268676758, "rewards/rejected": -17.161483764648438, "step": 3255 }, { "epoch": 0.78, "learning_rate": 4.858666666666667e-08, "logps/chosen": -239.4973602294922, "logps/rejected": -384.8236083984375, "loss": 0.0005, "losses/dpo": 1.899659457516023e-10, "losses/sft": 0.6856856346130371, "losses/total": 1.899659457516023e-10, "ref_logps/chosen": -223.53079223632812, "ref_logps/rejected": -224.80752563476562, "rewards/accuracies": 1.0, "rewards/chosen": -1.5966575145721436, "rewards/margins": 14.404953002929688, "rewards/rejected": -16.001609802246094, "step": 3256 }, { "epoch": 0.78, "learning_rate": 4.853333333333333e-08, "logps/chosen": -258.048095703125, "logps/rejected": -398.1763610839844, "loss": 0.0005, "losses/dpo": 1.2489265373005765e-06, "losses/sft": 0.7999780178070068, "losses/total": 1.2489265373005765e-06, "ref_logps/chosen": -247.33316040039062, "ref_logps/rejected": -240.88894653320312, "rewards/accuracies": 1.0, "rewards/chosen": -1.0714936256408691, "rewards/margins": 14.657245635986328, "rewards/rejected": -15.728738784790039, "step": 3257 }, { "epoch": 0.78, "learning_rate": 4.848e-08, "logps/chosen": -237.79278564453125, "logps/rejected": -375.7903747558594, "loss": 0.0013, "losses/dpo": 1.7695281258056639e-06, "losses/sft": 0.5102381110191345, "losses/total": 1.7695281258056639e-06, "ref_logps/chosen": -222.74166870117188, "ref_logps/rejected": -224.34449768066406, "rewards/accuracies": 1.0, "rewards/chosen": -1.5051108598709106, "rewards/margins": 13.639474868774414, "rewards/rejected": -15.144586563110352, "step": 3258 }, { "epoch": 0.78, "learning_rate": 4.8426666666666665e-08, "logps/chosen": -287.7313232421875, "logps/rejected": -417.2522888183594, "loss": 0.0003, "losses/dpo": 2.5587605367860533e-09, "losses/sft": 0.6811719536781311, "losses/total": 2.5587605367860533e-09, "ref_logps/chosen": -265.94622802734375, "ref_logps/rejected": -243.03062438964844, "rewards/accuracies": 1.0, "rewards/chosen": -2.1785125732421875, "rewards/margins": 15.243654251098633, "rewards/rejected": -17.42216682434082, "step": 3259 }, { "epoch": 0.78, "learning_rate": 4.8373333333333335e-08, "logps/chosen": -261.74200439453125, "logps/rejected": -403.08929443359375, "loss": 0.0006, "losses/dpo": 1.2889393019577255e-07, "losses/sft": 1.2855483293533325, "losses/total": 1.2889393019577255e-07, "ref_logps/chosen": -242.56674194335938, "ref_logps/rejected": -228.2509307861328, "rewards/accuracies": 1.0, "rewards/chosen": -1.9175283908843994, "rewards/margins": 15.566309928894043, "rewards/rejected": -17.483837127685547, "step": 3260 }, { "epoch": 0.78, "learning_rate": 4.832e-08, "logps/chosen": -259.65826416015625, "logps/rejected": -374.06072998046875, "loss": 0.0008, "losses/dpo": 2.6061786062570036e-09, "losses/sft": 0.7395490407943726, "losses/total": 2.6061786062570036e-09, "ref_logps/chosen": -243.75875854492188, "ref_logps/rejected": -209.8492889404297, "rewards/accuracies": 1.0, "rewards/chosen": -1.5899507999420166, "rewards/margins": 14.831195831298828, "rewards/rejected": -16.4211483001709, "step": 3261 }, { "epoch": 0.78, "learning_rate": 4.826666666666667e-08, "logps/chosen": -277.42608642578125, "logps/rejected": -430.47808837890625, "loss": 0.0002, "losses/dpo": 3.5100372315355344e-06, "losses/sft": 0.6006344556808472, "losses/total": 3.5100372315355344e-06, "ref_logps/chosen": -255.98094177246094, "ref_logps/rejected": -239.7141571044922, "rewards/accuracies": 1.0, "rewards/chosen": -2.1445140838623047, "rewards/margins": 16.9318790435791, "rewards/rejected": -19.076393127441406, "step": 3262 }, { "epoch": 0.78, "learning_rate": 4.821333333333333e-08, "logps/chosen": -250.5673828125, "logps/rejected": -369.44256591796875, "loss": 0.003, "losses/dpo": 4.038125496208522e-07, "losses/sft": 0.8170884251594543, "losses/total": 4.038125496208522e-07, "ref_logps/chosen": -232.2329864501953, "ref_logps/rejected": -198.95260620117188, "rewards/accuracies": 1.0, "rewards/chosen": -1.833439826965332, "rewards/margins": 15.215557098388672, "rewards/rejected": -17.04899787902832, "step": 3263 }, { "epoch": 0.78, "learning_rate": 4.8159999999999996e-08, "logps/chosen": -229.25845336914062, "logps/rejected": -375.1342468261719, "loss": 0.0003, "losses/dpo": 4.2255301480054186e-08, "losses/sft": 0.510793924331665, "losses/total": 4.2255301480054186e-08, "ref_logps/chosen": -216.09486389160156, "ref_logps/rejected": -216.79144287109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.316359519958496, "rewards/margins": 14.517921447753906, "rewards/rejected": -15.834280014038086, "step": 3264 }, { "epoch": 0.78, "learning_rate": 4.810666666666666e-08, "logps/chosen": -239.99261474609375, "logps/rejected": -371.9588623046875, "loss": 0.0022, "losses/dpo": 6.927634160769358e-11, "losses/sft": 0.5258353352546692, "losses/total": 6.927634160769358e-11, "ref_logps/chosen": -228.14303588867188, "ref_logps/rejected": -217.94471740722656, "rewards/accuracies": 1.0, "rewards/chosen": -1.1849578619003296, "rewards/margins": 14.216458320617676, "rewards/rejected": -15.401416778564453, "step": 3265 }, { "epoch": 0.78, "learning_rate": 4.805333333333333e-08, "logps/chosen": -236.7779541015625, "logps/rejected": -413.0569763183594, "loss": 0.0001, "losses/dpo": 5.441316375254246e-08, "losses/sft": 0.5971960425376892, "losses/total": 5.441316375254246e-08, "ref_logps/chosen": -219.9798583984375, "ref_logps/rejected": -231.85580444335938, "rewards/accuracies": 1.0, "rewards/chosen": -1.6798104047775269, "rewards/margins": 16.4403076171875, "rewards/rejected": -18.1201171875, "step": 3266 }, { "epoch": 0.78, "learning_rate": 4.799999999999999e-08, "logps/chosen": -270.37030029296875, "logps/rejected": -399.6461181640625, "loss": 0.001, "losses/dpo": 6.4280625267088e-09, "losses/sft": 0.5962169170379639, "losses/total": 6.4280625267088e-09, "ref_logps/chosen": -251.20697021484375, "ref_logps/rejected": -224.8450927734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.916332721710205, "rewards/margins": 15.563770294189453, "rewards/rejected": -17.4801025390625, "step": 3267 }, { "epoch": 0.78, "learning_rate": 4.794666666666666e-08, "logps/chosen": -255.03744506835938, "logps/rejected": -428.296875, "loss": 0.0, "losses/dpo": 6.967771071231255e-08, "losses/sft": 0.36941418051719666, "losses/total": 6.967771071231255e-08, "ref_logps/chosen": -233.76458740234375, "ref_logps/rejected": -245.52822875976562, "rewards/accuracies": 1.0, "rewards/chosen": -2.1272857189178467, "rewards/margins": 16.149578094482422, "rewards/rejected": -18.27686309814453, "step": 3268 }, { "epoch": 0.78, "learning_rate": 4.789333333333333e-08, "logps/chosen": -279.0051574707031, "logps/rejected": -406.64642333984375, "loss": 0.0002, "losses/dpo": 2.2423209600219707e-07, "losses/sft": 1.0318970680236816, "losses/total": 2.2423209600219707e-07, "ref_logps/chosen": -263.0692138671875, "ref_logps/rejected": -228.8373260498047, "rewards/accuracies": 1.0, "rewards/chosen": -1.5935978889465332, "rewards/margins": 16.187313079833984, "rewards/rejected": -17.780908584594727, "step": 3269 }, { "epoch": 0.78, "learning_rate": 4.784e-08, "logps/chosen": -245.96517944335938, "logps/rejected": -413.5926513671875, "loss": 0.0008, "losses/dpo": 1.950146572005096e-10, "losses/sft": 0.6856206059455872, "losses/total": 1.950146572005096e-10, "ref_logps/chosen": -225.32945251464844, "ref_logps/rejected": -231.9227294921875, "rewards/accuracies": 1.0, "rewards/chosen": -2.063572883605957, "rewards/margins": 16.103418350219727, "rewards/rejected": -18.1669921875, "step": 3270 }, { "epoch": 0.78, "learning_rate": 4.778666666666667e-08, "logps/chosen": -208.08421325683594, "logps/rejected": -352.14825439453125, "loss": 0.0012, "losses/dpo": 9.880821494334668e-08, "losses/sft": 0.7593176364898682, "losses/total": 9.880821494334668e-08, "ref_logps/chosen": -193.36505126953125, "ref_logps/rejected": -199.0950164794922, "rewards/accuracies": 1.0, "rewards/chosen": -1.4719158411026, "rewards/margins": 13.833410263061523, "rewards/rejected": -15.305325508117676, "step": 3271 }, { "epoch": 0.79, "learning_rate": 4.773333333333333e-08, "logps/chosen": -267.01397705078125, "logps/rejected": -380.61114501953125, "loss": 0.0015, "losses/dpo": 7.454190598110699e-10, "losses/sft": 0.46991798281669617, "losses/total": 7.454190598110699e-10, "ref_logps/chosen": -248.417236328125, "ref_logps/rejected": -219.9506072998047, "rewards/accuracies": 1.0, "rewards/chosen": -1.859675645828247, "rewards/margins": 14.206377029418945, "rewards/rejected": -16.066051483154297, "step": 3272 }, { "epoch": 0.79, "learning_rate": 4.768e-08, "logps/chosen": -221.55894470214844, "logps/rejected": -377.5357360839844, "loss": 0.0002, "losses/dpo": 1.6417450751760043e-05, "losses/sft": 0.5244477987289429, "losses/total": 1.6417450751760043e-05, "ref_logps/chosen": -207.38082885742188, "ref_logps/rejected": -206.42880249023438, "rewards/accuracies": 1.0, "rewards/chosen": -1.4178109169006348, "rewards/margins": 15.69288158416748, "rewards/rejected": -17.110692977905273, "step": 3273 }, { "epoch": 0.79, "learning_rate": 4.7626666666666664e-08, "logps/chosen": -242.68394470214844, "logps/rejected": -350.11322021484375, "loss": 0.0007, "losses/dpo": 2.9605201845583906e-08, "losses/sft": 0.4535723626613617, "losses/total": 2.9605201845583906e-08, "ref_logps/chosen": -227.63687133789062, "ref_logps/rejected": -202.20571899414062, "rewards/accuracies": 1.0, "rewards/chosen": -1.5047074556350708, "rewards/margins": 13.286044120788574, "rewards/rejected": -14.790751457214355, "step": 3274 }, { "epoch": 0.79, "learning_rate": 4.7573333333333334e-08, "logps/chosen": -198.63079833984375, "logps/rejected": -353.9704895019531, "loss": 0.0052, "losses/dpo": 1.147154421232699e-06, "losses/sft": 0.6699510216712952, "losses/total": 1.147154421232699e-06, "ref_logps/chosen": -185.974853515625, "ref_logps/rejected": -200.32005310058594, "rewards/accuracies": 1.0, "rewards/chosen": -1.2655936479568481, "rewards/margins": 14.099451065063477, "rewards/rejected": -15.365044593811035, "step": 3275 }, { "epoch": 0.79, "learning_rate": 4.752e-08, "logps/chosen": -214.93533325195312, "logps/rejected": -365.88922119140625, "loss": 0.0, "losses/dpo": 3.5437435208507395e-09, "losses/sft": 0.6833578944206238, "losses/total": 3.5437435208507395e-09, "ref_logps/chosen": -200.27088928222656, "ref_logps/rejected": -208.73739624023438, "rewards/accuracies": 1.0, "rewards/chosen": -1.4664454460144043, "rewards/margins": 14.248741149902344, "rewards/rejected": -15.71518611907959, "step": 3276 }, { "epoch": 0.79, "learning_rate": 4.746666666666667e-08, "logps/chosen": -258.5803527832031, "logps/rejected": -386.55438232421875, "loss": 0.0056, "losses/dpo": 3.339044665762181e-11, "losses/sft": 0.6107370257377625, "losses/total": 3.339044665762181e-11, "ref_logps/chosen": -242.73565673828125, "ref_logps/rejected": -234.27029418945312, "rewards/accuracies": 1.0, "rewards/chosen": -1.584467887878418, "rewards/margins": 13.643940925598145, "rewards/rejected": -15.228409767150879, "step": 3277 }, { "epoch": 0.79, "learning_rate": 4.741333333333333e-08, "logps/chosen": -254.40362548828125, "logps/rejected": -393.78228759765625, "loss": 0.0028, "losses/dpo": 5.149231014911493e-07, "losses/sft": 1.1683297157287598, "losses/total": 5.149231014911493e-07, "ref_logps/chosen": -236.28045654296875, "ref_logps/rejected": -229.60488891601562, "rewards/accuracies": 1.0, "rewards/chosen": -1.8123164176940918, "rewards/margins": 14.605424880981445, "rewards/rejected": -16.417739868164062, "step": 3278 }, { "epoch": 0.79, "learning_rate": 4.736e-08, "logps/chosen": -231.41708374023438, "logps/rejected": -367.49560546875, "loss": 0.0004, "losses/dpo": 0.007273162715137005, "losses/sft": 0.6895356774330139, "losses/total": 0.007273162715137005, "ref_logps/chosen": -215.28018188476562, "ref_logps/rejected": -202.03125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6136903762817383, "rewards/margins": 14.932748794555664, "rewards/rejected": -16.54644012451172, "step": 3279 }, { "epoch": 0.79, "learning_rate": 4.7306666666666665e-08, "logps/chosen": -239.3235321044922, "logps/rejected": -414.754150390625, "loss": 0.0003, "losses/dpo": 1.908746771750458e-10, "losses/sft": 0.6442579030990601, "losses/total": 1.908746771750458e-10, "ref_logps/chosen": -219.35568237304688, "ref_logps/rejected": -231.84725952148438, "rewards/accuracies": 1.0, "rewards/chosen": -1.9967851638793945, "rewards/margins": 16.293903350830078, "rewards/rejected": -18.290687561035156, "step": 3280 }, { "epoch": 0.79, "learning_rate": 4.7253333333333336e-08, "logps/chosen": -241.35426330566406, "logps/rejected": -400.60791015625, "loss": 0.0002, "losses/dpo": 2.0268773326392875e-09, "losses/sft": 0.4157184958457947, "losses/total": 2.0268773326392875e-09, "ref_logps/chosen": -222.96932983398438, "ref_logps/rejected": -229.0825653076172, "rewards/accuracies": 1.0, "rewards/chosen": -1.8384922742843628, "rewards/margins": 15.314044952392578, "rewards/rejected": -17.152538299560547, "step": 3281 }, { "epoch": 0.79, "learning_rate": 4.719999999999999e-08, "logps/chosen": -244.37347412109375, "logps/rejected": -406.9388427734375, "loss": 0.0011, "losses/dpo": 1.0519936495256843e-06, "losses/sft": 0.8024805188179016, "losses/total": 1.0519936495256843e-06, "ref_logps/chosen": -231.1705322265625, "ref_logps/rejected": -235.1580810546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.3202940225601196, "rewards/margins": 15.857784271240234, "rewards/rejected": -17.178077697753906, "step": 3282 }, { "epoch": 0.79, "learning_rate": 4.714666666666666e-08, "logps/chosen": -256.5478515625, "logps/rejected": -341.6549072265625, "loss": 0.0015, "losses/dpo": 0.00010998419020324945, "losses/sft": 0.4884607791900635, "losses/total": 0.00010998419020324945, "ref_logps/chosen": -239.3114013671875, "ref_logps/rejected": -201.41555786132812, "rewards/accuracies": 1.0, "rewards/chosen": -1.7236464023590088, "rewards/margins": 12.300289154052734, "rewards/rejected": -14.023935317993164, "step": 3283 }, { "epoch": 0.79, "learning_rate": 4.7093333333333326e-08, "logps/chosen": -235.84022521972656, "logps/rejected": -375.8227844238281, "loss": 0.004, "losses/dpo": 1.8504586023482261e-06, "losses/sft": 0.5419425368309021, "losses/total": 1.8504586023482261e-06, "ref_logps/chosen": -219.02679443359375, "ref_logps/rejected": -210.5956268310547, "rewards/accuracies": 1.0, "rewards/chosen": -1.6813414096832275, "rewards/margins": 14.8413724899292, "rewards/rejected": -16.52271270751953, "step": 3284 }, { "epoch": 0.79, "learning_rate": 4.7039999999999996e-08, "logps/chosen": -221.3141632080078, "logps/rejected": -389.80780029296875, "loss": 0.0006, "losses/dpo": 2.2230938157008495e-07, "losses/sft": 0.5904562473297119, "losses/total": 2.2230938157008495e-07, "ref_logps/chosen": -203.19749450683594, "ref_logps/rejected": -222.6644744873047, "rewards/accuracies": 1.0, "rewards/chosen": -1.8116670846939087, "rewards/margins": 14.902663230895996, "rewards/rejected": -16.714330673217773, "step": 3285 }, { "epoch": 0.79, "learning_rate": 4.6986666666666667e-08, "logps/chosen": -239.835205078125, "logps/rejected": -356.3360290527344, "loss": 0.0802, "losses/dpo": 2.515007972717285, "losses/sft": 0.9280216693878174, "losses/total": 2.515007972717285, "ref_logps/chosen": -221.7664794921875, "ref_logps/rejected": -197.5342254638672, "rewards/accuracies": 0.96875, "rewards/chosen": -1.8068711757659912, "rewards/margins": 14.073308944702148, "rewards/rejected": -15.880181312561035, "step": 3286 }, { "epoch": 0.79, "learning_rate": 4.693333333333333e-08, "logps/chosen": -210.49839782714844, "logps/rejected": -369.329345703125, "loss": 0.0003, "losses/dpo": 4.2820838643820025e-06, "losses/sft": 0.6275277137756348, "losses/total": 4.2820838643820025e-06, "ref_logps/chosen": -195.29641723632812, "ref_logps/rejected": -210.97225952148438, "rewards/accuracies": 1.0, "rewards/chosen": -1.5201988220214844, "rewards/margins": 14.315505981445312, "rewards/rejected": -15.83570384979248, "step": 3287 }, { "epoch": 0.79, "learning_rate": 4.688e-08, "logps/chosen": -240.02735900878906, "logps/rejected": -392.5122985839844, "loss": 0.0001, "losses/dpo": 6.01383902676389e-08, "losses/sft": 0.5935646295547485, "losses/total": 6.01383902676389e-08, "ref_logps/chosen": -218.99147033691406, "ref_logps/rejected": -220.34681701660156, "rewards/accuracies": 1.0, "rewards/chosen": -2.1035900115966797, "rewards/margins": 15.112958908081055, "rewards/rejected": -17.216548919677734, "step": 3288 }, { "epoch": 0.79, "learning_rate": 4.6826666666666664e-08, "logps/chosen": -280.90093994140625, "logps/rejected": -413.28765869140625, "loss": 0.0007, "losses/dpo": 8.255684136315722e-09, "losses/sft": 0.6344828605651855, "losses/total": 8.255684136315722e-09, "ref_logps/chosen": -260.0768127441406, "ref_logps/rejected": -241.03619384765625, "rewards/accuracies": 1.0, "rewards/chosen": -2.0824124813079834, "rewards/margins": 15.142732620239258, "rewards/rejected": -17.225143432617188, "step": 3289 }, { "epoch": 0.79, "learning_rate": 4.6773333333333334e-08, "logps/chosen": -251.777587890625, "logps/rejected": -393.3864440917969, "loss": 0.0011, "losses/dpo": 3.2774983083072584e-07, "losses/sft": 1.1932443380355835, "losses/total": 3.2774983083072584e-07, "ref_logps/chosen": -237.17088317871094, "ref_logps/rejected": -215.89083862304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.4606691598892212, "rewards/margins": 16.288890838623047, "rewards/rejected": -17.749561309814453, "step": 3290 }, { "epoch": 0.79, "learning_rate": 4.672e-08, "logps/chosen": -269.91619873046875, "logps/rejected": -422.01904296875, "loss": 0.0003, "losses/dpo": 3.6778030789719196e-06, "losses/sft": 0.8117847442626953, "losses/total": 3.6778030789719196e-06, "ref_logps/chosen": -248.58718872070312, "ref_logps/rejected": -235.12210083007812, "rewards/accuracies": 1.0, "rewards/chosen": -2.132904291152954, "rewards/margins": 16.55678939819336, "rewards/rejected": -18.689693450927734, "step": 3291 }, { "epoch": 0.79, "learning_rate": 4.666666666666667e-08, "logps/chosen": -272.6922302246094, "logps/rejected": -440.15283203125, "loss": 0.0002, "losses/dpo": 5.259970410620429e-12, "losses/sft": 0.6107496619224548, "losses/total": 5.259970410620429e-12, "ref_logps/chosen": -254.10313415527344, "ref_logps/rejected": -240.5048370361328, "rewards/accuracies": 1.0, "rewards/chosen": -1.8589098453521729, "rewards/margins": 18.10589027404785, "rewards/rejected": -19.964799880981445, "step": 3292 }, { "epoch": 0.79, "learning_rate": 4.661333333333333e-08, "logps/chosen": -243.33432006835938, "logps/rejected": -393.9316711425781, "loss": 0.003, "losses/dpo": 8.772492776643048e-08, "losses/sft": 1.036154866218567, "losses/total": 8.772492776643048e-08, "ref_logps/chosen": -225.1326141357422, "ref_logps/rejected": -229.4443359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.82016921043396, "rewards/margins": 14.628562927246094, "rewards/rejected": -16.44873046875, "step": 3293 }, { "epoch": 0.79, "learning_rate": 4.656e-08, "logps/chosen": -237.32693481445312, "logps/rejected": -387.8042907714844, "loss": 0.0023, "losses/dpo": 3.0798108952012626e-08, "losses/sft": 1.1872642040252686, "losses/total": 3.0798108952012626e-08, "ref_logps/chosen": -220.17617797851562, "ref_logps/rejected": -209.73321533203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.7150757312774658, "rewards/margins": 16.092031478881836, "rewards/rejected": -17.807106018066406, "step": 3294 }, { "epoch": 0.79, "learning_rate": 4.6506666666666665e-08, "logps/chosen": -277.52764892578125, "logps/rejected": -399.37542724609375, "loss": 0.0098, "losses/dpo": 1.757351859518752e-12, "losses/sft": 0.5994074940681458, "losses/total": 1.757351859518752e-12, "ref_logps/chosen": -255.475830078125, "ref_logps/rejected": -223.55953979492188, "rewards/accuracies": 1.0, "rewards/chosen": -2.2051849365234375, "rewards/margins": 15.376405715942383, "rewards/rejected": -17.58159065246582, "step": 3295 }, { "epoch": 0.79, "learning_rate": 4.6453333333333335e-08, "logps/chosen": -228.258056640625, "logps/rejected": -356.76123046875, "loss": 0.0042, "losses/dpo": 4.143930345890112e-05, "losses/sft": 0.7334782481193542, "losses/total": 4.143930345890112e-05, "ref_logps/chosen": -210.58370971679688, "ref_logps/rejected": -205.2898406982422, "rewards/accuracies": 1.0, "rewards/chosen": -1.7674345970153809, "rewards/margins": 13.379701614379883, "rewards/rejected": -15.147136688232422, "step": 3296 }, { "epoch": 0.79, "learning_rate": 4.64e-08, "logps/chosen": -277.5027160644531, "logps/rejected": -403.19195556640625, "loss": 0.0003, "losses/dpo": 4.944832653563935e-06, "losses/sft": 0.42455804347991943, "losses/total": 4.944832653563935e-06, "ref_logps/chosen": -253.88955688476562, "ref_logps/rejected": -220.51174926757812, "rewards/accuracies": 1.0, "rewards/chosen": -2.361316680908203, "rewards/margins": 15.906705856323242, "rewards/rejected": -18.268020629882812, "step": 3297 }, { "epoch": 0.79, "learning_rate": 4.634666666666667e-08, "logps/chosen": -257.6005554199219, "logps/rejected": -390.213134765625, "loss": 0.0008, "losses/dpo": 2.785501829194459e-10, "losses/sft": 0.5626360177993774, "losses/total": 2.785501829194459e-10, "ref_logps/chosen": -236.54074096679688, "ref_logps/rejected": -215.84140014648438, "rewards/accuracies": 1.0, "rewards/chosen": -2.105982780456543, "rewards/margins": 15.331188201904297, "rewards/rejected": -17.437171936035156, "step": 3298 }, { "epoch": 0.79, "learning_rate": 4.6293333333333326e-08, "logps/chosen": -236.06979370117188, "logps/rejected": -365.53118896484375, "loss": 0.0011, "losses/dpo": 8.297792533085158e-07, "losses/sft": 0.6396390795707703, "losses/total": 8.297792533085158e-07, "ref_logps/chosen": -218.79730224609375, "ref_logps/rejected": -209.45941162109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.7272465229034424, "rewards/margins": 13.879931449890137, "rewards/rejected": -15.607177734375, "step": 3299 }, { "epoch": 0.79, "learning_rate": 4.6239999999999996e-08, "logps/chosen": -256.4405517578125, "logps/rejected": -413.2972412109375, "loss": 0.0002, "losses/dpo": 1.816198675896885e-07, "losses/sft": 0.6640303730964661, "losses/total": 1.816198675896885e-07, "ref_logps/chosen": -236.85208129882812, "ref_logps/rejected": -245.61801147460938, "rewards/accuracies": 1.0, "rewards/chosen": -1.9588487148284912, "rewards/margins": 14.809077262878418, "rewards/rejected": -16.767925262451172, "step": 3300 }, { "epoch": 0.79, "learning_rate": 4.618666666666666e-08, "logps/chosen": -228.59063720703125, "logps/rejected": -378.1210632324219, "loss": 0.0001, "losses/dpo": 3.2167541519356746e-08, "losses/sft": 0.827538788318634, "losses/total": 3.2167541519356746e-08, "ref_logps/chosen": -214.42637634277344, "ref_logps/rejected": -221.34783935546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.416425108909607, "rewards/margins": 14.26089859008789, "rewards/rejected": -15.677324295043945, "step": 3301 }, { "epoch": 0.79, "learning_rate": 4.613333333333333e-08, "logps/chosen": -251.53843688964844, "logps/rejected": -391.4119873046875, "loss": 0.0001, "losses/dpo": 2.0849426352054223e-11, "losses/sft": 0.4548198878765106, "losses/total": 2.0849426352054223e-11, "ref_logps/chosen": -235.40960693359375, "ref_logps/rejected": -224.74758911132812, "rewards/accuracies": 1.0, "rewards/chosen": -1.6128838062286377, "rewards/margins": 15.053552627563477, "rewards/rejected": -16.66643714904785, "step": 3302 }, { "epoch": 0.79, "learning_rate": 4.608e-08, "logps/chosen": -235.0435791015625, "logps/rejected": -382.65740966796875, "loss": 0.0009, "losses/dpo": 6.172066946419363e-08, "losses/sft": 0.5535605549812317, "losses/total": 6.172066946419363e-08, "ref_logps/chosen": -214.66018676757812, "ref_logps/rejected": -213.369384765625, "rewards/accuracies": 1.0, "rewards/chosen": -2.0383388996124268, "rewards/margins": 14.890466690063477, "rewards/rejected": -16.92880630493164, "step": 3303 }, { "epoch": 0.79, "learning_rate": 4.602666666666666e-08, "logps/chosen": -206.20248413085938, "logps/rejected": -335.28204345703125, "loss": 0.0014, "losses/dpo": 2.682228217310012e-09, "losses/sft": 0.7424405813217163, "losses/total": 2.682228217310012e-09, "ref_logps/chosen": -192.25515747070312, "ref_logps/rejected": -183.82321166992188, "rewards/accuracies": 1.0, "rewards/chosen": -1.3947319984436035, "rewards/margins": 13.751151084899902, "rewards/rejected": -15.145883560180664, "step": 3304 }, { "epoch": 0.79, "learning_rate": 4.5973333333333333e-08, "logps/chosen": -254.8831787109375, "logps/rejected": -387.07177734375, "loss": 0.0004, "losses/dpo": 5.920115064839138e-09, "losses/sft": 0.5074403882026672, "losses/total": 5.920115064839138e-09, "ref_logps/chosen": -238.48818969726562, "ref_logps/rejected": -225.30111694335938, "rewards/accuracies": 1.0, "rewards/chosen": -1.63949716091156, "rewards/margins": 14.53757095336914, "rewards/rejected": -16.177066802978516, "step": 3305 }, { "epoch": 0.79, "learning_rate": 4.592e-08, "logps/chosen": -231.87374877929688, "logps/rejected": -366.18475341796875, "loss": 0.0005, "losses/dpo": 5.000682108402543e-07, "losses/sft": 0.5161131620407104, "losses/total": 5.000682108402543e-07, "ref_logps/chosen": -218.6345672607422, "ref_logps/rejected": -205.14419555664062, "rewards/accuracies": 1.0, "rewards/chosen": -1.3239208459854126, "rewards/margins": 14.780134201049805, "rewards/rejected": -16.104053497314453, "step": 3306 }, { "epoch": 0.79, "learning_rate": 4.586666666666667e-08, "logps/chosen": -243.49478149414062, "logps/rejected": -378.96636962890625, "loss": 0.0002, "losses/dpo": 6.395034870365635e-05, "losses/sft": 0.6459547877311707, "losses/total": 6.395034870365635e-05, "ref_logps/chosen": -226.89874267578125, "ref_logps/rejected": -222.2399139404297, "rewards/accuracies": 1.0, "rewards/chosen": -1.6596040725708008, "rewards/margins": 14.013044357299805, "rewards/rejected": -15.672647476196289, "step": 3307 }, { "epoch": 0.79, "learning_rate": 4.581333333333333e-08, "logps/chosen": -248.88018798828125, "logps/rejected": -420.6234130859375, "loss": 0.0079, "losses/dpo": 1.0396031591142219e-08, "losses/sft": 0.6977841854095459, "losses/total": 1.0396031591142219e-08, "ref_logps/chosen": -229.97079467773438, "ref_logps/rejected": -239.19053649902344, "rewards/accuracies": 1.0, "rewards/chosen": -1.8909385204315186, "rewards/margins": 16.252349853515625, "rewards/rejected": -18.14328956604004, "step": 3308 }, { "epoch": 0.79, "learning_rate": 4.576e-08, "logps/chosen": -218.88668823242188, "logps/rejected": -354.1050720214844, "loss": 0.001, "losses/dpo": 4.2172021430531e-09, "losses/sft": 0.6545208692550659, "losses/total": 4.2172021430531e-09, "ref_logps/chosen": -204.02462768554688, "ref_logps/rejected": -197.6123504638672, "rewards/accuracies": 1.0, "rewards/chosen": -1.486205816268921, "rewards/margins": 14.163066864013672, "rewards/rejected": -15.649271965026855, "step": 3309 }, { "epoch": 0.79, "learning_rate": 4.5706666666666664e-08, "logps/chosen": -260.33087158203125, "logps/rejected": -435.4863586425781, "loss": 0.0003, "losses/dpo": 4.327931080183589e-08, "losses/sft": 0.7554563283920288, "losses/total": 4.327931080183589e-08, "ref_logps/chosen": -243.7517852783203, "ref_logps/rejected": -242.98167419433594, "rewards/accuracies": 1.0, "rewards/chosen": -1.657909870147705, "rewards/margins": 17.592559814453125, "rewards/rejected": -19.250471115112305, "step": 3310 }, { "epoch": 0.79, "learning_rate": 4.5653333333333335e-08, "logps/chosen": -252.5723419189453, "logps/rejected": -431.7727966308594, "loss": 0.0, "losses/dpo": 4.592240216566568e-12, "losses/sft": 0.5746619701385498, "losses/total": 4.592240216566568e-12, "ref_logps/chosen": -235.2672119140625, "ref_logps/rejected": -230.15203857421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7305126190185547, "rewards/margins": 18.431560516357422, "rewards/rejected": -20.16207504272461, "step": 3311 }, { "epoch": 0.79, "learning_rate": 4.56e-08, "logps/chosen": -240.51022338867188, "logps/rejected": -391.61334228515625, "loss": 0.0001, "losses/dpo": 6.887282966161479e-10, "losses/sft": 0.8443474769592285, "losses/total": 6.887282966161479e-10, "ref_logps/chosen": -225.65341186523438, "ref_logps/rejected": -216.6966552734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4856809377670288, "rewards/margins": 16.00598907470703, "rewards/rejected": -17.491668701171875, "step": 3312 }, { "epoch": 0.8, "learning_rate": 4.554666666666667e-08, "logps/chosen": -239.80740356445312, "logps/rejected": -391.44061279296875, "loss": 0.0058, "losses/dpo": 2.2275091396295466e-05, "losses/sft": 0.7474870681762695, "losses/total": 2.2275091396295466e-05, "ref_logps/chosen": -223.66162109375, "ref_logps/rejected": -231.57485961914062, "rewards/accuracies": 1.0, "rewards/chosen": -1.614578366279602, "rewards/margins": 14.371994018554688, "rewards/rejected": -15.986571311950684, "step": 3313 }, { "epoch": 0.8, "learning_rate": 4.549333333333333e-08, "logps/chosen": -198.5300750732422, "logps/rejected": -353.7901306152344, "loss": 0.005, "losses/dpo": 9.787375176983915e-08, "losses/sft": 0.7926058769226074, "losses/total": 9.787375176983915e-08, "ref_logps/chosen": -187.78060913085938, "ref_logps/rejected": -203.89413452148438, "rewards/accuracies": 1.0, "rewards/chosen": -1.074946641921997, "rewards/margins": 13.914653778076172, "rewards/rejected": -14.98960018157959, "step": 3314 }, { "epoch": 0.8, "learning_rate": 4.544e-08, "logps/chosen": -230.1015625, "logps/rejected": -380.55267333984375, "loss": 0.0014, "losses/dpo": 5.073041986491944e-09, "losses/sft": 0.4830130934715271, "losses/total": 5.073041986491944e-09, "ref_logps/chosen": -212.35108947753906, "ref_logps/rejected": -205.86349487304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.775048851966858, "rewards/margins": 15.693865776062012, "rewards/rejected": -17.468915939331055, "step": 3315 }, { "epoch": 0.8, "learning_rate": 4.538666666666666e-08, "logps/chosen": -226.88858032226562, "logps/rejected": -410.2669677734375, "loss": 0.0001, "losses/dpo": 4.431999212073379e-09, "losses/sft": 0.670490562915802, "losses/total": 4.431999212073379e-09, "ref_logps/chosen": -213.21884155273438, "ref_logps/rejected": -233.82305908203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.3669712543487549, "rewards/margins": 16.277423858642578, "rewards/rejected": -17.644393920898438, "step": 3316 }, { "epoch": 0.8, "learning_rate": 4.533333333333333e-08, "logps/chosen": -184.33486938476562, "logps/rejected": -367.492919921875, "loss": 0.0026, "losses/dpo": 6.205768295330927e-05, "losses/sft": 0.7409111857414246, "losses/total": 6.205768295330927e-05, "ref_logps/chosen": -170.33334350585938, "ref_logps/rejected": -208.70486450195312, "rewards/accuracies": 1.0, "rewards/chosen": -1.4001526832580566, "rewards/margins": 14.478653907775879, "rewards/rejected": -15.878806114196777, "step": 3317 }, { "epoch": 0.8, "learning_rate": 4.528e-08, "logps/chosen": -252.734375, "logps/rejected": -392.7340393066406, "loss": 0.0007, "losses/dpo": 3.782453905643024e-09, "losses/sft": 0.5702399611473083, "losses/total": 3.782453905643024e-09, "ref_logps/chosen": -234.31100463867188, "ref_logps/rejected": -230.18585205078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.8423391580581665, "rewards/margins": 14.412477493286133, "rewards/rejected": -16.254817962646484, "step": 3318 }, { "epoch": 0.8, "learning_rate": 4.522666666666666e-08, "logps/chosen": -215.19496154785156, "logps/rejected": -391.03326416015625, "loss": 0.0005, "losses/dpo": 1.0807384501210215e-11, "losses/sft": 0.4619083106517792, "losses/total": 1.0807384501210215e-11, "ref_logps/chosen": -196.95748901367188, "ref_logps/rejected": -224.8911895751953, "rewards/accuracies": 1.0, "rewards/chosen": -1.8237481117248535, "rewards/margins": 14.790461540222168, "rewards/rejected": -16.614208221435547, "step": 3319 }, { "epoch": 0.8, "learning_rate": 4.517333333333333e-08, "logps/chosen": -241.7330780029297, "logps/rejected": -384.2864685058594, "loss": 0.0039, "losses/dpo": 3.0357835356653595e-08, "losses/sft": 0.6279031038284302, "losses/total": 3.0357835356653595e-08, "ref_logps/chosen": -220.90090942382812, "ref_logps/rejected": -218.758056640625, "rewards/accuracies": 1.0, "rewards/chosen": -2.0832173824310303, "rewards/margins": 14.469623565673828, "rewards/rejected": -16.552841186523438, "step": 3320 }, { "epoch": 0.8, "learning_rate": 4.5119999999999996e-08, "logps/chosen": -202.44216918945312, "logps/rejected": -393.550048828125, "loss": 0.0011, "losses/dpo": 5.5143267729818035e-08, "losses/sft": 0.5044786930084229, "losses/total": 5.5143267729818035e-08, "ref_logps/chosen": -186.30703735351562, "ref_logps/rejected": -220.11607360839844, "rewards/accuracies": 1.0, "rewards/chosen": -1.6135140657424927, "rewards/margins": 15.729887008666992, "rewards/rejected": -17.343400955200195, "step": 3321 }, { "epoch": 0.8, "learning_rate": 4.5066666666666667e-08, "logps/chosen": -212.15330505371094, "logps/rejected": -353.02044677734375, "loss": 0.0004, "losses/dpo": 5.985498763294572e-10, "losses/sft": 0.650227963924408, "losses/total": 5.985498763294572e-10, "ref_logps/chosen": -195.02261352539062, "ref_logps/rejected": -197.920654296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7130690813064575, "rewards/margins": 13.796911239624023, "rewards/rejected": -15.509979248046875, "step": 3322 }, { "epoch": 0.8, "learning_rate": 4.501333333333333e-08, "logps/chosen": -289.4401550292969, "logps/rejected": -450.3935546875, "loss": 0.0, "losses/dpo": 6.15864905739727e-08, "losses/sft": 0.6438305377960205, "losses/total": 6.15864905739727e-08, "ref_logps/chosen": -267.3931884765625, "ref_logps/rejected": -259.5750732421875, "rewards/accuracies": 1.0, "rewards/chosen": -2.204695463180542, "rewards/margins": 16.877155303955078, "rewards/rejected": -19.081851959228516, "step": 3323 }, { "epoch": 0.8, "learning_rate": 4.496e-08, "logps/chosen": -251.03085327148438, "logps/rejected": -407.7049560546875, "loss": 0.0003, "losses/dpo": 4.801939956777801e-10, "losses/sft": 0.7818458080291748, "losses/total": 4.801939956777801e-10, "ref_logps/chosen": -232.1060791015625, "ref_logps/rejected": -231.91500854492188, "rewards/accuracies": 1.0, "rewards/chosen": -1.8924777507781982, "rewards/margins": 15.686515808105469, "rewards/rejected": -17.57899284362793, "step": 3324 }, { "epoch": 0.8, "learning_rate": 4.4906666666666664e-08, "logps/chosen": -311.179931640625, "logps/rejected": -432.44415283203125, "loss": 0.0, "losses/dpo": 5.293545549817091e-12, "losses/sft": 0.482901394367218, "losses/total": 5.293545549817091e-12, "ref_logps/chosen": -291.85546875, "ref_logps/rejected": -246.91317749023438, "rewards/accuracies": 1.0, "rewards/chosen": -1.9324456453323364, "rewards/margins": 16.620651245117188, "rewards/rejected": -18.553096771240234, "step": 3325 }, { "epoch": 0.8, "learning_rate": 4.4853333333333334e-08, "logps/chosen": -264.29534912109375, "logps/rejected": -432.3149719238281, "loss": 0.0001, "losses/dpo": 2.1639435035525345e-12, "losses/sft": 0.5212429761886597, "losses/total": 2.1639435035525345e-12, "ref_logps/chosen": -241.2550048828125, "ref_logps/rejected": -239.64877319335938, "rewards/accuracies": 1.0, "rewards/chosen": -2.3040356636047363, "rewards/margins": 16.96258544921875, "rewards/rejected": -19.266620635986328, "step": 3326 }, { "epoch": 0.8, "learning_rate": 4.48e-08, "logps/chosen": -234.56504821777344, "logps/rejected": -414.9588623046875, "loss": 0.0005, "losses/dpo": 1.5148772902762175e-08, "losses/sft": 0.8934925198554993, "losses/total": 1.5148772902762175e-08, "ref_logps/chosen": -217.12078857421875, "ref_logps/rejected": -236.3918914794922, "rewards/accuracies": 1.0, "rewards/chosen": -1.744424819946289, "rewards/margins": 16.112268447875977, "rewards/rejected": -17.856693267822266, "step": 3327 }, { "epoch": 0.8, "learning_rate": 4.474666666666667e-08, "logps/chosen": -237.38819885253906, "logps/rejected": -399.24786376953125, "loss": 0.0009, "losses/dpo": 2.076361838021512e-08, "losses/sft": 0.8243150115013123, "losses/total": 2.076361838021512e-08, "ref_logps/chosen": -221.1696014404297, "ref_logps/rejected": -233.6533203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.621858835220337, "rewards/margins": 14.937592506408691, "rewards/rejected": -16.559452056884766, "step": 3328 }, { "epoch": 0.8, "learning_rate": 4.469333333333333e-08, "logps/chosen": -257.8652038574219, "logps/rejected": -441.9067077636719, "loss": 0.0024, "losses/dpo": 5.186077678409617e-10, "losses/sft": 0.717088520526886, "losses/total": 5.186077678409617e-10, "ref_logps/chosen": -241.4433135986328, "ref_logps/rejected": -247.285400390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6421910524368286, "rewards/margins": 17.819942474365234, "rewards/rejected": -19.46213150024414, "step": 3329 }, { "epoch": 0.8, "learning_rate": 4.464e-08, "logps/chosen": -284.0701904296875, "logps/rejected": -402.30206298828125, "loss": 0.0256, "losses/dpo": 1.6927081958328927e-10, "losses/sft": 0.6558920741081238, "losses/total": 1.6927081958328927e-10, "ref_logps/chosen": -266.5164794921875, "ref_logps/rejected": -230.89442443847656, "rewards/accuracies": 1.0, "rewards/chosen": -1.7553691864013672, "rewards/margins": 15.385393142700195, "rewards/rejected": -17.140762329101562, "step": 3330 }, { "epoch": 0.8, "learning_rate": 4.4586666666666665e-08, "logps/chosen": -237.76190185546875, "logps/rejected": -377.01165771484375, "loss": 0.0023, "losses/dpo": 5.712847084282657e-08, "losses/sft": 0.6394541263580322, "losses/total": 5.712847084282657e-08, "ref_logps/chosen": -218.75205993652344, "ref_logps/rejected": -217.28173828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.9009839296340942, "rewards/margins": 14.072011947631836, "rewards/rejected": -15.972996711730957, "step": 3331 }, { "epoch": 0.8, "learning_rate": 4.4533333333333335e-08, "logps/chosen": -227.30027770996094, "logps/rejected": -361.07781982421875, "loss": 0.0027, "losses/dpo": 1.9497300995929834e-10, "losses/sft": 0.5790054798126221, "losses/total": 1.9497300995929834e-10, "ref_logps/chosen": -208.48519897460938, "ref_logps/rejected": -214.4589385986328, "rewards/accuracies": 1.0, "rewards/chosen": -1.8815085887908936, "rewards/margins": 12.780378341674805, "rewards/rejected": -14.661887168884277, "step": 3332 }, { "epoch": 0.8, "learning_rate": 4.448e-08, "logps/chosen": -283.24127197265625, "logps/rejected": -422.6075134277344, "loss": 0.0002, "losses/dpo": 3.476964938187166e-09, "losses/sft": 0.5649646520614624, "losses/total": 3.476964938187166e-09, "ref_logps/chosen": -265.966064453125, "ref_logps/rejected": -244.23934936523438, "rewards/accuracies": 1.0, "rewards/chosen": -1.7275217771530151, "rewards/margins": 16.109294891357422, "rewards/rejected": -17.836816787719727, "step": 3333 }, { "epoch": 0.8, "learning_rate": 4.442666666666666e-08, "logps/chosen": -265.69439697265625, "logps/rejected": -416.86627197265625, "loss": 0.0, "losses/dpo": 2.7748290332141323e-09, "losses/sft": 0.6564506888389587, "losses/total": 2.7748290332141323e-09, "ref_logps/chosen": -247.72564697265625, "ref_logps/rejected": -243.23556518554688, "rewards/accuracies": 1.0, "rewards/chosen": -1.7968730926513672, "rewards/margins": 15.56619644165039, "rewards/rejected": -17.363069534301758, "step": 3334 }, { "epoch": 0.8, "learning_rate": 4.437333333333333e-08, "logps/chosen": -235.26025390625, "logps/rejected": -378.9438781738281, "loss": 0.0006, "losses/dpo": 1.0138684558569366e-07, "losses/sft": 0.46310317516326904, "losses/total": 1.0138684558569366e-07, "ref_logps/chosen": -218.31602478027344, "ref_logps/rejected": -205.36474609375, "rewards/accuracies": 1.0, "rewards/chosen": -1.694422721862793, "rewards/margins": 15.663488388061523, "rewards/rejected": -17.357912063598633, "step": 3335 }, { "epoch": 0.8, "learning_rate": 4.4319999999999996e-08, "logps/chosen": -237.47254943847656, "logps/rejected": -417.30548095703125, "loss": 0.0001, "losses/dpo": 2.6890867843576416e-07, "losses/sft": 0.9880436062812805, "losses/total": 2.6890867843576416e-07, "ref_logps/chosen": -221.32632446289062, "ref_logps/rejected": -239.02175903320312, "rewards/accuracies": 1.0, "rewards/chosen": -1.614620327949524, "rewards/margins": 16.21375274658203, "rewards/rejected": -17.828372955322266, "step": 3336 }, { "epoch": 0.8, "learning_rate": 4.4266666666666666e-08, "logps/chosen": -226.54502868652344, "logps/rejected": -405.78546142578125, "loss": 0.0005, "losses/dpo": 6.980876605666708e-06, "losses/sft": 0.8764118552207947, "losses/total": 6.980876605666708e-06, "ref_logps/chosen": -212.1939697265625, "ref_logps/rejected": -240.58755493164062, "rewards/accuracies": 1.0, "rewards/chosen": -1.4351050853729248, "rewards/margins": 15.084684371948242, "rewards/rejected": -16.519790649414062, "step": 3337 }, { "epoch": 0.8, "learning_rate": 4.421333333333333e-08, "logps/chosen": -211.66050720214844, "logps/rejected": -360.839111328125, "loss": 0.0004, "losses/dpo": 4.357843599867017e-11, "losses/sft": 0.6117756962776184, "losses/total": 4.357843599867017e-11, "ref_logps/chosen": -196.28826904296875, "ref_logps/rejected": -202.86227416992188, "rewards/accuracies": 1.0, "rewards/chosen": -1.5372250080108643, "rewards/margins": 14.260457992553711, "rewards/rejected": -15.797683715820312, "step": 3338 }, { "epoch": 0.8, "learning_rate": 4.416e-08, "logps/chosen": -222.28038024902344, "logps/rejected": -378.9281921386719, "loss": 0.0001, "losses/dpo": 2.365941782045411e-06, "losses/sft": 1.0834002494812012, "losses/total": 2.365941782045411e-06, "ref_logps/chosen": -208.00979614257812, "ref_logps/rejected": -221.88882446289062, "rewards/accuracies": 1.0, "rewards/chosen": -1.427059531211853, "rewards/margins": 14.276876449584961, "rewards/rejected": -15.703936576843262, "step": 3339 }, { "epoch": 0.8, "learning_rate": 4.410666666666666e-08, "logps/chosen": -159.51382446289062, "logps/rejected": -318.0439758300781, "loss": 0.002, "losses/dpo": 2.5720354734914963e-09, "losses/sft": 0.398809552192688, "losses/total": 2.5720354734914963e-09, "ref_logps/chosen": -147.8475799560547, "ref_logps/rejected": -181.82472229003906, "rewards/accuracies": 1.0, "rewards/chosen": -1.1666240692138672, "rewards/margins": 12.455299377441406, "rewards/rejected": -13.621923446655273, "step": 3340 }, { "epoch": 0.8, "learning_rate": 4.4053333333333333e-08, "logps/chosen": -207.54562377929688, "logps/rejected": -391.78021240234375, "loss": 0.0033, "losses/dpo": 9.579451187846644e-08, "losses/sft": 0.5738125443458557, "losses/total": 9.579451187846644e-08, "ref_logps/chosen": -193.1355743408203, "ref_logps/rejected": -224.8026123046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.4410059452056885, "rewards/margins": 15.256756782531738, "rewards/rejected": -16.697765350341797, "step": 3341 }, { "epoch": 0.8, "learning_rate": 4.4e-08, "logps/chosen": -261.5381164550781, "logps/rejected": -369.465087890625, "loss": 0.001, "losses/dpo": 3.459164510388746e-09, "losses/sft": 0.5500987768173218, "losses/total": 3.459164510388746e-09, "ref_logps/chosen": -239.42465209960938, "ref_logps/rejected": -207.39535522460938, "rewards/accuracies": 1.0, "rewards/chosen": -2.2113490104675293, "rewards/margins": 13.995624542236328, "rewards/rejected": -16.206974029541016, "step": 3342 }, { "epoch": 0.8, "learning_rate": 4.394666666666667e-08, "logps/chosen": -257.8736572265625, "logps/rejected": -390.97576904296875, "loss": 0.0015, "losses/dpo": 3.215644994725153e-07, "losses/sft": 0.6655091643333435, "losses/total": 3.215644994725153e-07, "ref_logps/chosen": -241.29177856445312, "ref_logps/rejected": -229.41676330566406, "rewards/accuracies": 1.0, "rewards/chosen": -1.6581859588623047, "rewards/margins": 14.497714042663574, "rewards/rejected": -16.155899047851562, "step": 3343 }, { "epoch": 0.8, "learning_rate": 4.389333333333333e-08, "logps/chosen": -242.48370361328125, "logps/rejected": -403.13177490234375, "loss": 0.0002, "losses/dpo": 5.703180830563979e-15, "losses/sft": 0.44373199343681335, "losses/total": 5.703180830563979e-15, "ref_logps/chosen": -225.67642211914062, "ref_logps/rejected": -219.97775268554688, "rewards/accuracies": 1.0, "rewards/chosen": -1.6807301044464111, "rewards/margins": 16.634674072265625, "rewards/rejected": -18.315404891967773, "step": 3344 }, { "epoch": 0.8, "learning_rate": 4.384e-08, "logps/chosen": -241.05079650878906, "logps/rejected": -386.6624450683594, "loss": 0.0012, "losses/dpo": 4.382247453804666e-08, "losses/sft": 0.7064669728279114, "losses/total": 4.382247453804666e-08, "ref_logps/chosen": -228.32345581054688, "ref_logps/rejected": -219.6396484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.2727323770523071, "rewards/margins": 15.429549217224121, "rewards/rejected": -16.702281951904297, "step": 3345 }, { "epoch": 0.8, "learning_rate": 4.3786666666666664e-08, "logps/chosen": -222.267333984375, "logps/rejected": -370.76422119140625, "loss": 0.0237, "losses/dpo": 4.96772315727867e-07, "losses/sft": 0.5575893521308899, "losses/total": 4.96772315727867e-07, "ref_logps/chosen": -209.6067352294922, "ref_logps/rejected": -205.5413055419922, "rewards/accuracies": 1.0, "rewards/chosen": -1.2660601139068604, "rewards/margins": 15.256229400634766, "rewards/rejected": -16.522289276123047, "step": 3346 }, { "epoch": 0.8, "learning_rate": 4.3733333333333335e-08, "logps/chosen": -228.8555908203125, "logps/rejected": -396.9857482910156, "loss": 0.0014, "losses/dpo": 3.1825402402319014e-07, "losses/sft": 0.5081180930137634, "losses/total": 3.1825402402319014e-07, "ref_logps/chosen": -214.81556701660156, "ref_logps/rejected": -226.45233154296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.4040026664733887, "rewards/margins": 15.649337768554688, "rewards/rejected": -17.053340911865234, "step": 3347 }, { "epoch": 0.8, "learning_rate": 4.368e-08, "logps/chosen": -252.68212890625, "logps/rejected": -347.2916259765625, "loss": 0.0031, "losses/dpo": 9.546105239621738e-09, "losses/sft": 0.7814364433288574, "losses/total": 9.546105239621738e-09, "ref_logps/chosen": -237.03756713867188, "ref_logps/rejected": -200.0943145751953, "rewards/accuracies": 1.0, "rewards/chosen": -1.5644574165344238, "rewards/margins": 13.155275344848633, "rewards/rejected": -14.719732284545898, "step": 3348 }, { "epoch": 0.8, "learning_rate": 4.362666666666667e-08, "logps/chosen": -256.25140380859375, "logps/rejected": -410.38714599609375, "loss": 0.0003, "losses/dpo": 5.309085171489869e-09, "losses/sft": 0.4672243297100067, "losses/total": 5.309085171489869e-09, "ref_logps/chosen": -237.28237915039062, "ref_logps/rejected": -233.73207092285156, "rewards/accuracies": 1.0, "rewards/chosen": -1.8968989849090576, "rewards/margins": 15.768608093261719, "rewards/rejected": -17.66550636291504, "step": 3349 }, { "epoch": 0.8, "learning_rate": 4.357333333333333e-08, "logps/chosen": -236.36953735351562, "logps/rejected": -389.04656982421875, "loss": 0.0012, "losses/dpo": 1.2051542341851018e-07, "losses/sft": 0.47745251655578613, "losses/total": 1.2051542341851018e-07, "ref_logps/chosen": -223.52676391601562, "ref_logps/rejected": -226.2903289794922, "rewards/accuracies": 1.0, "rewards/chosen": -1.284281849861145, "rewards/margins": 14.99134349822998, "rewards/rejected": -16.27562713623047, "step": 3350 }, { "epoch": 0.8, "learning_rate": 4.3519999999999995e-08, "logps/chosen": -218.9580078125, "logps/rejected": -406.5171813964844, "loss": 0.0, "losses/dpo": 2.8927163953085255e-07, "losses/sft": 0.8625361919403076, "losses/total": 2.8927163953085255e-07, "ref_logps/chosen": -207.06625366210938, "ref_logps/rejected": -229.49156188964844, "rewards/accuracies": 1.0, "rewards/chosen": -1.1891732215881348, "rewards/margins": 16.51338768005371, "rewards/rejected": -17.70256233215332, "step": 3351 }, { "epoch": 0.8, "learning_rate": 4.3466666666666665e-08, "logps/chosen": -257.74859619140625, "logps/rejected": -414.6785888671875, "loss": 0.0001, "losses/dpo": 5.780922833764635e-07, "losses/sft": 0.9584203958511353, "losses/total": 5.780922833764635e-07, "ref_logps/chosen": -241.70111083984375, "ref_logps/rejected": -240.1829833984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.604749083518982, "rewards/margins": 15.84481430053711, "rewards/rejected": -17.44956398010254, "step": 3352 }, { "epoch": 0.8, "learning_rate": 4.341333333333333e-08, "logps/chosen": -220.93600463867188, "logps/rejected": -386.8572998046875, "loss": 0.0009, "losses/dpo": 5.070730058065465e-09, "losses/sft": 0.7302241921424866, "losses/total": 5.070730058065465e-09, "ref_logps/chosen": -208.09646606445312, "ref_logps/rejected": -227.20249938964844, "rewards/accuracies": 1.0, "rewards/chosen": -1.2839548587799072, "rewards/margins": 14.681529998779297, "rewards/rejected": -15.965485572814941, "step": 3353 }, { "epoch": 0.8, "learning_rate": 4.336e-08, "logps/chosen": -248.80679321289062, "logps/rejected": -402.94110107421875, "loss": 0.0008, "losses/dpo": 0.01846056990325451, "losses/sft": 0.843866765499115, "losses/total": 0.01846056990325451, "ref_logps/chosen": -233.24246215820312, "ref_logps/rejected": -230.13363647460938, "rewards/accuracies": 1.0, "rewards/chosen": -1.5564346313476562, "rewards/margins": 15.72431468963623, "rewards/rejected": -17.280750274658203, "step": 3354 }, { "epoch": 0.81, "learning_rate": 4.330666666666666e-08, "logps/chosen": -246.0716094970703, "logps/rejected": -402.1590576171875, "loss": 0.0058, "losses/dpo": 1.0306500541901187e-07, "losses/sft": 0.6928789019584656, "losses/total": 1.0306500541901187e-07, "ref_logps/chosen": -229.63270568847656, "ref_logps/rejected": -224.61181640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.643890142440796, "rewards/margins": 16.110836029052734, "rewards/rejected": -17.754728317260742, "step": 3355 }, { "epoch": 0.81, "learning_rate": 4.325333333333333e-08, "logps/chosen": -225.30227661132812, "logps/rejected": -390.33001708984375, "loss": 0.0007, "losses/dpo": 3.1114286258571155e-08, "losses/sft": 0.6061773896217346, "losses/total": 3.1114286258571155e-08, "ref_logps/chosen": -208.47274780273438, "ref_logps/rejected": -224.21875, "rewards/accuracies": 1.0, "rewards/chosen": -1.682955026626587, "rewards/margins": 14.928175926208496, "rewards/rejected": -16.611129760742188, "step": 3356 }, { "epoch": 0.81, "learning_rate": 4.3199999999999996e-08, "logps/chosen": -267.73388671875, "logps/rejected": -402.0426025390625, "loss": 0.0019, "losses/dpo": 1.5376770079456037e-08, "losses/sft": 0.6746810078620911, "losses/total": 1.5376770079456037e-08, "ref_logps/chosen": -249.44822692871094, "ref_logps/rejected": -228.23049926757812, "rewards/accuracies": 1.0, "rewards/chosen": -1.828568458557129, "rewards/margins": 15.552644729614258, "rewards/rejected": -17.381214141845703, "step": 3357 }, { "epoch": 0.81, "learning_rate": 4.3146666666666667e-08, "logps/chosen": -225.3671875, "logps/rejected": -376.3995056152344, "loss": 0.0016, "losses/dpo": 6.39101563137956e-06, "losses/sft": 0.7351909875869751, "losses/total": 6.39101563137956e-06, "ref_logps/chosen": -209.13861083984375, "ref_logps/rejected": -209.00750732421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.6228580474853516, "rewards/margins": 15.11634349822998, "rewards/rejected": -16.739200592041016, "step": 3358 }, { "epoch": 0.81, "learning_rate": 4.309333333333333e-08, "logps/chosen": -216.07113647460938, "logps/rejected": -369.26531982421875, "loss": 0.0071, "losses/dpo": 9.666206501890429e-09, "losses/sft": 0.6441307067871094, "losses/total": 9.666206501890429e-09, "ref_logps/chosen": -200.07667541503906, "ref_logps/rejected": -207.37721252441406, "rewards/accuracies": 1.0, "rewards/chosen": -1.5994471311569214, "rewards/margins": 14.589366912841797, "rewards/rejected": -16.188812255859375, "step": 3359 }, { "epoch": 0.81, "learning_rate": 4.304e-08, "logps/chosen": -206.99346923828125, "logps/rejected": -375.2104797363281, "loss": 0.0099, "losses/dpo": 2.8976323562801554e-08, "losses/sft": 1.128003478050232, "losses/total": 2.8976323562801554e-08, "ref_logps/chosen": -194.87420654296875, "ref_logps/rejected": -218.13275146484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.2119276523590088, "rewards/margins": 14.495845794677734, "rewards/rejected": -15.707773208618164, "step": 3360 }, { "epoch": 0.81, "learning_rate": 4.2986666666666664e-08, "logps/chosen": -265.3935546875, "logps/rejected": -409.73883056640625, "loss": 0.0021, "losses/dpo": 2.3928471648559935e-08, "losses/sft": 0.8834501504898071, "losses/total": 2.3928471648559935e-08, "ref_logps/chosen": -249.64971923828125, "ref_logps/rejected": -239.45260620117188, "rewards/accuracies": 1.0, "rewards/chosen": -1.5743814706802368, "rewards/margins": 15.454241752624512, "rewards/rejected": -17.028623580932617, "step": 3361 }, { "epoch": 0.81, "learning_rate": 4.2933333333333334e-08, "logps/chosen": -228.90338134765625, "logps/rejected": -409.331298828125, "loss": 0.0004, "losses/dpo": 2.348477323721454e-08, "losses/sft": 0.522367000579834, "losses/total": 2.348477323721454e-08, "ref_logps/chosen": -213.620849609375, "ref_logps/rejected": -244.17904663085938, "rewards/accuracies": 1.0, "rewards/chosen": -1.5282565355300903, "rewards/margins": 14.986964225769043, "rewards/rejected": -16.515220642089844, "step": 3362 }, { "epoch": 0.81, "learning_rate": 4.288e-08, "logps/chosen": -218.78189086914062, "logps/rejected": -341.08111572265625, "loss": 0.0015, "losses/dpo": 1.4020977434636706e-10, "losses/sft": 0.7383686900138855, "losses/total": 1.4020977434636706e-10, "ref_logps/chosen": -201.71475219726562, "ref_logps/rejected": -193.83102416992188, "rewards/accuracies": 1.0, "rewards/chosen": -1.7067148685455322, "rewards/margins": 13.018295288085938, "rewards/rejected": -14.72500991821289, "step": 3363 }, { "epoch": 0.81, "learning_rate": 4.282666666666667e-08, "logps/chosen": -240.9940643310547, "logps/rejected": -409.71685791015625, "loss": 0.0009, "losses/dpo": 5.4242185854036506e-08, "losses/sft": 0.5740759968757629, "losses/total": 5.4242185854036506e-08, "ref_logps/chosen": -222.3536376953125, "ref_logps/rejected": -235.46971130371094, "rewards/accuracies": 1.0, "rewards/chosen": -1.8640446662902832, "rewards/margins": 15.56067180633545, "rewards/rejected": -17.42471694946289, "step": 3364 }, { "epoch": 0.81, "learning_rate": 4.277333333333333e-08, "logps/chosen": -233.54367065429688, "logps/rejected": -389.8848876953125, "loss": 0.0001, "losses/dpo": 2.117010566848876e-09, "losses/sft": 0.6287983059883118, "losses/total": 2.117010566848876e-09, "ref_logps/chosen": -216.4065704345703, "ref_logps/rejected": -212.4896697998047, "rewards/accuracies": 1.0, "rewards/chosen": -1.713711142539978, "rewards/margins": 16.025814056396484, "rewards/rejected": -17.739524841308594, "step": 3365 }, { "epoch": 0.81, "learning_rate": 4.272e-08, "logps/chosen": -219.72341918945312, "logps/rejected": -389.374267578125, "loss": 0.0004, "losses/dpo": 3.6358716037909744e-09, "losses/sft": 0.6111665964126587, "losses/total": 3.6358716037909744e-09, "ref_logps/chosen": -202.53436279296875, "ref_logps/rejected": -214.76454162597656, "rewards/accuracies": 1.0, "rewards/chosen": -1.7189056873321533, "rewards/margins": 15.742066383361816, "rewards/rejected": -17.46097183227539, "step": 3366 }, { "epoch": 0.81, "learning_rate": 4.2666666666666665e-08, "logps/chosen": -206.30935668945312, "logps/rejected": -386.61346435546875, "loss": 0.0014, "losses/dpo": 1.2254139925005347e-09, "losses/sft": 0.6758737564086914, "losses/total": 1.2254139925005347e-09, "ref_logps/chosen": -190.60452270507812, "ref_logps/rejected": -218.51641845703125, "rewards/accuracies": 1.0, "rewards/chosen": -1.5704845190048218, "rewards/margins": 15.239222526550293, "rewards/rejected": -16.809707641601562, "step": 3367 }, { "epoch": 0.81, "learning_rate": 4.261333333333333e-08, "logps/chosen": -255.66397094726562, "logps/rejected": -474.2336120605469, "loss": 0.0, "losses/dpo": 3.0655269434021193e-09, "losses/sft": 0.6790831089019775, "losses/total": 3.0655269434021193e-09, "ref_logps/chosen": -235.705322265625, "ref_logps/rejected": -267.8850402832031, "rewards/accuracies": 1.0, "rewards/chosen": -1.995863437652588, "rewards/margins": 18.638996124267578, "rewards/rejected": -20.634859085083008, "step": 3368 }, { "epoch": 0.81, "learning_rate": 4.256e-08, "logps/chosen": -245.44505310058594, "logps/rejected": -417.7493896484375, "loss": 0.0001, "losses/dpo": 8.088060354793924e-08, "losses/sft": 0.8907879590988159, "losses/total": 8.088060354793924e-08, "ref_logps/chosen": -228.57420349121094, "ref_logps/rejected": -245.25436401367188, "rewards/accuracies": 1.0, "rewards/chosen": -1.6870841979980469, "rewards/margins": 15.562420845031738, "rewards/rejected": -17.24950408935547, "step": 3369 }, { "epoch": 0.81, "learning_rate": 4.250666666666666e-08, "logps/chosen": -200.16000366210938, "logps/rejected": -340.3952941894531, "loss": 0.0025, "losses/dpo": 1.3195496251228178e-07, "losses/sft": 0.7373737692832947, "losses/total": 1.3195496251228178e-07, "ref_logps/chosen": -179.41954040527344, "ref_logps/rejected": -190.48147583007812, "rewards/accuracies": 1.0, "rewards/chosen": -2.074047565460205, "rewards/margins": 12.917335510253906, "rewards/rejected": -14.991381645202637, "step": 3370 }, { "epoch": 0.81, "learning_rate": 4.245333333333333e-08, "logps/chosen": -249.9691619873047, "logps/rejected": -371.99676513671875, "loss": 0.0007, "losses/dpo": 0.0012649152195081115, "losses/sft": 0.4013286232948303, "losses/total": 0.0012649152195081115, "ref_logps/chosen": -232.70724487304688, "ref_logps/rejected": -211.7978057861328, "rewards/accuracies": 1.0, "rewards/chosen": -1.7261924743652344, "rewards/margins": 14.293700218200684, "rewards/rejected": -16.019893646240234, "step": 3371 }, { "epoch": 0.81, "learning_rate": 4.2399999999999996e-08, "logps/chosen": -237.33148193359375, "logps/rejected": -403.32220458984375, "loss": 0.004, "losses/dpo": 1.1589246895482574e-07, "losses/sft": 0.4941027760505676, "losses/total": 1.1589246895482574e-07, "ref_logps/chosen": -218.0215606689453, "ref_logps/rejected": -231.61932373046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.930991530418396, "rewards/margins": 15.239295959472656, "rewards/rejected": -17.1702880859375, "step": 3372 }, { "epoch": 0.81, "learning_rate": 4.2346666666666666e-08, "logps/chosen": -227.89498901367188, "logps/rejected": -438.29742431640625, "loss": 0.0085, "losses/dpo": 3.3657968856459775e-08, "losses/sft": 0.57017982006073, "losses/total": 3.3657968856459775e-08, "ref_logps/chosen": -212.555419921875, "ref_logps/rejected": -251.88067626953125, "rewards/accuracies": 1.0, "rewards/chosen": -1.533956527709961, "rewards/margins": 17.107723236083984, "rewards/rejected": -18.641679763793945, "step": 3373 }, { "epoch": 0.81, "learning_rate": 4.229333333333333e-08, "logps/chosen": -217.09982299804688, "logps/rejected": -392.82861328125, "loss": 0.0096, "losses/dpo": 3.5060797039070113e-13, "losses/sft": 0.636249840259552, "losses/total": 3.5060797039070113e-13, "ref_logps/chosen": -200.78384399414062, "ref_logps/rejected": -217.08958435058594, "rewards/accuracies": 1.0, "rewards/chosen": -1.6315977573394775, "rewards/margins": 15.942306518554688, "rewards/rejected": -17.573904037475586, "step": 3374 }, { "epoch": 0.81, "learning_rate": 4.224e-08, "logps/chosen": -189.51171875, "logps/rejected": -330.57318115234375, "loss": 0.0043, "losses/dpo": 7.418371410494728e-07, "losses/sft": 0.6931958794593811, "losses/total": 7.418371410494728e-07, "ref_logps/chosen": -176.88525390625, "ref_logps/rejected": -192.4356689453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.2626465559005737, "rewards/margins": 12.551107406616211, "rewards/rejected": -13.81375503540039, "step": 3375 }, { "epoch": 0.81, "learning_rate": 4.218666666666666e-08, "logps/chosen": -256.68572998046875, "logps/rejected": -406.48431396484375, "loss": 0.0006, "losses/dpo": 3.8641840838238295e-09, "losses/sft": 0.5062181353569031, "losses/total": 3.8641840838238295e-09, "ref_logps/chosen": -235.95318603515625, "ref_logps/rejected": -221.85516357421875, "rewards/accuracies": 1.0, "rewards/chosen": -2.073253870010376, "rewards/margins": 16.38966178894043, "rewards/rejected": -18.462915420532227, "step": 3376 }, { "epoch": 0.81, "learning_rate": 4.2133333333333333e-08, "logps/chosen": -234.27731323242188, "logps/rejected": -358.64495849609375, "loss": 0.006, "losses/dpo": 2.0537445077906114e-09, "losses/sft": 0.7740013599395752, "losses/total": 2.0537445077906114e-09, "ref_logps/chosen": -210.31167602539062, "ref_logps/rejected": -194.76242065429688, "rewards/accuracies": 1.0, "rewards/chosen": -2.3965647220611572, "rewards/margins": 13.991691589355469, "rewards/rejected": -16.388256072998047, "step": 3377 }, { "epoch": 0.81, "learning_rate": 4.208e-08, "logps/chosen": -202.37330627441406, "logps/rejected": -382.2041015625, "loss": 0.0004, "losses/dpo": 3.479736221390084e-10, "losses/sft": 0.37552231550216675, "losses/total": 3.479736221390084e-10, "ref_logps/chosen": -191.60691833496094, "ref_logps/rejected": -221.35707092285156, "rewards/accuracies": 1.0, "rewards/chosen": -1.0766394138336182, "rewards/margins": 15.008064270019531, "rewards/rejected": -16.084701538085938, "step": 3378 }, { "epoch": 0.81, "learning_rate": 4.202666666666667e-08, "logps/chosen": -211.15972900390625, "logps/rejected": -356.1524963378906, "loss": 0.0036, "losses/dpo": 7.44874661950945e-10, "losses/sft": 0.7096612453460693, "losses/total": 7.44874661950945e-10, "ref_logps/chosen": -196.14002990722656, "ref_logps/rejected": -200.912109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.501969814300537, "rewards/margins": 14.022068977355957, "rewards/rejected": -15.52403736114502, "step": 3379 }, { "epoch": 0.81, "learning_rate": 4.197333333333333e-08, "logps/chosen": -237.53067016601562, "logps/rejected": -400.1259460449219, "loss": 0.002, "losses/dpo": 1.0411421499156859e-05, "losses/sft": 0.9905813932418823, "losses/total": 1.0411421499156859e-05, "ref_logps/chosen": -221.26898193359375, "ref_logps/rejected": -231.90264892578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6261703968048096, "rewards/margins": 15.196159362792969, "rewards/rejected": -16.822328567504883, "step": 3380 }, { "epoch": 0.81, "learning_rate": 4.192e-08, "logps/chosen": -243.8478240966797, "logps/rejected": -407.82073974609375, "loss": 0.0004, "losses/dpo": 1.0379730497334094e-07, "losses/sft": 0.9317437410354614, "losses/total": 1.0379730497334094e-07, "ref_logps/chosen": -227.08384704589844, "ref_logps/rejected": -235.96621704101562, "rewards/accuracies": 1.0, "rewards/chosen": -1.676397442817688, "rewards/margins": 15.509054183959961, "rewards/rejected": -17.18545150756836, "step": 3381 }, { "epoch": 0.81, "learning_rate": 4.1866666666666664e-08, "logps/chosen": -266.3857421875, "logps/rejected": -412.7493896484375, "loss": 0.0002, "losses/dpo": 1.9602211409619485e-08, "losses/sft": 0.587567150592804, "losses/total": 1.9602211409619485e-08, "ref_logps/chosen": -248.7305145263672, "ref_logps/rejected": -224.23348999023438, "rewards/accuracies": 1.0, "rewards/chosen": -1.7655267715454102, "rewards/margins": 17.086063385009766, "rewards/rejected": -18.851593017578125, "step": 3382 }, { "epoch": 0.81, "learning_rate": 4.1813333333333335e-08, "logps/chosen": -255.51683044433594, "logps/rejected": -385.021484375, "loss": 0.0051, "losses/dpo": 2.469113553615898e-07, "losses/sft": 0.6285591721534729, "losses/total": 2.469113553615898e-07, "ref_logps/chosen": -237.58949279785156, "ref_logps/rejected": -221.103271484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.7927354574203491, "rewards/margins": 14.599088668823242, "rewards/rejected": -16.391822814941406, "step": 3383 }, { "epoch": 0.81, "learning_rate": 4.176e-08, "logps/chosen": -212.91525268554688, "logps/rejected": -372.2430419921875, "loss": 0.0029, "losses/dpo": 2.9173369853197073e-07, "losses/sft": 0.7244524359703064, "losses/total": 2.9173369853197073e-07, "ref_logps/chosen": -196.990234375, "ref_logps/rejected": -211.82957458496094, "rewards/accuracies": 1.0, "rewards/chosen": -1.5925014019012451, "rewards/margins": 14.448847770690918, "rewards/rejected": -16.041349411010742, "step": 3384 }, { "epoch": 0.81, "learning_rate": 4.170666666666666e-08, "logps/chosen": -264.5030517578125, "logps/rejected": -410.83843994140625, "loss": 0.0003, "losses/dpo": 7.729842321779756e-10, "losses/sft": 0.7268298268318176, "losses/total": 7.729842321779756e-10, "ref_logps/chosen": -243.9127197265625, "ref_logps/rejected": -227.24832153320312, "rewards/accuracies": 1.0, "rewards/chosen": -2.059030294418335, "rewards/margins": 16.299983978271484, "rewards/rejected": -18.35901641845703, "step": 3385 }, { "epoch": 0.81, "learning_rate": 4.165333333333333e-08, "logps/chosen": -249.81741333007812, "logps/rejected": -408.0929870605469, "loss": 0.0009, "losses/dpo": 1.2410819749675284e-07, "losses/sft": 0.7846064567565918, "losses/total": 1.2410819749675284e-07, "ref_logps/chosen": -233.2797393798828, "ref_logps/rejected": -233.65057373046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.653768539428711, "rewards/margins": 15.790473937988281, "rewards/rejected": -17.444242477416992, "step": 3386 }, { "epoch": 0.81, "learning_rate": 4.1599999999999995e-08, "logps/chosen": -220.83975219726562, "logps/rejected": -389.3464660644531, "loss": 0.0008, "losses/dpo": 3.5388420656090602e-06, "losses/sft": 0.5729036331176758, "losses/total": 3.5388420656090602e-06, "ref_logps/chosen": -201.48733520507812, "ref_logps/rejected": -218.63690185546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.935239315032959, "rewards/margins": 15.135717391967773, "rewards/rejected": -17.070955276489258, "step": 3387 }, { "epoch": 0.81, "learning_rate": 4.1546666666666665e-08, "logps/chosen": -261.02301025390625, "logps/rejected": -367.65264892578125, "loss": 0.0009, "losses/dpo": 1.0556293091212865e-05, "losses/sft": 0.5961645841598511, "losses/total": 1.0556293091212865e-05, "ref_logps/chosen": -246.28176879882812, "ref_logps/rejected": -219.89401245117188, "rewards/accuracies": 1.0, "rewards/chosen": -1.4741243124008179, "rewards/margins": 13.301739692687988, "rewards/rejected": -14.775864601135254, "step": 3388 }, { "epoch": 0.81, "learning_rate": 4.149333333333333e-08, "logps/chosen": -282.27972412109375, "logps/rejected": -421.86846923828125, "loss": 0.0002, "losses/dpo": 6.7387201374558e-10, "losses/sft": 0.585343062877655, "losses/total": 6.7387201374558e-10, "ref_logps/chosen": -263.1717529296875, "ref_logps/rejected": -245.12840270996094, "rewards/accuracies": 1.0, "rewards/chosen": -1.9107983112335205, "rewards/margins": 15.76321029663086, "rewards/rejected": -17.674007415771484, "step": 3389 }, { "epoch": 0.81, "learning_rate": 4.144e-08, "logps/chosen": -242.60198974609375, "logps/rejected": -385.3513488769531, "loss": 0.0013, "losses/dpo": 4.108864231966436e-06, "losses/sft": 0.6502060294151306, "losses/total": 4.108864231966436e-06, "ref_logps/chosen": -227.88613891601562, "ref_logps/rejected": -229.01219177246094, "rewards/accuracies": 1.0, "rewards/chosen": -1.4715858697891235, "rewards/margins": 14.162330627441406, "rewards/rejected": -15.633916854858398, "step": 3390 }, { "epoch": 0.81, "learning_rate": 4.138666666666666e-08, "logps/chosen": -249.92076110839844, "logps/rejected": -398.10406494140625, "loss": 0.0003, "losses/dpo": 3.1660178922976456e-09, "losses/sft": 0.494380384683609, "losses/total": 3.1660178922976456e-09, "ref_logps/chosen": -230.53515625, "ref_logps/rejected": -216.92050170898438, "rewards/accuracies": 1.0, "rewards/chosen": -1.9385595321655273, "rewards/margins": 16.17979621887207, "rewards/rejected": -18.11835479736328, "step": 3391 }, { "epoch": 0.81, "learning_rate": 4.133333333333333e-08, "logps/chosen": -205.5152587890625, "logps/rejected": -394.7110595703125, "loss": 0.0018, "losses/dpo": 4.007524239568738e-06, "losses/sft": 0.5668821334838867, "losses/total": 4.007524239568738e-06, "ref_logps/chosen": -188.77761840820312, "ref_logps/rejected": -225.9619903564453, "rewards/accuracies": 1.0, "rewards/chosen": -1.6737663745880127, "rewards/margins": 15.201139450073242, "rewards/rejected": -16.87490463256836, "step": 3392 }, { "epoch": 0.81, "learning_rate": 4.1279999999999996e-08, "logps/chosen": -210.26890563964844, "logps/rejected": -373.640380859375, "loss": 0.002, "losses/dpo": 1.242096573150775e-06, "losses/sft": 0.9378220438957214, "losses/total": 1.242096573150775e-06, "ref_logps/chosen": -195.6708984375, "ref_logps/rejected": -213.66127014160156, "rewards/accuracies": 1.0, "rewards/chosen": -1.4598009586334229, "rewards/margins": 14.538110733032227, "rewards/rejected": -15.99791145324707, "step": 3393 }, { "epoch": 0.81, "learning_rate": 4.1226666666666667e-08, "logps/chosen": -269.05364990234375, "logps/rejected": -359.7716979980469, "loss": 0.0011, "losses/dpo": 3.33470282498638e-08, "losses/sft": 0.6947476863861084, "losses/total": 3.33470282498638e-08, "ref_logps/chosen": -252.12136840820312, "ref_logps/rejected": -212.89056396484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.6932293176651, "rewards/margins": 12.994884490966797, "rewards/rejected": -14.688114166259766, "step": 3394 }, { "epoch": 0.81, "learning_rate": 4.117333333333333e-08, "logps/chosen": -296.3728942871094, "logps/rejected": -395.401123046875, "loss": 0.0002, "losses/dpo": 1.3735665049807722e-07, "losses/sft": 0.5219188928604126, "losses/total": 1.3735665049807722e-07, "ref_logps/chosen": -276.0104675292969, "ref_logps/rejected": -227.27769470214844, "rewards/accuracies": 1.0, "rewards/chosen": -2.036243200302124, "rewards/margins": 14.776098251342773, "rewards/rejected": -16.812339782714844, "step": 3395 }, { "epoch": 0.81, "learning_rate": 4.112e-08, "logps/chosen": -241.173095703125, "logps/rejected": -378.2516784667969, "loss": 0.0004, "losses/dpo": 4.759589344871529e-08, "losses/sft": 0.5661069750785828, "losses/total": 4.759589344871529e-08, "ref_logps/chosen": -226.78927612304688, "ref_logps/rejected": -219.2713165283203, "rewards/accuracies": 1.0, "rewards/chosen": -1.4383823871612549, "rewards/margins": 14.459653854370117, "rewards/rejected": -15.898037910461426, "step": 3396 }, { "epoch": 0.82, "learning_rate": 4.1066666666666664e-08, "logps/chosen": -243.0471954345703, "logps/rejected": -381.99664306640625, "loss": 0.0027, "losses/dpo": 1.617161979083903e-05, "losses/sft": 0.631109356880188, "losses/total": 1.617161979083903e-05, "ref_logps/chosen": -225.11883544921875, "ref_logps/rejected": -219.96017456054688, "rewards/accuracies": 1.0, "rewards/chosen": -1.7928352355957031, "rewards/margins": 14.410809516906738, "rewards/rejected": -16.203643798828125, "step": 3397 }, { "epoch": 0.82, "learning_rate": 4.1013333333333334e-08, "logps/chosen": -188.78244018554688, "logps/rejected": -362.1385498046875, "loss": 0.0001, "losses/dpo": 1.4365477696287599e-09, "losses/sft": 0.9199944734573364, "losses/total": 1.4365477696287599e-09, "ref_logps/chosen": -175.35299682617188, "ref_logps/rejected": -197.28305053710938, "rewards/accuracies": 1.0, "rewards/chosen": -1.342942714691162, "rewards/margins": 15.142608642578125, "rewards/rejected": -16.485551834106445, "step": 3398 }, { "epoch": 0.82, "learning_rate": 4.096e-08, "logps/chosen": -241.960693359375, "logps/rejected": -404.47637939453125, "loss": 0.0009, "losses/dpo": 1.4997388461779337e-05, "losses/sft": 0.5151885747909546, "losses/total": 1.4997388461779337e-05, "ref_logps/chosen": -224.29336547851562, "ref_logps/rejected": -227.36904907226562, "rewards/accuracies": 1.0, "rewards/chosen": -1.766733169555664, "rewards/margins": 15.944003105163574, "rewards/rejected": -17.710737228393555, "step": 3399 }, { "epoch": 0.82, "learning_rate": 4.090666666666667e-08, "logps/chosen": -245.57171630859375, "logps/rejected": -397.9075622558594, "loss": 0.0006, "losses/dpo": 1.6233353505867854e-07, "losses/sft": 0.43618956208229065, "losses/total": 1.6233353505867854e-07, "ref_logps/chosen": -226.12571716308594, "ref_logps/rejected": -223.16812133789062, "rewards/accuracies": 1.0, "rewards/chosen": -1.9445998668670654, "rewards/margins": 15.52934741973877, "rewards/rejected": -17.47394561767578, "step": 3400 }, { "epoch": 0.82, "learning_rate": 4.085333333333333e-08, "logps/chosen": -266.4376220703125, "logps/rejected": -438.4355773925781, "loss": 0.0, "losses/dpo": 3.0093578970991075e-06, "losses/sft": 1.44698166847229, "losses/total": 3.0093578970991075e-06, "ref_logps/chosen": -250.76132202148438, "ref_logps/rejected": -248.6719207763672, "rewards/accuracies": 1.0, "rewards/chosen": -1.5676276683807373, "rewards/margins": 17.408737182617188, "rewards/rejected": -18.976367950439453, "step": 3401 }, { "epoch": 0.82, "learning_rate": 4.0799999999999995e-08, "logps/chosen": -202.10084533691406, "logps/rejected": -350.40509033203125, "loss": 0.0062, "losses/dpo": 3.569394024793837e-08, "losses/sft": 0.8081244230270386, "losses/total": 3.569394024793837e-08, "ref_logps/chosen": -190.95492553710938, "ref_logps/rejected": -198.19198608398438, "rewards/accuracies": 1.0, "rewards/chosen": -1.1145933866500854, "rewards/margins": 14.106714248657227, "rewards/rejected": -15.221307754516602, "step": 3402 }, { "epoch": 0.82, "learning_rate": 4.0746666666666665e-08, "logps/chosen": -256.372314453125, "logps/rejected": -418.89263916015625, "loss": 0.0, "losses/dpo": 7.578699889876361e-10, "losses/sft": 0.4439549148082733, "losses/total": 7.578699889876361e-10, "ref_logps/chosen": -238.60397338867188, "ref_logps/rejected": -232.02590942382812, "rewards/accuracies": 1.0, "rewards/chosen": -1.7768350839614868, "rewards/margins": 16.909839630126953, "rewards/rejected": -18.686674118041992, "step": 3403 }, { "epoch": 0.82, "learning_rate": 4.069333333333333e-08, "logps/chosen": -237.90785217285156, "logps/rejected": -387.18621826171875, "loss": 0.0005, "losses/dpo": 5.69898304547678e-07, "losses/sft": 0.5866971015930176, "losses/total": 5.69898304547678e-07, "ref_logps/chosen": -219.63101196289062, "ref_logps/rejected": -216.13934326171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.8276851177215576, "rewards/margins": 15.27700424194336, "rewards/rejected": -17.10468864440918, "step": 3404 }, { "epoch": 0.82, "learning_rate": 4.064e-08, "logps/chosen": -228.49246215820312, "logps/rejected": -395.53448486328125, "loss": 0.0007, "losses/dpo": 8.447130994682084e-08, "losses/sft": 0.6806240677833557, "losses/total": 8.447130994682084e-08, "ref_logps/chosen": -212.30966186523438, "ref_logps/rejected": -230.77627563476562, "rewards/accuracies": 1.0, "rewards/chosen": -1.618280291557312, "rewards/margins": 14.857540130615234, "rewards/rejected": -16.475820541381836, "step": 3405 }, { "epoch": 0.82, "learning_rate": 4.058666666666666e-08, "logps/chosen": -242.60818481445312, "logps/rejected": -357.6401062011719, "loss": 0.0008, "losses/dpo": 7.046738748073267e-10, "losses/sft": 0.8837845325469971, "losses/total": 7.046738748073267e-10, "ref_logps/chosen": -220.14697265625, "ref_logps/rejected": -200.50930786132812, "rewards/accuracies": 1.0, "rewards/chosen": -2.246121883392334, "rewards/margins": 13.466958045959473, "rewards/rejected": -15.713080406188965, "step": 3406 }, { "epoch": 0.82, "learning_rate": 4.053333333333333e-08, "logps/chosen": -260.26806640625, "logps/rejected": -413.2633361816406, "loss": 0.0002, "losses/dpo": 4.0762984099274036e-08, "losses/sft": 0.550690233707428, "losses/total": 4.0762984099274036e-08, "ref_logps/chosen": -241.2746124267578, "ref_logps/rejected": -240.66909790039062, "rewards/accuracies": 1.0, "rewards/chosen": -1.8993468284606934, "rewards/margins": 15.360078811645508, "rewards/rejected": -17.25942611694336, "step": 3407 }, { "epoch": 0.82, "learning_rate": 4.0479999999999996e-08, "logps/chosen": -242.5592041015625, "logps/rejected": -387.728271484375, "loss": 0.0008, "losses/dpo": 2.4028219414162777e-08, "losses/sft": 0.632672131061554, "losses/total": 2.4028219414162777e-08, "ref_logps/chosen": -228.59967041015625, "ref_logps/rejected": -220.3228759765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.3959543704986572, "rewards/margins": 15.344589233398438, "rewards/rejected": -16.740543365478516, "step": 3408 }, { "epoch": 0.82, "learning_rate": 4.0426666666666666e-08, "logps/chosen": -228.14205932617188, "logps/rejected": -394.23284912109375, "loss": 0.0005, "losses/dpo": 5.4941775573524865e-09, "losses/sft": 0.5770009160041809, "losses/total": 5.4941775573524865e-09, "ref_logps/chosen": -214.7584228515625, "ref_logps/rejected": -226.41879272460938, "rewards/accuracies": 1.0, "rewards/chosen": -1.3383630514144897, "rewards/margins": 15.443042755126953, "rewards/rejected": -16.78140640258789, "step": 3409 }, { "epoch": 0.82, "learning_rate": 4.037333333333333e-08, "logps/chosen": -270.63092041015625, "logps/rejected": -401.1911315917969, "loss": 0.0, "losses/dpo": 1.938384599498022e-07, "losses/sft": 0.6652435064315796, "losses/total": 1.938384599498022e-07, "ref_logps/chosen": -252.06484985351562, "ref_logps/rejected": -230.1015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.8566067218780518, "rewards/margins": 15.252349853515625, "rewards/rejected": -17.108957290649414, "step": 3410 }, { "epoch": 0.82, "learning_rate": 4.032e-08, "logps/chosen": -256.7047119140625, "logps/rejected": -405.30035400390625, "loss": 0.0001, "losses/dpo": 2.2541465916070536e-10, "losses/sft": 0.6037826538085938, "losses/total": 2.2541465916070536e-10, "ref_logps/chosen": -239.4854736328125, "ref_logps/rejected": -241.6102294921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7219223976135254, "rewards/margins": 14.647093772888184, "rewards/rejected": -16.369014739990234, "step": 3411 }, { "epoch": 0.82, "learning_rate": 4.026666666666666e-08, "logps/chosen": -207.4177703857422, "logps/rejected": -397.25897216796875, "loss": 0.0026, "losses/dpo": 2.3042650010296484e-09, "losses/sft": 0.8455365896224976, "losses/total": 2.3042650010296484e-09, "ref_logps/chosen": -190.36746215820312, "ref_logps/rejected": -223.74661254882812, "rewards/accuracies": 1.0, "rewards/chosen": -1.705030918121338, "rewards/margins": 15.646206855773926, "rewards/rejected": -17.351238250732422, "step": 3412 }, { "epoch": 0.82, "learning_rate": 4.0213333333333333e-08, "logps/chosen": -257.03057861328125, "logps/rejected": -386.38702392578125, "loss": 0.0057, "losses/dpo": 2.5338917453154863e-07, "losses/sft": 0.6649165153503418, "losses/total": 2.5338917453154863e-07, "ref_logps/chosen": -239.75399780273438, "ref_logps/rejected": -231.85873413085938, "rewards/accuracies": 1.0, "rewards/chosen": -1.7276581525802612, "rewards/margins": 13.725168228149414, "rewards/rejected": -15.452826499938965, "step": 3413 }, { "epoch": 0.82, "learning_rate": 4.016e-08, "logps/chosen": -254.43019104003906, "logps/rejected": -375.0400390625, "loss": 0.0004, "losses/dpo": 2.469124183335225e-06, "losses/sft": 0.497700035572052, "losses/total": 2.469124183335225e-06, "ref_logps/chosen": -235.01861572265625, "ref_logps/rejected": -213.59182739257812, "rewards/accuracies": 1.0, "rewards/chosen": -1.9411565065383911, "rewards/margins": 14.203664779663086, "rewards/rejected": -16.144821166992188, "step": 3414 }, { "epoch": 0.82, "learning_rate": 4.010666666666667e-08, "logps/chosen": -251.50540161132812, "logps/rejected": -391.7640380859375, "loss": 0.001, "losses/dpo": 2.823795242079541e-08, "losses/sft": 0.8818312287330627, "losses/total": 2.823795242079541e-08, "ref_logps/chosen": -231.98556518554688, "ref_logps/rejected": -226.84315490722656, "rewards/accuracies": 1.0, "rewards/chosen": -1.9519834518432617, "rewards/margins": 14.540103912353516, "rewards/rejected": -16.492088317871094, "step": 3415 }, { "epoch": 0.82, "learning_rate": 4.005333333333333e-08, "logps/chosen": -228.89468383789062, "logps/rejected": -415.7366027832031, "loss": 0.0, "losses/dpo": 3.386796976201367e-08, "losses/sft": 1.0123536586761475, "losses/total": 3.386796976201367e-08, "ref_logps/chosen": -209.4799346923828, "ref_logps/rejected": -231.92689514160156, "rewards/accuracies": 1.0, "rewards/chosen": -1.9414747953414917, "rewards/margins": 16.439495086669922, "rewards/rejected": -18.380970001220703, "step": 3416 }, { "epoch": 0.82, "learning_rate": 4e-08, "logps/chosen": -229.94183349609375, "logps/rejected": -385.15777587890625, "loss": 0.0004, "losses/dpo": 7.1813388785813e-05, "losses/sft": 0.7131250500679016, "losses/total": 7.1813388785813e-05, "ref_logps/chosen": -213.80551147460938, "ref_logps/rejected": -215.2949981689453, "rewards/accuracies": 1.0, "rewards/chosen": -1.6136335134506226, "rewards/margins": 15.372642517089844, "rewards/rejected": -16.986276626586914, "step": 3417 }, { "epoch": 0.82, "learning_rate": 3.9946666666666664e-08, "logps/chosen": -218.84683227539062, "logps/rejected": -332.1837158203125, "loss": 0.006, "losses/dpo": 4.7731258945304944e-08, "losses/sft": 0.7579538226127625, "losses/total": 4.7731258945304944e-08, "ref_logps/chosen": -204.7733917236328, "ref_logps/rejected": -188.24232482910156, "rewards/accuracies": 1.0, "rewards/chosen": -1.4073426723480225, "rewards/margins": 12.986797332763672, "rewards/rejected": -14.394140243530273, "step": 3418 }, { "epoch": 0.82, "learning_rate": 3.989333333333333e-08, "logps/chosen": -254.8389129638672, "logps/rejected": -382.465087890625, "loss": 0.0002, "losses/dpo": 1.6763591403901046e-08, "losses/sft": 0.6484614610671997, "losses/total": 1.6763591403901046e-08, "ref_logps/chosen": -238.4400634765625, "ref_logps/rejected": -222.76148986816406, "rewards/accuracies": 1.0, "rewards/chosen": -1.639885425567627, "rewards/margins": 14.330472946166992, "rewards/rejected": -15.970359802246094, "step": 3419 }, { "epoch": 0.82, "learning_rate": 3.984e-08, "logps/chosen": -256.0015563964844, "logps/rejected": -393.0314636230469, "loss": 0.0, "losses/dpo": 1.7442408628554062e-09, "losses/sft": 0.626613974571228, "losses/total": 1.7442408628554062e-09, "ref_logps/chosen": -239.7365264892578, "ref_logps/rejected": -222.1123046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.6265027523040771, "rewards/margins": 15.465414047241211, "rewards/rejected": -17.091915130615234, "step": 3420 }, { "epoch": 0.82, "learning_rate": 3.978666666666666e-08, "logps/chosen": -209.0359344482422, "logps/rejected": -397.7913818359375, "loss": 0.0017, "losses/dpo": 3.652275148979811e-09, "losses/sft": 0.5353580117225647, "losses/total": 3.652275148979811e-09, "ref_logps/chosen": -193.33978271484375, "ref_logps/rejected": -226.8620147705078, "rewards/accuracies": 1.0, "rewards/chosen": -1.5696157217025757, "rewards/margins": 15.523321151733398, "rewards/rejected": -17.092937469482422, "step": 3421 }, { "epoch": 0.82, "learning_rate": 3.973333333333333e-08, "logps/chosen": -257.0330505371094, "logps/rejected": -394.9528503417969, "loss": 0.0011, "losses/dpo": 8.567447966345565e-12, "losses/sft": 0.6875512003898621, "losses/total": 8.567447966345565e-12, "ref_logps/chosen": -236.9842529296875, "ref_logps/rejected": -223.04864501953125, "rewards/accuracies": 1.0, "rewards/chosen": -2.0048813819885254, "rewards/margins": 15.185543060302734, "rewards/rejected": -17.1904239654541, "step": 3422 }, { "epoch": 0.82, "learning_rate": 3.9679999999999995e-08, "logps/chosen": -255.85824584960938, "logps/rejected": -398.60516357421875, "loss": 0.0016, "losses/dpo": 2.8764871373709866e-08, "losses/sft": 0.6530109643936157, "losses/total": 2.8764871373709866e-08, "ref_logps/chosen": -238.83872985839844, "ref_logps/rejected": -225.46929931640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7019518613815308, "rewards/margins": 15.611635208129883, "rewards/rejected": -17.313587188720703, "step": 3423 }, { "epoch": 0.82, "learning_rate": 3.9626666666666665e-08, "logps/chosen": -284.1251220703125, "logps/rejected": -421.78369140625, "loss": 0.0007, "losses/dpo": 1.1146906730985506e-09, "losses/sft": 0.6418647170066833, "losses/total": 1.1146906730985506e-09, "ref_logps/chosen": -265.4248962402344, "ref_logps/rejected": -251.32907104492188, "rewards/accuracies": 1.0, "rewards/chosen": -1.8700189590454102, "rewards/margins": 15.17544174194336, "rewards/rejected": -17.045459747314453, "step": 3424 }, { "epoch": 0.82, "learning_rate": 3.957333333333333e-08, "logps/chosen": -248.0122528076172, "logps/rejected": -365.97064208984375, "loss": 0.0009, "losses/dpo": 0.0009044882026501, "losses/sft": 0.47130173444747925, "losses/total": 0.0009044882026501, "ref_logps/chosen": -233.59317016601562, "ref_logps/rejected": -208.1092529296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.441907286643982, "rewards/margins": 14.34422779083252, "rewards/rejected": -15.786136627197266, "step": 3425 }, { "epoch": 0.82, "learning_rate": 3.952e-08, "logps/chosen": -264.29034423828125, "logps/rejected": -375.0930480957031, "loss": 0.0295, "losses/dpo": 4.512177298465758e-09, "losses/sft": 0.7364522814750671, "losses/total": 4.512177298465758e-09, "ref_logps/chosen": -246.44189453125, "ref_logps/rejected": -214.40931701660156, "rewards/accuracies": 0.96875, "rewards/chosen": -1.7848432064056396, "rewards/margins": 14.283531188964844, "rewards/rejected": -16.068374633789062, "step": 3426 }, { "epoch": 0.82, "learning_rate": 3.946666666666666e-08, "logps/chosen": -246.99307250976562, "logps/rejected": -385.8580322265625, "loss": 0.0036, "losses/dpo": 6.930938462046399e-11, "losses/sft": 0.5595699548721313, "losses/total": 6.930938462046399e-11, "ref_logps/chosen": -225.84799194335938, "ref_logps/rejected": -218.64617919921875, "rewards/accuracies": 1.0, "rewards/chosen": -2.114509105682373, "rewards/margins": 14.606679916381836, "rewards/rejected": -16.721187591552734, "step": 3427 }, { "epoch": 0.82, "learning_rate": 3.941333333333333e-08, "logps/chosen": -232.29393005371094, "logps/rejected": -356.8094482421875, "loss": 0.0012, "losses/dpo": 1.5474028259632178e-05, "losses/sft": 0.8117448687553406, "losses/total": 1.5474028259632178e-05, "ref_logps/chosen": -218.09860229492188, "ref_logps/rejected": -199.40560913085938, "rewards/accuracies": 1.0, "rewards/chosen": -1.4195330142974854, "rewards/margins": 14.320849418640137, "rewards/rejected": -15.74038314819336, "step": 3428 }, { "epoch": 0.82, "learning_rate": 3.9359999999999996e-08, "logps/chosen": -287.91796875, "logps/rejected": -445.7091369628906, "loss": 0.0007, "losses/dpo": 5.924239831339717e-12, "losses/sft": 0.4545619785785675, "losses/total": 5.924239831339717e-12, "ref_logps/chosen": -269.9953308105469, "ref_logps/rejected": -264.5155029296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.79226553440094, "rewards/margins": 16.327096939086914, "rewards/rejected": -18.119361877441406, "step": 3429 }, { "epoch": 0.82, "learning_rate": 3.9306666666666667e-08, "logps/chosen": -219.12570190429688, "logps/rejected": -345.21484375, "loss": 0.001, "losses/dpo": 3.724208441635568e-10, "losses/sft": 0.5720812678337097, "losses/total": 3.724208441635568e-10, "ref_logps/chosen": -205.51318359375, "ref_logps/rejected": -192.63681030273438, "rewards/accuracies": 1.0, "rewards/chosen": -1.3612525463104248, "rewards/margins": 13.896551132202148, "rewards/rejected": -15.257802963256836, "step": 3430 }, { "epoch": 0.82, "learning_rate": 3.925333333333333e-08, "logps/chosen": -192.68582153320312, "logps/rejected": -404.48553466796875, "loss": 0.0002, "losses/dpo": 3.907489372068085e-07, "losses/sft": 0.4375676214694977, "losses/total": 3.907489372068085e-07, "ref_logps/chosen": -179.4320831298828, "ref_logps/rejected": -228.83416748046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.3253743648529053, "rewards/margins": 16.239761352539062, "rewards/rejected": -17.56513786315918, "step": 3431 }, { "epoch": 0.82, "learning_rate": 3.92e-08, "logps/chosen": -238.10458374023438, "logps/rejected": -364.74957275390625, "loss": 0.0026, "losses/dpo": 6.993824630541212e-09, "losses/sft": 0.5927883982658386, "losses/total": 6.993824630541212e-09, "ref_logps/chosen": -223.78753662109375, "ref_logps/rejected": -214.0388641357422, "rewards/accuracies": 1.0, "rewards/chosen": -1.431703805923462, "rewards/margins": 13.639369010925293, "rewards/rejected": -15.071073532104492, "step": 3432 }, { "epoch": 0.82, "learning_rate": 3.9146666666666664e-08, "logps/chosen": -267.660888671875, "logps/rejected": -422.7109069824219, "loss": 0.0001, "losses/dpo": 1.899829294416122e-05, "losses/sft": 0.7420224547386169, "losses/total": 1.899829294416122e-05, "ref_logps/chosen": -248.46575927734375, "ref_logps/rejected": -236.97744750976562, "rewards/accuracies": 1.0, "rewards/chosen": -1.919514775276184, "rewards/margins": 16.653831481933594, "rewards/rejected": -18.573345184326172, "step": 3433 }, { "epoch": 0.82, "learning_rate": 3.9093333333333334e-08, "logps/chosen": -256.67022705078125, "logps/rejected": -441.9606628417969, "loss": 0.0, "losses/dpo": 6.913929695429033e-08, "losses/sft": 0.6736648678779602, "losses/total": 6.913929695429033e-08, "ref_logps/chosen": -241.4442138671875, "ref_logps/rejected": -252.5431671142578, "rewards/accuracies": 1.0, "rewards/chosen": -1.5225982666015625, "rewards/margins": 17.419153213500977, "rewards/rejected": -18.941749572753906, "step": 3434 }, { "epoch": 0.82, "learning_rate": 3.904e-08, "logps/chosen": -241.5613250732422, "logps/rejected": -390.6103820800781, "loss": 0.0002, "losses/dpo": 2.0431846436963497e-08, "losses/sft": 0.6342547535896301, "losses/total": 2.0431846436963497e-08, "ref_logps/chosen": -223.99911499023438, "ref_logps/rejected": -227.93600463867188, "rewards/accuracies": 1.0, "rewards/chosen": -1.7562196254730225, "rewards/margins": 14.511218070983887, "rewards/rejected": -16.267436981201172, "step": 3435 }, { "epoch": 0.82, "learning_rate": 3.898666666666666e-08, "logps/chosen": -224.66683959960938, "logps/rejected": -369.8081970214844, "loss": 0.0003, "losses/dpo": 2.0866565364485723e-08, "losses/sft": 0.5811311602592468, "losses/total": 2.0866565364485723e-08, "ref_logps/chosen": -209.86746215820312, "ref_logps/rejected": -205.52096557617188, "rewards/accuracies": 1.0, "rewards/chosen": -1.4799370765686035, "rewards/margins": 14.948787689208984, "rewards/rejected": -16.42872428894043, "step": 3436 }, { "epoch": 0.82, "learning_rate": 3.893333333333333e-08, "logps/chosen": -255.8284912109375, "logps/rejected": -417.0464782714844, "loss": 0.0006, "losses/dpo": 2.2928377946263012e-11, "losses/sft": 0.5948590040206909, "losses/total": 2.2928377946263012e-11, "ref_logps/chosen": -240.10166931152344, "ref_logps/rejected": -239.81378173828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.5726819038391113, "rewards/margins": 16.150590896606445, "rewards/rejected": -17.72327423095703, "step": 3437 }, { "epoch": 0.83, "learning_rate": 3.8879999999999995e-08, "logps/chosen": -211.39117431640625, "logps/rejected": -361.1182861328125, "loss": 0.0068, "losses/dpo": 1.243791757588042e-06, "losses/sft": 0.5880813598632812, "losses/total": 1.243791757588042e-06, "ref_logps/chosen": -195.8642120361328, "ref_logps/rejected": -206.2902069091797, "rewards/accuracies": 1.0, "rewards/chosen": -1.5526938438415527, "rewards/margins": 13.93011474609375, "rewards/rejected": -15.482809066772461, "step": 3438 }, { "epoch": 0.83, "learning_rate": 3.8826666666666665e-08, "logps/chosen": -258.9477844238281, "logps/rejected": -410.28424072265625, "loss": 0.0006, "losses/dpo": 8.271681144833565e-05, "losses/sft": 1.1630901098251343, "losses/total": 8.271681144833565e-05, "ref_logps/chosen": -236.91590881347656, "ref_logps/rejected": -233.8134002685547, "rewards/accuracies": 1.0, "rewards/chosen": -2.2031891345977783, "rewards/margins": 15.44389533996582, "rewards/rejected": -17.647085189819336, "step": 3439 }, { "epoch": 0.83, "learning_rate": 3.877333333333333e-08, "logps/chosen": -208.56771850585938, "logps/rejected": -388.6226501464844, "loss": 0.0031, "losses/dpo": 1.739742145900891e-07, "losses/sft": 0.5556692481040955, "losses/total": 1.739742145900891e-07, "ref_logps/chosen": -192.7618865966797, "ref_logps/rejected": -229.33834838867188, "rewards/accuracies": 1.0, "rewards/chosen": -1.580582618713379, "rewards/margins": 14.347850799560547, "rewards/rejected": -15.92843246459961, "step": 3440 }, { "epoch": 0.83, "learning_rate": 3.872e-08, "logps/chosen": -220.19900512695312, "logps/rejected": -367.9686584472656, "loss": 0.0024, "losses/dpo": 5.244082511984827e-12, "losses/sft": 0.5213950276374817, "losses/total": 5.244082511984827e-12, "ref_logps/chosen": -203.8770751953125, "ref_logps/rejected": -215.44105529785156, "rewards/accuracies": 1.0, "rewards/chosen": -1.6321945190429688, "rewards/margins": 13.620565414428711, "rewards/rejected": -15.25275993347168, "step": 3441 }, { "epoch": 0.83, "learning_rate": 3.866666666666666e-08, "logps/chosen": -197.83172607421875, "logps/rejected": -352.22052001953125, "loss": 0.0017, "losses/dpo": 1.483953298020424e-07, "losses/sft": 0.6889654994010925, "losses/total": 1.483953298020424e-07, "ref_logps/chosen": -182.62875366210938, "ref_logps/rejected": -195.58729553222656, "rewards/accuracies": 1.0, "rewards/chosen": -1.5202981233596802, "rewards/margins": 14.143025398254395, "rewards/rejected": -15.663323402404785, "step": 3442 }, { "epoch": 0.83, "learning_rate": 3.861333333333333e-08, "logps/chosen": -268.4698791503906, "logps/rejected": -389.00537109375, "loss": 0.0006, "losses/dpo": 1.4524399478271777e-12, "losses/sft": 0.36038991808891296, "losses/total": 1.4524399478271777e-12, "ref_logps/chosen": -254.00543212890625, "ref_logps/rejected": -218.94468688964844, "rewards/accuracies": 1.0, "rewards/chosen": -1.4464449882507324, "rewards/margins": 15.559621810913086, "rewards/rejected": -17.006067276000977, "step": 3443 }, { "epoch": 0.83, "learning_rate": 3.8559999999999996e-08, "logps/chosen": -229.95370483398438, "logps/rejected": -357.8396301269531, "loss": 0.0002, "losses/dpo": 5.1984624604983765e-08, "losses/sft": 0.41988831758499146, "losses/total": 5.1984624604983765e-08, "ref_logps/chosen": -213.880859375, "ref_logps/rejected": -200.33721923828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6072845458984375, "rewards/margins": 14.14295768737793, "rewards/rejected": -15.750242233276367, "step": 3444 }, { "epoch": 0.83, "learning_rate": 3.8506666666666666e-08, "logps/chosen": -227.91998291015625, "logps/rejected": -386.2181396484375, "loss": 0.0008, "losses/dpo": 3.309301277454324e-08, "losses/sft": 0.5442681908607483, "losses/total": 3.309301277454324e-08, "ref_logps/chosen": -210.0767822265625, "ref_logps/rejected": -219.96627807617188, "rewards/accuracies": 1.0, "rewards/chosen": -1.7843208312988281, "rewards/margins": 14.840866088867188, "rewards/rejected": -16.625186920166016, "step": 3445 }, { "epoch": 0.83, "learning_rate": 3.845333333333333e-08, "logps/chosen": -249.2884521484375, "logps/rejected": -397.9482421875, "loss": 0.0007, "losses/dpo": 3.545690390183154e-07, "losses/sft": 0.6948882937431335, "losses/total": 3.545690390183154e-07, "ref_logps/chosen": -230.5282745361328, "ref_logps/rejected": -231.09115600585938, "rewards/accuracies": 1.0, "rewards/chosen": -1.8760161399841309, "rewards/margins": 14.809691429138184, "rewards/rejected": -16.685707092285156, "step": 3446 }, { "epoch": 0.83, "learning_rate": 3.84e-08, "logps/chosen": -199.11563110351562, "logps/rejected": -343.59417724609375, "loss": 0.0, "losses/dpo": 1.5153483801100265e-08, "losses/sft": 1.0876245498657227, "losses/total": 1.5153483801100265e-08, "ref_logps/chosen": -185.53475952148438, "ref_logps/rejected": -188.9998321533203, "rewards/accuracies": 1.0, "rewards/chosen": -1.3580873012542725, "rewards/margins": 14.101346015930176, "rewards/rejected": -15.459433555603027, "step": 3447 }, { "epoch": 0.83, "learning_rate": 3.834666666666666e-08, "logps/chosen": -232.70809936523438, "logps/rejected": -345.51922607421875, "loss": 0.0016, "losses/dpo": 9.981779847123562e-09, "losses/sft": 0.6103488802909851, "losses/total": 9.981779847123562e-09, "ref_logps/chosen": -216.66912841796875, "ref_logps/rejected": -191.64694213867188, "rewards/accuracies": 1.0, "rewards/chosen": -1.6038953065872192, "rewards/margins": 13.783334732055664, "rewards/rejected": -15.387228965759277, "step": 3448 }, { "epoch": 0.83, "learning_rate": 3.8293333333333333e-08, "logps/chosen": -259.4471130371094, "logps/rejected": -420.0103454589844, "loss": 0.0019, "losses/dpo": 4.865492009287209e-09, "losses/sft": 0.6471907496452332, "losses/total": 4.865492009287209e-09, "ref_logps/chosen": -238.90618896484375, "ref_logps/rejected": -224.26162719726562, "rewards/accuracies": 1.0, "rewards/chosen": -2.0540928840637207, "rewards/margins": 17.520780563354492, "rewards/rejected": -19.574874877929688, "step": 3449 }, { "epoch": 0.83, "learning_rate": 3.824e-08, "logps/chosen": -241.88296508789062, "logps/rejected": -377.724609375, "loss": 0.0001, "losses/dpo": 1.8581897165859118e-05, "losses/sft": 0.6250836253166199, "losses/total": 1.8581897165859118e-05, "ref_logps/chosen": -222.35145568847656, "ref_logps/rejected": -207.51722717285156, "rewards/accuracies": 1.0, "rewards/chosen": -1.9531497955322266, "rewards/margins": 15.067588806152344, "rewards/rejected": -17.02073860168457, "step": 3450 }, { "epoch": 0.83, "learning_rate": 3.818666666666667e-08, "logps/chosen": -282.306396484375, "logps/rejected": -440.7412109375, "loss": 0.0, "losses/dpo": 1.9248624916201607e-10, "losses/sft": 0.4773026406764984, "losses/total": 1.9248624916201607e-10, "ref_logps/chosen": -265.2956237792969, "ref_logps/rejected": -253.1302032470703, "rewards/accuracies": 1.0, "rewards/chosen": -1.7010761499404907, "rewards/margins": 17.06002426147461, "rewards/rejected": -18.76110076904297, "step": 3451 }, { "epoch": 0.83, "learning_rate": 3.813333333333334e-08, "logps/chosen": -228.19546508789062, "logps/rejected": -446.3401184082031, "loss": 0.0009, "losses/dpo": 1.9757685549848247e-06, "losses/sft": 0.4786781668663025, "losses/total": 1.9757685549848247e-06, "ref_logps/chosen": -213.88571166992188, "ref_logps/rejected": -254.24533081054688, "rewards/accuracies": 1.0, "rewards/chosen": -1.4309744834899902, "rewards/margins": 17.778507232666016, "rewards/rejected": -19.20948028564453, "step": 3452 }, { "epoch": 0.83, "learning_rate": 3.808e-08, "logps/chosen": -229.1826629638672, "logps/rejected": -363.947509765625, "loss": 0.001, "losses/dpo": 2.684926769802587e-08, "losses/sft": 0.6156149506568909, "losses/total": 2.684926769802587e-08, "ref_logps/chosen": -208.08383178710938, "ref_logps/rejected": -201.61289978027344, "rewards/accuracies": 1.0, "rewards/chosen": -2.1098837852478027, "rewards/margins": 14.123576164245605, "rewards/rejected": -16.23345947265625, "step": 3453 }, { "epoch": 0.83, "learning_rate": 3.8026666666666664e-08, "logps/chosen": -243.79920959472656, "logps/rejected": -385.590576171875, "loss": 0.0033, "losses/dpo": 9.071752771205865e-08, "losses/sft": 0.8414705395698547, "losses/total": 9.071752771205865e-08, "ref_logps/chosen": -227.13052368164062, "ref_logps/rejected": -210.95428466796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.666866660118103, "rewards/margins": 15.796760559082031, "rewards/rejected": -17.463626861572266, "step": 3454 }, { "epoch": 0.83, "learning_rate": 3.797333333333333e-08, "logps/chosen": -249.73826599121094, "logps/rejected": -417.3148193359375, "loss": 0.001, "losses/dpo": 2.696459722173472e-11, "losses/sft": 0.4757257103919983, "losses/total": 2.696459722173472e-11, "ref_logps/chosen": -231.63677978515625, "ref_logps/rejected": -237.46453857421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.8101484775543213, "rewards/margins": 16.174880981445312, "rewards/rejected": -17.985031127929688, "step": 3455 }, { "epoch": 0.83, "learning_rate": 3.792e-08, "logps/chosen": -303.6557922363281, "logps/rejected": -422.20587158203125, "loss": 0.0003, "losses/dpo": 5.227333677026991e-09, "losses/sft": 0.5004663467407227, "losses/total": 5.227333677026991e-09, "ref_logps/chosen": -283.56451416015625, "ref_logps/rejected": -242.41064453125, "rewards/accuracies": 1.0, "rewards/chosen": -2.009127616882324, "rewards/margins": 15.970396041870117, "rewards/rejected": -17.979522705078125, "step": 3456 }, { "epoch": 0.83, "learning_rate": 3.786666666666666e-08, "logps/chosen": -240.58499145507812, "logps/rejected": -439.19976806640625, "loss": 0.0, "losses/dpo": 2.587315464097628e-07, "losses/sft": 1.0057711601257324, "losses/total": 2.587315464097628e-07, "ref_logps/chosen": -227.57362365722656, "ref_logps/rejected": -249.56019592285156, "rewards/accuracies": 1.0, "rewards/chosen": -1.301137924194336, "rewards/margins": 17.662822723388672, "rewards/rejected": -18.96396255493164, "step": 3457 }, { "epoch": 0.83, "learning_rate": 3.781333333333333e-08, "logps/chosen": -253.98568725585938, "logps/rejected": -408.7612609863281, "loss": 0.0008, "losses/dpo": 1.7808816921149884e-11, "losses/sft": 0.5396802425384521, "losses/total": 1.7808816921149884e-11, "ref_logps/chosen": -235.3536376953125, "ref_logps/rejected": -228.39808654785156, "rewards/accuracies": 1.0, "rewards/chosen": -1.8632049560546875, "rewards/margins": 16.173112869262695, "rewards/rejected": -18.036317825317383, "step": 3458 }, { "epoch": 0.83, "learning_rate": 3.7759999999999995e-08, "logps/chosen": -262.1323547363281, "logps/rejected": -404.8121032714844, "loss": 0.0001, "losses/dpo": 3.8408924041277714e-08, "losses/sft": 0.6566250324249268, "losses/total": 3.8408924041277714e-08, "ref_logps/chosen": -244.60348510742188, "ref_logps/rejected": -235.19960021972656, "rewards/accuracies": 1.0, "rewards/chosen": -1.7528839111328125, "rewards/margins": 15.208365440368652, "rewards/rejected": -16.96125030517578, "step": 3459 }, { "epoch": 0.83, "learning_rate": 3.7706666666666665e-08, "logps/chosen": -226.4087677001953, "logps/rejected": -434.95526123046875, "loss": 0.001, "losses/dpo": 4.693751805007196e-07, "losses/sft": 0.41556286811828613, "losses/total": 4.693751805007196e-07, "ref_logps/chosen": -212.42642211914062, "ref_logps/rejected": -242.56076049804688, "rewards/accuracies": 1.0, "rewards/chosen": -1.3982361555099487, "rewards/margins": 17.841215133666992, "rewards/rejected": -19.239450454711914, "step": 3460 }, { "epoch": 0.83, "learning_rate": 3.765333333333333e-08, "logps/chosen": -249.75704956054688, "logps/rejected": -390.2835693359375, "loss": 0.0021, "losses/dpo": 1.06858969672885e-07, "losses/sft": 0.7176781296730042, "losses/total": 1.06858969672885e-07, "ref_logps/chosen": -230.479736328125, "ref_logps/rejected": -223.90089416503906, "rewards/accuracies": 1.0, "rewards/chosen": -1.927730917930603, "rewards/margins": 14.710535049438477, "rewards/rejected": -16.63826560974121, "step": 3461 }, { "epoch": 0.83, "learning_rate": 3.76e-08, "logps/chosen": -237.07301330566406, "logps/rejected": -408.78973388671875, "loss": 0.0032, "losses/dpo": 1.2431655704858713e-06, "losses/sft": 0.5137558579444885, "losses/total": 1.2431655704858713e-06, "ref_logps/chosen": -217.93238830566406, "ref_logps/rejected": -231.53500366210938, "rewards/accuracies": 1.0, "rewards/chosen": -1.9140617847442627, "rewards/margins": 15.81141185760498, "rewards/rejected": -17.725473403930664, "step": 3462 }, { "epoch": 0.83, "learning_rate": 3.754666666666666e-08, "logps/chosen": -228.54800415039062, "logps/rejected": -393.240234375, "loss": 0.0024, "losses/dpo": 1.543707668361094e-07, "losses/sft": 0.8409584164619446, "losses/total": 1.543707668361094e-07, "ref_logps/chosen": -212.5019073486328, "ref_logps/rejected": -220.4994354248047, "rewards/accuracies": 1.0, "rewards/chosen": -1.6046099662780762, "rewards/margins": 15.66946792602539, "rewards/rejected": -17.274078369140625, "step": 3463 }, { "epoch": 0.83, "learning_rate": 3.749333333333333e-08, "logps/chosen": -242.39752197265625, "logps/rejected": -379.99395751953125, "loss": 0.004, "losses/dpo": 9.856159444154855e-09, "losses/sft": 0.46443086862564087, "losses/total": 9.856159444154855e-09, "ref_logps/chosen": -221.42721557617188, "ref_logps/rejected": -220.34237670898438, "rewards/accuracies": 1.0, "rewards/chosen": -2.0970306396484375, "rewards/margins": 13.868124961853027, "rewards/rejected": -15.965155601501465, "step": 3464 }, { "epoch": 0.83, "learning_rate": 3.7439999999999996e-08, "logps/chosen": -240.52740478515625, "logps/rejected": -412.24456787109375, "loss": 0.0002, "losses/dpo": 2.785309855548769e-18, "losses/sft": 0.8344330191612244, "losses/total": 2.785309855548769e-18, "ref_logps/chosen": -218.6568603515625, "ref_logps/rejected": -226.27674865722656, "rewards/accuracies": 1.0, "rewards/chosen": -2.1870522499084473, "rewards/margins": 16.40972900390625, "rewards/rejected": -18.59678077697754, "step": 3465 }, { "epoch": 0.83, "learning_rate": 3.7386666666666667e-08, "logps/chosen": -233.62051391601562, "logps/rejected": -373.47979736328125, "loss": 0.0013, "losses/dpo": 1.1434800626375363e-06, "losses/sft": 0.40132322907447815, "losses/total": 1.1434800626375363e-06, "ref_logps/chosen": -219.2690887451172, "ref_logps/rejected": -208.35104370117188, "rewards/accuracies": 1.0, "rewards/chosen": -1.4351422786712646, "rewards/margins": 15.07773208618164, "rewards/rejected": -16.512874603271484, "step": 3466 }, { "epoch": 0.83, "learning_rate": 3.733333333333333e-08, "logps/chosen": -243.7736358642578, "logps/rejected": -394.77886962890625, "loss": 0.0003, "losses/dpo": 2.4350288452268387e-09, "losses/sft": 0.6634224653244019, "losses/total": 2.4350288452268387e-09, "ref_logps/chosen": -225.53076171875, "ref_logps/rejected": -219.50711059570312, "rewards/accuracies": 1.0, "rewards/chosen": -1.824288249015808, "rewards/margins": 15.702889442443848, "rewards/rejected": -17.527177810668945, "step": 3467 }, { "epoch": 0.83, "learning_rate": 3.728e-08, "logps/chosen": -250.81878662109375, "logps/rejected": -367.96923828125, "loss": 0.0187, "losses/dpo": 1.740161934549178e-07, "losses/sft": 0.6218432188034058, "losses/total": 1.740161934549178e-07, "ref_logps/chosen": -232.5075225830078, "ref_logps/rejected": -215.4524688720703, "rewards/accuracies": 1.0, "rewards/chosen": -1.8311272859573364, "rewards/margins": 13.420549392700195, "rewards/rejected": -15.251676559448242, "step": 3468 }, { "epoch": 0.83, "learning_rate": 3.722666666666667e-08, "logps/chosen": -217.38583374023438, "logps/rejected": -377.77978515625, "loss": 0.0129, "losses/dpo": 2.87996604342311e-09, "losses/sft": 0.7289098501205444, "losses/total": 2.87996604342311e-09, "ref_logps/chosen": -202.71717834472656, "ref_logps/rejected": -216.771484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4668643474578857, "rewards/margins": 14.633964538574219, "rewards/rejected": -16.100830078125, "step": 3469 }, { "epoch": 0.83, "learning_rate": 3.7173333333333334e-08, "logps/chosen": -209.61529541015625, "logps/rejected": -381.2676086425781, "loss": 0.001, "losses/dpo": 2.47563065158829e-07, "losses/sft": 0.5769534707069397, "losses/total": 2.47563065158829e-07, "ref_logps/chosen": -196.7666778564453, "ref_logps/rejected": -201.5494384765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.2848612070083618, "rewards/margins": 16.686954498291016, "rewards/rejected": -17.97181510925293, "step": 3470 }, { "epoch": 0.83, "learning_rate": 3.712e-08, "logps/chosen": -234.21163940429688, "logps/rejected": -406.40350341796875, "loss": 0.0, "losses/dpo": 6.474566589531605e-07, "losses/sft": 0.5763986110687256, "losses/total": 6.474566589531605e-07, "ref_logps/chosen": -218.27455139160156, "ref_logps/rejected": -231.2834930419922, "rewards/accuracies": 1.0, "rewards/chosen": -1.5937098264694214, "rewards/margins": 15.918288230895996, "rewards/rejected": -17.512001037597656, "step": 3471 }, { "epoch": 0.83, "learning_rate": 3.706666666666666e-08, "logps/chosen": -219.31761169433594, "logps/rejected": -386.5748291015625, "loss": 0.0006, "losses/dpo": 3.9268204687914476e-08, "losses/sft": 0.5454466342926025, "losses/total": 3.9268204687914476e-08, "ref_logps/chosen": -202.80950927734375, "ref_logps/rejected": -219.83139038085938, "rewards/accuracies": 1.0, "rewards/chosen": -1.6508097648620605, "rewards/margins": 15.023531913757324, "rewards/rejected": -16.674341201782227, "step": 3472 }, { "epoch": 0.83, "learning_rate": 3.701333333333333e-08, "logps/chosen": -236.38784790039062, "logps/rejected": -368.0623779296875, "loss": 0.0006, "losses/dpo": 7.582639938163993e-08, "losses/sft": 0.2914814352989197, "losses/total": 7.582639938163993e-08, "ref_logps/chosen": -223.2770538330078, "ref_logps/rejected": -209.31417846679688, "rewards/accuracies": 1.0, "rewards/chosen": -1.3110803365707397, "rewards/margins": 14.563741683959961, "rewards/rejected": -15.874822616577148, "step": 3473 }, { "epoch": 0.83, "learning_rate": 3.6959999999999995e-08, "logps/chosen": -262.61138916015625, "logps/rejected": -424.9372253417969, "loss": 0.003, "losses/dpo": 1.1275458344783829e-10, "losses/sft": 0.4940972328186035, "losses/total": 1.1275458344783829e-10, "ref_logps/chosen": -244.8855743408203, "ref_logps/rejected": -252.69912719726562, "rewards/accuracies": 1.0, "rewards/chosen": -1.7725807428359985, "rewards/margins": 15.451229095458984, "rewards/rejected": -17.22381019592285, "step": 3474 }, { "epoch": 0.83, "learning_rate": 3.6906666666666665e-08, "logps/chosen": -240.00640869140625, "logps/rejected": -383.7481689453125, "loss": 0.0071, "losses/dpo": 8.630683390187244e-10, "losses/sft": 0.8314814567565918, "losses/total": 8.630683390187244e-10, "ref_logps/chosen": -224.96742248535156, "ref_logps/rejected": -230.3497772216797, "rewards/accuracies": 1.0, "rewards/chosen": -1.503899335861206, "rewards/margins": 13.835939407348633, "rewards/rejected": -15.339838981628418, "step": 3475 }, { "epoch": 0.83, "learning_rate": 3.685333333333333e-08, "logps/chosen": -264.2842712402344, "logps/rejected": -409.13348388671875, "loss": 0.0001, "losses/dpo": 8.8768212615567e-14, "losses/sft": 0.5682778358459473, "losses/total": 8.8768212615567e-14, "ref_logps/chosen": -248.1104736328125, "ref_logps/rejected": -231.0116729736328, "rewards/accuracies": 1.0, "rewards/chosen": -1.6173778772354126, "rewards/margins": 16.19480323791504, "rewards/rejected": -17.812183380126953, "step": 3476 }, { "epoch": 0.83, "learning_rate": 3.68e-08, "logps/chosen": -241.23687744140625, "logps/rejected": -397.83917236328125, "loss": 0.0003, "losses/dpo": 9.986222266888944e-07, "losses/sft": 0.6323083639144897, "losses/total": 9.986222266888944e-07, "ref_logps/chosen": -223.0948486328125, "ref_logps/rejected": -230.29678344726562, "rewards/accuracies": 1.0, "rewards/chosen": -1.8142046928405762, "rewards/margins": 14.94003677368164, "rewards/rejected": -16.754241943359375, "step": 3477 }, { "epoch": 0.83, "learning_rate": 3.674666666666666e-08, "logps/chosen": -244.12042236328125, "logps/rejected": -413.2991943359375, "loss": 0.0013, "losses/dpo": 4.697785493945617e-10, "losses/sft": 0.555651068687439, "losses/total": 4.697785493945617e-10, "ref_logps/chosen": -225.28590393066406, "ref_logps/rejected": -233.11062622070312, "rewards/accuracies": 1.0, "rewards/chosen": -1.883451223373413, "rewards/margins": 16.135406494140625, "rewards/rejected": -18.018857955932617, "step": 3478 }, { "epoch": 0.83, "learning_rate": 3.669333333333333e-08, "logps/chosen": -244.07858276367188, "logps/rejected": -370.122314453125, "loss": 0.0003, "losses/dpo": 8.610347990156697e-09, "losses/sft": 0.40547966957092285, "losses/total": 8.610347990156697e-09, "ref_logps/chosen": -226.82395935058594, "ref_logps/rejected": -204.7906036376953, "rewards/accuracies": 1.0, "rewards/chosen": -1.7254608869552612, "rewards/margins": 14.807708740234375, "rewards/rejected": -16.53316879272461, "step": 3479 }, { "epoch": 0.84, "learning_rate": 3.6639999999999996e-08, "logps/chosen": -239.15072631835938, "logps/rejected": -401.819580078125, "loss": 0.0057, "losses/dpo": 2.157528738999659e-10, "losses/sft": 0.5145022869110107, "losses/total": 2.157528738999659e-10, "ref_logps/chosen": -219.89462280273438, "ref_logps/rejected": -227.9174346923828, "rewards/accuracies": 1.0, "rewards/chosen": -1.9256106615066528, "rewards/margins": 15.46460247039795, "rewards/rejected": -17.390213012695312, "step": 3480 }, { "epoch": 0.84, "learning_rate": 3.6586666666666666e-08, "logps/chosen": -269.61724853515625, "logps/rejected": -389.0726623535156, "loss": 0.0003, "losses/dpo": 2.039357468675007e-06, "losses/sft": 1.3079017400741577, "losses/total": 2.039357468675007e-06, "ref_logps/chosen": -255.5648956298828, "ref_logps/rejected": -227.6753387451172, "rewards/accuracies": 1.0, "rewards/chosen": -1.4052331447601318, "rewards/margins": 14.7344970703125, "rewards/rejected": -16.13973045349121, "step": 3481 }, { "epoch": 0.84, "learning_rate": 3.653333333333333e-08, "logps/chosen": -216.60305786132812, "logps/rejected": -345.7255859375, "loss": 0.0114, "losses/dpo": 3.3536884380680476e-10, "losses/sft": 0.6523191332817078, "losses/total": 3.3536884380680476e-10, "ref_logps/chosen": -203.98507690429688, "ref_logps/rejected": -196.49095153808594, "rewards/accuracies": 1.0, "rewards/chosen": -1.2617970705032349, "rewards/margins": 13.66166877746582, "rewards/rejected": -14.923466682434082, "step": 3482 }, { "epoch": 0.84, "learning_rate": 3.648e-08, "logps/chosen": -194.10079956054688, "logps/rejected": -341.01373291015625, "loss": 0.0005, "losses/dpo": 8.122120337406002e-11, "losses/sft": 0.614406168460846, "losses/total": 8.122120337406002e-11, "ref_logps/chosen": -177.52520751953125, "ref_logps/rejected": -187.1639404296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.657559871673584, "rewards/margins": 13.727423667907715, "rewards/rejected": -15.38498306274414, "step": 3483 }, { "epoch": 0.84, "learning_rate": 3.642666666666667e-08, "logps/chosen": -256.1679992675781, "logps/rejected": -451.0995788574219, "loss": 0.0001, "losses/dpo": 5.578334560767118e-13, "losses/sft": 0.5291992425918579, "losses/total": 5.578334560767118e-13, "ref_logps/chosen": -237.8959197998047, "ref_logps/rejected": -260.82940673828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.827209234237671, "rewards/margins": 17.19980812072754, "rewards/rejected": -19.027019500732422, "step": 3484 }, { "epoch": 0.84, "learning_rate": 3.6373333333333333e-08, "logps/chosen": -220.14361572265625, "logps/rejected": -402.41845703125, "loss": 0.0031, "losses/dpo": 2.1337669478538324e-11, "losses/sft": 0.4342617392539978, "losses/total": 2.1337669478538324e-11, "ref_logps/chosen": -205.7154998779297, "ref_logps/rejected": -241.7459259033203, "rewards/accuracies": 1.0, "rewards/chosen": -1.44281005859375, "rewards/margins": 14.624444961547852, "rewards/rejected": -16.0672550201416, "step": 3485 }, { "epoch": 0.84, "learning_rate": 3.6320000000000004e-08, "logps/chosen": -220.11273193359375, "logps/rejected": -399.6531982421875, "loss": 0.0009, "losses/dpo": 9.966252036974765e-07, "losses/sft": 0.5539066195487976, "losses/total": 9.966252036974765e-07, "ref_logps/chosen": -207.68307495117188, "ref_logps/rejected": -236.272705078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.2429641485214233, "rewards/margins": 15.095083236694336, "rewards/rejected": -16.33804702758789, "step": 3486 }, { "epoch": 0.84, "learning_rate": 3.626666666666667e-08, "logps/chosen": -277.1725769042969, "logps/rejected": -413.3858642578125, "loss": 0.001, "losses/dpo": 2.9997288208960526e-08, "losses/sft": 0.6674343347549438, "losses/total": 2.9997288208960526e-08, "ref_logps/chosen": -259.5249938964844, "ref_logps/rejected": -235.39236450195312, "rewards/accuracies": 1.0, "rewards/chosen": -1.764758825302124, "rewards/margins": 16.03459358215332, "rewards/rejected": -17.799352645874023, "step": 3487 }, { "epoch": 0.84, "learning_rate": 3.621333333333333e-08, "logps/chosen": -219.19927978515625, "logps/rejected": -366.9703674316406, "loss": 0.0019, "losses/dpo": 2.979682056647448e-10, "losses/sft": 0.826702892780304, "losses/total": 2.979682056647448e-10, "ref_logps/chosen": -204.12136840820312, "ref_logps/rejected": -216.820068359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5077908039093018, "rewards/margins": 13.507238388061523, "rewards/rejected": -15.015030860900879, "step": 3488 }, { "epoch": 0.84, "learning_rate": 3.6159999999999994e-08, "logps/chosen": -254.60870361328125, "logps/rejected": -374.0615539550781, "loss": 0.0016, "losses/dpo": 6.544139008468619e-09, "losses/sft": 0.803033709526062, "losses/total": 6.544139008468619e-09, "ref_logps/chosen": -235.25332641601562, "ref_logps/rejected": -206.80874633789062, "rewards/accuracies": 1.0, "rewards/chosen": -1.9355391263961792, "rewards/margins": 14.789743423461914, "rewards/rejected": -16.725282669067383, "step": 3489 }, { "epoch": 0.84, "learning_rate": 3.6106666666666664e-08, "logps/chosen": -257.6954040527344, "logps/rejected": -411.2228088378906, "loss": 0.0006, "losses/dpo": 2.781618313463241e-08, "losses/sft": 0.6874625086784363, "losses/total": 2.781618313463241e-08, "ref_logps/chosen": -239.55609130859375, "ref_logps/rejected": -229.66954040527344, "rewards/accuracies": 1.0, "rewards/chosen": -1.8139294385910034, "rewards/margins": 16.341400146484375, "rewards/rejected": -18.15532875061035, "step": 3490 }, { "epoch": 0.84, "learning_rate": 3.605333333333333e-08, "logps/chosen": -240.5438232421875, "logps/rejected": -396.400634765625, "loss": 0.0003, "losses/dpo": 2.5517542212583066e-07, "losses/sft": 0.8082801103591919, "losses/total": 2.5517542212583066e-07, "ref_logps/chosen": -220.4438934326172, "ref_logps/rejected": -218.35714721679688, "rewards/accuracies": 1.0, "rewards/chosen": -2.009993553161621, "rewards/margins": 15.794354438781738, "rewards/rejected": -17.80434799194336, "step": 3491 }, { "epoch": 0.84, "learning_rate": 3.6e-08, "logps/chosen": -256.510498046875, "logps/rejected": -391.7917175292969, "loss": 0.0002, "losses/dpo": 8.579489076510072e-06, "losses/sft": 1.0248233079910278, "losses/total": 8.579489076510072e-06, "ref_logps/chosen": -238.44363403320312, "ref_logps/rejected": -229.03765869140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.8066872358322144, "rewards/margins": 14.468717575073242, "rewards/rejected": -16.275405883789062, "step": 3492 }, { "epoch": 0.84, "learning_rate": 3.594666666666666e-08, "logps/chosen": -260.5509033203125, "logps/rejected": -392.0555419921875, "loss": 0.0074, "losses/dpo": 2.211333116619585e-09, "losses/sft": 0.48798802495002747, "losses/total": 2.211333116619585e-09, "ref_logps/chosen": -240.73556518554688, "ref_logps/rejected": -225.3555450439453, "rewards/accuracies": 1.0, "rewards/chosen": -1.9815318584442139, "rewards/margins": 14.688465118408203, "rewards/rejected": -16.669998168945312, "step": 3493 }, { "epoch": 0.84, "learning_rate": 3.589333333333333e-08, "logps/chosen": -227.92105102539062, "logps/rejected": -368.0168151855469, "loss": 0.0034, "losses/dpo": 6.767118065909017e-06, "losses/sft": 0.6371367573738098, "losses/total": 6.767118065909017e-06, "ref_logps/chosen": -209.11083984375, "ref_logps/rejected": -197.90232849121094, "rewards/accuracies": 1.0, "rewards/chosen": -1.8810206651687622, "rewards/margins": 15.130428314208984, "rewards/rejected": -17.01144790649414, "step": 3494 }, { "epoch": 0.84, "learning_rate": 3.5839999999999995e-08, "logps/chosen": -249.46713256835938, "logps/rejected": -358.4319152832031, "loss": 0.0015, "losses/dpo": 1.6079533452284522e-05, "losses/sft": 0.7099920511245728, "losses/total": 1.6079533452284522e-05, "ref_logps/chosen": -229.94000244140625, "ref_logps/rejected": -206.6671142578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.9527146816253662, "rewards/margins": 13.223766326904297, "rewards/rejected": -15.176481246948242, "step": 3495 }, { "epoch": 0.84, "learning_rate": 3.5786666666666666e-08, "logps/chosen": -219.32489013671875, "logps/rejected": -361.8182067871094, "loss": 0.0, "losses/dpo": 9.765408748307891e-08, "losses/sft": 0.4821123480796814, "losses/total": 9.765408748307891e-08, "ref_logps/chosen": -206.10763549804688, "ref_logps/rejected": -206.3490447998047, "rewards/accuracies": 1.0, "rewards/chosen": -1.3217262029647827, "rewards/margins": 14.225192070007324, "rewards/rejected": -15.546918869018555, "step": 3496 }, { "epoch": 0.84, "learning_rate": 3.573333333333333e-08, "logps/chosen": -226.80477905273438, "logps/rejected": -342.51568603515625, "loss": 0.0025, "losses/dpo": 9.029473169164248e-09, "losses/sft": 0.8249130249023438, "losses/total": 9.029473169164248e-09, "ref_logps/chosen": -212.75753784179688, "ref_logps/rejected": -195.37890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4047250747680664, "rewards/margins": 13.308952331542969, "rewards/rejected": -14.713678359985352, "step": 3497 }, { "epoch": 0.84, "learning_rate": 3.568e-08, "logps/chosen": -226.2117462158203, "logps/rejected": -363.97113037109375, "loss": 0.0002, "losses/dpo": 9.301490133672985e-12, "losses/sft": 0.6914704442024231, "losses/total": 9.301490133672985e-12, "ref_logps/chosen": -212.10659790039062, "ref_logps/rejected": -208.38450622558594, "rewards/accuracies": 1.0, "rewards/chosen": -1.4105126857757568, "rewards/margins": 14.148147583007812, "rewards/rejected": -15.558659553527832, "step": 3498 }, { "epoch": 0.84, "learning_rate": 3.562666666666666e-08, "logps/chosen": -244.26858520507812, "logps/rejected": -410.0091552734375, "loss": 0.0002, "losses/dpo": 9.530936750934416e-08, "losses/sft": 0.6538116335868835, "losses/total": 9.530936750934416e-08, "ref_logps/chosen": -227.77586364746094, "ref_logps/rejected": -231.17996215820312, "rewards/accuracies": 1.0, "rewards/chosen": -1.6492717266082764, "rewards/margins": 16.2336483001709, "rewards/rejected": -17.882919311523438, "step": 3499 }, { "epoch": 0.84, "learning_rate": 3.557333333333333e-08, "logps/chosen": -252.97637939453125, "logps/rejected": -395.991455078125, "loss": 0.0009, "losses/dpo": 1.0402457562008749e-08, "losses/sft": 0.6031456589698792, "losses/total": 1.0402457562008749e-08, "ref_logps/chosen": -233.92239379882812, "ref_logps/rejected": -220.87416076660156, "rewards/accuracies": 1.0, "rewards/chosen": -1.9053993225097656, "rewards/margins": 15.606330871582031, "rewards/rejected": -17.511730194091797, "step": 3500 }, { "epoch": 0.84, "learning_rate": 3.552e-08, "logps/chosen": -246.9171905517578, "logps/rejected": -391.3860168457031, "loss": 0.0016, "losses/dpo": 1.465057852811924e-08, "losses/sft": 0.7773659825325012, "losses/total": 1.465057852811924e-08, "ref_logps/chosen": -228.37771606445312, "ref_logps/rejected": -227.2049102783203, "rewards/accuracies": 1.0, "rewards/chosen": -1.8539471626281738, "rewards/margins": 14.564160346984863, "rewards/rejected": -16.418107986450195, "step": 3501 }, { "epoch": 0.84, "learning_rate": 3.5466666666666667e-08, "logps/chosen": -263.59832763671875, "logps/rejected": -407.20928955078125, "loss": 0.0018, "losses/dpo": 2.093914417855558e-06, "losses/sft": 0.7186557650566101, "losses/total": 2.093914417855558e-06, "ref_logps/chosen": -242.30856323242188, "ref_logps/rejected": -228.63845825195312, "rewards/accuracies": 1.0, "rewards/chosen": -2.1289758682250977, "rewards/margins": 15.728104591369629, "rewards/rejected": -17.85708236694336, "step": 3502 }, { "epoch": 0.84, "learning_rate": 3.541333333333334e-08, "logps/chosen": -257.1495056152344, "logps/rejected": -376.8805847167969, "loss": 0.0004, "losses/dpo": 1.81348469574516e-09, "losses/sft": 0.5780560374259949, "losses/total": 1.81348469574516e-09, "ref_logps/chosen": -240.77102661132812, "ref_logps/rejected": -211.8377227783203, "rewards/accuracies": 1.0, "rewards/chosen": -1.6378456354141235, "rewards/margins": 14.86644172668457, "rewards/rejected": -16.504287719726562, "step": 3503 }, { "epoch": 0.84, "learning_rate": 3.536e-08, "logps/chosen": -220.02047729492188, "logps/rejected": -392.7891540527344, "loss": 0.0003, "losses/dpo": 2.6572490696707973e-06, "losses/sft": 0.5442613363265991, "losses/total": 2.6572490696707973e-06, "ref_logps/chosen": -199.9412841796875, "ref_logps/rejected": -218.009521484375, "rewards/accuracies": 1.0, "rewards/chosen": -2.0079197883605957, "rewards/margins": 15.47004508972168, "rewards/rejected": -17.477964401245117, "step": 3504 }, { "epoch": 0.84, "learning_rate": 3.5306666666666664e-08, "logps/chosen": -249.05230712890625, "logps/rejected": -415.3822021484375, "loss": 0.0013, "losses/dpo": 5.4578745078970314e-08, "losses/sft": 0.5259263515472412, "losses/total": 5.4578745078970314e-08, "ref_logps/chosen": -229.89678955078125, "ref_logps/rejected": -230.60977172851562, "rewards/accuracies": 1.0, "rewards/chosen": -1.9155499935150146, "rewards/margins": 16.56169319152832, "rewards/rejected": -18.47724151611328, "step": 3505 }, { "epoch": 0.84, "learning_rate": 3.525333333333333e-08, "logps/chosen": -232.43533325195312, "logps/rejected": -397.95916748046875, "loss": 0.0007, "losses/dpo": 1.201024280028662e-10, "losses/sft": 0.5600816011428833, "losses/total": 1.201024280028662e-10, "ref_logps/chosen": -216.5499267578125, "ref_logps/rejected": -217.02593994140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.5885412693023682, "rewards/margins": 16.504779815673828, "rewards/rejected": -18.093318939208984, "step": 3506 }, { "epoch": 0.84, "learning_rate": 3.52e-08, "logps/chosen": -242.3543243408203, "logps/rejected": -414.0500183105469, "loss": 0.002, "losses/dpo": 1.6138982511515465e-12, "losses/sft": 0.5628548860549927, "losses/total": 1.6138982511515465e-12, "ref_logps/chosen": -225.66253662109375, "ref_logps/rejected": -237.2750701904297, "rewards/accuracies": 1.0, "rewards/chosen": -1.6691784858703613, "rewards/margins": 16.008316040039062, "rewards/rejected": -17.677494049072266, "step": 3507 }, { "epoch": 0.84, "learning_rate": 3.514666666666666e-08, "logps/chosen": -182.9257049560547, "logps/rejected": -351.4978332519531, "loss": 0.0007, "losses/dpo": 7.339423291341518e-07, "losses/sft": 0.5239161252975464, "losses/total": 7.339423291341518e-07, "ref_logps/chosen": -171.82867431640625, "ref_logps/rejected": -205.77145385742188, "rewards/accuracies": 1.0, "rewards/chosen": -1.1097033023834229, "rewards/margins": 13.462935447692871, "rewards/rejected": -14.572638511657715, "step": 3508 }, { "epoch": 0.84, "learning_rate": 3.509333333333333e-08, "logps/chosen": -255.42059326171875, "logps/rejected": -386.1260681152344, "loss": 0.0024, "losses/dpo": 3.904496992390705e-08, "losses/sft": 0.9141832590103149, "losses/total": 3.904496992390705e-08, "ref_logps/chosen": -235.67034912109375, "ref_logps/rejected": -217.89608764648438, "rewards/accuracies": 1.0, "rewards/chosen": -1.9750237464904785, "rewards/margins": 14.84797477722168, "rewards/rejected": -16.822998046875, "step": 3509 }, { "epoch": 0.84, "learning_rate": 3.5039999999999995e-08, "logps/chosen": -218.60104370117188, "logps/rejected": -369.20074462890625, "loss": 0.0004, "losses/dpo": 3.197815970357176e-11, "losses/sft": 0.7079757452011108, "losses/total": 3.197815970357176e-11, "ref_logps/chosen": -204.1607666015625, "ref_logps/rejected": -214.39987182617188, "rewards/accuracies": 1.0, "rewards/chosen": -1.4440257549285889, "rewards/margins": 14.036062240600586, "rewards/rejected": -15.48008918762207, "step": 3510 }, { "epoch": 0.84, "learning_rate": 3.4986666666666665e-08, "logps/chosen": -224.07943725585938, "logps/rejected": -380.1175231933594, "loss": 0.0001, "losses/dpo": 4.725226858681708e-07, "losses/sft": 1.3257875442504883, "losses/total": 4.725226858681708e-07, "ref_logps/chosen": -208.64723205566406, "ref_logps/rejected": -221.32131958007812, "rewards/accuracies": 1.0, "rewards/chosen": -1.5432202816009521, "rewards/margins": 14.336400985717773, "rewards/rejected": -15.879621505737305, "step": 3511 }, { "epoch": 0.84, "learning_rate": 3.493333333333333e-08, "logps/chosen": -288.2403869628906, "logps/rejected": -427.16754150390625, "loss": 0.0014, "losses/dpo": 2.3124346625991166e-05, "losses/sft": 0.9864531755447388, "losses/total": 2.3124346625991166e-05, "ref_logps/chosen": -265.5554504394531, "ref_logps/rejected": -248.2528076171875, "rewards/accuracies": 1.0, "rewards/chosen": -2.268493890762329, "rewards/margins": 15.622980117797852, "rewards/rejected": -17.8914737701416, "step": 3512 }, { "epoch": 0.84, "learning_rate": 3.488e-08, "logps/chosen": -243.3345184326172, "logps/rejected": -385.7650146484375, "loss": 0.0007, "losses/dpo": 3.299587048033459e-09, "losses/sft": 1.3418012857437134, "losses/total": 3.299587048033459e-09, "ref_logps/chosen": -223.28128051757812, "ref_logps/rejected": -213.13006591796875, "rewards/accuracies": 1.0, "rewards/chosen": -2.005323648452759, "rewards/margins": 15.258171081542969, "rewards/rejected": -17.26349449157715, "step": 3513 }, { "epoch": 0.84, "learning_rate": 3.482666666666666e-08, "logps/chosen": -204.12159729003906, "logps/rejected": -385.91632080078125, "loss": 0.0015, "losses/dpo": 3.5693396682745515e-08, "losses/sft": 0.5893149375915527, "losses/total": 3.5693396682745515e-08, "ref_logps/chosen": -185.0216522216797, "ref_logps/rejected": -218.62005615234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.90999436378479, "rewards/margins": 14.819633483886719, "rewards/rejected": -16.72962760925293, "step": 3514 }, { "epoch": 0.84, "learning_rate": 3.477333333333333e-08, "logps/chosen": -221.75082397460938, "logps/rejected": -389.1119384765625, "loss": 0.0025, "losses/dpo": 7.268271929206094e-06, "losses/sft": 0.5160261392593384, "losses/total": 7.268271929206094e-06, "ref_logps/chosen": -206.95242309570312, "ref_logps/rejected": -215.29690551757812, "rewards/accuracies": 1.0, "rewards/chosen": -1.4798393249511719, "rewards/margins": 15.901667594909668, "rewards/rejected": -17.381505966186523, "step": 3515 }, { "epoch": 0.84, "learning_rate": 3.472e-08, "logps/chosen": -238.44972229003906, "logps/rejected": -402.27178955078125, "loss": 0.0, "losses/dpo": 1.5964815247571096e-07, "losses/sft": 0.4079902768135071, "losses/total": 1.5964815247571096e-07, "ref_logps/chosen": -222.47219848632812, "ref_logps/rejected": -233.81515502929688, "rewards/accuracies": 1.0, "rewards/chosen": -1.5977530479431152, "rewards/margins": 15.247913360595703, "rewards/rejected": -16.845664978027344, "step": 3516 }, { "epoch": 0.84, "learning_rate": 3.4666666666666666e-08, "logps/chosen": -209.51844787597656, "logps/rejected": -355.3787536621094, "loss": 0.0033, "losses/dpo": 1.3131704057478544e-10, "losses/sft": 0.6258292198181152, "losses/total": 1.3131704057478544e-10, "ref_logps/chosen": -189.235595703125, "ref_logps/rejected": -203.9417266845703, "rewards/accuracies": 1.0, "rewards/chosen": -2.0282857418060303, "rewards/margins": 13.115416526794434, "rewards/rejected": -15.143702507019043, "step": 3517 }, { "epoch": 0.84, "learning_rate": 3.4613333333333336e-08, "logps/chosen": -202.70138549804688, "logps/rejected": -382.8450927734375, "loss": 0.0059, "losses/dpo": 6.393793133829817e-11, "losses/sft": 0.5586240291595459, "losses/total": 6.393793133829817e-11, "ref_logps/chosen": -185.3890380859375, "ref_logps/rejected": -218.58216857910156, "rewards/accuracies": 1.0, "rewards/chosen": -1.7312343120574951, "rewards/margins": 14.69505786895752, "rewards/rejected": -16.426292419433594, "step": 3518 }, { "epoch": 0.84, "learning_rate": 3.456e-08, "logps/chosen": -284.0338439941406, "logps/rejected": -402.0493469238281, "loss": 0.0046, "losses/dpo": 2.374239471691908e-08, "losses/sft": 0.6657730340957642, "losses/total": 2.374239471691908e-08, "ref_logps/chosen": -264.3291015625, "ref_logps/rejected": -225.8184814453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.9704713821411133, "rewards/margins": 15.652616500854492, "rewards/rejected": -17.62308692932129, "step": 3519 }, { "epoch": 0.84, "learning_rate": 3.450666666666667e-08, "logps/chosen": -255.11105346679688, "logps/rejected": -408.93572998046875, "loss": 0.0002, "losses/dpo": 9.017184332549277e-09, "losses/sft": 0.4646639823913574, "losses/total": 9.017184332549277e-09, "ref_logps/chosen": -240.53941345214844, "ref_logps/rejected": -246.16429138183594, "rewards/accuracies": 1.0, "rewards/chosen": -1.45716392993927, "rewards/margins": 14.81998062133789, "rewards/rejected": -16.277145385742188, "step": 3520 }, { "epoch": 0.84, "learning_rate": 3.4453333333333333e-08, "logps/chosen": -251.96261596679688, "logps/rejected": -397.2294616699219, "loss": 0.0122, "losses/dpo": 1.4220392641206558e-09, "losses/sft": 0.6534008979797363, "losses/total": 1.4220392641206558e-09, "ref_logps/chosen": -237.2543487548828, "ref_logps/rejected": -229.0914306640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4708261489868164, "rewards/margins": 15.342979431152344, "rewards/rejected": -16.813806533813477, "step": 3521 }, { "epoch": 0.85, "learning_rate": 3.44e-08, "logps/chosen": -271.24444580078125, "logps/rejected": -412.65728759765625, "loss": 0.0001, "losses/dpo": 2.8265538276173174e-05, "losses/sft": 0.7607402801513672, "losses/total": 2.8265538276173174e-05, "ref_logps/chosen": -253.9573211669922, "ref_logps/rejected": -234.13888549804688, "rewards/accuracies": 1.0, "rewards/chosen": -1.7287136316299438, "rewards/margins": 16.123130798339844, "rewards/rejected": -17.851842880249023, "step": 3522 }, { "epoch": 0.85, "learning_rate": 3.434666666666666e-08, "logps/chosen": -253.07907104492188, "logps/rejected": -411.0743713378906, "loss": 0.0002, "losses/dpo": 9.212085955168092e-11, "losses/sft": 0.5622449517250061, "losses/total": 9.212085955168092e-11, "ref_logps/chosen": -228.4963836669922, "ref_logps/rejected": -229.1236572265625, "rewards/accuracies": 1.0, "rewards/chosen": -2.4582695960998535, "rewards/margins": 15.73680305480957, "rewards/rejected": -18.195072174072266, "step": 3523 }, { "epoch": 0.85, "learning_rate": 3.429333333333333e-08, "logps/chosen": -234.35006713867188, "logps/rejected": -369.5406494140625, "loss": 0.0019, "losses/dpo": 1.8615244068431736e-10, "losses/sft": 0.6873133182525635, "losses/total": 1.8615244068431736e-10, "ref_logps/chosen": -217.51419067382812, "ref_logps/rejected": -204.8365478515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6835875511169434, "rewards/margins": 14.786821365356445, "rewards/rejected": -16.470409393310547, "step": 3524 }, { "epoch": 0.85, "learning_rate": 3.4239999999999994e-08, "logps/chosen": -257.12701416015625, "logps/rejected": -430.30133056640625, "loss": 0.0012, "losses/dpo": 1.3098484796802978e-10, "losses/sft": 0.6149841547012329, "losses/total": 1.3098484796802978e-10, "ref_logps/chosen": -240.04205322265625, "ref_logps/rejected": -242.5135498046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.708493709564209, "rewards/margins": 17.07028579711914, "rewards/rejected": -18.778779983520508, "step": 3525 }, { "epoch": 0.85, "learning_rate": 3.4186666666666664e-08, "logps/chosen": -220.04000854492188, "logps/rejected": -352.7270812988281, "loss": 0.0015, "losses/dpo": 2.879103885788936e-05, "losses/sft": 0.7445474863052368, "losses/total": 2.879103885788936e-05, "ref_logps/chosen": -201.33978271484375, "ref_logps/rejected": -195.66146850585938, "rewards/accuracies": 1.0, "rewards/chosen": -1.8700236082077026, "rewards/margins": 13.836536407470703, "rewards/rejected": -15.706561088562012, "step": 3526 }, { "epoch": 0.85, "learning_rate": 3.413333333333333e-08, "logps/chosen": -191.32740783691406, "logps/rejected": -395.3095397949219, "loss": 0.0005, "losses/dpo": 3.4872410736852544e-08, "losses/sft": 0.509668231010437, "losses/total": 3.4872410736852544e-08, "ref_logps/chosen": -174.10165405273438, "ref_logps/rejected": -218.84408569335938, "rewards/accuracies": 1.0, "rewards/chosen": -1.7225755453109741, "rewards/margins": 15.923973083496094, "rewards/rejected": -17.646549224853516, "step": 3527 }, { "epoch": 0.85, "learning_rate": 3.408e-08, "logps/chosen": -254.49215698242188, "logps/rejected": -397.60906982421875, "loss": 0.0007, "losses/dpo": 6.249873507613302e-08, "losses/sft": 0.38465872406959534, "losses/total": 6.249873507613302e-08, "ref_logps/chosen": -238.7845001220703, "ref_logps/rejected": -229.41224670410156, "rewards/accuracies": 1.0, "rewards/chosen": -1.5707660913467407, "rewards/margins": 15.248918533325195, "rewards/rejected": -16.819684982299805, "step": 3528 }, { "epoch": 0.85, "learning_rate": 3.402666666666666e-08, "logps/chosen": -229.8876190185547, "logps/rejected": -354.0863037109375, "loss": 0.0006, "losses/dpo": 5.3550275325164876e-11, "losses/sft": 0.6379976272583008, "losses/total": 5.3550275325164876e-11, "ref_logps/chosen": -215.70245361328125, "ref_logps/rejected": -203.11016845703125, "rewards/accuracies": 1.0, "rewards/chosen": -1.4185163974761963, "rewards/margins": 13.679096221923828, "rewards/rejected": -15.097612380981445, "step": 3529 }, { "epoch": 0.85, "learning_rate": 3.397333333333333e-08, "logps/chosen": -241.33914184570312, "logps/rejected": -393.95147705078125, "loss": 0.0002, "losses/dpo": 4.863395020038297e-09, "losses/sft": 0.5725070238113403, "losses/total": 4.863395020038297e-09, "ref_logps/chosen": -218.33297729492188, "ref_logps/rejected": -225.30160522460938, "rewards/accuracies": 1.0, "rewards/chosen": -2.3006176948547363, "rewards/margins": 14.564370155334473, "rewards/rejected": -16.864986419677734, "step": 3530 }, { "epoch": 0.85, "learning_rate": 3.3919999999999995e-08, "logps/chosen": -230.5433349609375, "logps/rejected": -408.5633544921875, "loss": 0.0011, "losses/dpo": 2.3279211891491514e-09, "losses/sft": 0.48739302158355713, "losses/total": 2.3279211891491514e-09, "ref_logps/chosen": -210.64962768554688, "ref_logps/rejected": -232.90663146972656, "rewards/accuracies": 1.0, "rewards/chosen": -1.989367961883545, "rewards/margins": 15.576301574707031, "rewards/rejected": -17.565670013427734, "step": 3531 }, { "epoch": 0.85, "learning_rate": 3.3866666666666666e-08, "logps/chosen": -283.71282958984375, "logps/rejected": -386.703125, "loss": 0.0102, "losses/dpo": 1.1397044090699637e-06, "losses/sft": 0.5233665108680725, "losses/total": 1.1397044090699637e-06, "ref_logps/chosen": -266.25103759765625, "ref_logps/rejected": -214.32781982421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7461786270141602, "rewards/margins": 15.491352081298828, "rewards/rejected": -17.237529754638672, "step": 3532 }, { "epoch": 0.85, "learning_rate": 3.3813333333333336e-08, "logps/chosen": -249.0691680908203, "logps/rejected": -421.2889709472656, "loss": 0.0031, "losses/dpo": 1.996396797876443e-10, "losses/sft": 0.5994346141815186, "losses/total": 1.996396797876443e-10, "ref_logps/chosen": -232.25601196289062, "ref_logps/rejected": -246.21876525878906, "rewards/accuracies": 1.0, "rewards/chosen": -1.681315541267395, "rewards/margins": 15.825703620910645, "rewards/rejected": -17.50701904296875, "step": 3533 }, { "epoch": 0.85, "learning_rate": 3.376e-08, "logps/chosen": -220.8009490966797, "logps/rejected": -423.34844970703125, "loss": 0.0001, "losses/dpo": 2.0132352673840614e-08, "losses/sft": 0.7148193717002869, "losses/total": 2.0132352673840614e-08, "ref_logps/chosen": -202.63034057617188, "ref_logps/rejected": -238.688720703125, "rewards/accuracies": 1.0, "rewards/chosen": -1.817061424255371, "rewards/margins": 16.648910522460938, "rewards/rejected": -18.465972900390625, "step": 3534 }, { "epoch": 0.85, "learning_rate": 3.370666666666667e-08, "logps/chosen": -254.0139923095703, "logps/rejected": -360.44879150390625, "loss": 0.003, "losses/dpo": 1.532690987460228e-07, "losses/sft": 0.6823781728744507, "losses/total": 1.532690987460228e-07, "ref_logps/chosen": -235.4047088623047, "ref_logps/rejected": -204.57879638671875, "rewards/accuracies": 1.0, "rewards/chosen": -1.8609281778335571, "rewards/margins": 13.72607421875, "rewards/rejected": -15.58700180053711, "step": 3535 }, { "epoch": 0.85, "learning_rate": 3.365333333333333e-08, "logps/chosen": -234.43075561523438, "logps/rejected": -388.912109375, "loss": 0.0004, "losses/dpo": 3.4623755251761423e-11, "losses/sft": 0.9444961547851562, "losses/total": 3.4623755251761423e-11, "ref_logps/chosen": -219.04824829101562, "ref_logps/rejected": -217.9520263671875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5382510423660278, "rewards/margins": 15.557756423950195, "rewards/rejected": -17.09600830078125, "step": 3536 }, { "epoch": 0.85, "learning_rate": 3.36e-08, "logps/chosen": -262.2781982421875, "logps/rejected": -440.8731689453125, "loss": 0.0011, "losses/dpo": 3.2658699637977406e-06, "losses/sft": 0.6346547603607178, "losses/total": 3.2658699637977406e-06, "ref_logps/chosen": -245.19259643554688, "ref_logps/rejected": -246.66448974609375, "rewards/accuracies": 1.0, "rewards/chosen": -1.708556890487671, "rewards/margins": 17.712310791015625, "rewards/rejected": -19.420866012573242, "step": 3537 }, { "epoch": 0.85, "learning_rate": 3.3546666666666667e-08, "logps/chosen": -268.65771484375, "logps/rejected": -404.2822265625, "loss": 0.0004, "losses/dpo": 5.493603438821992e-08, "losses/sft": 0.9203938841819763, "losses/total": 5.493603438821992e-08, "ref_logps/chosen": -246.2924041748047, "ref_logps/rejected": -229.80752563476562, "rewards/accuracies": 1.0, "rewards/chosen": -2.236530303955078, "rewards/margins": 15.21094036102295, "rewards/rejected": -17.447471618652344, "step": 3538 }, { "epoch": 0.85, "learning_rate": 3.349333333333334e-08, "logps/chosen": -243.64308166503906, "logps/rejected": -368.8466796875, "loss": 0.001, "losses/dpo": 1.34569040355359e-08, "losses/sft": 0.4139108657836914, "losses/total": 1.34569040355359e-08, "ref_logps/chosen": -227.1138153076172, "ref_logps/rejected": -202.37103271484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.6529253721237183, "rewards/margins": 14.994641304016113, "rewards/rejected": -16.647565841674805, "step": 3539 }, { "epoch": 0.85, "learning_rate": 3.3439999999999994e-08, "logps/chosen": -247.52940368652344, "logps/rejected": -399.7054443359375, "loss": 0.0009, "losses/dpo": 2.2466032589818496e-08, "losses/sft": 0.6064976453781128, "losses/total": 2.2466032589818496e-08, "ref_logps/chosen": -230.1561279296875, "ref_logps/rejected": -234.5889892578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.737327218055725, "rewards/margins": 14.77431869506836, "rewards/rejected": -16.51164436340332, "step": 3540 }, { "epoch": 0.85, "learning_rate": 3.3386666666666664e-08, "logps/chosen": -257.3576965332031, "logps/rejected": -405.507080078125, "loss": 0.0001, "losses/dpo": 6.774433458645035e-09, "losses/sft": 1.0260611772537231, "losses/total": 6.774433458645035e-09, "ref_logps/chosen": -240.20138549804688, "ref_logps/rejected": -232.1593475341797, "rewards/accuracies": 1.0, "rewards/chosen": -1.7156319618225098, "rewards/margins": 15.619142532348633, "rewards/rejected": -17.334774017333984, "step": 3541 }, { "epoch": 0.85, "learning_rate": 3.333333333333333e-08, "logps/chosen": -281.2325134277344, "logps/rejected": -419.2036437988281, "loss": 0.0002, "losses/dpo": 3.5095976125276707e-10, "losses/sft": 1.2707045078277588, "losses/total": 3.5095976125276707e-10, "ref_logps/chosen": -253.7202911376953, "ref_logps/rejected": -226.30740356445312, "rewards/accuracies": 1.0, "rewards/chosen": -2.7512216567993164, "rewards/margins": 16.538402557373047, "rewards/rejected": -19.289623260498047, "step": 3542 }, { "epoch": 0.85, "learning_rate": 3.328e-08, "logps/chosen": -235.0710906982422, "logps/rejected": -386.6124267578125, "loss": 0.0002, "losses/dpo": 2.782980956794745e-09, "losses/sft": 0.877125084400177, "losses/total": 2.782980956794745e-09, "ref_logps/chosen": -213.79759216308594, "ref_logps/rejected": -202.25650024414062, "rewards/accuracies": 1.0, "rewards/chosen": -2.127349853515625, "rewards/margins": 16.308242797851562, "rewards/rejected": -18.435592651367188, "step": 3543 }, { "epoch": 0.85, "learning_rate": 3.322666666666666e-08, "logps/chosen": -220.48565673828125, "logps/rejected": -406.3170471191406, "loss": 0.0005, "losses/dpo": 2.7457747364678653e-07, "losses/sft": 1.014707088470459, "losses/total": 2.7457747364678653e-07, "ref_logps/chosen": -202.90130615234375, "ref_logps/rejected": -230.42271423339844, "rewards/accuracies": 1.0, "rewards/chosen": -1.7584357261657715, "rewards/margins": 15.830995559692383, "rewards/rejected": -17.589431762695312, "step": 3544 }, { "epoch": 0.85, "learning_rate": 3.317333333333333e-08, "logps/chosen": -247.49966430664062, "logps/rejected": -408.5079345703125, "loss": 0.0002, "losses/dpo": 3.511715584991748e-09, "losses/sft": 0.5866685509681702, "losses/total": 3.511715584991748e-09, "ref_logps/chosen": -228.8787841796875, "ref_logps/rejected": -242.82321166992188, "rewards/accuracies": 1.0, "rewards/chosen": -1.8620890378952026, "rewards/margins": 14.70638370513916, "rewards/rejected": -16.56847381591797, "step": 3545 }, { "epoch": 0.85, "learning_rate": 3.3119999999999995e-08, "logps/chosen": -212.09130859375, "logps/rejected": -370.91656494140625, "loss": 0.0005, "losses/dpo": 2.5884560272970702e-06, "losses/sft": 0.625331699848175, "losses/total": 2.5884560272970702e-06, "ref_logps/chosen": -195.2513427734375, "ref_logps/rejected": -215.71365356445312, "rewards/accuracies": 1.0, "rewards/chosen": -1.6839998960494995, "rewards/margins": 13.836292266845703, "rewards/rejected": -15.520292282104492, "step": 3546 }, { "epoch": 0.85, "learning_rate": 3.3066666666666665e-08, "logps/chosen": -232.74652099609375, "logps/rejected": -392.2551574707031, "loss": 0.0007, "losses/dpo": 7.041793814721586e-10, "losses/sft": 0.5945828557014465, "losses/total": 7.041793814721586e-10, "ref_logps/chosen": -211.1112823486328, "ref_logps/rejected": -214.31558227539062, "rewards/accuracies": 1.0, "rewards/chosen": -2.163525104522705, "rewards/margins": 15.630433082580566, "rewards/rejected": -17.79395866394043, "step": 3547 }, { "epoch": 0.85, "learning_rate": 3.3013333333333335e-08, "logps/chosen": -246.827880859375, "logps/rejected": -344.3707580566406, "loss": 0.0002, "losses/dpo": 1.035632066215264e-10, "losses/sft": 0.5444138646125793, "losses/total": 1.035632066215264e-10, "ref_logps/chosen": -229.4784698486328, "ref_logps/rejected": -189.11456298828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.7349402904510498, "rewards/margins": 13.790681838989258, "rewards/rejected": -15.525622367858887, "step": 3548 }, { "epoch": 0.85, "learning_rate": 3.296e-08, "logps/chosen": -268.2496337890625, "logps/rejected": -392.8817138671875, "loss": 0.0003, "losses/dpo": 4.833487668269576e-13, "losses/sft": 0.5689648985862732, "losses/total": 4.833487668269576e-13, "ref_logps/chosen": -250.78341674804688, "ref_logps/rejected": -229.07745361328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.7466211318969727, "rewards/margins": 14.633804321289062, "rewards/rejected": -16.38042449951172, "step": 3549 }, { "epoch": 0.85, "learning_rate": 3.290666666666667e-08, "logps/chosen": -282.127685546875, "logps/rejected": -447.404541015625, "loss": 0.0002, "losses/dpo": 1.069363486649877e-09, "losses/sft": 0.613892138004303, "losses/total": 1.069363486649877e-09, "ref_logps/chosen": -260.11138916015625, "ref_logps/rejected": -248.35989379882812, "rewards/accuracies": 1.0, "rewards/chosen": -2.201629638671875, "rewards/margins": 17.70283317565918, "rewards/rejected": -19.904462814331055, "step": 3550 }, { "epoch": 0.85, "learning_rate": 3.285333333333333e-08, "logps/chosen": -250.58425903320312, "logps/rejected": -400.1993713378906, "loss": 0.0005, "losses/dpo": 1.1139669048310097e-11, "losses/sft": 0.47983720898628235, "losses/total": 1.1139669048310097e-11, "ref_logps/chosen": -233.69699096679688, "ref_logps/rejected": -223.66604614257812, "rewards/accuracies": 1.0, "rewards/chosen": -1.6887264251708984, "rewards/margins": 15.964607238769531, "rewards/rejected": -17.653331756591797, "step": 3551 }, { "epoch": 0.85, "learning_rate": 3.28e-08, "logps/chosen": -239.85498046875, "logps/rejected": -393.7164306640625, "loss": 0.014, "losses/dpo": 8.834544762414964e-10, "losses/sft": 0.7001093029975891, "losses/total": 8.834544762414964e-10, "ref_logps/chosen": -217.884765625, "ref_logps/rejected": -219.6025390625, "rewards/accuracies": 1.0, "rewards/chosen": -2.1970200538635254, "rewards/margins": 15.21436882019043, "rewards/rejected": -17.411388397216797, "step": 3552 }, { "epoch": 0.85, "learning_rate": 3.2746666666666666e-08, "logps/chosen": -218.0425567626953, "logps/rejected": -399.8367919921875, "loss": 0.0003, "losses/dpo": 6.424464432663868e-11, "losses/sft": 0.7846283316612244, "losses/total": 6.424464432663868e-11, "ref_logps/chosen": -199.5010223388672, "ref_logps/rejected": -221.38128662109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.8541532754898071, "rewards/margins": 15.991394996643066, "rewards/rejected": -17.845548629760742, "step": 3553 }, { "epoch": 0.85, "learning_rate": 3.2693333333333336e-08, "logps/chosen": -256.1510009765625, "logps/rejected": -426.0811462402344, "loss": 0.0005, "losses/dpo": 1.6711050945805717e-11, "losses/sft": 0.5846764445304871, "losses/total": 1.6711050945805717e-11, "ref_logps/chosen": -238.74644470214844, "ref_logps/rejected": -241.4017333984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.7404541969299316, "rewards/margins": 16.727487564086914, "rewards/rejected": -18.46794319152832, "step": 3554 }, { "epoch": 0.85, "learning_rate": 3.264e-08, "logps/chosen": -248.50267028808594, "logps/rejected": -390.336669921875, "loss": 0.0001, "losses/dpo": 9.573875914270502e-09, "losses/sft": 0.7108770608901978, "losses/total": 9.573875914270502e-09, "ref_logps/chosen": -232.37039184570312, "ref_logps/rejected": -221.85577392578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.613227367401123, "rewards/margins": 15.234864234924316, "rewards/rejected": -16.84809112548828, "step": 3555 }, { "epoch": 0.85, "learning_rate": 3.258666666666667e-08, "logps/chosen": -260.1044616699219, "logps/rejected": -403.45709228515625, "loss": 0.002, "losses/dpo": 1.5666364039379488e-11, "losses/sft": 0.6007253527641296, "losses/total": 1.5666364039379488e-11, "ref_logps/chosen": -239.34036254882812, "ref_logps/rejected": -229.74459838867188, "rewards/accuracies": 1.0, "rewards/chosen": -2.076411724090576, "rewards/margins": 15.294840812683105, "rewards/rejected": -17.371253967285156, "step": 3556 }, { "epoch": 0.85, "learning_rate": 3.253333333333333e-08, "logps/chosen": -215.86607360839844, "logps/rejected": -360.661865234375, "loss": 0.0024, "losses/dpo": 1.5866334024394746e-08, "losses/sft": 0.6049322485923767, "losses/total": 1.5866334024394746e-08, "ref_logps/chosen": -196.103759765625, "ref_logps/rejected": -198.82931518554688, "rewards/accuracies": 1.0, "rewards/chosen": -1.976233959197998, "rewards/margins": 14.207022666931152, "rewards/rejected": -16.183256149291992, "step": 3557 }, { "epoch": 0.85, "learning_rate": 3.248e-08, "logps/chosen": -235.71278381347656, "logps/rejected": -370.73828125, "loss": 0.0005, "losses/dpo": 4.155060739918781e-09, "losses/sft": 0.6187052130699158, "losses/total": 4.155060739918781e-09, "ref_logps/chosen": -216.0341033935547, "ref_logps/rejected": -214.33331298828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.967867136001587, "rewards/margins": 13.672629356384277, "rewards/rejected": -15.640495300292969, "step": 3558 }, { "epoch": 0.85, "learning_rate": 3.242666666666666e-08, "logps/chosen": -206.86529541015625, "logps/rejected": -351.2576599121094, "loss": 0.0023, "losses/dpo": 9.998876748795738e-08, "losses/sft": 0.6820995807647705, "losses/total": 9.998876748795738e-08, "ref_logps/chosen": -194.0890655517578, "ref_logps/rejected": -202.66619873046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.2776257991790771, "rewards/margins": 13.581520080566406, "rewards/rejected": -14.859146118164062, "step": 3559 }, { "epoch": 0.85, "learning_rate": 3.237333333333333e-08, "logps/chosen": -237.13265991210938, "logps/rejected": -394.364990234375, "loss": 0.0008, "losses/dpo": 4.872877923389751e-08, "losses/sft": 1.1328120231628418, "losses/total": 4.872877923389751e-08, "ref_logps/chosen": -223.15484619140625, "ref_logps/rejected": -225.23672485351562, "rewards/accuracies": 1.0, "rewards/chosen": -1.3977808952331543, "rewards/margins": 15.515046119689941, "rewards/rejected": -16.912826538085938, "step": 3560 }, { "epoch": 0.85, "learning_rate": 3.2319999999999994e-08, "logps/chosen": -263.5491638183594, "logps/rejected": -416.099365234375, "loss": 0.0004, "losses/dpo": 1.9133304718366162e-08, "losses/sft": 0.6016334295272827, "losses/total": 1.9133304718366162e-08, "ref_logps/chosen": -247.59353637695312, "ref_logps/rejected": -240.42752075195312, "rewards/accuracies": 1.0, "rewards/chosen": -1.5955636501312256, "rewards/margins": 15.971617698669434, "rewards/rejected": -17.567182540893555, "step": 3561 }, { "epoch": 0.85, "learning_rate": 3.2266666666666664e-08, "logps/chosen": -276.69342041015625, "logps/rejected": -366.90655517578125, "loss": 0.0037, "losses/dpo": 2.5881952137041253e-10, "losses/sft": 0.6511014699935913, "losses/total": 2.5881952137041253e-10, "ref_logps/chosen": -255.18919372558594, "ref_logps/rejected": -212.7111358642578, "rewards/accuracies": 1.0, "rewards/chosen": -2.150425434112549, "rewards/margins": 13.269115447998047, "rewards/rejected": -15.419541358947754, "step": 3562 }, { "epoch": 0.86, "learning_rate": 3.221333333333333e-08, "logps/chosen": -231.62762451171875, "logps/rejected": -382.2327880859375, "loss": 0.0004, "losses/dpo": 3.439606643951265e-08, "losses/sft": 0.6768316626548767, "losses/total": 3.439606643951265e-08, "ref_logps/chosen": -209.82334899902344, "ref_logps/rejected": -207.99227905273438, "rewards/accuracies": 1.0, "rewards/chosen": -2.1804275512695312, "rewards/margins": 15.243622779846191, "rewards/rejected": -17.424049377441406, "step": 3563 }, { "epoch": 0.86, "learning_rate": 3.216e-08, "logps/chosen": -232.43161010742188, "logps/rejected": -370.19927978515625, "loss": 0.0002, "losses/dpo": 2.3485624112140613e-08, "losses/sft": 0.5783900618553162, "losses/total": 2.3485624112140613e-08, "ref_logps/chosen": -214.32217407226562, "ref_logps/rejected": -216.51510620117188, "rewards/accuracies": 1.0, "rewards/chosen": -1.8109421730041504, "rewards/margins": 13.557476997375488, "rewards/rejected": -15.36841869354248, "step": 3564 }, { "epoch": 0.86, "learning_rate": 3.210666666666667e-08, "logps/chosen": -246.888671875, "logps/rejected": -410.45184326171875, "loss": 0.0008, "losses/dpo": 2.199509799538646e-05, "losses/sft": 0.4745357632637024, "losses/total": 2.199509799538646e-05, "ref_logps/chosen": -227.24884033203125, "ref_logps/rejected": -225.25448608398438, "rewards/accuracies": 1.0, "rewards/chosen": -1.9639836549758911, "rewards/margins": 16.55575180053711, "rewards/rejected": -18.519733428955078, "step": 3565 }, { "epoch": 0.86, "learning_rate": 3.205333333333333e-08, "logps/chosen": -219.80677795410156, "logps/rejected": -375.725341796875, "loss": 0.0019, "losses/dpo": 7.793331951999605e-11, "losses/sft": 0.4811548888683319, "losses/total": 7.793331951999605e-11, "ref_logps/chosen": -208.08486938476562, "ref_logps/rejected": -224.70101928710938, "rewards/accuracies": 1.0, "rewards/chosen": -1.172189474105835, "rewards/margins": 13.930246353149414, "rewards/rejected": -15.102435111999512, "step": 3566 }, { "epoch": 0.86, "learning_rate": 3.2e-08, "logps/chosen": -262.627197265625, "logps/rejected": -405.67010498046875, "loss": 0.0001, "losses/dpo": 1.1370473451677299e-08, "losses/sft": 0.7071686387062073, "losses/total": 1.1370473451677299e-08, "ref_logps/chosen": -241.65975952148438, "ref_logps/rejected": -225.44972229003906, "rewards/accuracies": 1.0, "rewards/chosen": -2.0967445373535156, "rewards/margins": 15.925296783447266, "rewards/rejected": -18.02204132080078, "step": 3567 }, { "epoch": 0.86, "learning_rate": 3.1946666666666666e-08, "logps/chosen": -243.60523986816406, "logps/rejected": -399.9156494140625, "loss": 0.0002, "losses/dpo": 6.180925129356529e-12, "losses/sft": 0.6632869839668274, "losses/total": 6.180925129356529e-12, "ref_logps/chosen": -226.98695373535156, "ref_logps/rejected": -226.3128662109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.6618294715881348, "rewards/margins": 15.698447227478027, "rewards/rejected": -17.36027717590332, "step": 3568 }, { "epoch": 0.86, "learning_rate": 3.1893333333333336e-08, "logps/chosen": -260.72705078125, "logps/rejected": -400.47296142578125, "loss": 0.0071, "losses/dpo": 1.40203542287054e-06, "losses/sft": 0.3537176847457886, "losses/total": 1.40203542287054e-06, "ref_logps/chosen": -240.56036376953125, "ref_logps/rejected": -224.78741455078125, "rewards/accuracies": 1.0, "rewards/chosen": -2.016669273376465, "rewards/margins": 15.551887512207031, "rewards/rejected": -17.568557739257812, "step": 3569 }, { "epoch": 0.86, "learning_rate": 3.184e-08, "logps/chosen": -222.8614044189453, "logps/rejected": -374.47369384765625, "loss": 0.0013, "losses/dpo": 8.767504944273696e-09, "losses/sft": 0.6516572833061218, "losses/total": 8.767504944273696e-09, "ref_logps/chosen": -205.951171875, "ref_logps/rejected": -211.38990783691406, "rewards/accuracies": 1.0, "rewards/chosen": -1.691023826599121, "rewards/margins": 14.617355346679688, "rewards/rejected": -16.308380126953125, "step": 3570 }, { "epoch": 0.86, "learning_rate": 3.178666666666667e-08, "logps/chosen": -241.53842163085938, "logps/rejected": -406.414794921875, "loss": 0.0004, "losses/dpo": 1.8529882073181625e-09, "losses/sft": 0.7282503247261047, "losses/total": 1.8529882073181625e-09, "ref_logps/chosen": -227.85426330566406, "ref_logps/rejected": -237.09246826171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.3684154748916626, "rewards/margins": 15.563817977905273, "rewards/rejected": -16.932231903076172, "step": 3571 }, { "epoch": 0.86, "learning_rate": 3.173333333333333e-08, "logps/chosen": -263.3016357421875, "logps/rejected": -433.1904296875, "loss": 0.0001, "losses/dpo": 1.4127939269859002e-12, "losses/sft": 0.7263510823249817, "losses/total": 1.4127939269859002e-12, "ref_logps/chosen": -245.37109375, "ref_logps/rejected": -236.32675170898438, "rewards/accuracies": 1.0, "rewards/chosen": -1.793054461479187, "rewards/margins": 17.893314361572266, "rewards/rejected": -19.68636703491211, "step": 3572 }, { "epoch": 0.86, "learning_rate": 3.168e-08, "logps/chosen": -226.3240966796875, "logps/rejected": -361.6875, "loss": 0.0054, "losses/dpo": 1.1289769119571247e-10, "losses/sft": 0.5293673872947693, "losses/total": 1.1289769119571247e-10, "ref_logps/chosen": -215.15965270996094, "ref_logps/rejected": -203.68125915527344, "rewards/accuracies": 1.0, "rewards/chosen": -1.1164432764053345, "rewards/margins": 14.684181213378906, "rewards/rejected": -15.80062484741211, "step": 3573 }, { "epoch": 0.86, "learning_rate": 3.162666666666666e-08, "logps/chosen": -243.2640838623047, "logps/rejected": -412.00189208984375, "loss": 0.0041, "losses/dpo": 6.500900512351748e-11, "losses/sft": 0.5633880496025085, "losses/total": 6.500900512351748e-11, "ref_logps/chosen": -226.94027709960938, "ref_logps/rejected": -231.04132080078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.632380723953247, "rewards/margins": 16.46367645263672, "rewards/rejected": -18.096057891845703, "step": 3574 }, { "epoch": 0.86, "learning_rate": 3.157333333333333e-08, "logps/chosen": -249.39303588867188, "logps/rejected": -410.73101806640625, "loss": 0.0011, "losses/dpo": 4.583253954137945e-08, "losses/sft": 0.6792060732841492, "losses/total": 4.583253954137945e-08, "ref_logps/chosen": -231.3390655517578, "ref_logps/rejected": -222.83792114257812, "rewards/accuracies": 1.0, "rewards/chosen": -1.8053996562957764, "rewards/margins": 16.98391342163086, "rewards/rejected": -18.789310455322266, "step": 3575 }, { "epoch": 0.86, "learning_rate": 3.1519999999999994e-08, "logps/chosen": -251.3822479248047, "logps/rejected": -394.03192138671875, "loss": 0.0014, "losses/dpo": 3.568404054021812e-06, "losses/sft": 0.7726696729660034, "losses/total": 3.568404054021812e-06, "ref_logps/chosen": -229.544677734375, "ref_logps/rejected": -210.7364501953125, "rewards/accuracies": 1.0, "rewards/chosen": -2.18375563621521, "rewards/margins": 16.14579200744629, "rewards/rejected": -18.329547882080078, "step": 3576 }, { "epoch": 0.86, "learning_rate": 3.1466666666666664e-08, "logps/chosen": -264.5307922363281, "logps/rejected": -370.3857727050781, "loss": 0.0003, "losses/dpo": 3.675516280221558e-10, "losses/sft": 0.5416945815086365, "losses/total": 3.675516280221558e-10, "ref_logps/chosen": -244.44671630859375, "ref_logps/rejected": -206.32603454589844, "rewards/accuracies": 1.0, "rewards/chosen": -2.0084068775177, "rewards/margins": 14.397565841674805, "rewards/rejected": -16.405973434448242, "step": 3577 }, { "epoch": 0.86, "learning_rate": 3.141333333333333e-08, "logps/chosen": -247.03843688964844, "logps/rejected": -419.3827209472656, "loss": 0.0003, "losses/dpo": 2.0925340038502327e-07, "losses/sft": 0.4083773195743561, "losses/total": 2.0925340038502327e-07, "ref_logps/chosen": -230.89532470703125, "ref_logps/rejected": -238.59255981445312, "rewards/accuracies": 1.0, "rewards/chosen": -1.6143121719360352, "rewards/margins": 16.464702606201172, "rewards/rejected": -18.07901382446289, "step": 3578 }, { "epoch": 0.86, "learning_rate": 3.136e-08, "logps/chosen": -243.66571044921875, "logps/rejected": -413.41302490234375, "loss": 0.0002, "losses/dpo": 1.1653728648752804e-07, "losses/sft": 0.8341140747070312, "losses/total": 1.1653728648752804e-07, "ref_logps/chosen": -226.31051635742188, "ref_logps/rejected": -232.68426513671875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7355189323425293, "rewards/margins": 16.337356567382812, "rewards/rejected": -18.0728759765625, "step": 3579 }, { "epoch": 0.86, "learning_rate": 3.130666666666667e-08, "logps/chosen": -240.015625, "logps/rejected": -395.6864013671875, "loss": 0.0016, "losses/dpo": 1.528412840734461e-09, "losses/sft": 0.6008356809616089, "losses/total": 1.528412840734461e-09, "ref_logps/chosen": -224.5938720703125, "ref_logps/rejected": -235.19004821777344, "rewards/accuracies": 1.0, "rewards/chosen": -1.5421736240386963, "rewards/margins": 14.507463455200195, "rewards/rejected": -16.049636840820312, "step": 3580 }, { "epoch": 0.86, "learning_rate": 3.125333333333333e-08, "logps/chosen": -235.52330017089844, "logps/rejected": -392.34381103515625, "loss": 0.0001, "losses/dpo": 1.8146504316973733e-06, "losses/sft": 1.054822325706482, "losses/total": 1.8146504316973733e-06, "ref_logps/chosen": -219.87692260742188, "ref_logps/rejected": -218.68453979492188, "rewards/accuracies": 1.0, "rewards/chosen": -1.5646371841430664, "rewards/margins": 15.801290512084961, "rewards/rejected": -17.365928649902344, "step": 3581 }, { "epoch": 0.86, "learning_rate": 3.12e-08, "logps/chosen": -239.48281860351562, "logps/rejected": -390.05474853515625, "loss": 0.0049, "losses/dpo": 1.0143979878307618e-08, "losses/sft": 0.3656216561794281, "losses/total": 1.0143979878307618e-08, "ref_logps/chosen": -221.3616180419922, "ref_logps/rejected": -219.00473022460938, "rewards/accuracies": 1.0, "rewards/chosen": -1.8121206760406494, "rewards/margins": 15.292879104614258, "rewards/rejected": -17.104999542236328, "step": 3582 }, { "epoch": 0.86, "learning_rate": 3.1146666666666665e-08, "logps/chosen": -269.66802978515625, "logps/rejected": -398.8312072753906, "loss": 0.0002, "losses/dpo": 5.16333020783577e-09, "losses/sft": 0.5680778622627258, "losses/total": 5.16333020783577e-09, "ref_logps/chosen": -254.9864501953125, "ref_logps/rejected": -229.85130310058594, "rewards/accuracies": 1.0, "rewards/chosen": -1.4681564569473267, "rewards/margins": 15.429834365844727, "rewards/rejected": -16.897991180419922, "step": 3583 }, { "epoch": 0.86, "learning_rate": 3.1093333333333335e-08, "logps/chosen": -272.0920715332031, "logps/rejected": -461.24609375, "loss": 0.0004, "losses/dpo": 1.507524834132301e-11, "losses/sft": 0.4493255913257599, "losses/total": 1.507524834132301e-11, "ref_logps/chosen": -250.8470001220703, "ref_logps/rejected": -258.48162841796875, "rewards/accuracies": 1.0, "rewards/chosen": -2.1245083808898926, "rewards/margins": 18.151939392089844, "rewards/rejected": -20.276445388793945, "step": 3584 }, { "epoch": 0.86, "learning_rate": 3.104e-08, "logps/chosen": -280.4059753417969, "logps/rejected": -412.1748046875, "loss": 0.0002, "losses/dpo": 7.038571392392612e-10, "losses/sft": 0.6634511947631836, "losses/total": 7.038571392392612e-10, "ref_logps/chosen": -260.02984619140625, "ref_logps/rejected": -227.01031494140625, "rewards/accuracies": 1.0, "rewards/chosen": -2.0376150608062744, "rewards/margins": 16.478836059570312, "rewards/rejected": -18.51645278930664, "step": 3585 }, { "epoch": 0.86, "learning_rate": 3.098666666666667e-08, "logps/chosen": -213.61138916015625, "logps/rejected": -417.0435791015625, "loss": 0.0022, "losses/dpo": 5.840692756464705e-05, "losses/sft": 1.0864592790603638, "losses/total": 5.840692756464705e-05, "ref_logps/chosen": -195.6796112060547, "ref_logps/rejected": -232.71669006347656, "rewards/accuracies": 1.0, "rewards/chosen": -1.793177604675293, "rewards/margins": 16.639511108398438, "rewards/rejected": -18.432689666748047, "step": 3586 }, { "epoch": 0.86, "learning_rate": 3.093333333333333e-08, "logps/chosen": -232.3272247314453, "logps/rejected": -375.69268798828125, "loss": 0.0001, "losses/dpo": 5.7258916058344766e-05, "losses/sft": 1.0481303930282593, "losses/total": 5.7258916058344766e-05, "ref_logps/chosen": -212.6110382080078, "ref_logps/rejected": -204.4952392578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.9716187715530396, "rewards/margins": 15.148127555847168, "rewards/rejected": -17.119747161865234, "step": 3587 }, { "epoch": 0.86, "learning_rate": 3.088e-08, "logps/chosen": -260.53790283203125, "logps/rejected": -403.7127990722656, "loss": 0.001, "losses/dpo": 5.289960469667676e-09, "losses/sft": 0.44295525550842285, "losses/total": 5.289960469667676e-09, "ref_logps/chosen": -236.22433471679688, "ref_logps/rejected": -227.27178955078125, "rewards/accuracies": 1.0, "rewards/chosen": -2.4313549995422363, "rewards/margins": 15.212747573852539, "rewards/rejected": -17.64410400390625, "step": 3588 }, { "epoch": 0.86, "learning_rate": 3.0826666666666666e-08, "logps/chosen": -225.82626342773438, "logps/rejected": -405.615234375, "loss": 0.0009, "losses/dpo": 2.1516368686747e-09, "losses/sft": 0.7161441445350647, "losses/total": 2.1516368686747e-09, "ref_logps/chosen": -205.9419403076172, "ref_logps/rejected": -223.85536193847656, "rewards/accuracies": 1.0, "rewards/chosen": -1.9884321689605713, "rewards/margins": 16.187557220458984, "rewards/rejected": -18.17599105834961, "step": 3589 }, { "epoch": 0.86, "learning_rate": 3.0773333333333336e-08, "logps/chosen": -217.36923217773438, "logps/rejected": -412.5235290527344, "loss": 0.0009, "losses/dpo": 2.1253144666388835e-07, "losses/sft": 0.7759647369384766, "losses/total": 2.1253144666388835e-07, "ref_logps/chosen": -199.79763793945312, "ref_logps/rejected": -219.03158569335938, "rewards/accuracies": 1.0, "rewards/chosen": -1.757157325744629, "rewards/margins": 17.592039108276367, "rewards/rejected": -19.34919548034668, "step": 3590 }, { "epoch": 0.86, "learning_rate": 3.071999999999999e-08, "logps/chosen": -224.09027099609375, "logps/rejected": -354.374755859375, "loss": 0.0005, "losses/dpo": 1.8432444903737633e-06, "losses/sft": 0.48474985361099243, "losses/total": 1.8432444903737633e-06, "ref_logps/chosen": -210.76101684570312, "ref_logps/rejected": -199.1832275390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.332925796508789, "rewards/margins": 14.186227798461914, "rewards/rejected": -15.519152641296387, "step": 3591 }, { "epoch": 0.86, "learning_rate": 3.0666666666666663e-08, "logps/chosen": -266.5694580078125, "logps/rejected": -390.8829345703125, "loss": 0.0004, "losses/dpo": 1.336592134748571e-07, "losses/sft": 0.6352093815803528, "losses/total": 1.336592134748571e-07, "ref_logps/chosen": -247.86390686035156, "ref_logps/rejected": -218.09661865234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.8705525398254395, "rewards/margins": 15.408079147338867, "rewards/rejected": -17.27863121032715, "step": 3592 }, { "epoch": 0.86, "learning_rate": 3.061333333333333e-08, "logps/chosen": -229.19863891601562, "logps/rejected": -418.3837890625, "loss": 0.0, "losses/dpo": 5.160828361883141e-13, "losses/sft": 0.642166793346405, "losses/total": 5.160828361883141e-13, "ref_logps/chosen": -207.95033264160156, "ref_logps/rejected": -229.92239379882812, "rewards/accuracies": 1.0, "rewards/chosen": -2.1248319149017334, "rewards/margins": 16.721309661865234, "rewards/rejected": -18.846141815185547, "step": 3593 }, { "epoch": 0.86, "learning_rate": 3.056e-08, "logps/chosen": -239.7631378173828, "logps/rejected": -409.79034423828125, "loss": 0.0018, "losses/dpo": 6.581905687141898e-09, "losses/sft": 0.6389709711074829, "losses/total": 6.581905687141898e-09, "ref_logps/chosen": -222.89279174804688, "ref_logps/rejected": -230.59014892578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6870352029800415, "rewards/margins": 16.232982635498047, "rewards/rejected": -17.92001724243164, "step": 3594 }, { "epoch": 0.86, "learning_rate": 3.050666666666666e-08, "logps/chosen": -247.84811401367188, "logps/rejected": -364.42388916015625, "loss": 0.0005, "losses/dpo": 1.069555732868821e-08, "losses/sft": 0.8848466873168945, "losses/total": 1.069555732868821e-08, "ref_logps/chosen": -230.59286499023438, "ref_logps/rejected": -203.1236572265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.72552490234375, "rewards/margins": 14.404495239257812, "rewards/rejected": -16.130020141601562, "step": 3595 }, { "epoch": 0.86, "learning_rate": 3.045333333333333e-08, "logps/chosen": -251.3134765625, "logps/rejected": -406.650146484375, "loss": 0.0003, "losses/dpo": 5.848378205330906e-12, "losses/sft": 0.6117949485778809, "losses/total": 5.848378205330906e-12, "ref_logps/chosen": -234.1460418701172, "ref_logps/rejected": -226.7630157470703, "rewards/accuracies": 1.0, "rewards/chosen": -1.7167420387268066, "rewards/margins": 16.271968841552734, "rewards/rejected": -17.98870849609375, "step": 3596 }, { "epoch": 0.86, "learning_rate": 3.04e-08, "logps/chosen": -260.5860595703125, "logps/rejected": -418.38616943359375, "loss": 0.0013, "losses/dpo": 2.4744729216763517e-06, "losses/sft": 0.647689163684845, "losses/total": 2.4744729216763517e-06, "ref_logps/chosen": -244.20315551757812, "ref_logps/rejected": -239.8389129638672, "rewards/accuracies": 1.0, "rewards/chosen": -1.6382901668548584, "rewards/margins": 16.2164363861084, "rewards/rejected": -17.854726791381836, "step": 3597 }, { "epoch": 0.86, "learning_rate": 3.0346666666666664e-08, "logps/chosen": -222.18817138671875, "logps/rejected": -363.6072082519531, "loss": 0.0029, "losses/dpo": 3.242382196066984e-10, "losses/sft": 0.8117702007293701, "losses/total": 3.242382196066984e-10, "ref_logps/chosen": -206.55484008789062, "ref_logps/rejected": -199.24020385742188, "rewards/accuracies": 1.0, "rewards/chosen": -1.5633341073989868, "rewards/margins": 14.873366355895996, "rewards/rejected": -16.43670082092285, "step": 3598 }, { "epoch": 0.86, "learning_rate": 3.0293333333333335e-08, "logps/chosen": -276.4104919433594, "logps/rejected": -401.2332763671875, "loss": 0.0003, "losses/dpo": 1.0192392352337265e-07, "losses/sft": 0.5056437849998474, "losses/total": 1.0192392352337265e-07, "ref_logps/chosen": -255.63401794433594, "ref_logps/rejected": -228.8806915283203, "rewards/accuracies": 1.0, "rewards/chosen": -2.077646255493164, "rewards/margins": 15.157609939575195, "rewards/rejected": -17.235254287719727, "step": 3599 }, { "epoch": 0.86, "learning_rate": 3.024e-08, "logps/chosen": -248.993896484375, "logps/rejected": -428.99365234375, "loss": 0.0001, "losses/dpo": 5.374605205332728e-10, "losses/sft": 0.6763838529586792, "losses/total": 5.374605205332728e-10, "ref_logps/chosen": -230.50035095214844, "ref_logps/rejected": -241.03909301757812, "rewards/accuracies": 1.0, "rewards/chosen": -1.84935462474823, "rewards/margins": 16.94610023498535, "rewards/rejected": -18.795454025268555, "step": 3600 }, { "epoch": 0.86, "learning_rate": 3.018666666666667e-08, "logps/chosen": -191.42068481445312, "logps/rejected": -369.7908630371094, "loss": 0.0014, "losses/dpo": 9.250041704822465e-12, "losses/sft": 0.5459564328193665, "losses/total": 9.250041704822465e-12, "ref_logps/chosen": -179.42611694335938, "ref_logps/rejected": -208.76625061035156, "rewards/accuracies": 1.0, "rewards/chosen": -1.199455738067627, "rewards/margins": 14.903005599975586, "rewards/rejected": -16.102462768554688, "step": 3601 }, { "epoch": 0.86, "learning_rate": 3.013333333333333e-08, "logps/chosen": -256.2115173339844, "logps/rejected": -401.6620178222656, "loss": 0.0002, "losses/dpo": 3.620248278934923e-08, "losses/sft": 0.5534083247184753, "losses/total": 3.620248278934923e-08, "ref_logps/chosen": -238.295166015625, "ref_logps/rejected": -228.39830017089844, "rewards/accuracies": 1.0, "rewards/chosen": -1.7916358709335327, "rewards/margins": 15.534737586975098, "rewards/rejected": -17.326374053955078, "step": 3602 }, { "epoch": 0.86, "learning_rate": 3.008e-08, "logps/chosen": -260.3416748046875, "logps/rejected": -449.04608154296875, "loss": 0.0011, "losses/dpo": 1.6808382241606523e-08, "losses/sft": 0.6887344717979431, "losses/total": 1.6808382241606523e-08, "ref_logps/chosen": -241.29025268554688, "ref_logps/rejected": -250.29458618164062, "rewards/accuracies": 1.0, "rewards/chosen": -1.9051411151885986, "rewards/margins": 17.97001075744629, "rewards/rejected": -19.875150680541992, "step": 3603 }, { "epoch": 0.86, "learning_rate": 3.0026666666666666e-08, "logps/chosen": -234.55882263183594, "logps/rejected": -367.6237487792969, "loss": 0.0054, "losses/dpo": 2.657327229371731e-07, "losses/sft": 0.5775154232978821, "losses/total": 2.657327229371731e-07, "ref_logps/chosen": -217.67236328125, "ref_logps/rejected": -216.45571899414062, "rewards/accuracies": 1.0, "rewards/chosen": -1.688645362854004, "rewards/margins": 13.4281587600708, "rewards/rejected": -15.116804122924805, "step": 3604 }, { "epoch": 0.87, "learning_rate": 2.9973333333333336e-08, "logps/chosen": -225.9337158203125, "logps/rejected": -358.2294921875, "loss": 0.0008, "losses/dpo": 2.854263811968849e-07, "losses/sft": 0.5911316275596619, "losses/total": 2.854263811968849e-07, "ref_logps/chosen": -209.351806640625, "ref_logps/rejected": -204.2652130126953, "rewards/accuracies": 1.0, "rewards/chosen": -1.658189296722412, "rewards/margins": 13.738237380981445, "rewards/rejected": -15.396427154541016, "step": 3605 }, { "epoch": 0.87, "learning_rate": 2.992e-08, "logps/chosen": -192.08055114746094, "logps/rejected": -384.93621826171875, "loss": 0.0, "losses/dpo": 2.736939841430086e-10, "losses/sft": 0.624064564704895, "losses/total": 2.736939841430086e-10, "ref_logps/chosen": -177.0498046875, "ref_logps/rejected": -210.7570037841797, "rewards/accuracies": 1.0, "rewards/chosen": -1.5030765533447266, "rewards/margins": 15.91484546661377, "rewards/rejected": -17.41792106628418, "step": 3606 }, { "epoch": 0.87, "learning_rate": 2.986666666666667e-08, "logps/chosen": -217.23268127441406, "logps/rejected": -395.02691650390625, "loss": 0.0015, "losses/dpo": 1.9513994686803926e-07, "losses/sft": 0.8297291994094849, "losses/total": 1.9513994686803926e-07, "ref_logps/chosen": -201.01953125, "ref_logps/rejected": -228.53074645996094, "rewards/accuracies": 1.0, "rewards/chosen": -1.621315598487854, "rewards/margins": 15.028300285339355, "rewards/rejected": -16.649616241455078, "step": 3607 }, { "epoch": 0.87, "learning_rate": 2.9813333333333326e-08, "logps/chosen": -249.72528076171875, "logps/rejected": -456.04815673828125, "loss": 0.0, "losses/dpo": 4.707568166395504e-07, "losses/sft": 0.7265700101852417, "losses/total": 4.707568166395504e-07, "ref_logps/chosen": -230.2642822265625, "ref_logps/rejected": -252.0795135498047, "rewards/accuracies": 1.0, "rewards/chosen": -1.9461020231246948, "rewards/margins": 18.450759887695312, "rewards/rejected": -20.396862030029297, "step": 3608 }, { "epoch": 0.87, "learning_rate": 2.9759999999999996e-08, "logps/chosen": -262.78375244140625, "logps/rejected": -426.1679382324219, "loss": 0.0, "losses/dpo": 7.15858259354718e-05, "losses/sft": 0.5066314935684204, "losses/total": 7.15858259354718e-05, "ref_logps/chosen": -242.57034301757812, "ref_logps/rejected": -232.07000732421875, "rewards/accuracies": 1.0, "rewards/chosen": -2.0213418006896973, "rewards/margins": 17.388450622558594, "rewards/rejected": -19.409791946411133, "step": 3609 }, { "epoch": 0.87, "learning_rate": 2.9706666666666663e-08, "logps/chosen": -262.1693115234375, "logps/rejected": -422.58526611328125, "loss": 0.0001, "losses/dpo": 1.196851599161164e-06, "losses/sft": 1.157825231552124, "losses/total": 1.196851599161164e-06, "ref_logps/chosen": -239.14698791503906, "ref_logps/rejected": -236.7505645751953, "rewards/accuracies": 1.0, "rewards/chosen": -2.302234172821045, "rewards/margins": 16.28123664855957, "rewards/rejected": -18.583471298217773, "step": 3610 }, { "epoch": 0.87, "learning_rate": 2.965333333333333e-08, "logps/chosen": -214.36526489257812, "logps/rejected": -349.7568054199219, "loss": 0.0058, "losses/dpo": 3.9325204204132547e-10, "losses/sft": 0.5318859219551086, "losses/total": 3.9325204204132547e-10, "ref_logps/chosen": -201.15309143066406, "ref_logps/rejected": -193.48074340820312, "rewards/accuracies": 1.0, "rewards/chosen": -1.3212167024612427, "rewards/margins": 14.306391716003418, "rewards/rejected": -15.627609252929688, "step": 3611 }, { "epoch": 0.87, "learning_rate": 2.9599999999999997e-08, "logps/chosen": -192.2559814453125, "logps/rejected": -352.5411376953125, "loss": 0.0001, "losses/dpo": 2.3630261694052024e-06, "losses/sft": 0.7495097517967224, "losses/total": 2.3630261694052024e-06, "ref_logps/chosen": -174.24197387695312, "ref_logps/rejected": -195.79183959960938, "rewards/accuracies": 1.0, "rewards/chosen": -1.8013999462127686, "rewards/margins": 13.873531341552734, "rewards/rejected": -15.674930572509766, "step": 3612 }, { "epoch": 0.87, "learning_rate": 2.9546666666666664e-08, "logps/chosen": -213.11831665039062, "logps/rejected": -386.06390380859375, "loss": 0.0029, "losses/dpo": 1.358734778023063e-07, "losses/sft": 0.7198582291603088, "losses/total": 1.358734778023063e-07, "ref_logps/chosen": -193.06967163085938, "ref_logps/rejected": -224.07601928710938, "rewards/accuracies": 1.0, "rewards/chosen": -2.004866123199463, "rewards/margins": 14.193922996520996, "rewards/rejected": -16.198787689208984, "step": 3613 }, { "epoch": 0.87, "learning_rate": 2.949333333333333e-08, "logps/chosen": -228.05397033691406, "logps/rejected": -361.6081237792969, "loss": 0.0011, "losses/dpo": 6.544074858538806e-05, "losses/sft": 1.327064871788025, "losses/total": 6.544074858538806e-05, "ref_logps/chosen": -210.50889587402344, "ref_logps/rejected": -208.40103149414062, "rewards/accuracies": 1.0, "rewards/chosen": -1.754509449005127, "rewards/margins": 13.566201210021973, "rewards/rejected": -15.320711135864258, "step": 3614 }, { "epoch": 0.87, "learning_rate": 2.9439999999999998e-08, "logps/chosen": -245.9979705810547, "logps/rejected": -384.5631408691406, "loss": 0.0006, "losses/dpo": 1.5615723159001504e-09, "losses/sft": 0.7570298314094543, "losses/total": 1.5615723159001504e-09, "ref_logps/chosen": -229.08389282226562, "ref_logps/rejected": -219.43743896484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.6914082765579224, "rewards/margins": 14.821162223815918, "rewards/rejected": -16.512571334838867, "step": 3615 }, { "epoch": 0.87, "learning_rate": 2.9386666666666664e-08, "logps/chosen": -234.19752502441406, "logps/rejected": -374.3274230957031, "loss": 0.0005, "losses/dpo": 2.584903298352703e-11, "losses/sft": 0.559882402420044, "losses/total": 2.584903298352703e-11, "ref_logps/chosen": -222.8182373046875, "ref_logps/rejected": -224.587158203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.1379289627075195, "rewards/margins": 13.836099624633789, "rewards/rejected": -14.974027633666992, "step": 3616 }, { "epoch": 0.87, "learning_rate": 2.933333333333333e-08, "logps/chosen": -243.09120178222656, "logps/rejected": -368.9202880859375, "loss": 0.0015, "losses/dpo": 3.333253334858455e-05, "losses/sft": 0.9023441076278687, "losses/total": 3.333253334858455e-05, "ref_logps/chosen": -226.79006958007812, "ref_logps/rejected": -209.97691345214844, "rewards/accuracies": 1.0, "rewards/chosen": -1.6301120519638062, "rewards/margins": 14.264223098754883, "rewards/rejected": -15.894336700439453, "step": 3617 }, { "epoch": 0.87, "learning_rate": 2.9279999999999998e-08, "logps/chosen": -206.72702026367188, "logps/rejected": -368.5037841796875, "loss": 0.0002, "losses/dpo": 1.1978317893124313e-08, "losses/sft": 0.6631916761398315, "losses/total": 1.1978317893124313e-08, "ref_logps/chosen": -189.50010681152344, "ref_logps/rejected": -202.54644775390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7226921319961548, "rewards/margins": 14.87304401397705, "rewards/rejected": -16.595735549926758, "step": 3618 }, { "epoch": 0.87, "learning_rate": 2.9226666666666665e-08, "logps/chosen": -230.01193237304688, "logps/rejected": -380.0704345703125, "loss": 0.0005, "losses/dpo": 2.910924001842119e-10, "losses/sft": 0.9683566093444824, "losses/total": 2.910924001842119e-10, "ref_logps/chosen": -218.64859008789062, "ref_logps/rejected": -213.9961700439453, "rewards/accuracies": 1.0, "rewards/chosen": -1.1363351345062256, "rewards/margins": 15.471092224121094, "rewards/rejected": -16.6074275970459, "step": 3619 }, { "epoch": 0.87, "learning_rate": 2.9173333333333332e-08, "logps/chosen": -283.2509460449219, "logps/rejected": -425.0730895996094, "loss": 0.0007, "losses/dpo": 5.618666865103705e-09, "losses/sft": 0.44911661744117737, "losses/total": 5.618666865103705e-09, "ref_logps/chosen": -260.0130310058594, "ref_logps/rejected": -236.16668701171875, "rewards/accuracies": 1.0, "rewards/chosen": -2.323791265487671, "rewards/margins": 16.566850662231445, "rewards/rejected": -18.890642166137695, "step": 3620 }, { "epoch": 0.87, "learning_rate": 2.912e-08, "logps/chosen": -242.45889282226562, "logps/rejected": -423.87890625, "loss": 0.0, "losses/dpo": 5.030844629771991e-10, "losses/sft": 0.5429989099502563, "losses/total": 5.030844629771991e-10, "ref_logps/chosen": -224.82728576660156, "ref_logps/rejected": -238.03762817382812, "rewards/accuracies": 1.0, "rewards/chosen": -1.7631596326828003, "rewards/margins": 16.820968627929688, "rewards/rejected": -18.584129333496094, "step": 3621 }, { "epoch": 0.87, "learning_rate": 2.9066666666666666e-08, "logps/chosen": -245.38182067871094, "logps/rejected": -371.75323486328125, "loss": 0.013, "losses/dpo": 3.780443691425717e-08, "losses/sft": 0.6966020464897156, "losses/total": 3.780443691425717e-08, "ref_logps/chosen": -226.89462280273438, "ref_logps/rejected": -208.6649169921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.8487187623977661, "rewards/margins": 14.460114479064941, "rewards/rejected": -16.308834075927734, "step": 3622 }, { "epoch": 0.87, "learning_rate": 2.9013333333333336e-08, "logps/chosen": -233.9517822265625, "logps/rejected": -415.33978271484375, "loss": 0.0014, "losses/dpo": 6.84734047240454e-08, "losses/sft": 0.7343202829360962, "losses/total": 6.84734047240454e-08, "ref_logps/chosen": -214.48492431640625, "ref_logps/rejected": -228.53529357910156, "rewards/accuracies": 1.0, "rewards/chosen": -1.9466835260391235, "rewards/margins": 16.7337646484375, "rewards/rejected": -18.680450439453125, "step": 3623 }, { "epoch": 0.87, "learning_rate": 2.8960000000000003e-08, "logps/chosen": -308.19989013671875, "logps/rejected": -416.87890625, "loss": 0.0039, "losses/dpo": 1.2343055288965843e-07, "losses/sft": 1.1804276704788208, "losses/total": 1.2343055288965843e-07, "ref_logps/chosen": -286.5150146484375, "ref_logps/rejected": -232.47952270507812, "rewards/accuracies": 1.0, "rewards/chosen": -2.168487548828125, "rewards/margins": 16.27145004272461, "rewards/rejected": -18.439939498901367, "step": 3624 }, { "epoch": 0.87, "learning_rate": 2.890666666666667e-08, "logps/chosen": -261.4571838378906, "logps/rejected": -457.92767333984375, "loss": 0.0004, "losses/dpo": 9.395512279297691e-08, "losses/sft": 0.6417413949966431, "losses/total": 9.395512279297691e-08, "ref_logps/chosen": -243.64202880859375, "ref_logps/rejected": -259.0824890136719, "rewards/accuracies": 1.0, "rewards/chosen": -1.7815165519714355, "rewards/margins": 18.103002548217773, "rewards/rejected": -19.884517669677734, "step": 3625 }, { "epoch": 0.87, "learning_rate": 2.885333333333333e-08, "logps/chosen": -239.28622436523438, "logps/rejected": -414.56842041015625, "loss": 0.0002, "losses/dpo": 1.577739161540137e-09, "losses/sft": 0.4515572190284729, "losses/total": 1.577739161540137e-09, "ref_logps/chosen": -223.8220977783203, "ref_logps/rejected": -244.73626708984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5464110374450684, "rewards/margins": 15.43680191040039, "rewards/rejected": -16.983213424682617, "step": 3626 }, { "epoch": 0.87, "learning_rate": 2.8799999999999996e-08, "logps/chosen": -204.9417724609375, "logps/rejected": -342.038818359375, "loss": 0.0012, "losses/dpo": 1.5604256304868613e-06, "losses/sft": 0.6718031167984009, "losses/total": 1.5604256304868613e-06, "ref_logps/chosen": -190.92808532714844, "ref_logps/rejected": -191.87350463867188, "rewards/accuracies": 1.0, "rewards/chosen": -1.4013688564300537, "rewards/margins": 13.615161895751953, "rewards/rejected": -15.016530990600586, "step": 3627 }, { "epoch": 0.87, "learning_rate": 2.8746666666666663e-08, "logps/chosen": -226.60916137695312, "logps/rejected": -385.8375244140625, "loss": 0.0044, "losses/dpo": 5.582127471370768e-08, "losses/sft": 0.4987608790397644, "losses/total": 5.582127471370768e-08, "ref_logps/chosen": -208.2259063720703, "ref_logps/rejected": -216.1552276611328, "rewards/accuracies": 1.0, "rewards/chosen": -1.8383233547210693, "rewards/margins": 15.129907608032227, "rewards/rejected": -16.968231201171875, "step": 3628 }, { "epoch": 0.87, "learning_rate": 2.869333333333333e-08, "logps/chosen": -236.9516143798828, "logps/rejected": -399.36798095703125, "loss": 0.0001, "losses/dpo": 3.4106510948817004e-09, "losses/sft": 0.7781791090965271, "losses/total": 3.4106510948817004e-09, "ref_logps/chosen": -215.92523193359375, "ref_logps/rejected": -226.05343627929688, "rewards/accuracies": 1.0, "rewards/chosen": -2.10263729095459, "rewards/margins": 15.2288179397583, "rewards/rejected": -17.33145523071289, "step": 3629 }, { "epoch": 0.87, "learning_rate": 2.8639999999999997e-08, "logps/chosen": -228.0322265625, "logps/rejected": -410.6032409667969, "loss": 0.0009, "losses/dpo": 6.630817561692481e-10, "losses/sft": 0.4665514826774597, "losses/total": 6.630817561692481e-10, "ref_logps/chosen": -210.15301513671875, "ref_logps/rejected": -224.07925415039062, "rewards/accuracies": 1.0, "rewards/chosen": -1.787920594215393, "rewards/margins": 16.864477157592773, "rewards/rejected": -18.65239715576172, "step": 3630 }, { "epoch": 0.87, "learning_rate": 2.8586666666666664e-08, "logps/chosen": -260.3717346191406, "logps/rejected": -420.3453674316406, "loss": 0.0037, "losses/dpo": 4.588269486571761e-10, "losses/sft": 0.6792056560516357, "losses/total": 4.588269486571761e-10, "ref_logps/chosen": -245.07408142089844, "ref_logps/rejected": -238.40231323242188, "rewards/accuracies": 1.0, "rewards/chosen": -1.5297660827636719, "rewards/margins": 16.664541244506836, "rewards/rejected": -18.194307327270508, "step": 3631 }, { "epoch": 0.87, "learning_rate": 2.853333333333333e-08, "logps/chosen": -236.97613525390625, "logps/rejected": -404.987548828125, "loss": 0.0042, "losses/dpo": 6.302990129825048e-08, "losses/sft": 0.6824994087219238, "losses/total": 6.302990129825048e-08, "ref_logps/chosen": -223.57962036132812, "ref_logps/rejected": -232.6931915283203, "rewards/accuracies": 1.0, "rewards/chosen": -1.3396518230438232, "rewards/margins": 15.889785766601562, "rewards/rejected": -17.22943878173828, "step": 3632 }, { "epoch": 0.87, "learning_rate": 2.8479999999999998e-08, "logps/chosen": -263.7333984375, "logps/rejected": -408.5841064453125, "loss": 0.0025, "losses/dpo": 1.3798027964639914e-07, "losses/sft": 0.6625137329101562, "losses/total": 1.3798027964639914e-07, "ref_logps/chosen": -238.15560913085938, "ref_logps/rejected": -225.80552673339844, "rewards/accuracies": 1.0, "rewards/chosen": -2.557776927947998, "rewards/margins": 15.720081329345703, "rewards/rejected": -18.27785873413086, "step": 3633 }, { "epoch": 0.87, "learning_rate": 2.8426666666666664e-08, "logps/chosen": -267.87567138671875, "logps/rejected": -397.48468017578125, "loss": 0.0006, "losses/dpo": 2.7203922670082648e-08, "losses/sft": 1.082109808921814, "losses/total": 2.7203922670082648e-08, "ref_logps/chosen": -247.34027099609375, "ref_logps/rejected": -228.0221710205078, "rewards/accuracies": 1.0, "rewards/chosen": -2.0535411834716797, "rewards/margins": 14.892709732055664, "rewards/rejected": -16.946250915527344, "step": 3634 }, { "epoch": 0.87, "learning_rate": 2.837333333333333e-08, "logps/chosen": -251.2017364501953, "logps/rejected": -394.5249328613281, "loss": 0.0049, "losses/dpo": 9.844792536739533e-09, "losses/sft": 0.4945908784866333, "losses/total": 9.844792536739533e-09, "ref_logps/chosen": -234.14166259765625, "ref_logps/rejected": -226.63812255859375, "rewards/accuracies": 1.0, "rewards/chosen": -1.7060068845748901, "rewards/margins": 15.082674980163574, "rewards/rejected": -16.78868293762207, "step": 3635 }, { "epoch": 0.87, "learning_rate": 2.8319999999999998e-08, "logps/chosen": -254.514404296875, "logps/rejected": -388.9753723144531, "loss": 0.0015, "losses/dpo": 1.816541583821163e-09, "losses/sft": 0.5549130439758301, "losses/total": 1.816541583821163e-09, "ref_logps/chosen": -231.63169860839844, "ref_logps/rejected": -222.29122924804688, "rewards/accuracies": 1.0, "rewards/chosen": -2.288269519805908, "rewards/margins": 14.380146026611328, "rewards/rejected": -16.668415069580078, "step": 3636 }, { "epoch": 0.87, "learning_rate": 2.8266666666666665e-08, "logps/chosen": -248.8708038330078, "logps/rejected": -403.8787536621094, "loss": 0.0055, "losses/dpo": 1.779814517988143e-08, "losses/sft": 0.6093987822532654, "losses/total": 1.779814517988143e-08, "ref_logps/chosen": -231.55679321289062, "ref_logps/rejected": -221.24827575683594, "rewards/accuracies": 1.0, "rewards/chosen": -1.7314003705978394, "rewards/margins": 16.531646728515625, "rewards/rejected": -18.26304817199707, "step": 3637 }, { "epoch": 0.87, "learning_rate": 2.8213333333333332e-08, "logps/chosen": -255.86822509765625, "logps/rejected": -463.04193115234375, "loss": 0.0001, "losses/dpo": 1.436858276804287e-08, "losses/sft": 0.7657783031463623, "losses/total": 1.436858276804287e-08, "ref_logps/chosen": -240.7737579345703, "ref_logps/rejected": -278.6312255859375, "rewards/accuracies": 1.0, "rewards/chosen": -1.509448528289795, "rewards/margins": 16.931623458862305, "rewards/rejected": -18.441070556640625, "step": 3638 }, { "epoch": 0.87, "learning_rate": 2.8160000000000002e-08, "logps/chosen": -239.839599609375, "logps/rejected": -399.20440673828125, "loss": 0.0, "losses/dpo": 2.9481517671570145e-08, "losses/sft": 0.5833967924118042, "losses/total": 2.9481517671570145e-08, "ref_logps/chosen": -223.45849609375, "ref_logps/rejected": -230.4593048095703, "rewards/accuracies": 1.0, "rewards/chosen": -1.6381068229675293, "rewards/margins": 15.236404418945312, "rewards/rejected": -16.87451171875, "step": 3639 }, { "epoch": 0.87, "learning_rate": 2.810666666666667e-08, "logps/chosen": -300.84869384765625, "logps/rejected": -401.23681640625, "loss": 0.0, "losses/dpo": 7.436967930374294e-09, "losses/sft": 0.6473709940910339, "losses/total": 7.436967930374294e-09, "ref_logps/chosen": -279.32208251953125, "ref_logps/rejected": -221.6208038330078, "rewards/accuracies": 1.0, "rewards/chosen": -2.152663230895996, "rewards/margins": 15.808937072753906, "rewards/rejected": -17.96160125732422, "step": 3640 }, { "epoch": 0.87, "learning_rate": 2.8053333333333336e-08, "logps/chosen": -229.97760009765625, "logps/rejected": -398.38665771484375, "loss": 0.001, "losses/dpo": 0.0005437837680801749, "losses/sft": 0.5718706846237183, "losses/total": 0.0005437837680801749, "ref_logps/chosen": -217.25376892089844, "ref_logps/rejected": -230.7469482421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.2723819017410278, "rewards/margins": 15.491588592529297, "rewards/rejected": -16.76396942138672, "step": 3641 }, { "epoch": 0.87, "learning_rate": 2.8000000000000003e-08, "logps/chosen": -234.1759033203125, "logps/rejected": -399.1357421875, "loss": 0.0056, "losses/dpo": 6.674896746439174e-11, "losses/sft": 0.6003561019897461, "losses/total": 6.674896746439174e-11, "ref_logps/chosen": -218.1623077392578, "ref_logps/rejected": -229.82620239257812, "rewards/accuracies": 1.0, "rewards/chosen": -1.6013604402542114, "rewards/margins": 15.329591751098633, "rewards/rejected": -16.930952072143555, "step": 3642 }, { "epoch": 0.87, "learning_rate": 2.7946666666666663e-08, "logps/chosen": -268.9093322753906, "logps/rejected": -396.63134765625, "loss": 0.0001, "losses/dpo": 0.00037388940108940005, "losses/sft": 0.9439244866371155, "losses/total": 0.00037388940108940005, "ref_logps/chosen": -251.9111328125, "ref_logps/rejected": -229.85275268554688, "rewards/accuracies": 1.0, "rewards/chosen": -1.6998207569122314, "rewards/margins": 14.978038787841797, "rewards/rejected": -16.677860260009766, "step": 3643 }, { "epoch": 0.87, "learning_rate": 2.789333333333333e-08, "logps/chosen": -241.50299072265625, "logps/rejected": -382.3804931640625, "loss": 0.0171, "losses/dpo": 3.181229146775877e-07, "losses/sft": 0.4065760374069214, "losses/total": 3.181229146775877e-07, "ref_logps/chosen": -221.56698608398438, "ref_logps/rejected": -214.71429443359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.993599772453308, "rewards/margins": 14.773019790649414, "rewards/rejected": -16.766620635986328, "step": 3644 }, { "epoch": 0.87, "learning_rate": 2.7839999999999997e-08, "logps/chosen": -245.29139709472656, "logps/rejected": -347.2174072265625, "loss": 0.0035, "losses/dpo": 9.506064202469133e-08, "losses/sft": 0.8723223805427551, "losses/total": 9.506064202469133e-08, "ref_logps/chosen": -227.79808044433594, "ref_logps/rejected": -190.87588500976562, "rewards/accuracies": 1.0, "rewards/chosen": -1.7493312358856201, "rewards/margins": 13.884824752807617, "rewards/rejected": -15.6341552734375, "step": 3645 }, { "epoch": 0.87, "learning_rate": 2.7786666666666663e-08, "logps/chosen": -236.48817443847656, "logps/rejected": -390.36785888671875, "loss": 0.0007, "losses/dpo": 4.1551796183501855e-13, "losses/sft": 0.6089885234832764, "losses/total": 4.1551796183501855e-13, "ref_logps/chosen": -223.17562866210938, "ref_logps/rejected": -216.8330535888672, "rewards/accuracies": 1.0, "rewards/chosen": -1.3312559127807617, "rewards/margins": 16.02222442626953, "rewards/rejected": -17.35348129272461, "step": 3646 }, { "epoch": 0.88, "learning_rate": 2.773333333333333e-08, "logps/chosen": -228.9346923828125, "logps/rejected": -434.4262390136719, "loss": 0.0, "losses/dpo": 9.68785274224615e-10, "losses/sft": 0.4813212752342224, "losses/total": 9.68785274224615e-10, "ref_logps/chosen": -213.88870239257812, "ref_logps/rejected": -242.3738250732422, "rewards/accuracies": 1.0, "rewards/chosen": -1.5045979022979736, "rewards/margins": 17.70064353942871, "rewards/rejected": -19.205242156982422, "step": 3647 }, { "epoch": 0.88, "learning_rate": 2.7679999999999997e-08, "logps/chosen": -287.69354248046875, "logps/rejected": -437.81201171875, "loss": 0.0005, "losses/dpo": 1.0187043208986779e-08, "losses/sft": 0.7012760639190674, "losses/total": 1.0187043208986779e-08, "ref_logps/chosen": -267.025146484375, "ref_logps/rejected": -250.2803955078125, "rewards/accuracies": 1.0, "rewards/chosen": -2.0668416023254395, "rewards/margins": 16.686323165893555, "rewards/rejected": -18.75316619873047, "step": 3648 }, { "epoch": 0.88, "learning_rate": 2.7626666666666664e-08, "logps/chosen": -272.18212890625, "logps/rejected": -417.2737121582031, "loss": 0.0062, "losses/dpo": 1.1178183934035246e-09, "losses/sft": 0.5816816687583923, "losses/total": 1.1178183934035246e-09, "ref_logps/chosen": -256.1439514160156, "ref_logps/rejected": -236.50930786132812, "rewards/accuracies": 1.0, "rewards/chosen": -1.6038196086883545, "rewards/margins": 16.472618103027344, "rewards/rejected": -18.076438903808594, "step": 3649 }, { "epoch": 0.88, "learning_rate": 2.757333333333333e-08, "logps/chosen": -283.6884765625, "logps/rejected": -413.9590759277344, "loss": 0.0185, "losses/dpo": 7.724775138127915e-13, "losses/sft": 0.6287597417831421, "losses/total": 7.724775138127915e-13, "ref_logps/chosen": -263.04010009765625, "ref_logps/rejected": -229.748779296875, "rewards/accuracies": 1.0, "rewards/chosen": -2.0648345947265625, "rewards/margins": 16.35619354248047, "rewards/rejected": -18.42102813720703, "step": 3650 }, { "epoch": 0.88, "learning_rate": 2.7519999999999998e-08, "logps/chosen": -254.16094970703125, "logps/rejected": -452.0633850097656, "loss": 0.0003, "losses/dpo": 2.983263357236865e-06, "losses/sft": 0.5415748953819275, "losses/total": 2.983263357236865e-06, "ref_logps/chosen": -237.510009765625, "ref_logps/rejected": -265.33648681640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6650935411453247, "rewards/margins": 17.00759506225586, "rewards/rejected": -18.67268943786621, "step": 3651 }, { "epoch": 0.88, "learning_rate": 2.7466666666666664e-08, "logps/chosen": -247.4735107421875, "logps/rejected": -403.0828857421875, "loss": 0.0019, "losses/dpo": 7.245930078170204e-08, "losses/sft": 0.5329044461250305, "losses/total": 7.245930078170204e-08, "ref_logps/chosen": -230.14059448242188, "ref_logps/rejected": -234.06884765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7332900762557983, "rewards/margins": 15.168112754821777, "rewards/rejected": -16.90140151977539, "step": 3652 }, { "epoch": 0.88, "learning_rate": 2.741333333333333e-08, "logps/chosen": -237.45741271972656, "logps/rejected": -409.67376708984375, "loss": 0.0016, "losses/dpo": 2.3686836936320788e-09, "losses/sft": 0.600464403629303, "losses/total": 2.3686836936320788e-09, "ref_logps/chosen": -219.82373046875, "ref_logps/rejected": -228.55819702148438, "rewards/accuracies": 1.0, "rewards/chosen": -1.7633684873580933, "rewards/margins": 16.34818458557129, "rewards/rejected": -18.111553192138672, "step": 3653 }, { "epoch": 0.88, "learning_rate": 2.7359999999999998e-08, "logps/chosen": -268.07672119140625, "logps/rejected": -416.19110107421875, "loss": 0.0014, "losses/dpo": 5.7300802991377964e-11, "losses/sft": 0.6865231990814209, "losses/total": 5.7300802991377964e-11, "ref_logps/chosen": -249.5911102294922, "ref_logps/rejected": -236.20286560058594, "rewards/accuracies": 1.0, "rewards/chosen": -1.848557472229004, "rewards/margins": 16.150266647338867, "rewards/rejected": -17.998823165893555, "step": 3654 }, { "epoch": 0.88, "learning_rate": 2.7306666666666668e-08, "logps/chosen": -255.02191162109375, "logps/rejected": -398.3963623046875, "loss": 0.0003, "losses/dpo": 2.1021340224081086e-10, "losses/sft": 0.861552357673645, "losses/total": 2.1021340224081086e-10, "ref_logps/chosen": -232.864990234375, "ref_logps/rejected": -211.3167724609375, "rewards/accuracies": 1.0, "rewards/chosen": -2.215691089630127, "rewards/margins": 16.492267608642578, "rewards/rejected": -18.70796012878418, "step": 3655 }, { "epoch": 0.88, "learning_rate": 2.7253333333333335e-08, "logps/chosen": -249.70603942871094, "logps/rejected": -386.4865417480469, "loss": 0.0001, "losses/dpo": 1.7174641708805893e-09, "losses/sft": 0.41707590222358704, "losses/total": 1.7174641708805893e-09, "ref_logps/chosen": -229.44186401367188, "ref_logps/rejected": -218.68023681640625, "rewards/accuracies": 1.0, "rewards/chosen": -2.0264177322387695, "rewards/margins": 14.754215240478516, "rewards/rejected": -16.78063201904297, "step": 3656 }, { "epoch": 0.88, "learning_rate": 2.7200000000000002e-08, "logps/chosen": -215.3196563720703, "logps/rejected": -412.71160888671875, "loss": 0.0001, "losses/dpo": 3.155875283322729e-10, "losses/sft": 0.6171993017196655, "losses/total": 3.155875283322729e-10, "ref_logps/chosen": -199.4052276611328, "ref_logps/rejected": -241.03355407714844, "rewards/accuracies": 1.0, "rewards/chosen": -1.5914421081542969, "rewards/margins": 15.576362609863281, "rewards/rejected": -17.167802810668945, "step": 3657 }, { "epoch": 0.88, "learning_rate": 2.714666666666667e-08, "logps/chosen": -255.09271240234375, "logps/rejected": -406.1666564941406, "loss": 0.0002, "losses/dpo": 1.984474096161648e-08, "losses/sft": 0.927330732345581, "losses/total": 1.984474096161648e-08, "ref_logps/chosen": -234.50186157226562, "ref_logps/rejected": -226.20916748046875, "rewards/accuracies": 1.0, "rewards/chosen": -2.059084892272949, "rewards/margins": 15.936665534973145, "rewards/rejected": -17.995750427246094, "step": 3658 }, { "epoch": 0.88, "learning_rate": 2.7093333333333336e-08, "logps/chosen": -266.89996337890625, "logps/rejected": -403.6756286621094, "loss": 0.0003, "losses/dpo": 2.064541204660486e-09, "losses/sft": 0.514019250869751, "losses/total": 2.064541204660486e-09, "ref_logps/chosen": -246.38079833984375, "ref_logps/rejected": -230.8333282470703, "rewards/accuracies": 1.0, "rewards/chosen": -2.0519142150878906, "rewards/margins": 15.232315063476562, "rewards/rejected": -17.284229278564453, "step": 3659 }, { "epoch": 0.88, "learning_rate": 2.7039999999999996e-08, "logps/chosen": -253.34921264648438, "logps/rejected": -396.769775390625, "loss": 0.0005, "losses/dpo": 9.403216871817222e-09, "losses/sft": 0.9245594143867493, "losses/total": 9.403216871817222e-09, "ref_logps/chosen": -236.61814880371094, "ref_logps/rejected": -217.53375244140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6731071472167969, "rewards/margins": 16.25049591064453, "rewards/rejected": -17.92360496520996, "step": 3660 }, { "epoch": 0.88, "learning_rate": 2.6986666666666663e-08, "logps/chosen": -264.09222412109375, "logps/rejected": -440.4463195800781, "loss": 0.0014, "losses/dpo": 4.3925716397552605e-09, "losses/sft": 0.5334048271179199, "losses/total": 4.3925716397552605e-09, "ref_logps/chosen": -249.3893280029297, "ref_logps/rejected": -262.62066650390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4702917337417603, "rewards/margins": 16.31227684020996, "rewards/rejected": -17.782567977905273, "step": 3661 }, { "epoch": 0.88, "learning_rate": 2.693333333333333e-08, "logps/chosen": -212.6505126953125, "logps/rejected": -380.12689208984375, "loss": 0.005, "losses/dpo": 7.262030976562528e-09, "losses/sft": 0.5304433703422546, "losses/total": 7.262030976562528e-09, "ref_logps/chosen": -195.01162719726562, "ref_logps/rejected": -217.0791015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7638881206512451, "rewards/margins": 14.540889739990234, "rewards/rejected": -16.304779052734375, "step": 3662 }, { "epoch": 0.88, "learning_rate": 2.6879999999999997e-08, "logps/chosen": -243.33018493652344, "logps/rejected": -426.0177001953125, "loss": 0.0002, "losses/dpo": 1.926576054245288e-08, "losses/sft": 0.9834936261177063, "losses/total": 1.926576054245288e-08, "ref_logps/chosen": -226.8988037109375, "ref_logps/rejected": -244.63674926757812, "rewards/accuracies": 1.0, "rewards/chosen": -1.643136739730835, "rewards/margins": 16.49496078491211, "rewards/rejected": -18.138097763061523, "step": 3663 }, { "epoch": 0.88, "learning_rate": 2.6826666666666663e-08, "logps/chosen": -270.46954345703125, "logps/rejected": -400.93243408203125, "loss": 0.0007, "losses/dpo": 0.00016413380217272788, "losses/sft": 0.4833965003490448, "losses/total": 0.00016413380217272788, "ref_logps/chosen": -252.74200439453125, "ref_logps/rejected": -225.63319396972656, "rewards/accuracies": 1.0, "rewards/chosen": -1.7727534770965576, "rewards/margins": 15.757170677185059, "rewards/rejected": -17.529922485351562, "step": 3664 }, { "epoch": 0.88, "learning_rate": 2.677333333333333e-08, "logps/chosen": -210.613525390625, "logps/rejected": -361.1585693359375, "loss": 0.0028, "losses/dpo": 4.1232565972659785e-11, "losses/sft": 0.547805666923523, "losses/total": 4.1232565972659785e-11, "ref_logps/chosen": -196.00509643554688, "ref_logps/rejected": -195.00299072265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4608420133590698, "rewards/margins": 15.154719352722168, "rewards/rejected": -16.61556053161621, "step": 3665 }, { "epoch": 0.88, "learning_rate": 2.6719999999999997e-08, "logps/chosen": -224.0459442138672, "logps/rejected": -420.5580749511719, "loss": 0.0001, "losses/dpo": 1.0249355852920417e-07, "losses/sft": 1.1756938695907593, "losses/total": 1.0249355852920417e-07, "ref_logps/chosen": -211.949951171875, "ref_logps/rejected": -230.31808471679688, "rewards/accuracies": 1.0, "rewards/chosen": -1.2095987796783447, "rewards/margins": 17.81439971923828, "rewards/rejected": -19.023998260498047, "step": 3666 }, { "epoch": 0.88, "learning_rate": 2.6666666666666664e-08, "logps/chosen": -241.00979614257812, "logps/rejected": -407.50689697265625, "loss": 0.0062, "losses/dpo": 1.082371703375884e-08, "losses/sft": 0.5912405848503113, "losses/total": 1.082371703375884e-08, "ref_logps/chosen": -220.0244140625, "ref_logps/rejected": -226.4270477294922, "rewards/accuracies": 1.0, "rewards/chosen": -2.098538875579834, "rewards/margins": 16.00944709777832, "rewards/rejected": -18.107986450195312, "step": 3667 }, { "epoch": 0.88, "learning_rate": 2.661333333333333e-08, "logps/chosen": -301.06927490234375, "logps/rejected": -421.15728759765625, "loss": 0.0003, "losses/dpo": 3.6038873274524263e-11, "losses/sft": 0.58551025390625, "losses/total": 3.6038873274524263e-11, "ref_logps/chosen": -279.0912780761719, "ref_logps/rejected": -235.15786743164062, "rewards/accuracies": 1.0, "rewards/chosen": -2.1977999210357666, "rewards/margins": 16.402145385742188, "rewards/rejected": -18.599945068359375, "step": 3668 }, { "epoch": 0.88, "learning_rate": 2.6559999999999998e-08, "logps/chosen": -195.88937377929688, "logps/rejected": -335.59033203125, "loss": 0.0002, "losses/dpo": 3.6328455621514877e-07, "losses/sft": 0.6282731890678406, "losses/total": 3.6328455621514877e-07, "ref_logps/chosen": -178.64028930664062, "ref_logps/rejected": -188.35568237304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.7249109745025635, "rewards/margins": 12.998552322387695, "rewards/rejected": -14.72346305847168, "step": 3669 }, { "epoch": 0.88, "learning_rate": 2.6506666666666664e-08, "logps/chosen": -302.2594299316406, "logps/rejected": -437.45281982421875, "loss": 0.0009, "losses/dpo": 3.2559557894273894e-07, "losses/sft": 0.5861861705780029, "losses/total": 3.2559557894273894e-07, "ref_logps/chosen": -279.98876953125, "ref_logps/rejected": -243.11660766601562, "rewards/accuracies": 1.0, "rewards/chosen": -2.2270655632019043, "rewards/margins": 17.206554412841797, "rewards/rejected": -19.43362045288086, "step": 3670 }, { "epoch": 0.88, "learning_rate": 2.6453333333333335e-08, "logps/chosen": -247.61973571777344, "logps/rejected": -391.69305419921875, "loss": 0.0001, "losses/dpo": 1.0066847977441284e-07, "losses/sft": 0.6731512546539307, "losses/total": 1.0066847977441284e-07, "ref_logps/chosen": -230.0639190673828, "ref_logps/rejected": -227.43206787109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.7555831670761108, "rewards/margins": 14.670516967773438, "rewards/rejected": -16.42609977722168, "step": 3671 }, { "epoch": 0.88, "learning_rate": 2.64e-08, "logps/chosen": -242.01254272460938, "logps/rejected": -406.07391357421875, "loss": 0.0011, "losses/dpo": 3.742646192961274e-09, "losses/sft": 0.8342538475990295, "losses/total": 3.742646192961274e-09, "ref_logps/chosen": -223.50318908691406, "ref_logps/rejected": -227.59703063964844, "rewards/accuracies": 1.0, "rewards/chosen": -1.8509342670440674, "rewards/margins": 15.996756553649902, "rewards/rejected": -17.84769058227539, "step": 3672 }, { "epoch": 0.88, "learning_rate": 2.6346666666666668e-08, "logps/chosen": -197.0836944580078, "logps/rejected": -396.0023193359375, "loss": 0.0008, "losses/dpo": 3.3066873683651465e-09, "losses/sft": 0.47015801072120667, "losses/total": 3.3066873683651465e-09, "ref_logps/chosen": -179.57962036132812, "ref_logps/rejected": -225.89712524414062, "rewards/accuracies": 1.0, "rewards/chosen": -1.7504072189331055, "rewards/margins": 15.260116577148438, "rewards/rejected": -17.01052474975586, "step": 3673 }, { "epoch": 0.88, "learning_rate": 2.6293333333333335e-08, "logps/chosen": -250.99078369140625, "logps/rejected": -422.3114013671875, "loss": 0.0001, "losses/dpo": 1.2694002862190246e-06, "losses/sft": 0.5852209329605103, "losses/total": 1.2694002862190246e-06, "ref_logps/chosen": -228.56137084960938, "ref_logps/rejected": -231.8749542236328, "rewards/accuracies": 1.0, "rewards/chosen": -2.2429404258728027, "rewards/margins": 16.80070686340332, "rewards/rejected": -19.04364776611328, "step": 3674 }, { "epoch": 0.88, "learning_rate": 2.6240000000000002e-08, "logps/chosen": -262.40753173828125, "logps/rejected": -400.31768798828125, "loss": 0.0011, "losses/dpo": 1.088816006244997e-07, "losses/sft": 0.8756791353225708, "losses/total": 1.088816006244997e-07, "ref_logps/chosen": -242.02589416503906, "ref_logps/rejected": -233.78102111816406, "rewards/accuracies": 1.0, "rewards/chosen": -2.038165807723999, "rewards/margins": 14.615501403808594, "rewards/rejected": -16.653667449951172, "step": 3675 }, { "epoch": 0.88, "learning_rate": 2.618666666666667e-08, "logps/chosen": -240.218017578125, "logps/rejected": -385.72088623046875, "loss": 0.0003, "losses/dpo": 5.073738762462199e-09, "losses/sft": 0.688209056854248, "losses/total": 5.073738762462199e-09, "ref_logps/chosen": -223.86627197265625, "ref_logps/rejected": -210.70181274414062, "rewards/accuracies": 1.0, "rewards/chosen": -1.6351754665374756, "rewards/margins": 15.866731643676758, "rewards/rejected": -17.501907348632812, "step": 3676 }, { "epoch": 0.88, "learning_rate": 2.613333333333333e-08, "logps/chosen": -204.7032928466797, "logps/rejected": -362.2520751953125, "loss": 0.0008, "losses/dpo": 2.404420627044601e-07, "losses/sft": 0.3990132212638855, "losses/total": 2.404420627044601e-07, "ref_logps/chosen": -190.36172485351562, "ref_logps/rejected": -198.224853515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4341557025909424, "rewards/margins": 14.968564987182617, "rewards/rejected": -16.402719497680664, "step": 3677 }, { "epoch": 0.88, "learning_rate": 2.6079999999999996e-08, "logps/chosen": -247.27911376953125, "logps/rejected": -383.6025390625, "loss": 0.0002, "losses/dpo": 1.091883944370764e-10, "losses/sft": 0.6665881276130676, "losses/total": 1.091883944370764e-10, "ref_logps/chosen": -233.73312377929688, "ref_logps/rejected": -221.23626708984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.3545982837677002, "rewards/margins": 14.882026672363281, "rewards/rejected": -16.23662567138672, "step": 3678 }, { "epoch": 0.88, "learning_rate": 2.6026666666666663e-08, "logps/chosen": -244.01046752929688, "logps/rejected": -425.0478210449219, "loss": 0.0017, "losses/dpo": 2.126991471840256e-09, "losses/sft": 0.5893638134002686, "losses/total": 2.126991471840256e-09, "ref_logps/chosen": -231.51358032226562, "ref_logps/rejected": -243.71530151367188, "rewards/accuracies": 1.0, "rewards/chosen": -1.2496892213821411, "rewards/margins": 16.883563995361328, "rewards/rejected": -18.13325309753418, "step": 3679 }, { "epoch": 0.88, "learning_rate": 2.597333333333333e-08, "logps/chosen": -191.9501495361328, "logps/rejected": -363.2196044921875, "loss": 0.0015, "losses/dpo": 5.170082677946608e-12, "losses/sft": 0.6415192484855652, "losses/total": 5.170082677946608e-12, "ref_logps/chosen": -177.50994873046875, "ref_logps/rejected": -199.9307403564453, "rewards/accuracies": 1.0, "rewards/chosen": -1.444019079208374, "rewards/margins": 14.884869575500488, "rewards/rejected": -16.328887939453125, "step": 3680 }, { "epoch": 0.88, "learning_rate": 2.5919999999999997e-08, "logps/chosen": -253.69561767578125, "logps/rejected": -393.47113037109375, "loss": 0.0002, "losses/dpo": 3.413670679464076e-09, "losses/sft": 0.5726863741874695, "losses/total": 3.413670679464076e-09, "ref_logps/chosen": -233.96139526367188, "ref_logps/rejected": -215.3896484375, "rewards/accuracies": 1.0, "rewards/chosen": -1.9734230041503906, "rewards/margins": 15.834726333618164, "rewards/rejected": -17.808149337768555, "step": 3681 }, { "epoch": 0.88, "learning_rate": 2.5866666666666663e-08, "logps/chosen": -251.70751953125, "logps/rejected": -408.32403564453125, "loss": 0.0007, "losses/dpo": 2.0343775275488696e-11, "losses/sft": 0.7067961096763611, "losses/total": 2.0343775275488696e-11, "ref_logps/chosen": -234.01303100585938, "ref_logps/rejected": -235.92800903320312, "rewards/accuracies": 1.0, "rewards/chosen": -1.7694497108459473, "rewards/margins": 15.470155715942383, "rewards/rejected": -17.239604949951172, "step": 3682 }, { "epoch": 0.88, "learning_rate": 2.581333333333333e-08, "logps/chosen": -286.4735412597656, "logps/rejected": -384.62030029296875, "loss": 0.0011, "losses/dpo": 2.2268218913268356e-07, "losses/sft": 0.5917987823486328, "losses/total": 2.2268218913268356e-07, "ref_logps/chosen": -266.2637939453125, "ref_logps/rejected": -225.5047607421875, "rewards/accuracies": 1.0, "rewards/chosen": -2.0209734439849854, "rewards/margins": 13.890582084655762, "rewards/rejected": -15.911556243896484, "step": 3683 }, { "epoch": 0.88, "learning_rate": 2.5759999999999997e-08, "logps/chosen": -259.5982360839844, "logps/rejected": -388.7176208496094, "loss": 0.0032, "losses/dpo": 5.1070335757685825e-05, "losses/sft": 0.7248877286911011, "losses/total": 5.1070335757685825e-05, "ref_logps/chosen": -239.6307830810547, "ref_logps/rejected": -216.07388305664062, "rewards/accuracies": 1.0, "rewards/chosen": -1.996744990348816, "rewards/margins": 15.267627716064453, "rewards/rejected": -17.264373779296875, "step": 3684 }, { "epoch": 0.88, "learning_rate": 2.5706666666666664e-08, "logps/chosen": -276.05859375, "logps/rejected": -416.8518371582031, "loss": 0.0001, "losses/dpo": 1.6676578979968326e-06, "losses/sft": 1.1084778308868408, "losses/total": 1.6676578979968326e-06, "ref_logps/chosen": -258.3780517578125, "ref_logps/rejected": -235.6951904296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7680548429489136, "rewards/margins": 16.347606658935547, "rewards/rejected": -18.115665435791016, "step": 3685 }, { "epoch": 0.88, "learning_rate": 2.565333333333333e-08, "logps/chosen": -226.8055419921875, "logps/rejected": -386.9837951660156, "loss": 0.0056, "losses/dpo": 1.5240907202951348e-08, "losses/sft": 0.8375710844993591, "losses/total": 1.5240907202951348e-08, "ref_logps/chosen": -208.69345092773438, "ref_logps/rejected": -214.68057250976562, "rewards/accuracies": 1.0, "rewards/chosen": -1.8112084865570068, "rewards/margins": 15.419118881225586, "rewards/rejected": -17.230327606201172, "step": 3686 }, { "epoch": 0.88, "learning_rate": 2.56e-08, "logps/chosen": -231.6618194580078, "logps/rejected": -373.64935302734375, "loss": 0.0006, "losses/dpo": 1.0311180176358903e-07, "losses/sft": 0.5602023005485535, "losses/total": 1.0311180176358903e-07, "ref_logps/chosen": -215.07684326171875, "ref_logps/rejected": -209.93228149414062, "rewards/accuracies": 1.0, "rewards/chosen": -1.6584961414337158, "rewards/margins": 14.71320915222168, "rewards/rejected": -16.371707916259766, "step": 3687 }, { "epoch": 0.89, "learning_rate": 2.5546666666666668e-08, "logps/chosen": -198.351806640625, "logps/rejected": -375.2523498535156, "loss": 0.0003, "losses/dpo": 4.713236023690115e-08, "losses/sft": 1.0000089406967163, "losses/total": 4.713236023690115e-08, "ref_logps/chosen": -183.99267578125, "ref_logps/rejected": -218.29685974121094, "rewards/accuracies": 1.0, "rewards/chosen": -1.4359138011932373, "rewards/margins": 14.25963306427002, "rewards/rejected": -15.695547103881836, "step": 3688 }, { "epoch": 0.89, "learning_rate": 2.5493333333333335e-08, "logps/chosen": -259.96710205078125, "logps/rejected": -418.54632568359375, "loss": 0.0005, "losses/dpo": 2.7109738898190017e-09, "losses/sft": 0.5778470635414124, "losses/total": 2.7109738898190017e-09, "ref_logps/chosen": -237.20449829101562, "ref_logps/rejected": -225.52862548828125, "rewards/accuracies": 1.0, "rewards/chosen": -2.276261806488037, "rewards/margins": 17.025510787963867, "rewards/rejected": -19.301773071289062, "step": 3689 }, { "epoch": 0.89, "learning_rate": 2.544e-08, "logps/chosen": -256.5694580078125, "logps/rejected": -409.07000732421875, "loss": 0.0008, "losses/dpo": 6.943635000311588e-09, "losses/sft": 0.47969889640808105, "losses/total": 6.943635000311588e-09, "ref_logps/chosen": -239.8297119140625, "ref_logps/rejected": -234.88632202148438, "rewards/accuracies": 1.0, "rewards/chosen": -1.6739739179611206, "rewards/margins": 15.744394302368164, "rewards/rejected": -17.418367385864258, "step": 3690 }, { "epoch": 0.89, "learning_rate": 2.5386666666666668e-08, "logps/chosen": -229.2489471435547, "logps/rejected": -376.7342834472656, "loss": 0.0003, "losses/dpo": 3.0877467906975653e-06, "losses/sft": 0.6081684827804565, "losses/total": 3.0877467906975653e-06, "ref_logps/chosen": -213.0404052734375, "ref_logps/rejected": -207.14410400390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6208558082580566, "rewards/margins": 15.33816146850586, "rewards/rejected": -16.959016799926758, "step": 3691 }, { "epoch": 0.89, "learning_rate": 2.5333333333333335e-08, "logps/chosen": -206.7733612060547, "logps/rejected": -345.30364990234375, "loss": 0.0078, "losses/dpo": 2.9213821107987314e-05, "losses/sft": 0.9378081560134888, "losses/total": 2.9213821107987314e-05, "ref_logps/chosen": -190.46774291992188, "ref_logps/rejected": -191.83599853515625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6305612325668335, "rewards/margins": 13.716205596923828, "rewards/rejected": -15.34676742553711, "step": 3692 }, { "epoch": 0.89, "learning_rate": 2.5280000000000002e-08, "logps/chosen": -252.65744018554688, "logps/rejected": -394.02581787109375, "loss": 0.0015, "losses/dpo": 8.164122711207256e-10, "losses/sft": 0.4170289933681488, "losses/total": 8.164122711207256e-10, "ref_logps/chosen": -236.79757690429688, "ref_logps/rejected": -225.7217254638672, "rewards/accuracies": 1.0, "rewards/chosen": -1.5859862565994263, "rewards/margins": 15.24442195892334, "rewards/rejected": -16.830406188964844, "step": 3693 }, { "epoch": 0.89, "learning_rate": 2.5226666666666662e-08, "logps/chosen": -238.22015380859375, "logps/rejected": -372.75738525390625, "loss": 0.0016, "losses/dpo": 3.626089153385692e-07, "losses/sft": 0.6332409381866455, "losses/total": 3.626089153385692e-07, "ref_logps/chosen": -217.09410095214844, "ref_logps/rejected": -205.2547607421875, "rewards/accuracies": 1.0, "rewards/chosen": -2.1126060485839844, "rewards/margins": 14.637657165527344, "rewards/rejected": -16.750263214111328, "step": 3694 }, { "epoch": 0.89, "learning_rate": 2.517333333333333e-08, "logps/chosen": -252.62033081054688, "logps/rejected": -429.1644287109375, "loss": 0.0005, "losses/dpo": 1.0919912973861301e-08, "losses/sft": 0.5996206402778625, "losses/total": 1.0919912973861301e-08, "ref_logps/chosen": -233.2778778076172, "ref_logps/rejected": -237.4490966796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.934244990348816, "rewards/margins": 17.237289428710938, "rewards/rejected": -19.17153549194336, "step": 3695 }, { "epoch": 0.89, "learning_rate": 2.5119999999999996e-08, "logps/chosen": -238.74111938476562, "logps/rejected": -393.78497314453125, "loss": 0.0004, "losses/dpo": 1.882593907154284e-11, "losses/sft": 0.7973847389221191, "losses/total": 1.882593907154284e-11, "ref_logps/chosen": -224.09963989257812, "ref_logps/rejected": -215.4690704345703, "rewards/accuracies": 1.0, "rewards/chosen": -1.464146375656128, "rewards/margins": 16.367443084716797, "rewards/rejected": -17.831588745117188, "step": 3696 }, { "epoch": 0.89, "learning_rate": 2.5066666666666663e-08, "logps/chosen": -248.97755432128906, "logps/rejected": -378.06915283203125, "loss": 0.0008, "losses/dpo": 1.5587771351466273e-10, "losses/sft": 0.7631184458732605, "losses/total": 1.5587771351466273e-10, "ref_logps/chosen": -232.8209686279297, "ref_logps/rejected": -203.5380859375, "rewards/accuracies": 1.0, "rewards/chosen": -1.6156595945358276, "rewards/margins": 15.837448120117188, "rewards/rejected": -17.453107833862305, "step": 3697 }, { "epoch": 0.89, "learning_rate": 2.501333333333333e-08, "logps/chosen": -250.28358459472656, "logps/rejected": -380.79742431640625, "loss": 0.0019, "losses/dpo": 7.674319846273647e-08, "losses/sft": 0.6042404770851135, "losses/total": 7.674319846273647e-08, "ref_logps/chosen": -234.3715362548828, "ref_logps/rejected": -218.51197814941406, "rewards/accuracies": 1.0, "rewards/chosen": -1.5912059545516968, "rewards/margins": 14.637340545654297, "rewards/rejected": -16.228546142578125, "step": 3698 }, { "epoch": 0.89, "learning_rate": 2.4959999999999997e-08, "logps/chosen": -232.5196075439453, "logps/rejected": -418.94482421875, "loss": 0.0002, "losses/dpo": 8.478559010427489e-08, "losses/sft": 0.7743543386459351, "losses/total": 8.478559010427489e-08, "ref_logps/chosen": -211.70077514648438, "ref_logps/rejected": -235.4895477294922, "rewards/accuracies": 1.0, "rewards/chosen": -2.081881046295166, "rewards/margins": 16.263648986816406, "rewards/rejected": -18.345531463623047, "step": 3699 }, { "epoch": 0.89, "learning_rate": 2.4906666666666663e-08, "logps/chosen": -239.07630920410156, "logps/rejected": -386.55419921875, "loss": 0.0017, "losses/dpo": 6.991469945205608e-06, "losses/sft": 0.6427625417709351, "losses/total": 6.991469945205608e-06, "ref_logps/chosen": -221.45938110351562, "ref_logps/rejected": -217.4383544921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7616922855377197, "rewards/margins": 15.149895668029785, "rewards/rejected": -16.91158676147461, "step": 3700 }, { "epoch": 0.89, "learning_rate": 2.485333333333333e-08, "logps/chosen": -243.6747283935547, "logps/rejected": -397.29248046875, "loss": 0.0001, "losses/dpo": 4.561455102525258e-10, "losses/sft": 0.6143447160720825, "losses/total": 4.561455102525258e-10, "ref_logps/chosen": -224.17564392089844, "ref_logps/rejected": -213.9617156982422, "rewards/accuracies": 1.0, "rewards/chosen": -1.9499077796936035, "rewards/margins": 16.383169174194336, "rewards/rejected": -18.33307647705078, "step": 3701 }, { "epoch": 0.89, "learning_rate": 2.4799999999999997e-08, "logps/chosen": -216.625244140625, "logps/rejected": -353.22552490234375, "loss": 0.0004, "losses/dpo": 8.595511644671205e-07, "losses/sft": 0.5983812212944031, "losses/total": 8.595511644671205e-07, "ref_logps/chosen": -198.7817840576172, "ref_logps/rejected": -190.62332153320312, "rewards/accuracies": 1.0, "rewards/chosen": -1.7843457460403442, "rewards/margins": 14.475872993469238, "rewards/rejected": -16.26021957397461, "step": 3702 }, { "epoch": 0.89, "learning_rate": 2.4746666666666667e-08, "logps/chosen": -189.6753692626953, "logps/rejected": -364.94189453125, "loss": 0.0083, "losses/dpo": 1.254600618949553e-07, "losses/sft": 0.56587815284729, "losses/total": 1.254600618949553e-07, "ref_logps/chosen": -173.0568084716797, "ref_logps/rejected": -212.9866943359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.6618568897247314, "rewards/margins": 13.533663749694824, "rewards/rejected": -15.195521354675293, "step": 3703 }, { "epoch": 0.89, "learning_rate": 2.4693333333333334e-08, "logps/chosen": -233.44659423828125, "logps/rejected": -387.20440673828125, "loss": 0.0018, "losses/dpo": 9.023326974499923e-09, "losses/sft": 0.7294031381607056, "losses/total": 9.023326974499923e-09, "ref_logps/chosen": -218.76742553710938, "ref_logps/rejected": -220.11703491210938, "rewards/accuracies": 1.0, "rewards/chosen": -1.467915654182434, "rewards/margins": 15.240821838378906, "rewards/rejected": -16.708736419677734, "step": 3704 }, { "epoch": 0.89, "learning_rate": 2.464e-08, "logps/chosen": -192.45773315429688, "logps/rejected": -306.4441223144531, "loss": 0.0093, "losses/dpo": 1.8393818690398689e-09, "losses/sft": 1.1136223077774048, "losses/total": 1.8393818690398689e-09, "ref_logps/chosen": -178.51515197753906, "ref_logps/rejected": -174.3299560546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.3942567110061646, "rewards/margins": 11.817159652709961, "rewards/rejected": -13.211417198181152, "step": 3705 }, { "epoch": 0.89, "learning_rate": 2.4586666666666668e-08, "logps/chosen": -249.03912353515625, "logps/rejected": -408.128173828125, "loss": 0.002, "losses/dpo": 1.1516069209349666e-09, "losses/sft": 0.521929919719696, "losses/total": 1.1516069209349666e-09, "ref_logps/chosen": -228.64988708496094, "ref_logps/rejected": -216.50799560546875, "rewards/accuracies": 1.0, "rewards/chosen": -2.0389246940612793, "rewards/margins": 17.123092651367188, "rewards/rejected": -19.162017822265625, "step": 3706 }, { "epoch": 0.89, "learning_rate": 2.453333333333333e-08, "logps/chosen": -242.22503662109375, "logps/rejected": -394.7735595703125, "loss": 0.0001, "losses/dpo": 8.386489298572997e-07, "losses/sft": 0.32262325286865234, "losses/total": 8.386489298572997e-07, "ref_logps/chosen": -224.48501586914062, "ref_logps/rejected": -223.32003784179688, "rewards/accuracies": 1.0, "rewards/chosen": -1.774003028869629, "rewards/margins": 15.37135124206543, "rewards/rejected": -17.14535140991211, "step": 3707 }, { "epoch": 0.89, "learning_rate": 2.4479999999999998e-08, "logps/chosen": -232.92776489257812, "logps/rejected": -393.8958435058594, "loss": 0.0003, "losses/dpo": 3.900773037912586e-09, "losses/sft": 0.3537389934062958, "losses/total": 3.900773037912586e-09, "ref_logps/chosen": -217.82437133789062, "ref_logps/rejected": -214.2590789794922, "rewards/accuracies": 1.0, "rewards/chosen": -1.5103397369384766, "rewards/margins": 16.45333480834961, "rewards/rejected": -17.963674545288086, "step": 3708 }, { "epoch": 0.89, "learning_rate": 2.4426666666666665e-08, "logps/chosen": -287.0394287109375, "logps/rejected": -421.956298828125, "loss": 0.0001, "losses/dpo": 2.0233277098213875e-07, "losses/sft": 0.6465886235237122, "losses/total": 2.0233277098213875e-07, "ref_logps/chosen": -264.4531555175781, "ref_logps/rejected": -245.10479736328125, "rewards/accuracies": 1.0, "rewards/chosen": -2.2586276531219482, "rewards/margins": 15.426525115966797, "rewards/rejected": -17.68515396118164, "step": 3709 }, { "epoch": 0.89, "learning_rate": 2.4373333333333332e-08, "logps/chosen": -288.97686767578125, "logps/rejected": -417.6397705078125, "loss": 0.0002, "losses/dpo": 1.231043256666453e-06, "losses/sft": 0.421171635389328, "losses/total": 1.231043256666453e-06, "ref_logps/chosen": -270.0931701660156, "ref_logps/rejected": -245.611083984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.888370394706726, "rewards/margins": 15.314501762390137, "rewards/rejected": -17.20287322998047, "step": 3710 }, { "epoch": 0.89, "learning_rate": 2.432e-08, "logps/chosen": -242.77508544921875, "logps/rejected": -372.6719970703125, "loss": 0.0004, "losses/dpo": 5.5268647436435e-09, "losses/sft": 0.7814462184906006, "losses/total": 5.5268647436435e-09, "ref_logps/chosen": -223.32373046875, "ref_logps/rejected": -205.50088500976562, "rewards/accuracies": 1.0, "rewards/chosen": -1.9451361894607544, "rewards/margins": 14.771973609924316, "rewards/rejected": -16.71710968017578, "step": 3711 }, { "epoch": 0.89, "learning_rate": 2.4266666666666666e-08, "logps/chosen": -268.71392822265625, "logps/rejected": -406.65899658203125, "loss": 0.0013, "losses/dpo": 1.8267889371342827e-12, "losses/sft": 0.681682825088501, "losses/total": 1.8267889371342827e-12, "ref_logps/chosen": -247.21542358398438, "ref_logps/rejected": -217.87474060058594, "rewards/accuracies": 1.0, "rewards/chosen": -2.149853229522705, "rewards/margins": 16.728572845458984, "rewards/rejected": -18.87842559814453, "step": 3712 }, { "epoch": 0.89, "learning_rate": 2.4213333333333332e-08, "logps/chosen": -265.2888488769531, "logps/rejected": -444.53369140625, "loss": 0.0001, "losses/dpo": 2.3044616215273095e-10, "losses/sft": 0.4839933514595032, "losses/total": 2.3044616215273095e-10, "ref_logps/chosen": -247.56137084960938, "ref_logps/rejected": -251.97653198242188, "rewards/accuracies": 1.0, "rewards/chosen": -1.7727460861206055, "rewards/margins": 17.48297119140625, "rewards/rejected": -19.255718231201172, "step": 3713 }, { "epoch": 0.89, "learning_rate": 2.416e-08, "logps/chosen": -223.82281494140625, "logps/rejected": -375.0459289550781, "loss": 0.0003, "losses/dpo": 1.8052247197086757e-11, "losses/sft": 0.8436630368232727, "losses/total": 1.8052247197086757e-11, "ref_logps/chosen": -204.23780822753906, "ref_logps/rejected": -205.1890411376953, "rewards/accuracies": 1.0, "rewards/chosen": -1.9585020542144775, "rewards/margins": 15.02718734741211, "rewards/rejected": -16.985687255859375, "step": 3714 }, { "epoch": 0.89, "learning_rate": 2.4106666666666666e-08, "logps/chosen": -216.55087280273438, "logps/rejected": -389.1957702636719, "loss": 0.0009, "losses/dpo": 5.054198936704779e-06, "losses/sft": 0.518429696559906, "losses/total": 5.054198936704779e-06, "ref_logps/chosen": -200.70640563964844, "ref_logps/rejected": -220.39019775390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.5844473838806152, "rewards/margins": 15.296110153198242, "rewards/rejected": -16.880556106567383, "step": 3715 }, { "epoch": 0.89, "learning_rate": 2.405333333333333e-08, "logps/chosen": -214.96408081054688, "logps/rejected": -366.7930908203125, "loss": 0.0013, "losses/dpo": 2.550423232605681e-07, "losses/sft": 0.5691127181053162, "losses/total": 2.550423232605681e-07, "ref_logps/chosen": -200.6026153564453, "ref_logps/rejected": -211.5560302734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4361474514007568, "rewards/margins": 14.087557792663574, "rewards/rejected": -15.523704528808594, "step": 3716 }, { "epoch": 0.89, "learning_rate": 2.3999999999999997e-08, "logps/chosen": -283.7730407714844, "logps/rejected": -393.0208435058594, "loss": 0.0009, "losses/dpo": 2.3727274367502105e-08, "losses/sft": 0.5865347981452942, "losses/total": 2.3727274367502105e-08, "ref_logps/chosen": -265.3337097167969, "ref_logps/rejected": -227.80191040039062, "rewards/accuracies": 1.0, "rewards/chosen": -1.8439342975616455, "rewards/margins": 14.677959442138672, "rewards/rejected": -16.521894454956055, "step": 3717 }, { "epoch": 0.89, "learning_rate": 2.3946666666666663e-08, "logps/chosen": -275.38885498046875, "logps/rejected": -437.7896423339844, "loss": 0.0004, "losses/dpo": 1.788129755375678e-09, "losses/sft": 0.4993670880794525, "losses/total": 1.788129755375678e-09, "ref_logps/chosen": -252.67445373535156, "ref_logps/rejected": -244.2246856689453, "rewards/accuracies": 1.0, "rewards/chosen": -2.2714385986328125, "rewards/margins": 17.08505630493164, "rewards/rejected": -19.356494903564453, "step": 3718 }, { "epoch": 0.89, "learning_rate": 2.3893333333333334e-08, "logps/chosen": -226.41693115234375, "logps/rejected": -378.3265686035156, "loss": 0.0006, "losses/dpo": 8.244984250893594e-09, "losses/sft": 0.5682721734046936, "losses/total": 8.244984250893594e-09, "ref_logps/chosen": -211.38719177246094, "ref_logps/rejected": -209.8422393798828, "rewards/accuracies": 1.0, "rewards/chosen": -1.502974271774292, "rewards/margins": 15.345460891723633, "rewards/rejected": -16.848438262939453, "step": 3719 }, { "epoch": 0.89, "learning_rate": 2.384e-08, "logps/chosen": -216.1782684326172, "logps/rejected": -386.43310546875, "loss": 0.0001, "losses/dpo": 1.750206166661883e-07, "losses/sft": 0.4320047199726105, "losses/total": 1.750206166661883e-07, "ref_logps/chosen": -197.8841552734375, "ref_logps/rejected": -214.7725830078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.8294110298156738, "rewards/margins": 15.336644172668457, "rewards/rejected": -17.166053771972656, "step": 3720 }, { "epoch": 0.89, "learning_rate": 2.3786666666666667e-08, "logps/chosen": -234.1111602783203, "logps/rejected": -384.32012939453125, "loss": 0.0026, "losses/dpo": 2.373589658155595e-10, "losses/sft": 0.4288100302219391, "losses/total": 2.373589658155595e-10, "ref_logps/chosen": -213.76596069335938, "ref_logps/rejected": -212.59024047851562, "rewards/accuracies": 1.0, "rewards/chosen": -2.0345206260681152, "rewards/margins": 15.138465881347656, "rewards/rejected": -17.17298698425293, "step": 3721 }, { "epoch": 0.89, "learning_rate": 2.3733333333333334e-08, "logps/chosen": -263.3399353027344, "logps/rejected": -401.60650634765625, "loss": 0.0001, "losses/dpo": 3.535853238645359e-06, "losses/sft": 0.6319032311439514, "losses/total": 3.535853238645359e-06, "ref_logps/chosen": -240.38192749023438, "ref_logps/rejected": -223.3114013671875, "rewards/accuracies": 1.0, "rewards/chosen": -2.295802593231201, "rewards/margins": 15.533708572387695, "rewards/rejected": -17.829509735107422, "step": 3722 }, { "epoch": 0.89, "learning_rate": 2.368e-08, "logps/chosen": -225.50119018554688, "logps/rejected": -395.58740234375, "loss": 0.0033, "losses/dpo": 2.8471584898742686e-12, "losses/sft": 0.5732672810554504, "losses/total": 2.8471584898742686e-12, "ref_logps/chosen": -205.89834594726562, "ref_logps/rejected": -216.86904907226562, "rewards/accuracies": 1.0, "rewards/chosen": -1.9602844715118408, "rewards/margins": 15.91154956817627, "rewards/rejected": -17.87183380126953, "step": 3723 }, { "epoch": 0.89, "learning_rate": 2.3626666666666668e-08, "logps/chosen": -226.6016845703125, "logps/rejected": -376.0015869140625, "loss": 0.0001, "losses/dpo": 1.6865154975675978e-05, "losses/sft": 0.6023347973823547, "losses/total": 1.6865154975675978e-05, "ref_logps/chosen": -210.08010864257812, "ref_logps/rejected": -211.2696990966797, "rewards/accuracies": 1.0, "rewards/chosen": -1.6521573066711426, "rewards/margins": 14.821033477783203, "rewards/rejected": -16.473190307617188, "step": 3724 }, { "epoch": 0.89, "learning_rate": 2.357333333333333e-08, "logps/chosen": -275.3019104003906, "logps/rejected": -393.42974853515625, "loss": 0.0032, "losses/dpo": 1.2384217598082614e-06, "losses/sft": 0.5104913115501404, "losses/total": 1.2384217598082614e-06, "ref_logps/chosen": -251.56741333007812, "ref_logps/rejected": -212.77206420898438, "rewards/accuracies": 1.0, "rewards/chosen": -2.3734498023986816, "rewards/margins": 15.692319869995117, "rewards/rejected": -18.06576919555664, "step": 3725 }, { "epoch": 0.89, "learning_rate": 2.3519999999999998e-08, "logps/chosen": -217.19532775878906, "logps/rejected": -403.6120910644531, "loss": 0.0002, "losses/dpo": 1.1865424909274225e-07, "losses/sft": 0.6361284852027893, "losses/total": 1.1865424909274225e-07, "ref_logps/chosen": -197.1881866455078, "ref_logps/rejected": -225.56649780273438, "rewards/accuracies": 1.0, "rewards/chosen": -2.0007145404815674, "rewards/margins": 15.80384349822998, "rewards/rejected": -17.8045597076416, "step": 3726 }, { "epoch": 0.89, "learning_rate": 2.3466666666666665e-08, "logps/chosen": -239.90335083007812, "logps/rejected": -421.27569580078125, "loss": 0.0019, "losses/dpo": 2.407533372661419e-08, "losses/sft": 0.8269432187080383, "losses/total": 2.407533372661419e-08, "ref_logps/chosen": -223.7411651611328, "ref_logps/rejected": -236.05026245117188, "rewards/accuracies": 1.0, "rewards/chosen": -1.6162211894989014, "rewards/margins": 16.906322479248047, "rewards/rejected": -18.52254295349121, "step": 3727 }, { "epoch": 0.89, "learning_rate": 2.3413333333333332e-08, "logps/chosen": -267.386962890625, "logps/rejected": -402.83416748046875, "loss": 0.0003, "losses/dpo": 9.270452210330404e-07, "losses/sft": 1.1947089433670044, "losses/total": 9.270452210330404e-07, "ref_logps/chosen": -245.10659790039062, "ref_logps/rejected": -218.51376342773438, "rewards/accuracies": 1.0, "rewards/chosen": -2.228036403656006, "rewards/margins": 16.20400619506836, "rewards/rejected": -18.43204116821289, "step": 3728 }, { "epoch": 0.89, "learning_rate": 2.336e-08, "logps/chosen": -253.6126251220703, "logps/rejected": -378.0623474121094, "loss": 0.0022, "losses/dpo": 1.297567719937831e-09, "losses/sft": 0.5579265356063843, "losses/total": 1.297567719937831e-09, "ref_logps/chosen": -236.38558959960938, "ref_logps/rejected": -216.00430297851562, "rewards/accuracies": 1.0, "rewards/chosen": -1.7227028608322144, "rewards/margins": 14.483100891113281, "rewards/rejected": -16.2058048248291, "step": 3729 }, { "epoch": 0.9, "learning_rate": 2.3306666666666666e-08, "logps/chosen": -231.88320922851562, "logps/rejected": -391.89398193359375, "loss": 0.0002, "losses/dpo": 4.5125499781306644e-08, "losses/sft": 0.5200058221817017, "losses/total": 4.5125499781306644e-08, "ref_logps/chosen": -218.2751922607422, "ref_logps/rejected": -220.5941619873047, "rewards/accuracies": 1.0, "rewards/chosen": -1.3608026504516602, "rewards/margins": 15.769181251525879, "rewards/rejected": -17.12998390197754, "step": 3730 }, { "epoch": 0.9, "learning_rate": 2.3253333333333332e-08, "logps/chosen": -279.20703125, "logps/rejected": -406.35552978515625, "loss": 0.0006, "losses/dpo": 2.7357213383538692e-08, "losses/sft": 0.3962417542934418, "losses/total": 2.7357213383538692e-08, "ref_logps/chosen": -260.86212158203125, "ref_logps/rejected": -225.93402099609375, "rewards/accuracies": 1.0, "rewards/chosen": -1.8344942331314087, "rewards/margins": 16.207656860351562, "rewards/rejected": -18.04214859008789, "step": 3731 }, { "epoch": 0.9, "learning_rate": 2.32e-08, "logps/chosen": -255.44503784179688, "logps/rejected": -438.29937744140625, "loss": 0.0024, "losses/dpo": 1.0765353018749035e-11, "losses/sft": 0.6042219400405884, "losses/total": 1.0765353018749035e-11, "ref_logps/chosen": -240.42108154296875, "ref_logps/rejected": -245.0372314453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.5023953914642334, "rewards/margins": 17.823822021484375, "rewards/rejected": -19.32621955871582, "step": 3732 }, { "epoch": 0.9, "learning_rate": 2.3146666666666663e-08, "logps/chosen": -194.48880004882812, "logps/rejected": -333.1907653808594, "loss": 0.0022, "losses/dpo": 7.303359552679467e-07, "losses/sft": 0.4876336455345154, "losses/total": 7.303359552679467e-07, "ref_logps/chosen": -176.6278533935547, "ref_logps/rejected": -180.96768188476562, "rewards/accuracies": 1.0, "rewards/chosen": -1.7860946655273438, "rewards/margins": 13.436212539672852, "rewards/rejected": -15.222307205200195, "step": 3733 }, { "epoch": 0.9, "learning_rate": 2.309333333333333e-08, "logps/chosen": -245.5653839111328, "logps/rejected": -423.87060546875, "loss": 0.0007, "losses/dpo": 2.2365802543333757e-09, "losses/sft": 0.7985429167747498, "losses/total": 2.2365802543333757e-09, "ref_logps/chosen": -228.1720733642578, "ref_logps/rejected": -233.2899932861328, "rewards/accuracies": 1.0, "rewards/chosen": -1.7393317222595215, "rewards/margins": 17.318729400634766, "rewards/rejected": -19.058059692382812, "step": 3734 }, { "epoch": 0.9, "learning_rate": 2.304e-08, "logps/chosen": -239.79434204101562, "logps/rejected": -411.93621826171875, "loss": 0.002, "losses/dpo": 1.3741482396412152e-09, "losses/sft": 0.5477540493011475, "losses/total": 1.3741482396412152e-09, "ref_logps/chosen": -219.8505096435547, "ref_logps/rejected": -230.16860961914062, "rewards/accuracies": 1.0, "rewards/chosen": -1.9943844079971313, "rewards/margins": 16.182376861572266, "rewards/rejected": -18.176761627197266, "step": 3735 }, { "epoch": 0.9, "learning_rate": 2.2986666666666667e-08, "logps/chosen": -267.1722717285156, "logps/rejected": -399.173095703125, "loss": 0.0017, "losses/dpo": 1.1882918443006929e-05, "losses/sft": 1.2112538814544678, "losses/total": 1.1882918443006929e-05, "ref_logps/chosen": -254.39547729492188, "ref_logps/rejected": -239.62911987304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.2776784896850586, "rewards/margins": 14.67672061920166, "rewards/rejected": -15.954397201538086, "step": 3736 }, { "epoch": 0.9, "learning_rate": 2.2933333333333334e-08, "logps/chosen": -245.41159057617188, "logps/rejected": -436.4619140625, "loss": 0.0003, "losses/dpo": 3.827927211319526e-11, "losses/sft": 0.7377029061317444, "losses/total": 3.827927211319526e-11, "ref_logps/chosen": -231.36331176757812, "ref_logps/rejected": -232.96397399902344, "rewards/accuracies": 1.0, "rewards/chosen": -1.4048271179199219, "rewards/margins": 18.944965362548828, "rewards/rejected": -20.34979248046875, "step": 3737 }, { "epoch": 0.9, "learning_rate": 2.288e-08, "logps/chosen": -231.82174682617188, "logps/rejected": -435.2711181640625, "loss": 0.0, "losses/dpo": 4.155355746993106e-12, "losses/sft": 0.5367100238800049, "losses/total": 4.155355746993106e-12, "ref_logps/chosen": -209.47848510742188, "ref_logps/rejected": -225.42942810058594, "rewards/accuracies": 1.0, "rewards/chosen": -2.234327793121338, "rewards/margins": 18.749839782714844, "rewards/rejected": -20.984167098999023, "step": 3738 }, { "epoch": 0.9, "learning_rate": 2.2826666666666667e-08, "logps/chosen": -260.6542053222656, "logps/rejected": -424.5653381347656, "loss": 0.0, "losses/dpo": 6.827123311126115e-09, "losses/sft": 0.6696721315383911, "losses/total": 6.827123311126115e-09, "ref_logps/chosen": -247.5327911376953, "ref_logps/rejected": -243.92848205566406, "rewards/accuracies": 1.0, "rewards/chosen": -1.3121426105499268, "rewards/margins": 16.751544952392578, "rewards/rejected": -18.06368637084961, "step": 3739 }, { "epoch": 0.9, "learning_rate": 2.2773333333333334e-08, "logps/chosen": -242.0482940673828, "logps/rejected": -396.17803955078125, "loss": 0.0004, "losses/dpo": 5.371753286453895e-05, "losses/sft": 0.8936092853546143, "losses/total": 5.371753286453895e-05, "ref_logps/chosen": -223.61648559570312, "ref_logps/rejected": -219.6964111328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.843179702758789, "rewards/margins": 15.80498218536377, "rewards/rejected": -17.648160934448242, "step": 3740 }, { "epoch": 0.9, "learning_rate": 2.272e-08, "logps/chosen": -229.63641357421875, "logps/rejected": -392.5065612792969, "loss": 0.0026, "losses/dpo": 1.0976964226472319e-08, "losses/sft": 0.4847905933856964, "losses/total": 1.0976964226472319e-08, "ref_logps/chosen": -208.63870239257812, "ref_logps/rejected": -219.67730712890625, "rewards/accuracies": 1.0, "rewards/chosen": -2.0997729301452637, "rewards/margins": 15.183151245117188, "rewards/rejected": -17.28292465209961, "step": 3741 }, { "epoch": 0.9, "learning_rate": 2.2666666666666664e-08, "logps/chosen": -290.51861572265625, "logps/rejected": -433.2344970703125, "loss": 0.0002, "losses/dpo": 5.711308315170527e-09, "losses/sft": 0.5734527111053467, "losses/total": 5.711308315170527e-09, "ref_logps/chosen": -268.77392578125, "ref_logps/rejected": -248.43777465820312, "rewards/accuracies": 1.0, "rewards/chosen": -2.174468517303467, "rewards/margins": 16.30520248413086, "rewards/rejected": -18.479671478271484, "step": 3742 }, { "epoch": 0.9, "learning_rate": 2.261333333333333e-08, "logps/chosen": -260.2244567871094, "logps/rejected": -420.7149658203125, "loss": 0.0022, "losses/dpo": 5.452599907584954e-06, "losses/sft": 0.5725855827331543, "losses/total": 5.452599907584954e-06, "ref_logps/chosen": -240.13772583007812, "ref_logps/rejected": -234.11207580566406, "rewards/accuracies": 1.0, "rewards/chosen": -2.0086746215820312, "rewards/margins": 16.651613235473633, "rewards/rejected": -18.660287857055664, "step": 3743 }, { "epoch": 0.9, "learning_rate": 2.2559999999999998e-08, "logps/chosen": -200.36117553710938, "logps/rejected": -369.8291015625, "loss": 0.0008, "losses/dpo": 9.985110611607229e-13, "losses/sft": 0.6950711011886597, "losses/total": 9.985110611607229e-13, "ref_logps/chosen": -183.05487060546875, "ref_logps/rejected": -201.79129028320312, "rewards/accuracies": 1.0, "rewards/chosen": -1.7306315898895264, "rewards/margins": 15.073150634765625, "rewards/rejected": -16.803783416748047, "step": 3744 }, { "epoch": 0.9, "learning_rate": 2.2506666666666665e-08, "logps/chosen": -245.9432373046875, "logps/rejected": -393.9757995605469, "loss": 0.0014, "losses/dpo": 1.9277364149417053e-09, "losses/sft": 0.600185215473175, "losses/total": 1.9277364149417053e-09, "ref_logps/chosen": -229.06849670410156, "ref_logps/rejected": -218.87393188476562, "rewards/accuracies": 1.0, "rewards/chosen": -1.6874756813049316, "rewards/margins": 15.822711944580078, "rewards/rejected": -17.510189056396484, "step": 3745 }, { "epoch": 0.9, "learning_rate": 2.2453333333333332e-08, "logps/chosen": -209.18763732910156, "logps/rejected": -387.0277099609375, "loss": 0.0008, "losses/dpo": 2.0481145384110278e-06, "losses/sft": 0.5778229832649231, "losses/total": 2.0481145384110278e-06, "ref_logps/chosen": -197.48243713378906, "ref_logps/rejected": -209.78610229492188, "rewards/accuracies": 1.0, "rewards/chosen": -1.170522689819336, "rewards/margins": 16.553638458251953, "rewards/rejected": -17.724159240722656, "step": 3746 }, { "epoch": 0.9, "learning_rate": 2.24e-08, "logps/chosen": -234.5477294921875, "logps/rejected": -402.78790283203125, "loss": 0.0018, "losses/dpo": 1.6323030038734032e-08, "losses/sft": 0.7304539084434509, "losses/total": 1.6323030038734032e-08, "ref_logps/chosen": -218.2755126953125, "ref_logps/rejected": -229.25286865234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.6272214651107788, "rewards/margins": 15.726283073425293, "rewards/rejected": -17.353504180908203, "step": 3747 }, { "epoch": 0.9, "learning_rate": 2.2346666666666666e-08, "logps/chosen": -211.26351928710938, "logps/rejected": -365.98504638671875, "loss": 0.0004, "losses/dpo": 2.320035275005239e-09, "losses/sft": 0.5157288908958435, "losses/total": 2.320035275005239e-09, "ref_logps/chosen": -192.71484375, "ref_logps/rejected": -205.76724243164062, "rewards/accuracies": 1.0, "rewards/chosen": -1.8548681735992432, "rewards/margins": 14.166914939880371, "rewards/rejected": -16.02178192138672, "step": 3748 }, { "epoch": 0.9, "learning_rate": 2.2293333333333332e-08, "logps/chosen": -237.65489196777344, "logps/rejected": -417.13330078125, "loss": 0.0002, "losses/dpo": 6.95624059656591e-13, "losses/sft": 0.5420120358467102, "losses/total": 6.95624059656591e-13, "ref_logps/chosen": -218.93505859375, "ref_logps/rejected": -233.98768615722656, "rewards/accuracies": 1.0, "rewards/chosen": -1.8719812631607056, "rewards/margins": 16.442581176757812, "rewards/rejected": -18.314563751220703, "step": 3749 }, { "epoch": 0.9, "learning_rate": 2.224e-08, "logps/chosen": -223.80691528320312, "logps/rejected": -374.37786865234375, "loss": 0.0001, "losses/dpo": 7.53193729607915e-10, "losses/sft": 0.5610617995262146, "losses/total": 7.53193729607915e-10, "ref_logps/chosen": -207.684326171875, "ref_logps/rejected": -205.68356323242188, "rewards/accuracies": 1.0, "rewards/chosen": -1.612259864807129, "rewards/margins": 15.257169723510742, "rewards/rejected": -16.869430541992188, "step": 3750 }, { "epoch": 0.9, "learning_rate": 2.2186666666666666e-08, "logps/chosen": -290.04498291015625, "logps/rejected": -423.9140625, "loss": 0.0027, "losses/dpo": 0.037996623665094376, "losses/sft": 0.7910142540931702, "losses/total": 0.037996623665094376, "ref_logps/chosen": -274.5217590332031, "ref_logps/rejected": -253.9554901123047, "rewards/accuracies": 1.0, "rewards/chosen": -1.5523223876953125, "rewards/margins": 15.443536758422852, "rewards/rejected": -16.995859146118164, "step": 3751 }, { "epoch": 0.9, "learning_rate": 2.2133333333333333e-08, "logps/chosen": -254.07803344726562, "logps/rejected": -408.5376281738281, "loss": 0.003, "losses/dpo": 3.707340545666682e-10, "losses/sft": 0.5516742467880249, "losses/total": 3.707340545666682e-10, "ref_logps/chosen": -239.34939575195312, "ref_logps/rejected": -235.012451171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.4728641510009766, "rewards/margins": 15.879653930664062, "rewards/rejected": -17.35251808166504, "step": 3752 }, { "epoch": 0.9, "learning_rate": 2.208e-08, "logps/chosen": -248.6609344482422, "logps/rejected": -382.2943115234375, "loss": 0.0004, "losses/dpo": 1.609193077456439e-06, "losses/sft": 1.025246024131775, "losses/total": 1.609193077456439e-06, "ref_logps/chosen": -231.44873046875, "ref_logps/rejected": -219.57630920410156, "rewards/accuracies": 1.0, "rewards/chosen": -1.7212226390838623, "rewards/margins": 14.550577163696289, "rewards/rejected": -16.271799087524414, "step": 3753 }, { "epoch": 0.9, "learning_rate": 2.2026666666666667e-08, "logps/chosen": -218.6431121826172, "logps/rejected": -360.4427490234375, "loss": 0.0005, "losses/dpo": 6.6131633502664044e-09, "losses/sft": 0.7407734394073486, "losses/total": 6.6131633502664044e-09, "ref_logps/chosen": -204.59371948242188, "ref_logps/rejected": -202.30978393554688, "rewards/accuracies": 1.0, "rewards/chosen": -1.4049395322799683, "rewards/margins": 14.408357620239258, "rewards/rejected": -15.813297271728516, "step": 3754 }, { "epoch": 0.9, "learning_rate": 2.1973333333333334e-08, "logps/chosen": -229.61224365234375, "logps/rejected": -410.9991149902344, "loss": 0.0001, "losses/dpo": 1.869216612249147e-05, "losses/sft": 0.3285192847251892, "losses/total": 1.869216612249147e-05, "ref_logps/chosen": -211.8035430908203, "ref_logps/rejected": -231.1472930908203, "rewards/accuracies": 1.0, "rewards/chosen": -1.7808687686920166, "rewards/margins": 16.204313278198242, "rewards/rejected": -17.98518180847168, "step": 3755 }, { "epoch": 0.9, "learning_rate": 2.192e-08, "logps/chosen": -234.749267578125, "logps/rejected": -398.49603271484375, "loss": 0.0004, "losses/dpo": 8.929278010327835e-07, "losses/sft": 0.8782901763916016, "losses/total": 8.929278010327835e-07, "ref_logps/chosen": -217.71551513671875, "ref_logps/rejected": -223.46340942382812, "rewards/accuracies": 1.0, "rewards/chosen": -1.7033743858337402, "rewards/margins": 15.799890518188477, "rewards/rejected": -17.503265380859375, "step": 3756 }, { "epoch": 0.9, "learning_rate": 2.1866666666666667e-08, "logps/chosen": -244.8828125, "logps/rejected": -399.81500244140625, "loss": 0.0446, "losses/dpo": 6.284227478670079e-12, "losses/sft": 0.5907434225082397, "losses/total": 6.284227478670079e-12, "ref_logps/chosen": -227.39247131347656, "ref_logps/rejected": -222.10601806640625, "rewards/accuracies": 0.96875, "rewards/chosen": -1.7490336894989014, "rewards/margins": 16.02186393737793, "rewards/rejected": -17.770896911621094, "step": 3757 }, { "epoch": 0.9, "learning_rate": 2.1813333333333334e-08, "logps/chosen": -242.44393920898438, "logps/rejected": -380.7296142578125, "loss": 0.0001, "losses/dpo": 6.027533163432963e-05, "losses/sft": 0.652176022529602, "losses/total": 6.027533163432963e-05, "ref_logps/chosen": -222.77549743652344, "ref_logps/rejected": -203.2142791748047, "rewards/accuracies": 1.0, "rewards/chosen": -1.9668439626693726, "rewards/margins": 15.784690856933594, "rewards/rejected": -17.75153350830078, "step": 3758 }, { "epoch": 0.9, "learning_rate": 2.1759999999999998e-08, "logps/chosen": -213.6448974609375, "logps/rejected": -388.1295166015625, "loss": 0.0003, "losses/dpo": 1.1092701868165022e-08, "losses/sft": 0.5026243925094604, "losses/total": 1.1092701868165022e-08, "ref_logps/chosen": -195.25096130371094, "ref_logps/rejected": -211.7208709716797, "rewards/accuracies": 1.0, "rewards/chosen": -1.839394450187683, "rewards/margins": 15.801467895507812, "rewards/rejected": -17.64086151123047, "step": 3759 }, { "epoch": 0.9, "learning_rate": 2.1706666666666665e-08, "logps/chosen": -292.1706848144531, "logps/rejected": -427.642822265625, "loss": 0.0009, "losses/dpo": 3.063178866113958e-08, "losses/sft": 0.8901575207710266, "losses/total": 3.063178866113958e-08, "ref_logps/chosen": -273.7933654785156, "ref_logps/rejected": -251.36422729492188, "rewards/accuracies": 1.0, "rewards/chosen": -1.8377310037612915, "rewards/margins": 15.79012393951416, "rewards/rejected": -17.62785530090332, "step": 3760 }, { "epoch": 0.9, "learning_rate": 2.165333333333333e-08, "logps/chosen": -195.81320190429688, "logps/rejected": -383.613525390625, "loss": 0.0063, "losses/dpo": 0.00027517450507730246, "losses/sft": 0.7069242000579834, "losses/total": 0.00027517450507730246, "ref_logps/chosen": -180.2264404296875, "ref_logps/rejected": -214.7794952392578, "rewards/accuracies": 1.0, "rewards/chosen": -1.5586764812469482, "rewards/margins": 15.324728012084961, "rewards/rejected": -16.883403778076172, "step": 3761 }, { "epoch": 0.9, "learning_rate": 2.1599999999999998e-08, "logps/chosen": -197.9652862548828, "logps/rejected": -381.18328857421875, "loss": 0.0016, "losses/dpo": 4.1284603513602747e-10, "losses/sft": 0.8596184253692627, "losses/total": 4.1284603513602747e-10, "ref_logps/chosen": -175.8367919921875, "ref_logps/rejected": -217.0174560546875, "rewards/accuracies": 1.0, "rewards/chosen": -2.212848663330078, "rewards/margins": 14.203737258911133, "rewards/rejected": -16.41658592224121, "step": 3762 }, { "epoch": 0.9, "learning_rate": 2.1546666666666665e-08, "logps/chosen": -313.6567687988281, "logps/rejected": -425.9591064453125, "loss": 0.0001, "losses/dpo": 1.0878040690442958e-09, "losses/sft": 0.7783616185188293, "losses/total": 1.0878040690442958e-09, "ref_logps/chosen": -294.8978271484375, "ref_logps/rejected": -244.74508666992188, "rewards/accuracies": 1.0, "rewards/chosen": -1.8758959770202637, "rewards/margins": 16.245506286621094, "rewards/rejected": -18.121402740478516, "step": 3763 }, { "epoch": 0.9, "learning_rate": 2.1493333333333332e-08, "logps/chosen": -229.93408203125, "logps/rejected": -399.07958984375, "loss": 0.0032, "losses/dpo": 0.0004156643117312342, "losses/sft": 0.4576323330402374, "losses/total": 0.0004156643117312342, "ref_logps/chosen": -211.57086181640625, "ref_logps/rejected": -227.6781768798828, "rewards/accuracies": 1.0, "rewards/chosen": -1.8363217115402222, "rewards/margins": 15.303821563720703, "rewards/rejected": -17.14014434814453, "step": 3764 }, { "epoch": 0.9, "learning_rate": 2.144e-08, "logps/chosen": -232.87722778320312, "logps/rejected": -413.11395263671875, "loss": 0.0, "losses/dpo": 2.2219079909291395e-08, "losses/sft": 0.5778127312660217, "losses/total": 2.2219079909291395e-08, "ref_logps/chosen": -215.89401245117188, "ref_logps/rejected": -231.77276611328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.69832181930542, "rewards/margins": 16.435794830322266, "rewards/rejected": -18.134117126464844, "step": 3765 }, { "epoch": 0.9, "learning_rate": 2.1386666666666666e-08, "logps/chosen": -218.82272338867188, "logps/rejected": -352.3677978515625, "loss": 0.0005, "losses/dpo": 1.203067756527787e-09, "losses/sft": 0.41492363810539246, "losses/total": 1.203067756527787e-09, "ref_logps/chosen": -200.95993041992188, "ref_logps/rejected": -195.69688415527344, "rewards/accuracies": 1.0, "rewards/chosen": -1.786277174949646, "rewards/margins": 13.880814552307129, "rewards/rejected": -15.667091369628906, "step": 3766 }, { "epoch": 0.9, "learning_rate": 2.1333333333333332e-08, "logps/chosen": -228.97413635253906, "logps/rejected": -398.3974609375, "loss": 0.0006, "losses/dpo": 4.407480283519227e-11, "losses/sft": 0.5654482841491699, "losses/total": 4.407480283519227e-11, "ref_logps/chosen": -213.9930877685547, "ref_logps/rejected": -215.06613159179688, "rewards/accuracies": 1.0, "rewards/chosen": -1.4981046915054321, "rewards/margins": 16.835025787353516, "rewards/rejected": -18.333131790161133, "step": 3767 }, { "epoch": 0.9, "learning_rate": 2.128e-08, "logps/chosen": -260.1068115234375, "logps/rejected": -422.135009765625, "loss": 0.0003, "losses/dpo": 2.3558717998639622e-07, "losses/sft": 1.072520136833191, "losses/total": 2.3558717998639622e-07, "ref_logps/chosen": -238.41651916503906, "ref_logps/rejected": -234.8510284423828, "rewards/accuracies": 1.0, "rewards/chosen": -2.16903018951416, "rewards/margins": 16.559368133544922, "rewards/rejected": -18.728397369384766, "step": 3768 }, { "epoch": 0.9, "learning_rate": 2.1226666666666666e-08, "logps/chosen": -276.2039794921875, "logps/rejected": -420.4068603515625, "loss": 0.0002, "losses/dpo": 7.960919250205034e-12, "losses/sft": 0.557000458240509, "losses/total": 7.960919250205034e-12, "ref_logps/chosen": -258.2908020019531, "ref_logps/rejected": -232.01806640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7913180589675903, "rewards/margins": 17.047557830810547, "rewards/rejected": -18.83887481689453, "step": 3769 }, { "epoch": 0.9, "learning_rate": 2.1173333333333333e-08, "logps/chosen": -215.09231567382812, "logps/rejected": -364.2362976074219, "loss": 0.0004, "losses/dpo": 4.137929961785941e-13, "losses/sft": 0.5263490080833435, "losses/total": 4.137929961785941e-13, "ref_logps/chosen": -198.95950317382812, "ref_logps/rejected": -210.11827087402344, "rewards/accuracies": 1.0, "rewards/chosen": -1.6132793426513672, "rewards/margins": 13.79852294921875, "rewards/rejected": -15.411802291870117, "step": 3770 }, { "epoch": 0.9, "learning_rate": 2.112e-08, "logps/chosen": -219.00933837890625, "logps/rejected": -394.295654296875, "loss": 0.0043, "losses/dpo": 1.5480345894047787e-09, "losses/sft": 0.5501626133918762, "losses/total": 1.5480345894047787e-09, "ref_logps/chosen": -201.378173828125, "ref_logps/rejected": -227.5267333984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.7631148099899292, "rewards/margins": 14.913776397705078, "rewards/rejected": -16.676891326904297, "step": 3771 }, { "epoch": 0.91, "learning_rate": 2.1066666666666667e-08, "logps/chosen": -236.06227111816406, "logps/rejected": -418.3992919921875, "loss": 0.0009, "losses/dpo": 6.333952806691912e-11, "losses/sft": 0.6109185218811035, "losses/total": 6.333952806691912e-11, "ref_logps/chosen": -218.67381286621094, "ref_logps/rejected": -225.12924194335938, "rewards/accuracies": 1.0, "rewards/chosen": -1.7388471364974976, "rewards/margins": 17.58815574645996, "rewards/rejected": -19.327003479003906, "step": 3772 }, { "epoch": 0.91, "learning_rate": 2.1013333333333334e-08, "logps/chosen": -260.5230712890625, "logps/rejected": -375.87261962890625, "loss": 0.0017, "losses/dpo": 3.4862950304415108e-09, "losses/sft": 0.9748979210853577, "losses/total": 3.4862950304415108e-09, "ref_logps/chosen": -239.64712524414062, "ref_logps/rejected": -213.9682159423828, "rewards/accuracies": 1.0, "rewards/chosen": -2.087594509124756, "rewards/margins": 14.1028470993042, "rewards/rejected": -16.190441131591797, "step": 3773 }, { "epoch": 0.91, "learning_rate": 2.096e-08, "logps/chosen": -282.71136474609375, "logps/rejected": -440.9368896484375, "loss": 0.0, "losses/dpo": 5.291373561533419e-09, "losses/sft": 0.48451295495033264, "losses/total": 5.291373561533419e-09, "ref_logps/chosen": -266.97711181640625, "ref_logps/rejected": -246.89410400390625, "rewards/accuracies": 1.0, "rewards/chosen": -1.5734248161315918, "rewards/margins": 17.830852508544922, "rewards/rejected": -19.404277801513672, "step": 3774 }, { "epoch": 0.91, "learning_rate": 2.0906666666666667e-08, "logps/chosen": -245.6754150390625, "logps/rejected": -385.70404052734375, "loss": 0.0017, "losses/dpo": 2.332356530132529e-09, "losses/sft": 0.508062481880188, "losses/total": 2.332356530132529e-09, "ref_logps/chosen": -226.35781860351562, "ref_logps/rejected": -218.35888671875, "rewards/accuracies": 1.0, "rewards/chosen": -1.9317598342895508, "rewards/margins": 14.802756309509277, "rewards/rejected": -16.734516143798828, "step": 3775 }, { "epoch": 0.91, "learning_rate": 2.085333333333333e-08, "logps/chosen": -260.2088623046875, "logps/rejected": -402.3781433105469, "loss": 0.0013, "losses/dpo": 6.132015550974756e-05, "losses/sft": 0.5977312922477722, "losses/total": 6.132015550974756e-05, "ref_logps/chosen": -240.9259490966797, "ref_logps/rejected": -226.68902587890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.9282913208007812, "rewards/margins": 15.640621185302734, "rewards/rejected": -17.568912506103516, "step": 3776 }, { "epoch": 0.91, "learning_rate": 2.0799999999999998e-08, "logps/chosen": -232.8482666015625, "logps/rejected": -393.5521240234375, "loss": 0.005, "losses/dpo": 4.466023710847367e-06, "losses/sft": 0.5993209481239319, "losses/total": 4.466023710847367e-06, "ref_logps/chosen": -213.8255157470703, "ref_logps/rejected": -217.35414123535156, "rewards/accuracies": 1.0, "rewards/chosen": -1.9022724628448486, "rewards/margins": 15.717529296875, "rewards/rejected": -17.619800567626953, "step": 3777 }, { "epoch": 0.91, "learning_rate": 2.0746666666666665e-08, "logps/chosen": -230.09597778320312, "logps/rejected": -406.0697326660156, "loss": 0.0001, "losses/dpo": 3.6332911679259894e-10, "losses/sft": 0.48607680201530457, "losses/total": 3.6332911679259894e-10, "ref_logps/chosen": -212.68130493164062, "ref_logps/rejected": -220.5536346435547, "rewards/accuracies": 1.0, "rewards/chosen": -1.7414681911468506, "rewards/margins": 16.810142517089844, "rewards/rejected": -18.551610946655273, "step": 3778 }, { "epoch": 0.91, "learning_rate": 2.069333333333333e-08, "logps/chosen": -236.36233520507812, "logps/rejected": -390.620361328125, "loss": 0.0001, "losses/dpo": 1.3366124562708137e-07, "losses/sft": 0.6720870733261108, "losses/total": 1.3366124562708137e-07, "ref_logps/chosen": -219.6363067626953, "ref_logps/rejected": -213.632080078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6726036071777344, "rewards/margins": 16.02622413635254, "rewards/rejected": -17.698829650878906, "step": 3779 }, { "epoch": 0.91, "learning_rate": 2.0639999999999998e-08, "logps/chosen": -256.59222412109375, "logps/rejected": -370.8186950683594, "loss": 0.0078, "losses/dpo": 3.699716728533531e-07, "losses/sft": 0.4170961081981659, "losses/total": 3.699716728533531e-07, "ref_logps/chosen": -241.866943359375, "ref_logps/rejected": -206.15231323242188, "rewards/accuracies": 1.0, "rewards/chosen": -1.4725278615951538, "rewards/margins": 14.994112014770508, "rewards/rejected": -16.46664047241211, "step": 3780 }, { "epoch": 0.91, "learning_rate": 2.0586666666666665e-08, "logps/chosen": -247.69961547851562, "logps/rejected": -401.98980712890625, "loss": 0.0004, "losses/dpo": 3.798506043040106e-08, "losses/sft": 0.8318906426429749, "losses/total": 3.798506043040106e-08, "ref_logps/chosen": -233.75498962402344, "ref_logps/rejected": -231.3670654296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.3944599628448486, "rewards/margins": 15.667816162109375, "rewards/rejected": -17.062274932861328, "step": 3781 }, { "epoch": 0.91, "learning_rate": 2.0533333333333332e-08, "logps/chosen": -235.1898193359375, "logps/rejected": -398.277099609375, "loss": 0.0007, "losses/dpo": 2.8895183277199976e-06, "losses/sft": 1.0844008922576904, "losses/total": 2.8895183277199976e-06, "ref_logps/chosen": -217.781494140625, "ref_logps/rejected": -220.9561767578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.7408332824707031, "rewards/margins": 15.991259574890137, "rewards/rejected": -17.732093811035156, "step": 3782 }, { "epoch": 0.91, "learning_rate": 2.048e-08, "logps/chosen": -280.3187255859375, "logps/rejected": -456.9914245605469, "loss": 0.0, "losses/dpo": 1.1597384030892499e-07, "losses/sft": 1.0613709688186646, "losses/total": 1.1597384030892499e-07, "ref_logps/chosen": -259.1434326171875, "ref_logps/rejected": -257.6491394042969, "rewards/accuracies": 1.0, "rewards/chosen": -2.117527484893799, "rewards/margins": 17.816699981689453, "rewards/rejected": -19.934226989746094, "step": 3783 }, { "epoch": 0.91, "learning_rate": 2.0426666666666666e-08, "logps/chosen": -211.2926025390625, "logps/rejected": -384.9212646484375, "loss": 0.0005, "losses/dpo": 7.893924339441583e-05, "losses/sft": 1.1228939294815063, "losses/total": 7.893924339441583e-05, "ref_logps/chosen": -194.07496643066406, "ref_logps/rejected": -212.50257873535156, "rewards/accuracies": 1.0, "rewards/chosen": -1.7217633724212646, "rewards/margins": 15.520106315612793, "rewards/rejected": -17.24186897277832, "step": 3784 }, { "epoch": 0.91, "learning_rate": 2.0373333333333332e-08, "logps/chosen": -237.1407470703125, "logps/rejected": -437.946533203125, "loss": 0.0012, "losses/dpo": 7.546155367244012e-11, "losses/sft": 0.7307890057563782, "losses/total": 7.546155367244012e-11, "ref_logps/chosen": -221.74874877929688, "ref_logps/rejected": -240.43226623535156, "rewards/accuracies": 1.0, "rewards/chosen": -1.539198875427246, "rewards/margins": 18.21222686767578, "rewards/rejected": -19.751426696777344, "step": 3785 }, { "epoch": 0.91, "learning_rate": 2.032e-08, "logps/chosen": -261.30877685546875, "logps/rejected": -411.9874267578125, "loss": 0.0009, "losses/dpo": 3.3232590794796124e-06, "losses/sft": 0.7682793140411377, "losses/total": 3.3232590794796124e-06, "ref_logps/chosen": -243.18319702148438, "ref_logps/rejected": -240.06890869140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.8125576972961426, "rewards/margins": 15.379292488098145, "rewards/rejected": -17.191850662231445, "step": 3786 }, { "epoch": 0.91, "learning_rate": 2.0266666666666666e-08, "logps/chosen": -206.80438232421875, "logps/rejected": -338.6502380371094, "loss": 0.0132, "losses/dpo": 2.8953411401744233e-07, "losses/sft": 0.6916514039039612, "losses/total": 2.8953411401744233e-07, "ref_logps/chosen": -188.46923828125, "ref_logps/rejected": -184.039306640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.8335130214691162, "rewards/margins": 13.627581596374512, "rewards/rejected": -15.46109390258789, "step": 3787 }, { "epoch": 0.91, "learning_rate": 2.0213333333333333e-08, "logps/chosen": -240.1892547607422, "logps/rejected": -395.0584716796875, "loss": 0.0017, "losses/dpo": 5.242724657184006e-11, "losses/sft": 0.7648760080337524, "losses/total": 5.242724657184006e-11, "ref_logps/chosen": -223.89683532714844, "ref_logps/rejected": -208.08071899414062, "rewards/accuracies": 1.0, "rewards/chosen": -1.6292402744293213, "rewards/margins": 17.06853485107422, "rewards/rejected": -18.697776794433594, "step": 3788 }, { "epoch": 0.91, "learning_rate": 2.016e-08, "logps/chosen": -220.52728271484375, "logps/rejected": -350.1308898925781, "loss": 0.0025, "losses/dpo": 1.5505967621010086e-09, "losses/sft": 0.5542906522750854, "losses/total": 1.5505967621010086e-09, "ref_logps/chosen": -207.56771850585938, "ref_logps/rejected": -193.63650512695312, "rewards/accuracies": 1.0, "rewards/chosen": -1.2959567308425903, "rewards/margins": 14.35348129272461, "rewards/rejected": -15.64943790435791, "step": 3789 }, { "epoch": 0.91, "learning_rate": 2.0106666666666667e-08, "logps/chosen": -325.2825927734375, "logps/rejected": -479.26708984375, "loss": 0.0002, "losses/dpo": 2.1750665268882585e-08, "losses/sft": 0.660327672958374, "losses/total": 2.1750665268882585e-08, "ref_logps/chosen": -304.353759765625, "ref_logps/rejected": -271.91510009765625, "rewards/accuracies": 1.0, "rewards/chosen": -2.0928850173950195, "rewards/margins": 18.642314910888672, "rewards/rejected": -20.735198974609375, "step": 3790 }, { "epoch": 0.91, "learning_rate": 2.0053333333333334e-08, "logps/chosen": -239.626220703125, "logps/rejected": -421.84747314453125, "loss": 0.0017, "losses/dpo": 1.4401155112864217e-07, "losses/sft": 0.4995509386062622, "losses/total": 1.4401155112864217e-07, "ref_logps/chosen": -221.10995483398438, "ref_logps/rejected": -227.84823608398438, "rewards/accuracies": 1.0, "rewards/chosen": -1.8516273498535156, "rewards/margins": 17.548294067382812, "rewards/rejected": -19.399921417236328, "step": 3791 }, { "epoch": 0.91, "learning_rate": 2e-08, "logps/chosen": -259.9513244628906, "logps/rejected": -415.0953063964844, "loss": 0.0003, "losses/dpo": 0.0006214437307789922, "losses/sft": 1.0600529909133911, "losses/total": 0.0006214437307789922, "ref_logps/chosen": -243.94970703125, "ref_logps/rejected": -234.82443237304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.600161075592041, "rewards/margins": 16.426929473876953, "rewards/rejected": -18.027090072631836, "step": 3792 }, { "epoch": 0.91, "learning_rate": 1.9946666666666664e-08, "logps/chosen": -244.79132080078125, "logps/rejected": -370.354248046875, "loss": 0.0003, "losses/dpo": 5.843104645963937e-12, "losses/sft": 0.4504377543926239, "losses/total": 5.843104645963937e-12, "ref_logps/chosen": -225.2268524169922, "ref_logps/rejected": -210.8045654296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.9564478397369385, "rewards/margins": 13.99852180480957, "rewards/rejected": -15.95496940612793, "step": 3793 }, { "epoch": 0.91, "learning_rate": 1.989333333333333e-08, "logps/chosen": -230.22732543945312, "logps/rejected": -368.99114990234375, "loss": 0.0005, "losses/dpo": 6.586228806781946e-08, "losses/sft": 1.005302906036377, "losses/total": 6.586228806781946e-08, "ref_logps/chosen": -214.01889038085938, "ref_logps/rejected": -204.38192749023438, "rewards/accuracies": 1.0, "rewards/chosen": -1.620842695236206, "rewards/margins": 14.840078353881836, "rewards/rejected": -16.460922241210938, "step": 3794 }, { "epoch": 0.91, "learning_rate": 1.9839999999999998e-08, "logps/chosen": -236.93438720703125, "logps/rejected": -421.7174072265625, "loss": 0.002, "losses/dpo": 9.342884105256744e-12, "losses/sft": 0.7133516073226929, "losses/total": 9.342884105256744e-12, "ref_logps/chosen": -222.4695281982422, "ref_logps/rejected": -247.3862762451172, "rewards/accuracies": 1.0, "rewards/chosen": -1.446487307548523, "rewards/margins": 15.986628532409668, "rewards/rejected": -17.433115005493164, "step": 3795 }, { "epoch": 0.91, "learning_rate": 1.9786666666666665e-08, "logps/chosen": -238.86819458007812, "logps/rejected": -367.7608642578125, "loss": 0.0043, "losses/dpo": 8.922164852265269e-06, "losses/sft": 0.6735367774963379, "losses/total": 8.922164852265269e-06, "ref_logps/chosen": -219.53573608398438, "ref_logps/rejected": -195.72596740722656, "rewards/accuracies": 1.0, "rewards/chosen": -1.9332444667816162, "rewards/margins": 15.270242691040039, "rewards/rejected": -17.203487396240234, "step": 3796 }, { "epoch": 0.91, "learning_rate": 1.973333333333333e-08, "logps/chosen": -212.2131805419922, "logps/rejected": -364.07769775390625, "loss": 0.002, "losses/dpo": 1.6644389688735828e-06, "losses/sft": 0.545703649520874, "losses/total": 1.6644389688735828e-06, "ref_logps/chosen": -194.31593322753906, "ref_logps/rejected": -206.26171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7897238731384277, "rewards/margins": 13.991872787475586, "rewards/rejected": -15.781597137451172, "step": 3797 }, { "epoch": 0.91, "learning_rate": 1.9679999999999998e-08, "logps/chosen": -287.72418212890625, "logps/rejected": -417.4272155761719, "loss": 0.0001, "losses/dpo": 2.494219586424151e-07, "losses/sft": 0.6873765587806702, "losses/total": 2.494219586424151e-07, "ref_logps/chosen": -265.5947265625, "ref_logps/rejected": -232.58004760742188, "rewards/accuracies": 1.0, "rewards/chosen": -2.2129456996917725, "rewards/margins": 16.271770477294922, "rewards/rejected": -18.48471450805664, "step": 3798 }, { "epoch": 0.91, "learning_rate": 1.9626666666666665e-08, "logps/chosen": -249.23773193359375, "logps/rejected": -409.4415588378906, "loss": 0.0013, "losses/dpo": 6.388585660488388e-08, "losses/sft": 0.8664387464523315, "losses/total": 6.388585660488388e-08, "ref_logps/chosen": -229.14015197753906, "ref_logps/rejected": -227.10604858398438, "rewards/accuracies": 1.0, "rewards/chosen": -2.009758472442627, "rewards/margins": 16.223793029785156, "rewards/rejected": -18.233551025390625, "step": 3799 }, { "epoch": 0.91, "learning_rate": 1.9573333333333332e-08, "logps/chosen": -236.66571044921875, "logps/rejected": -395.07696533203125, "loss": 0.0095, "losses/dpo": 6.606119740126815e-08, "losses/sft": 0.5629479289054871, "losses/total": 6.606119740126815e-08, "ref_logps/chosen": -221.86619567871094, "ref_logps/rejected": -227.8912811279297, "rewards/accuracies": 1.0, "rewards/chosen": -1.4799511432647705, "rewards/margins": 15.238616943359375, "rewards/rejected": -16.71856689453125, "step": 3800 }, { "epoch": 0.91, "learning_rate": 1.952e-08, "logps/chosen": -241.42684936523438, "logps/rejected": -367.38525390625, "loss": 0.002, "losses/dpo": 1.830644151823435e-07, "losses/sft": 0.6340959668159485, "losses/total": 1.830644151823435e-07, "ref_logps/chosen": -220.758056640625, "ref_logps/rejected": -208.12994384765625, "rewards/accuracies": 1.0, "rewards/chosen": -2.066880226135254, "rewards/margins": 13.858650207519531, "rewards/rejected": -15.925529479980469, "step": 3801 }, { "epoch": 0.91, "learning_rate": 1.9466666666666666e-08, "logps/chosen": -230.99673461914062, "logps/rejected": -407.24169921875, "loss": 0.0006, "losses/dpo": 9.231073683224622e-09, "losses/sft": 0.3440210819244385, "losses/total": 9.231073683224622e-09, "ref_logps/chosen": -212.96484375, "ref_logps/rejected": -228.57931518554688, "rewards/accuracies": 1.0, "rewards/chosen": -1.803189992904663, "rewards/margins": 16.06304931640625, "rewards/rejected": -17.866241455078125, "step": 3802 }, { "epoch": 0.91, "learning_rate": 1.9413333333333332e-08, "logps/chosen": -237.0802001953125, "logps/rejected": -442.51141357421875, "loss": 0.0041, "losses/dpo": 7.177220595622202e-09, "losses/sft": 0.4588356614112854, "losses/total": 7.177220595622202e-09, "ref_logps/chosen": -218.3365936279297, "ref_logps/rejected": -254.61569213867188, "rewards/accuracies": 1.0, "rewards/chosen": -1.8743613958358765, "rewards/margins": 16.91520881652832, "rewards/rejected": -18.789569854736328, "step": 3803 }, { "epoch": 0.91, "learning_rate": 1.936e-08, "logps/chosen": -221.2161865234375, "logps/rejected": -370.87017822265625, "loss": 0.0038, "losses/dpo": 4.327988054747545e-12, "losses/sft": 0.5068731307983398, "losses/total": 4.327988054747545e-12, "ref_logps/chosen": -205.4581298828125, "ref_logps/rejected": -211.14283752441406, "rewards/accuracies": 1.0, "rewards/chosen": -1.575805902481079, "rewards/margins": 14.396928787231445, "rewards/rejected": -15.972733497619629, "step": 3804 }, { "epoch": 0.91, "learning_rate": 1.9306666666666666e-08, "logps/chosen": -249.6611328125, "logps/rejected": -390.7527160644531, "loss": 0.0003, "losses/dpo": 1.9209443063994414e-11, "losses/sft": 0.6696926951408386, "losses/total": 1.9209443063994414e-11, "ref_logps/chosen": -234.3326416015625, "ref_logps/rejected": -225.0832977294922, "rewards/accuracies": 1.0, "rewards/chosen": -1.5328466892242432, "rewards/margins": 15.034095764160156, "rewards/rejected": -16.56694221496582, "step": 3805 }, { "epoch": 0.91, "learning_rate": 1.9253333333333333e-08, "logps/chosen": -310.4659118652344, "logps/rejected": -451.62139892578125, "loss": 0.0004, "losses/dpo": 6.985911155110713e-11, "losses/sft": 0.4877564311027527, "losses/total": 6.985911155110713e-11, "ref_logps/chosen": -292.5226745605469, "ref_logps/rejected": -271.23260498046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7943236827850342, "rewards/margins": 16.244556427001953, "rewards/rejected": -18.03887939453125, "step": 3806 }, { "epoch": 0.91, "learning_rate": 1.92e-08, "logps/chosen": -244.8016357421875, "logps/rejected": -411.1217041015625, "loss": 0.0002, "losses/dpo": 3.5849689883349356e-09, "losses/sft": 0.49520954489707947, "losses/total": 3.5849689883349356e-09, "ref_logps/chosen": -227.33370971679688, "ref_logps/rejected": -227.77938842773438, "rewards/accuracies": 1.0, "rewards/chosen": -1.7467924356460571, "rewards/margins": 16.587440490722656, "rewards/rejected": -18.3342342376709, "step": 3807 }, { "epoch": 0.91, "learning_rate": 1.9146666666666667e-08, "logps/chosen": -218.14071655273438, "logps/rejected": -417.1044921875, "loss": 0.0001, "losses/dpo": 3.2540900529731687e-11, "losses/sft": 0.5003753304481506, "losses/total": 3.2540900529731687e-11, "ref_logps/chosen": -199.88836669921875, "ref_logps/rejected": -229.4866943359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.825234293937683, "rewards/margins": 16.936546325683594, "rewards/rejected": -18.76177978515625, "step": 3808 }, { "epoch": 0.91, "learning_rate": 1.9093333333333334e-08, "logps/chosen": -241.9988250732422, "logps/rejected": -379.6051940917969, "loss": 0.0003, "losses/dpo": 0.0008458362426608801, "losses/sft": 0.5440453886985779, "losses/total": 0.0008458362426608801, "ref_logps/chosen": -226.3711395263672, "ref_logps/rejected": -214.18466186523438, "rewards/accuracies": 1.0, "rewards/chosen": -1.5627682209014893, "rewards/margins": 14.97928237915039, "rewards/rejected": -16.542051315307617, "step": 3809 }, { "epoch": 0.91, "learning_rate": 1.904e-08, "logps/chosen": -278.19500732421875, "logps/rejected": -407.3627624511719, "loss": 0.0002, "losses/dpo": 2.4247306384950207e-09, "losses/sft": 0.6580193042755127, "losses/total": 2.4247306384950207e-09, "ref_logps/chosen": -256.66357421875, "ref_logps/rejected": -228.03216552734375, "rewards/accuracies": 1.0, "rewards/chosen": -2.153144359588623, "rewards/margins": 15.779916763305664, "rewards/rejected": -17.933059692382812, "step": 3810 }, { "epoch": 0.91, "learning_rate": 1.8986666666666664e-08, "logps/chosen": -243.82980346679688, "logps/rejected": -418.0602111816406, "loss": 0.0012, "losses/dpo": 6.097856594666773e-12, "losses/sft": 0.6142714619636536, "losses/total": 6.097856594666773e-12, "ref_logps/chosen": -229.11767578125, "ref_logps/rejected": -240.88258361816406, "rewards/accuracies": 1.0, "rewards/chosen": -1.4712117910385132, "rewards/margins": 16.246551513671875, "rewards/rejected": -17.717761993408203, "step": 3811 }, { "epoch": 0.91, "learning_rate": 1.893333333333333e-08, "logps/chosen": -247.4757080078125, "logps/rejected": -422.93194580078125, "loss": 0.0001, "losses/dpo": 1.9456493305369804e-08, "losses/sft": 0.5946910977363586, "losses/total": 1.9456493305369804e-08, "ref_logps/chosen": -229.6649169921875, "ref_logps/rejected": -239.1409149169922, "rewards/accuracies": 1.0, "rewards/chosen": -1.7810816764831543, "rewards/margins": 16.598024368286133, "rewards/rejected": -18.379104614257812, "step": 3812 }, { "epoch": 0.92, "learning_rate": 1.8879999999999998e-08, "logps/chosen": -213.0133819580078, "logps/rejected": -440.8214111328125, "loss": 0.0004, "losses/dpo": 2.1567245767073473e-08, "losses/sft": 0.4421674311161041, "losses/total": 2.1567245767073473e-08, "ref_logps/chosen": -194.93020629882812, "ref_logps/rejected": -245.59352111816406, "rewards/accuracies": 1.0, "rewards/chosen": -1.8083164691925049, "rewards/margins": 17.714473724365234, "rewards/rejected": -19.522790908813477, "step": 3813 }, { "epoch": 0.92, "learning_rate": 1.8826666666666665e-08, "logps/chosen": -242.82012939453125, "logps/rejected": -379.93310546875, "loss": 0.0008, "losses/dpo": 6.479890402033561e-08, "losses/sft": 0.567035436630249, "losses/total": 6.479890402033561e-08, "ref_logps/chosen": -225.6392364501953, "ref_logps/rejected": -213.69158935546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7180874347686768, "rewards/margins": 14.9060640335083, "rewards/rejected": -16.62415313720703, "step": 3814 }, { "epoch": 0.92, "learning_rate": 1.877333333333333e-08, "logps/chosen": -236.6341552734375, "logps/rejected": -422.157958984375, "loss": 0.0005, "losses/dpo": 1.2550996700277928e-11, "losses/sft": 0.6996431946754456, "losses/total": 1.2550996700277928e-11, "ref_logps/chosen": -218.21185302734375, "ref_logps/rejected": -241.89085388183594, "rewards/accuracies": 1.0, "rewards/chosen": -1.842231273651123, "rewards/margins": 16.184478759765625, "rewards/rejected": -18.026710510253906, "step": 3815 }, { "epoch": 0.92, "learning_rate": 1.8719999999999998e-08, "logps/chosen": -270.15521240234375, "logps/rejected": -395.12725830078125, "loss": 0.0013, "losses/dpo": 8.66115790199018e-10, "losses/sft": 0.4430348873138428, "losses/total": 8.66115790199018e-10, "ref_logps/chosen": -248.6165313720703, "ref_logps/rejected": -210.53863525390625, "rewards/accuracies": 1.0, "rewards/chosen": -2.153869152069092, "rewards/margins": 16.30499267578125, "rewards/rejected": -18.458860397338867, "step": 3816 }, { "epoch": 0.92, "learning_rate": 1.8666666666666665e-08, "logps/chosen": -238.64503479003906, "logps/rejected": -410.1456298828125, "loss": 0.0011, "losses/dpo": 1.037047892005205e-09, "losses/sft": 0.6191880106925964, "losses/total": 1.037047892005205e-09, "ref_logps/chosen": -214.15292358398438, "ref_logps/rejected": -212.85491943359375, "rewards/accuracies": 1.0, "rewards/chosen": -2.4492135047912598, "rewards/margins": 17.279857635498047, "rewards/rejected": -19.72907257080078, "step": 3817 }, { "epoch": 0.92, "learning_rate": 1.8613333333333335e-08, "logps/chosen": -261.75732421875, "logps/rejected": -386.9908447265625, "loss": 0.0019, "losses/dpo": 3.984391878475435e-05, "losses/sft": 0.9440168738365173, "losses/total": 3.984391878475435e-05, "ref_logps/chosen": -239.8596954345703, "ref_logps/rejected": -220.690673828125, "rewards/accuracies": 1.0, "rewards/chosen": -2.1897616386413574, "rewards/margins": 14.440255165100098, "rewards/rejected": -16.630016326904297, "step": 3818 }, { "epoch": 0.92, "learning_rate": 1.856e-08, "logps/chosen": -207.39584350585938, "logps/rejected": -383.0236511230469, "loss": 0.0007, "losses/dpo": 5.700862857338507e-06, "losses/sft": 0.9258834719657898, "losses/total": 5.700862857338507e-06, "ref_logps/chosen": -193.14208984375, "ref_logps/rejected": -218.5430908203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.4253764152526855, "rewards/margins": 15.022680282592773, "rewards/rejected": -16.448057174682617, "step": 3819 }, { "epoch": 0.92, "learning_rate": 1.8506666666666666e-08, "logps/chosen": -292.35888671875, "logps/rejected": -389.6763916015625, "loss": 0.0002, "losses/dpo": 2.6803766672856e-07, "losses/sft": 1.1573843955993652, "losses/total": 2.6803766672856e-07, "ref_logps/chosen": -277.0946350097656, "ref_logps/rejected": -226.151611328125, "rewards/accuracies": 1.0, "rewards/chosen": -1.5264270305633545, "rewards/margins": 14.826053619384766, "rewards/rejected": -16.352479934692383, "step": 3820 }, { "epoch": 0.92, "learning_rate": 1.8453333333333332e-08, "logps/chosen": -284.7768249511719, "logps/rejected": -395.575439453125, "loss": 0.003, "losses/dpo": 2.0159288396826014e-05, "losses/sft": 1.090450406074524, "losses/total": 2.0159288396826014e-05, "ref_logps/chosen": -264.1981201171875, "ref_logps/rejected": -219.953857421875, "rewards/accuracies": 1.0, "rewards/chosen": -2.057868242263794, "rewards/margins": 15.504291534423828, "rewards/rejected": -17.562158584594727, "step": 3821 }, { "epoch": 0.92, "learning_rate": 1.84e-08, "logps/chosen": -265.8082275390625, "logps/rejected": -410.91693115234375, "loss": 0.0013, "losses/dpo": 1.3999499515193747e-07, "losses/sft": 1.0690789222717285, "losses/total": 1.3999499515193747e-07, "ref_logps/chosen": -244.49905395507812, "ref_logps/rejected": -221.84432983398438, "rewards/accuracies": 1.0, "rewards/chosen": -2.1309168338775635, "rewards/margins": 16.776344299316406, "rewards/rejected": -18.90726089477539, "step": 3822 }, { "epoch": 0.92, "learning_rate": 1.8346666666666666e-08, "logps/chosen": -209.67726135253906, "logps/rejected": -371.830810546875, "loss": 0.0011, "losses/dpo": 2.989874126058112e-09, "losses/sft": 0.5390514731407166, "losses/total": 2.989874126058112e-09, "ref_logps/chosen": -192.54458618164062, "ref_logps/rejected": -210.3057403564453, "rewards/accuracies": 1.0, "rewards/chosen": -1.7132666110992432, "rewards/margins": 14.439239501953125, "rewards/rejected": -16.15250587463379, "step": 3823 }, { "epoch": 0.92, "learning_rate": 1.8293333333333333e-08, "logps/chosen": -275.8739013671875, "logps/rejected": -435.4978942871094, "loss": 0.0015, "losses/dpo": 3.3650266573204135e-08, "losses/sft": 0.5253980159759521, "losses/total": 3.3650266573204135e-08, "ref_logps/chosen": -255.0016326904297, "ref_logps/rejected": -238.3614501953125, "rewards/accuracies": 1.0, "rewards/chosen": -2.0872247219085693, "rewards/margins": 17.626419067382812, "rewards/rejected": -19.71364402770996, "step": 3824 }, { "epoch": 0.92, "learning_rate": 1.824e-08, "logps/chosen": -277.58642578125, "logps/rejected": -392.7834167480469, "loss": 0.0005, "losses/dpo": 2.3195235954176496e-08, "losses/sft": 0.7038754820823669, "losses/total": 2.3195235954176496e-08, "ref_logps/chosen": -257.84210205078125, "ref_logps/rejected": -224.55911254882812, "rewards/accuracies": 1.0, "rewards/chosen": -1.9744338989257812, "rewards/margins": 14.84799575805664, "rewards/rejected": -16.822429656982422, "step": 3825 }, { "epoch": 0.92, "learning_rate": 1.8186666666666667e-08, "logps/chosen": -248.71458435058594, "logps/rejected": -396.159423828125, "loss": 0.0027, "losses/dpo": 1.261243019745617e-11, "losses/sft": 0.4264800548553467, "losses/total": 1.261243019745617e-11, "ref_logps/chosen": -230.4616241455078, "ref_logps/rejected": -230.63818359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.825295329093933, "rewards/margins": 14.726829528808594, "rewards/rejected": -16.5521240234375, "step": 3826 }, { "epoch": 0.92, "learning_rate": 1.8133333333333334e-08, "logps/chosen": -242.925537109375, "logps/rejected": -396.52880859375, "loss": 0.0002, "losses/dpo": 5.021145625505596e-06, "losses/sft": 0.6431589126586914, "losses/total": 5.021145625505596e-06, "ref_logps/chosen": -222.459228515625, "ref_logps/rejected": -211.2147674560547, "rewards/accuracies": 1.0, "rewards/chosen": -2.0466294288635254, "rewards/margins": 16.484773635864258, "rewards/rejected": -18.531402587890625, "step": 3827 }, { "epoch": 0.92, "learning_rate": 1.8079999999999997e-08, "logps/chosen": -263.52618408203125, "logps/rejected": -425.7220458984375, "loss": 0.0037, "losses/dpo": 2.4168167467308876e-09, "losses/sft": 1.2212719917297363, "losses/total": 2.4168167467308876e-09, "ref_logps/chosen": -238.92044067382812, "ref_logps/rejected": -229.0576171875, "rewards/accuracies": 1.0, "rewards/chosen": -2.460576295852661, "rewards/margins": 17.205867767333984, "rewards/rejected": -19.66644287109375, "step": 3828 }, { "epoch": 0.92, "learning_rate": 1.8026666666666664e-08, "logps/chosen": -255.51419067382812, "logps/rejected": -372.1767578125, "loss": 0.0004, "losses/dpo": 6.5221024669881444e-06, "losses/sft": 0.6972573399543762, "losses/total": 6.5221024669881444e-06, "ref_logps/chosen": -236.38853454589844, "ref_logps/rejected": -210.47927856445312, "rewards/accuracies": 1.0, "rewards/chosen": -1.9125652313232422, "rewards/margins": 14.257183074951172, "rewards/rejected": -16.16974639892578, "step": 3829 }, { "epoch": 0.92, "learning_rate": 1.797333333333333e-08, "logps/chosen": -298.0645751953125, "logps/rejected": -421.87823486328125, "loss": 0.0005, "losses/dpo": 1.6739323815961882e-10, "losses/sft": 0.9903249144554138, "losses/total": 1.6739323815961882e-10, "ref_logps/chosen": -278.69580078125, "ref_logps/rejected": -243.5255889892578, "rewards/accuracies": 1.0, "rewards/chosen": -1.9368802309036255, "rewards/margins": 15.898386001586914, "rewards/rejected": -17.83526611328125, "step": 3830 }, { "epoch": 0.92, "learning_rate": 1.7919999999999998e-08, "logps/chosen": -283.2005615234375, "logps/rejected": -408.68914794921875, "loss": 0.0003, "losses/dpo": 4.531343833491519e-08, "losses/sft": 0.5173176527023315, "losses/total": 4.531343833491519e-08, "ref_logps/chosen": -259.5220031738281, "ref_logps/rejected": -212.3597869873047, "rewards/accuracies": 1.0, "rewards/chosen": -2.367856740951538, "rewards/margins": 17.26508331298828, "rewards/rejected": -19.632938385009766, "step": 3831 }, { "epoch": 0.92, "learning_rate": 1.7866666666666665e-08, "logps/chosen": -318.21844482421875, "logps/rejected": -434.10675048828125, "loss": 0.0013, "losses/dpo": 1.1396463328594564e-08, "losses/sft": 0.6144658923149109, "losses/total": 1.1396463328594564e-08, "ref_logps/chosen": -295.6028747558594, "ref_logps/rejected": -249.12008666992188, "rewards/accuracies": 1.0, "rewards/chosen": -2.2615575790405273, "rewards/margins": 16.23711395263672, "rewards/rejected": -18.49867057800293, "step": 3832 }, { "epoch": 0.92, "learning_rate": 1.781333333333333e-08, "logps/chosen": -283.3173828125, "logps/rejected": -448.49261474609375, "loss": 0.0, "losses/dpo": 5.2430704222672375e-09, "losses/sft": 0.5761614441871643, "losses/total": 5.2430704222672375e-09, "ref_logps/chosen": -262.5477294921875, "ref_logps/rejected": -252.97300720214844, "rewards/accuracies": 1.0, "rewards/chosen": -2.076962947845459, "rewards/margins": 17.474998474121094, "rewards/rejected": -19.55196189880371, "step": 3833 }, { "epoch": 0.92, "learning_rate": 1.776e-08, "logps/chosen": -244.52685546875, "logps/rejected": -411.55718994140625, "loss": 0.0002, "losses/dpo": 3.868302206333496e-11, "losses/sft": 0.5080909132957458, "losses/total": 3.868302206333496e-11, "ref_logps/chosen": -226.31900024414062, "ref_logps/rejected": -229.31854248046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.8207859992980957, "rewards/margins": 16.403078079223633, "rewards/rejected": -18.223865509033203, "step": 3834 }, { "epoch": 0.92, "learning_rate": 1.770666666666667e-08, "logps/chosen": -197.17919921875, "logps/rejected": -394.9938659667969, "loss": 0.0, "losses/dpo": 1.0219236656894282e-07, "losses/sft": 0.4631594717502594, "losses/total": 1.0219236656894282e-07, "ref_logps/chosen": -182.11956787109375, "ref_logps/rejected": -216.09986877441406, "rewards/accuracies": 1.0, "rewards/chosen": -1.5059654712677002, "rewards/margins": 16.383434295654297, "rewards/rejected": -17.889402389526367, "step": 3835 }, { "epoch": 0.92, "learning_rate": 1.7653333333333332e-08, "logps/chosen": -255.67396545410156, "logps/rejected": -397.2637634277344, "loss": 0.0004, "losses/dpo": 0.00851516705006361, "losses/sft": 0.5839986801147461, "losses/total": 0.00851516705006361, "ref_logps/chosen": -232.50222778320312, "ref_logps/rejected": -223.71743774414062, "rewards/accuracies": 1.0, "rewards/chosen": -2.317172050476074, "rewards/margins": 15.037461280822754, "rewards/rejected": -17.354633331298828, "step": 3836 }, { "epoch": 0.92, "learning_rate": 1.76e-08, "logps/chosen": -248.65602111816406, "logps/rejected": -399.22918701171875, "loss": 0.0006, "losses/dpo": 3.292920935926702e-10, "losses/sft": 0.9201087355613708, "losses/total": 3.292920935926702e-10, "ref_logps/chosen": -230.3704833984375, "ref_logps/rejected": -216.8460693359375, "rewards/accuracies": 1.0, "rewards/chosen": -1.828555703163147, "rewards/margins": 16.40975570678711, "rewards/rejected": -18.238311767578125, "step": 3837 }, { "epoch": 0.92, "learning_rate": 1.7546666666666666e-08, "logps/chosen": -203.23385620117188, "logps/rejected": -397.380615234375, "loss": 0.0006, "losses/dpo": 9.656620250098058e-07, "losses/sft": 0.6774213314056396, "losses/total": 9.656620250098058e-07, "ref_logps/chosen": -185.93704223632812, "ref_logps/rejected": -219.17398071289062, "rewards/accuracies": 1.0, "rewards/chosen": -1.729679822921753, "rewards/margins": 16.090984344482422, "rewards/rejected": -17.82066535949707, "step": 3838 }, { "epoch": 0.92, "learning_rate": 1.7493333333333332e-08, "logps/chosen": -225.7560272216797, "logps/rejected": -358.7620849609375, "loss": 0.0014, "losses/dpo": 6.0904330894118175e-05, "losses/sft": 0.5824629068374634, "losses/total": 6.0904330894118175e-05, "ref_logps/chosen": -207.7476043701172, "ref_logps/rejected": -188.95013427734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.8008426427841187, "rewards/margins": 15.180351257324219, "rewards/rejected": -16.98119354248047, "step": 3839 }, { "epoch": 0.92, "learning_rate": 1.744e-08, "logps/chosen": -252.17820739746094, "logps/rejected": -391.510009765625, "loss": 0.0026, "losses/dpo": 4.089532879003599e-10, "losses/sft": 0.9148564338684082, "losses/total": 4.089532879003599e-10, "ref_logps/chosen": -236.30441284179688, "ref_logps/rejected": -211.80368041992188, "rewards/accuracies": 1.0, "rewards/chosen": -1.5873780250549316, "rewards/margins": 16.38325309753418, "rewards/rejected": -17.970632553100586, "step": 3840 }, { "epoch": 0.92, "learning_rate": 1.7386666666666666e-08, "logps/chosen": -270.23101806640625, "logps/rejected": -417.93756103515625, "loss": 0.0003, "losses/dpo": 1.183183107045238e-09, "losses/sft": 0.5285957455635071, "losses/total": 1.183183107045238e-09, "ref_logps/chosen": -253.660400390625, "ref_logps/rejected": -234.1483154296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.6570613384246826, "rewards/margins": 16.72186279296875, "rewards/rejected": -18.378925323486328, "step": 3841 }, { "epoch": 0.92, "learning_rate": 1.7333333333333333e-08, "logps/chosen": -268.3681945800781, "logps/rejected": -430.3772888183594, "loss": 0.0005, "losses/dpo": 5.27934629346305e-09, "losses/sft": 0.5509322881698608, "losses/total": 5.27934629346305e-09, "ref_logps/chosen": -245.9060516357422, "ref_logps/rejected": -235.3247833251953, "rewards/accuracies": 1.0, "rewards/chosen": -2.246215343475342, "rewards/margins": 17.259033203125, "rewards/rejected": -19.5052490234375, "step": 3842 }, { "epoch": 0.92, "learning_rate": 1.728e-08, "logps/chosen": -234.83078002929688, "logps/rejected": -400.91802978515625, "loss": 0.0002, "losses/dpo": 1.5488187443679635e-07, "losses/sft": 0.7448835968971252, "losses/total": 1.5488187443679635e-07, "ref_logps/chosen": -219.82632446289062, "ref_logps/rejected": -227.3523712158203, "rewards/accuracies": 1.0, "rewards/chosen": -1.500443458557129, "rewards/margins": 15.856121063232422, "rewards/rejected": -17.356563568115234, "step": 3843 }, { "epoch": 0.92, "learning_rate": 1.7226666666666667e-08, "logps/chosen": -217.94430541992188, "logps/rejected": -375.58184814453125, "loss": 0.0005, "losses/dpo": 1.0954209095359602e-08, "losses/sft": 0.7270205616950989, "losses/total": 1.0954209095359602e-08, "ref_logps/chosen": -202.28329467773438, "ref_logps/rejected": -201.14878845214844, "rewards/accuracies": 1.0, "rewards/chosen": -1.56610107421875, "rewards/margins": 15.877205848693848, "rewards/rejected": -17.44330596923828, "step": 3844 }, { "epoch": 0.92, "learning_rate": 1.717333333333333e-08, "logps/chosen": -244.27249145507812, "logps/rejected": -392.71923828125, "loss": 0.0065, "losses/dpo": 2.804688703506031e-10, "losses/sft": 0.5313940644264221, "losses/total": 2.804688703506031e-10, "ref_logps/chosen": -226.39157104492188, "ref_logps/rejected": -236.12091064453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.788091778755188, "rewards/margins": 13.871740341186523, "rewards/rejected": -15.659832954406738, "step": 3845 }, { "epoch": 0.92, "learning_rate": 1.7119999999999997e-08, "logps/chosen": -213.30934143066406, "logps/rejected": -390.1732177734375, "loss": 0.0001, "losses/dpo": 5.650532042267287e-09, "losses/sft": 0.9393013715744019, "losses/total": 5.650532042267287e-09, "ref_logps/chosen": -198.5399627685547, "ref_logps/rejected": -219.799560546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.4769396781921387, "rewards/margins": 15.5604248046875, "rewards/rejected": -17.037364959716797, "step": 3846 }, { "epoch": 0.92, "learning_rate": 1.7066666666666664e-08, "logps/chosen": -271.6703186035156, "logps/rejected": -387.18450927734375, "loss": 0.0109, "losses/dpo": 5.881286124775897e-09, "losses/sft": 0.706911563873291, "losses/total": 5.881286124775897e-09, "ref_logps/chosen": -250.83522033691406, "ref_logps/rejected": -224.65902709960938, "rewards/accuracies": 1.0, "rewards/chosen": -2.083508014678955, "rewards/margins": 14.169038772583008, "rewards/rejected": -16.252546310424805, "step": 3847 }, { "epoch": 0.92, "learning_rate": 1.701333333333333e-08, "logps/chosen": -244.76922607421875, "logps/rejected": -421.93743896484375, "loss": 0.0008, "losses/dpo": 7.246937858834679e-12, "losses/sft": 0.5324277877807617, "losses/total": 7.246937858834679e-12, "ref_logps/chosen": -229.2904815673828, "ref_logps/rejected": -237.90101623535156, "rewards/accuracies": 1.0, "rewards/chosen": -1.547873854637146, "rewards/margins": 16.85576820373535, "rewards/rejected": -18.403640747070312, "step": 3848 }, { "epoch": 0.92, "learning_rate": 1.6959999999999998e-08, "logps/chosen": -245.7977294921875, "logps/rejected": -440.508056640625, "loss": 0.0, "losses/dpo": 1.1905441910098347e-10, "losses/sft": 0.6853585243225098, "losses/total": 1.1905441910098347e-10, "ref_logps/chosen": -229.56509399414062, "ref_logps/rejected": -242.1751708984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.6232646703720093, "rewards/margins": 18.210025787353516, "rewards/rejected": -19.833290100097656, "step": 3849 }, { "epoch": 0.92, "learning_rate": 1.6906666666666668e-08, "logps/chosen": -256.39678955078125, "logps/rejected": -432.2364807128906, "loss": 0.0, "losses/dpo": 3.638348005097214e-07, "losses/sft": 0.5745282769203186, "losses/total": 3.638348005097214e-07, "ref_logps/chosen": -239.4713134765625, "ref_logps/rejected": -239.4312744140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.6925487518310547, "rewards/margins": 17.587974548339844, "rewards/rejected": -19.280521392822266, "step": 3850 }, { "epoch": 0.92, "learning_rate": 1.6853333333333335e-08, "logps/chosen": -248.3145751953125, "logps/rejected": -388.42041015625, "loss": 0.0004, "losses/dpo": 2.6193896474069334e-07, "losses/sft": 0.26682421565055847, "losses/total": 2.6193896474069334e-07, "ref_logps/chosen": -228.28900146484375, "ref_logps/rejected": -215.29603576660156, "rewards/accuracies": 1.0, "rewards/chosen": -2.0025577545166016, "rewards/margins": 15.309881210327148, "rewards/rejected": -17.31243896484375, "step": 3851 }, { "epoch": 0.92, "learning_rate": 1.68e-08, "logps/chosen": -243.38722229003906, "logps/rejected": -373.3887939453125, "loss": 0.0093, "losses/dpo": 1.503547153447471e-08, "losses/sft": 0.47308725118637085, "losses/total": 1.503547153447471e-08, "ref_logps/chosen": -222.34495544433594, "ref_logps/rejected": -210.5679931640625, "rewards/accuracies": 1.0, "rewards/chosen": -2.104227066040039, "rewards/margins": 14.1778564453125, "rewards/rejected": -16.28208351135254, "step": 3852 }, { "epoch": 0.92, "learning_rate": 1.674666666666667e-08, "logps/chosen": -206.21533203125, "logps/rejected": -394.24774169921875, "loss": 0.0001, "losses/dpo": 1.4822145288917454e-08, "losses/sft": 0.6055667996406555, "losses/total": 1.4822145288917454e-08, "ref_logps/chosen": -191.0457000732422, "ref_logps/rejected": -207.72894287109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.516960859298706, "rewards/margins": 17.134920120239258, "rewards/rejected": -18.651880264282227, "step": 3853 }, { "epoch": 0.92, "learning_rate": 1.6693333333333332e-08, "logps/chosen": -188.476318359375, "logps/rejected": -360.2158203125, "loss": 0.0008, "losses/dpo": 2.0975262415845464e-08, "losses/sft": 0.7260560393333435, "losses/total": 2.0975262415845464e-08, "ref_logps/chosen": -170.99143981933594, "ref_logps/rejected": -197.86227416992188, "rewards/accuracies": 1.0, "rewards/chosen": -1.7484865188598633, "rewards/margins": 14.486869812011719, "rewards/rejected": -16.235355377197266, "step": 3854 }, { "epoch": 0.93, "learning_rate": 1.664e-08, "logps/chosen": -207.59140014648438, "logps/rejected": -380.8188781738281, "loss": 0.0018, "losses/dpo": 2.112784081020891e-08, "losses/sft": 0.4383145868778229, "losses/total": 2.112784081020891e-08, "ref_logps/chosen": -189.52481079101562, "ref_logps/rejected": -210.91336059570312, "rewards/accuracies": 1.0, "rewards/chosen": -1.8066601753234863, "rewards/margins": 15.183892250061035, "rewards/rejected": -16.99055290222168, "step": 3855 }, { "epoch": 0.93, "learning_rate": 1.6586666666666666e-08, "logps/chosen": -227.68289184570312, "logps/rejected": -406.8834228515625, "loss": 0.0019, "losses/dpo": 2.2215944639469853e-08, "losses/sft": 0.4299636781215668, "losses/total": 2.2215944639469853e-08, "ref_logps/chosen": -213.72708129882812, "ref_logps/rejected": -236.0402374267578, "rewards/accuracies": 1.0, "rewards/chosen": -1.3955798149108887, "rewards/margins": 15.688740730285645, "rewards/rejected": -17.084320068359375, "step": 3856 }, { "epoch": 0.93, "learning_rate": 1.6533333333333332e-08, "logps/chosen": -264.97479248046875, "logps/rejected": -389.97900390625, "loss": 0.0005, "losses/dpo": 1.1163146851345118e-08, "losses/sft": 0.6855988502502441, "losses/total": 1.1163146851345118e-08, "ref_logps/chosen": -250.23248291015625, "ref_logps/rejected": -224.3874053955078, "rewards/accuracies": 1.0, "rewards/chosen": -1.4742305278778076, "rewards/margins": 15.084930419921875, "rewards/rejected": -16.559160232543945, "step": 3857 }, { "epoch": 0.93, "learning_rate": 1.648e-08, "logps/chosen": -252.22970581054688, "logps/rejected": -389.99609375, "loss": 0.0003, "losses/dpo": 6.814308232266919e-13, "losses/sft": 0.43924862146377563, "losses/total": 6.814308232266919e-13, "ref_logps/chosen": -235.7526092529297, "ref_logps/rejected": -225.53738403320312, "rewards/accuracies": 1.0, "rewards/chosen": -1.6477100849151611, "rewards/margins": 14.798160552978516, "rewards/rejected": -16.44586944580078, "step": 3858 }, { "epoch": 0.93, "learning_rate": 1.6426666666666666e-08, "logps/chosen": -234.04977416992188, "logps/rejected": -402.745361328125, "loss": 0.0004, "losses/dpo": 2.4787050278973766e-06, "losses/sft": 1.125657320022583, "losses/total": 2.4787050278973766e-06, "ref_logps/chosen": -213.78765869140625, "ref_logps/rejected": -225.31527709960938, "rewards/accuracies": 1.0, "rewards/chosen": -2.0262129306793213, "rewards/margins": 15.716796875, "rewards/rejected": -17.743009567260742, "step": 3859 }, { "epoch": 0.93, "learning_rate": 1.6373333333333333e-08, "logps/chosen": -204.91104125976562, "logps/rejected": -353.46630859375, "loss": 0.0002, "losses/dpo": 3.202829113035932e-10, "losses/sft": 0.4682486653327942, "losses/total": 3.202829113035932e-10, "ref_logps/chosen": -188.5502166748047, "ref_logps/rejected": -199.22691345214844, "rewards/accuracies": 1.0, "rewards/chosen": -1.6360833644866943, "rewards/margins": 13.787856101989746, "rewards/rejected": -15.423938751220703, "step": 3860 }, { "epoch": 0.93, "learning_rate": 1.632e-08, "logps/chosen": -249.66262817382812, "logps/rejected": -388.3138427734375, "loss": 0.0044, "losses/dpo": 4.573645462357945e-09, "losses/sft": 0.5608002543449402, "losses/total": 4.573645462357945e-09, "ref_logps/chosen": -234.38076782226562, "ref_logps/rejected": -225.16842651367188, "rewards/accuracies": 1.0, "rewards/chosen": -1.5281836986541748, "rewards/margins": 14.786355972290039, "rewards/rejected": -16.314538955688477, "step": 3861 }, { "epoch": 0.93, "learning_rate": 1.6266666666666663e-08, "logps/chosen": -269.25732421875, "logps/rejected": -409.3602600097656, "loss": 0.0, "losses/dpo": 4.819262766631027e-10, "losses/sft": 0.8100470900535583, "losses/total": 4.819262766631027e-10, "ref_logps/chosen": -250.85421752929688, "ref_logps/rejected": -223.14938354492188, "rewards/accuracies": 1.0, "rewards/chosen": -1.8403077125549316, "rewards/margins": 16.780780792236328, "rewards/rejected": -18.6210880279541, "step": 3862 }, { "epoch": 0.93, "learning_rate": 1.621333333333333e-08, "logps/chosen": -257.0401611328125, "logps/rejected": -392.2645263671875, "loss": 0.0021, "losses/dpo": 1.1838722002721624e-08, "losses/sft": 0.6602902412414551, "losses/total": 1.1838722002721624e-08, "ref_logps/chosen": -240.02452087402344, "ref_logps/rejected": -226.09405517578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.7015657424926758, "rewards/margins": 14.915481567382812, "rewards/rejected": -16.617046356201172, "step": 3863 }, { "epoch": 0.93, "learning_rate": 1.6159999999999997e-08, "logps/chosen": -215.00274658203125, "logps/rejected": -365.4122314453125, "loss": 0.0004, "losses/dpo": 5.031228766938511e-10, "losses/sft": 0.5358996987342834, "losses/total": 5.031228766938511e-10, "ref_logps/chosen": -200.37384033203125, "ref_logps/rejected": -203.23916625976562, "rewards/accuracies": 1.0, "rewards/chosen": -1.4628918170928955, "rewards/margins": 14.754415512084961, "rewards/rejected": -16.217308044433594, "step": 3864 }, { "epoch": 0.93, "learning_rate": 1.6106666666666664e-08, "logps/chosen": -236.9092254638672, "logps/rejected": -418.17041015625, "loss": 0.0003, "losses/dpo": 2.28060473284053e-10, "losses/sft": 0.7241700291633606, "losses/total": 2.28060473284053e-10, "ref_logps/chosen": -223.07315063476562, "ref_logps/rejected": -236.30596923828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.3836060762405396, "rewards/margins": 16.802837371826172, "rewards/rejected": -18.186443328857422, "step": 3865 }, { "epoch": 0.93, "learning_rate": 1.6053333333333334e-08, "logps/chosen": -280.1766052246094, "logps/rejected": -443.94384765625, "loss": 0.0002, "losses/dpo": 9.359721531154719e-08, "losses/sft": 1.061734676361084, "losses/total": 9.359721531154719e-08, "ref_logps/chosen": -258.6395263671875, "ref_logps/rejected": -257.09454345703125, "rewards/accuracies": 1.0, "rewards/chosen": -2.153707504272461, "rewards/margins": 16.531221389770508, "rewards/rejected": -18.68492889404297, "step": 3866 }, { "epoch": 0.93, "learning_rate": 1.6e-08, "logps/chosen": -233.09619140625, "logps/rejected": -432.9212646484375, "loss": 0.0007, "losses/dpo": 4.3356682133399715e-10, "losses/sft": 0.5357127785682678, "losses/total": 4.3356682133399715e-10, "ref_logps/chosen": -213.98294067382812, "ref_logps/rejected": -253.29055786132812, "rewards/accuracies": 1.0, "rewards/chosen": -1.9113240242004395, "rewards/margins": 16.051748275756836, "rewards/rejected": -17.963071823120117, "step": 3867 }, { "epoch": 0.93, "learning_rate": 1.5946666666666668e-08, "logps/chosen": -186.18124389648438, "logps/rejected": -351.95013427734375, "loss": 0.0025, "losses/dpo": 2.036978408170853e-08, "losses/sft": 0.6822558641433716, "losses/total": 2.036978408170853e-08, "ref_logps/chosen": -171.5815887451172, "ref_logps/rejected": -195.1739044189453, "rewards/accuracies": 1.0, "rewards/chosen": -1.459965705871582, "rewards/margins": 14.217656135559082, "rewards/rejected": -15.677621841430664, "step": 3868 }, { "epoch": 0.93, "learning_rate": 1.5893333333333335e-08, "logps/chosen": -216.89695739746094, "logps/rejected": -362.9158935546875, "loss": 0.0131, "losses/dpo": 2.2203012406407652e-07, "losses/sft": 0.6263641119003296, "losses/total": 2.2203012406407652e-07, "ref_logps/chosen": -197.240478515625, "ref_logps/rejected": -198.61990356445312, "rewards/accuracies": 1.0, "rewards/chosen": -1.9656472206115723, "rewards/margins": 14.463951110839844, "rewards/rejected": -16.42959976196289, "step": 3869 }, { "epoch": 0.93, "learning_rate": 1.584e-08, "logps/chosen": -266.7755432128906, "logps/rejected": -434.18035888671875, "loss": 0.0009, "losses/dpo": 3.2238907770556124e-16, "losses/sft": 0.635734498500824, "losses/total": 3.2238907770556124e-16, "ref_logps/chosen": -247.5306396484375, "ref_logps/rejected": -246.67544555664062, "rewards/accuracies": 1.0, "rewards/chosen": -1.9244881868362427, "rewards/margins": 16.82600212097168, "rewards/rejected": -18.750490188598633, "step": 3870 }, { "epoch": 0.93, "learning_rate": 1.5786666666666665e-08, "logps/chosen": -315.954833984375, "logps/rejected": -417.3332214355469, "loss": 0.0002, "losses/dpo": 3.671344117606168e-08, "losses/sft": 0.7858908176422119, "losses/total": 3.671344117606168e-08, "ref_logps/chosen": -294.6055908203125, "ref_logps/rejected": -219.9617919921875, "rewards/accuracies": 1.0, "rewards/chosen": -2.1349222660064697, "rewards/margins": 17.602222442626953, "rewards/rejected": -19.737144470214844, "step": 3871 }, { "epoch": 0.93, "learning_rate": 1.5733333333333332e-08, "logps/chosen": -215.94259643554688, "logps/rejected": -374.85791015625, "loss": 0.0004, "losses/dpo": 2.4012081212276826e-07, "losses/sft": 0.6070113778114319, "losses/total": 2.4012081212276826e-07, "ref_logps/chosen": -200.39022827148438, "ref_logps/rejected": -211.053466796875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5552358627319336, "rewards/margins": 14.825210571289062, "rewards/rejected": -16.380447387695312, "step": 3872 }, { "epoch": 0.93, "learning_rate": 1.568e-08, "logps/chosen": -259.60302734375, "logps/rejected": -378.638671875, "loss": 0.0033, "losses/dpo": 1.37197848568138e-11, "losses/sft": 0.4822825789451599, "losses/total": 1.37197848568138e-11, "ref_logps/chosen": -239.57606506347656, "ref_logps/rejected": -217.6527099609375, "rewards/accuracies": 1.0, "rewards/chosen": -2.0026957988739014, "rewards/margins": 14.09589958190918, "rewards/rejected": -16.098594665527344, "step": 3873 }, { "epoch": 0.93, "learning_rate": 1.5626666666666666e-08, "logps/chosen": -224.525390625, "logps/rejected": -420.3277587890625, "loss": 0.0002, "losses/dpo": 0.0006762134726159275, "losses/sft": 0.746062695980072, "losses/total": 0.0006762134726159275, "ref_logps/chosen": -207.33290100097656, "ref_logps/rejected": -241.6116180419922, "rewards/accuracies": 1.0, "rewards/chosen": -1.7192466259002686, "rewards/margins": 16.152366638183594, "rewards/rejected": -17.871612548828125, "step": 3874 }, { "epoch": 0.93, "learning_rate": 1.5573333333333332e-08, "logps/chosen": -229.216796875, "logps/rejected": -412.4072265625, "loss": 0.0, "losses/dpo": 8.554247301617579e-07, "losses/sft": 0.5515131950378418, "losses/total": 8.554247301617579e-07, "ref_logps/chosen": -208.72930908203125, "ref_logps/rejected": -219.36355590820312, "rewards/accuracies": 1.0, "rewards/chosen": -2.048746109008789, "rewards/margins": 17.2556209564209, "rewards/rejected": -19.304367065429688, "step": 3875 }, { "epoch": 0.93, "learning_rate": 1.552e-08, "logps/chosen": -225.80096435546875, "logps/rejected": -374.46478271484375, "loss": 0.0058, "losses/dpo": 2.7074481545596996e-10, "losses/sft": 0.43504998087882996, "losses/total": 2.7074481545596996e-10, "ref_logps/chosen": -210.5623779296875, "ref_logps/rejected": -215.9098358154297, "rewards/accuracies": 1.0, "rewards/chosen": -1.5238573551177979, "rewards/margins": 14.33163833618164, "rewards/rejected": -15.85549545288086, "step": 3876 }, { "epoch": 0.93, "learning_rate": 1.5466666666666666e-08, "logps/chosen": -277.9114074707031, "logps/rejected": -431.6871337890625, "loss": 0.0, "losses/dpo": 3.440767670781497e-08, "losses/sft": 1.0781587362289429, "losses/total": 3.440767670781497e-08, "ref_logps/chosen": -256.67144775390625, "ref_logps/rejected": -238.47213745117188, "rewards/accuracies": 1.0, "rewards/chosen": -2.123994827270508, "rewards/margins": 17.197505950927734, "rewards/rejected": -19.321502685546875, "step": 3877 }, { "epoch": 0.93, "learning_rate": 1.5413333333333333e-08, "logps/chosen": -222.68475341796875, "logps/rejected": -397.95831298828125, "loss": 0.001, "losses/dpo": 3.47353056895372e-06, "losses/sft": 0.7448569536209106, "losses/total": 3.47353056895372e-06, "ref_logps/chosen": -207.32408142089844, "ref_logps/rejected": -226.8904266357422, "rewards/accuracies": 1.0, "rewards/chosen": -1.5360677242279053, "rewards/margins": 15.570723533630371, "rewards/rejected": -17.10679054260254, "step": 3878 }, { "epoch": 0.93, "learning_rate": 1.5359999999999997e-08, "logps/chosen": -256.7521057128906, "logps/rejected": -375.6153564453125, "loss": 0.0007, "losses/dpo": 1.1233331598248242e-08, "losses/sft": 0.7455136775970459, "losses/total": 1.1233331598248242e-08, "ref_logps/chosen": -236.88894653320312, "ref_logps/rejected": -205.79522705078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.9863133430480957, "rewards/margins": 14.995701789855957, "rewards/rejected": -16.982013702392578, "step": 3879 }, { "epoch": 0.93, "learning_rate": 1.5306666666666663e-08, "logps/chosen": -251.19261169433594, "logps/rejected": -415.6623229980469, "loss": 0.0011, "losses/dpo": 2.9273243384864145e-08, "losses/sft": 0.6207132935523987, "losses/total": 2.9273243384864145e-08, "ref_logps/chosen": -232.517333984375, "ref_logps/rejected": -233.30343627929688, "rewards/accuracies": 1.0, "rewards/chosen": -1.867526888847351, "rewards/margins": 16.368362426757812, "rewards/rejected": -18.235889434814453, "step": 3880 }, { "epoch": 0.93, "learning_rate": 1.525333333333333e-08, "logps/chosen": -222.07337951660156, "logps/rejected": -384.9991760253906, "loss": 0.0003, "losses/dpo": 4.847400692647996e-13, "losses/sft": 0.6148385405540466, "losses/total": 4.847400692647996e-13, "ref_logps/chosen": -206.952392578125, "ref_logps/rejected": -207.43092346191406, "rewards/accuracies": 1.0, "rewards/chosen": -1.5120978355407715, "rewards/margins": 16.244728088378906, "rewards/rejected": -17.756824493408203, "step": 3881 }, { "epoch": 0.93, "learning_rate": 1.52e-08, "logps/chosen": -212.78501892089844, "logps/rejected": -389.8433532714844, "loss": 0.0001, "losses/dpo": 7.219085063070452e-08, "losses/sft": 1.0535657405853271, "losses/total": 7.219085063070452e-08, "ref_logps/chosen": -199.9623565673828, "ref_logps/rejected": -217.3860321044922, "rewards/accuracies": 1.0, "rewards/chosen": -1.2822659015655518, "rewards/margins": 15.963465690612793, "rewards/rejected": -17.245731353759766, "step": 3882 }, { "epoch": 0.93, "learning_rate": 1.5146666666666667e-08, "logps/chosen": -269.8558349609375, "logps/rejected": -407.62286376953125, "loss": 0.0013, "losses/dpo": 1.574612429377023e-13, "losses/sft": 0.6398006677627563, "losses/total": 1.574612429377023e-13, "ref_logps/chosen": -252.6103515625, "ref_logps/rejected": -229.78585815429688, "rewards/accuracies": 1.0, "rewards/chosen": -1.7245473861694336, "rewards/margins": 16.05915069580078, "rewards/rejected": -17.78369903564453, "step": 3883 }, { "epoch": 0.93, "learning_rate": 1.5093333333333334e-08, "logps/chosen": -250.98565673828125, "logps/rejected": -434.5848083496094, "loss": 0.0003, "losses/dpo": 5.090333488055876e-09, "losses/sft": 0.6558046936988831, "losses/total": 5.090333488055876e-09, "ref_logps/chosen": -231.95001220703125, "ref_logps/rejected": -239.8400421142578, "rewards/accuracies": 1.0, "rewards/chosen": -1.9035667181015015, "rewards/margins": 17.570911407470703, "rewards/rejected": -19.474477767944336, "step": 3884 }, { "epoch": 0.93, "learning_rate": 1.504e-08, "logps/chosen": -261.33172607421875, "logps/rejected": -393.3359375, "loss": 0.0003, "losses/dpo": 6.846457108622417e-05, "losses/sft": 0.6031136512756348, "losses/total": 6.846457108622417e-05, "ref_logps/chosen": -241.5503387451172, "ref_logps/rejected": -228.8873748779297, "rewards/accuracies": 1.0, "rewards/chosen": -1.97813880443573, "rewards/margins": 14.466716766357422, "rewards/rejected": -16.444854736328125, "step": 3885 }, { "epoch": 0.93, "learning_rate": 1.4986666666666668e-08, "logps/chosen": -275.0668640136719, "logps/rejected": -435.9927062988281, "loss": 0.0001, "losses/dpo": 3.847577367821486e-09, "losses/sft": 0.5618091821670532, "losses/total": 3.847577367821486e-09, "ref_logps/chosen": -255.76055908203125, "ref_logps/rejected": -247.236083984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.9306306838989258, "rewards/margins": 16.945030212402344, "rewards/rejected": -18.875661849975586, "step": 3886 }, { "epoch": 0.93, "learning_rate": 1.4933333333333335e-08, "logps/chosen": -233.6280517578125, "logps/rejected": -423.90911865234375, "loss": 0.0001, "losses/dpo": 9.206130016536918e-06, "losses/sft": 0.6528658866882324, "losses/total": 9.206130016536918e-06, "ref_logps/chosen": -213.4500732421875, "ref_logps/rejected": -232.85443115234375, "rewards/accuracies": 1.0, "rewards/chosen": -2.017796039581299, "rewards/margins": 17.08767318725586, "rewards/rejected": -19.10546875, "step": 3887 }, { "epoch": 0.93, "learning_rate": 1.4879999999999998e-08, "logps/chosen": -299.2252197265625, "logps/rejected": -418.7109375, "loss": 0.0087, "losses/dpo": 3.739314191619769e-09, "losses/sft": 0.44731852412223816, "losses/total": 3.739314191619769e-09, "ref_logps/chosen": -276.6899108886719, "ref_logps/rejected": -243.93101501464844, "rewards/accuracies": 1.0, "rewards/chosen": -2.2535324096679688, "rewards/margins": 15.224459648132324, "rewards/rejected": -17.47799301147461, "step": 3888 }, { "epoch": 0.93, "learning_rate": 1.4826666666666665e-08, "logps/chosen": -213.5639190673828, "logps/rejected": -368.41339111328125, "loss": 0.0009, "losses/dpo": 2.2256001208642573e-10, "losses/sft": 0.629407525062561, "losses/total": 2.2256001208642573e-10, "ref_logps/chosen": -196.96783447265625, "ref_logps/rejected": -207.01119995117188, "rewards/accuracies": 1.0, "rewards/chosen": -1.6596078872680664, "rewards/margins": 14.480608940124512, "rewards/rejected": -16.140216827392578, "step": 3889 }, { "epoch": 0.93, "learning_rate": 1.4773333333333332e-08, "logps/chosen": -251.85731506347656, "logps/rejected": -429.59832763671875, "loss": 0.0009, "losses/dpo": 2.272419752369359e-10, "losses/sft": 0.5698286890983582, "losses/total": 2.272419752369359e-10, "ref_logps/chosen": -232.6393585205078, "ref_logps/rejected": -234.4271240234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.9217958450317383, "rewards/margins": 17.595327377319336, "rewards/rejected": -19.51712417602539, "step": 3890 }, { "epoch": 0.93, "learning_rate": 1.4719999999999999e-08, "logps/chosen": -285.83349609375, "logps/rejected": -388.1370849609375, "loss": 0.0051, "losses/dpo": 1.1152511660839082e-06, "losses/sft": 0.557650625705719, "losses/total": 1.1152511660839082e-06, "ref_logps/chosen": -266.277587890625, "ref_logps/rejected": -222.00999450683594, "rewards/accuracies": 1.0, "rewards/chosen": -1.955588936805725, "rewards/margins": 14.657122611999512, "rewards/rejected": -16.612712860107422, "step": 3891 }, { "epoch": 0.93, "learning_rate": 1.4666666666666666e-08, "logps/chosen": -249.5121612548828, "logps/rejected": -369.2181396484375, "loss": 0.0079, "losses/dpo": 8.138283192238305e-06, "losses/sft": 0.44439980387687683, "losses/total": 8.138283192238305e-06, "ref_logps/chosen": -234.2270965576172, "ref_logps/rejected": -215.45309448242188, "rewards/accuracies": 1.0, "rewards/chosen": -1.5285074710845947, "rewards/margins": 13.847996711730957, "rewards/rejected": -15.376504898071289, "step": 3892 }, { "epoch": 0.93, "learning_rate": 1.4613333333333333e-08, "logps/chosen": -230.55838012695312, "logps/rejected": -414.2666015625, "loss": 0.002, "losses/dpo": 2.906374685189561e-14, "losses/sft": 0.6123220324516296, "losses/total": 2.906374685189561e-14, "ref_logps/chosen": -211.9183349609375, "ref_logps/rejected": -232.2571258544922, "rewards/accuracies": 1.0, "rewards/chosen": -1.8640034198760986, "rewards/margins": 16.33694839477539, "rewards/rejected": -18.200950622558594, "step": 3893 }, { "epoch": 0.93, "learning_rate": 1.456e-08, "logps/chosen": -256.80816650390625, "logps/rejected": -401.3908996582031, "loss": 0.0025, "losses/dpo": 3.2641434000169056e-10, "losses/sft": 0.6907013058662415, "losses/total": 3.2641434000169056e-10, "ref_logps/chosen": -237.35784912109375, "ref_logps/rejected": -219.01663208007812, "rewards/accuracies": 1.0, "rewards/chosen": -1.9450322389602661, "rewards/margins": 16.29239273071289, "rewards/rejected": -18.2374267578125, "step": 3894 }, { "epoch": 0.93, "learning_rate": 1.4506666666666668e-08, "logps/chosen": -218.03536987304688, "logps/rejected": -384.4600830078125, "loss": 0.0004, "losses/dpo": 2.3343650568108387e-08, "losses/sft": 0.5446163415908813, "losses/total": 2.3343650568108387e-08, "ref_logps/chosen": -198.46762084960938, "ref_logps/rejected": -212.26954650878906, "rewards/accuracies": 1.0, "rewards/chosen": -1.956775426864624, "rewards/margins": 15.262277603149414, "rewards/rejected": -17.219053268432617, "step": 3895 }, { "epoch": 0.93, "learning_rate": 1.4453333333333335e-08, "logps/chosen": -234.96124267578125, "logps/rejected": -383.88702392578125, "loss": 0.0001, "losses/dpo": 6.6833122218667995e-06, "losses/sft": 0.8204177618026733, "losses/total": 6.6833122218667995e-06, "ref_logps/chosen": -215.26657104492188, "ref_logps/rejected": -220.08413696289062, "rewards/accuracies": 1.0, "rewards/chosen": -1.969468116760254, "rewards/margins": 14.410821914672852, "rewards/rejected": -16.380287170410156, "step": 3896 }, { "epoch": 0.94, "learning_rate": 1.4399999999999998e-08, "logps/chosen": -280.0035400390625, "logps/rejected": -358.13580322265625, "loss": 0.0002, "losses/dpo": 1.7896917281490232e-10, "losses/sft": 0.6487348079681396, "losses/total": 1.7896917281490232e-10, "ref_logps/chosen": -264.0625305175781, "ref_logps/rejected": -207.5232696533203, "rewards/accuracies": 1.0, "rewards/chosen": -1.5940990447998047, "rewards/margins": 13.467151641845703, "rewards/rejected": -15.061249732971191, "step": 3897 }, { "epoch": 0.94, "learning_rate": 1.4346666666666665e-08, "logps/chosen": -265.4331970214844, "logps/rejected": -425.14727783203125, "loss": 0.0002, "losses/dpo": 2.4567049783374983e-11, "losses/sft": 0.6926516890525818, "losses/total": 2.4567049783374983e-11, "ref_logps/chosen": -243.39549255371094, "ref_logps/rejected": -246.4526824951172, "rewards/accuracies": 1.0, "rewards/chosen": -2.2037696838378906, "rewards/margins": 15.665692329406738, "rewards/rejected": -17.869462966918945, "step": 3898 }, { "epoch": 0.94, "learning_rate": 1.4293333333333332e-08, "logps/chosen": -278.4003601074219, "logps/rejected": -423.46337890625, "loss": 0.0002, "losses/dpo": 3.021790373480826e-09, "losses/sft": 0.6600456237792969, "losses/total": 3.021790373480826e-09, "ref_logps/chosen": -256.4481201171875, "ref_logps/rejected": -243.63494873046875, "rewards/accuracies": 1.0, "rewards/chosen": -2.1952264308929443, "rewards/margins": 15.787616729736328, "rewards/rejected": -17.982845306396484, "step": 3899 }, { "epoch": 0.94, "learning_rate": 1.4239999999999999e-08, "logps/chosen": -256.84649658203125, "logps/rejected": -450.9544677734375, "loss": 0.0001, "losses/dpo": 1.121768855227856e-05, "losses/sft": 0.4125065505504608, "losses/total": 1.121768855227856e-05, "ref_logps/chosen": -238.83465576171875, "ref_logps/rejected": -261.5916748046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.8011820316314697, "rewards/margins": 17.13509750366211, "rewards/rejected": -18.936281204223633, "step": 3900 }, { "epoch": 0.94, "learning_rate": 1.4186666666666666e-08, "logps/chosen": -298.4184875488281, "logps/rejected": -435.99652099609375, "loss": 0.0001, "losses/dpo": 1.3622933003887283e-08, "losses/sft": 0.6904114484786987, "losses/total": 1.3622933003887283e-08, "ref_logps/chosen": -276.491455078125, "ref_logps/rejected": -241.80877685546875, "rewards/accuracies": 1.0, "rewards/chosen": -2.1927013397216797, "rewards/margins": 17.226070404052734, "rewards/rejected": -19.418773651123047, "step": 3901 }, { "epoch": 0.94, "learning_rate": 1.4133333333333333e-08, "logps/chosen": -255.389892578125, "logps/rejected": -425.7572937011719, "loss": 0.001, "losses/dpo": 1.8063044393556993e-09, "losses/sft": 0.7310147881507874, "losses/total": 1.8063044393556993e-09, "ref_logps/chosen": -235.69358825683594, "ref_logps/rejected": -234.4757537841797, "rewards/accuracies": 1.0, "rewards/chosen": -1.9696284532546997, "rewards/margins": 17.158523559570312, "rewards/rejected": -19.12815284729004, "step": 3902 }, { "epoch": 0.94, "learning_rate": 1.4080000000000001e-08, "logps/chosen": -255.65306091308594, "logps/rejected": -401.3062438964844, "loss": 0.0009, "losses/dpo": 8.131564865898611e-10, "losses/sft": 0.6797322034835815, "losses/total": 8.131564865898611e-10, "ref_logps/chosen": -237.61761474609375, "ref_logps/rejected": -226.1722412109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.803545355796814, "rewards/margins": 15.709856033325195, "rewards/rejected": -17.51340103149414, "step": 3903 }, { "epoch": 0.94, "learning_rate": 1.4026666666666668e-08, "logps/chosen": -246.26760864257812, "logps/rejected": -391.524658203125, "loss": 0.0013, "losses/dpo": 6.95606861000897e-09, "losses/sft": 0.7646832466125488, "losses/total": 6.95606861000897e-09, "ref_logps/chosen": -228.2941436767578, "ref_logps/rejected": -214.98602294921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7973474264144897, "rewards/margins": 15.856513977050781, "rewards/rejected": -17.65386199951172, "step": 3904 }, { "epoch": 0.94, "learning_rate": 1.3973333333333331e-08, "logps/chosen": -215.80218505859375, "logps/rejected": -420.762939453125, "loss": 0.0021, "losses/dpo": 0.0016599709633737803, "losses/sft": 0.48371991515159607, "losses/total": 0.0016599709633737803, "ref_logps/chosen": -198.13284301757812, "ref_logps/rejected": -240.5481719970703, "rewards/accuracies": 1.0, "rewards/chosen": -1.7669353485107422, "rewards/margins": 16.254541397094727, "rewards/rejected": -18.02147674560547, "step": 3905 }, { "epoch": 0.94, "learning_rate": 1.3919999999999998e-08, "logps/chosen": -256.9071960449219, "logps/rejected": -398.50115966796875, "loss": 0.0002, "losses/dpo": 7.130507295727284e-11, "losses/sft": 0.5281519293785095, "losses/total": 7.130507295727284e-11, "ref_logps/chosen": -239.70675659179688, "ref_logps/rejected": -226.38473510742188, "rewards/accuracies": 1.0, "rewards/chosen": -1.7200456857681274, "rewards/margins": 15.491595268249512, "rewards/rejected": -17.211641311645508, "step": 3906 }, { "epoch": 0.94, "learning_rate": 1.3866666666666665e-08, "logps/chosen": -258.830810546875, "logps/rejected": -418.7840576171875, "loss": 0.0003, "losses/dpo": 1.161809038731365e-12, "losses/sft": 0.6367685794830322, "losses/total": 1.161809038731365e-12, "ref_logps/chosen": -239.83380126953125, "ref_logps/rejected": -235.2152557373047, "rewards/accuracies": 1.0, "rewards/chosen": -1.899701714515686, "rewards/margins": 16.457176208496094, "rewards/rejected": -18.35687828063965, "step": 3907 }, { "epoch": 0.94, "learning_rate": 1.3813333333333332e-08, "logps/chosen": -220.66067504882812, "logps/rejected": -367.5107116699219, "loss": 0.0001, "losses/dpo": 1.5482369519759231e-07, "losses/sft": 0.7110960483551025, "losses/total": 1.5482369519759231e-07, "ref_logps/chosen": -203.97756958007812, "ref_logps/rejected": -198.45458984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.6683104038238525, "rewards/margins": 15.23730182647705, "rewards/rejected": -16.90561294555664, "step": 3908 }, { "epoch": 0.94, "learning_rate": 1.3759999999999999e-08, "logps/chosen": -223.5950469970703, "logps/rejected": -362.13690185546875, "loss": 0.0006, "losses/dpo": 9.050991955916743e-09, "losses/sft": 0.7213838696479797, "losses/total": 9.050991955916743e-09, "ref_logps/chosen": -207.64419555664062, "ref_logps/rejected": -197.09075927734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5950839519500732, "rewards/margins": 14.90953254699707, "rewards/rejected": -16.504615783691406, "step": 3909 }, { "epoch": 0.94, "learning_rate": 1.3706666666666666e-08, "logps/chosen": -263.21234130859375, "logps/rejected": -429.0710754394531, "loss": 0.0001, "losses/dpo": 2.2829993895356893e-07, "losses/sft": 0.7110694050788879, "losses/total": 2.2829993895356893e-07, "ref_logps/chosen": -241.92123413085938, "ref_logps/rejected": -240.17698669433594, "rewards/accuracies": 1.0, "rewards/chosen": -2.129108428955078, "rewards/margins": 16.760299682617188, "rewards/rejected": -18.889408111572266, "step": 3910 }, { "epoch": 0.94, "learning_rate": 1.3653333333333334e-08, "logps/chosen": -223.06129455566406, "logps/rejected": -374.72015380859375, "loss": 0.0003, "losses/dpo": 8.192730092559941e-06, "losses/sft": 0.9302968382835388, "losses/total": 8.192730092559941e-06, "ref_logps/chosen": -205.82467651367188, "ref_logps/rejected": -209.03244018554688, "rewards/accuracies": 1.0, "rewards/chosen": -1.7236615419387817, "rewards/margins": 14.845109939575195, "rewards/rejected": -16.568771362304688, "step": 3911 }, { "epoch": 0.94, "learning_rate": 1.3600000000000001e-08, "logps/chosen": -219.83778381347656, "logps/rejected": -349.32244873046875, "loss": 0.0006, "losses/dpo": 2.1046252240974894e-10, "losses/sft": 0.5916054844856262, "losses/total": 2.1046252240974894e-10, "ref_logps/chosen": -203.59400939941406, "ref_logps/rejected": -190.51124572753906, "rewards/accuracies": 1.0, "rewards/chosen": -1.6243762969970703, "rewards/margins": 14.256742477416992, "rewards/rejected": -15.881118774414062, "step": 3912 }, { "epoch": 0.94, "learning_rate": 1.3546666666666668e-08, "logps/chosen": -226.66018676757812, "logps/rejected": -388.2191467285156, "loss": 0.0044, "losses/dpo": 1.263115263425152e-12, "losses/sft": 0.5350280404090881, "losses/total": 1.263115263425152e-12, "ref_logps/chosen": -210.58595275878906, "ref_logps/rejected": -227.09774780273438, "rewards/accuracies": 1.0, "rewards/chosen": -1.607424020767212, "rewards/margins": 14.504716873168945, "rewards/rejected": -16.112140655517578, "step": 3913 }, { "epoch": 0.94, "learning_rate": 1.3493333333333331e-08, "logps/chosen": -233.44189453125, "logps/rejected": -367.96697998046875, "loss": 0.0082, "losses/dpo": 1.5750690751659135e-09, "losses/sft": 0.5880371332168579, "losses/total": 1.5750690751659135e-09, "ref_logps/chosen": -217.38372802734375, "ref_logps/rejected": -209.8554229736328, "rewards/accuracies": 1.0, "rewards/chosen": -1.6058189868927002, "rewards/margins": 14.205336570739746, "rewards/rejected": -15.811155319213867, "step": 3914 }, { "epoch": 0.94, "learning_rate": 1.3439999999999998e-08, "logps/chosen": -279.0426025390625, "logps/rejected": -426.7050476074219, "loss": 0.0004, "losses/dpo": 5.061144836560061e-09, "losses/sft": 0.6513614058494568, "losses/total": 5.061144836560061e-09, "ref_logps/chosen": -256.25384521484375, "ref_logps/rejected": -245.2291717529297, "rewards/accuracies": 1.0, "rewards/chosen": -2.2788777351379395, "rewards/margins": 15.868711471557617, "rewards/rejected": -18.14759063720703, "step": 3915 }, { "epoch": 0.94, "learning_rate": 1.3386666666666665e-08, "logps/chosen": -209.6459197998047, "logps/rejected": -390.7103576660156, "loss": 0.0015, "losses/dpo": 1.1455146831096386e-09, "losses/sft": 0.6920732855796814, "losses/total": 1.1455146831096386e-09, "ref_logps/chosen": -195.9204559326172, "ref_logps/rejected": -225.93878173828125, "rewards/accuracies": 1.0, "rewards/chosen": -1.3725481033325195, "rewards/margins": 15.104607582092285, "rewards/rejected": -16.477157592773438, "step": 3916 }, { "epoch": 0.94, "learning_rate": 1.3333333333333332e-08, "logps/chosen": -252.59942626953125, "logps/rejected": -406.94183349609375, "loss": 0.0001, "losses/dpo": 1.327664875816481e-07, "losses/sft": 0.6914975643157959, "losses/total": 1.327664875816481e-07, "ref_logps/chosen": -237.822998046875, "ref_logps/rejected": -230.13290405273438, "rewards/accuracies": 1.0, "rewards/chosen": -1.4776418209075928, "rewards/margins": 16.2032527923584, "rewards/rejected": -17.680892944335938, "step": 3917 }, { "epoch": 0.94, "learning_rate": 1.3279999999999999e-08, "logps/chosen": -267.2297668457031, "logps/rejected": -410.9630432128906, "loss": 0.0001, "losses/dpo": 6.852667322476691e-09, "losses/sft": 0.831565797328949, "losses/total": 6.852667322476691e-09, "ref_logps/chosen": -244.20046997070312, "ref_logps/rejected": -231.65786743164062, "rewards/accuracies": 1.0, "rewards/chosen": -2.3029274940490723, "rewards/margins": 15.627593040466309, "rewards/rejected": -17.930519104003906, "step": 3918 }, { "epoch": 0.94, "learning_rate": 1.3226666666666667e-08, "logps/chosen": -244.31234741210938, "logps/rejected": -427.2274475097656, "loss": 0.0003, "losses/dpo": 1.2246553104944269e-08, "losses/sft": 0.5150918960571289, "losses/total": 1.2246553104944269e-08, "ref_logps/chosen": -229.7902069091797, "ref_logps/rejected": -253.66688537597656, "rewards/accuracies": 1.0, "rewards/chosen": -1.4522130489349365, "rewards/margins": 15.903841018676758, "rewards/rejected": -17.356054306030273, "step": 3919 }, { "epoch": 0.94, "learning_rate": 1.3173333333333334e-08, "logps/chosen": -276.5191345214844, "logps/rejected": -421.1451721191406, "loss": 0.0002, "losses/dpo": 1.2224165013563493e-09, "losses/sft": 0.8300585150718689, "losses/total": 1.2224165013563493e-09, "ref_logps/chosen": -252.47714233398438, "ref_logps/rejected": -247.62594604492188, "rewards/accuracies": 1.0, "rewards/chosen": -2.404197931289673, "rewards/margins": 14.947726249694824, "rewards/rejected": -17.351924896240234, "step": 3920 }, { "epoch": 0.94, "learning_rate": 1.3120000000000001e-08, "logps/chosen": -234.81561279296875, "logps/rejected": -410.9438781738281, "loss": 0.0005, "losses/dpo": 8.731706913067683e-09, "losses/sft": 1.0340439081192017, "losses/total": 8.731706913067683e-09, "ref_logps/chosen": -218.04440307617188, "ref_logps/rejected": -228.77871704101562, "rewards/accuracies": 1.0, "rewards/chosen": -1.6771198511123657, "rewards/margins": 16.539398193359375, "rewards/rejected": -18.216516494750977, "step": 3921 }, { "epoch": 0.94, "learning_rate": 1.3066666666666665e-08, "logps/chosen": -316.1492919921875, "logps/rejected": -440.6877136230469, "loss": 0.0002, "losses/dpo": 9.005582057852735e-09, "losses/sft": 0.6875340938568115, "losses/total": 9.005582057852735e-09, "ref_logps/chosen": -291.6170654296875, "ref_logps/rejected": -251.33291625976562, "rewards/accuracies": 1.0, "rewards/chosen": -2.4532206058502197, "rewards/margins": 16.482257843017578, "rewards/rejected": -18.93547821044922, "step": 3922 }, { "epoch": 0.94, "learning_rate": 1.3013333333333331e-08, "logps/chosen": -228.6975555419922, "logps/rejected": -389.48077392578125, "loss": 0.0007, "losses/dpo": 4.95756040663764e-08, "losses/sft": 0.3988170325756073, "losses/total": 4.95756040663764e-08, "ref_logps/chosen": -210.71234130859375, "ref_logps/rejected": -225.09132385253906, "rewards/accuracies": 1.0, "rewards/chosen": -1.7985223531723022, "rewards/margins": 14.640420913696289, "rewards/rejected": -16.43894386291504, "step": 3923 }, { "epoch": 0.94, "learning_rate": 1.2959999999999998e-08, "logps/chosen": -221.34429931640625, "logps/rejected": -369.6143798828125, "loss": 0.0006, "losses/dpo": 2.283530919910959e-09, "losses/sft": 0.48566722869873047, "losses/total": 2.283530919910959e-09, "ref_logps/chosen": -202.7064208984375, "ref_logps/rejected": -206.0668182373047, "rewards/accuracies": 1.0, "rewards/chosen": -1.8637877702713013, "rewards/margins": 14.490964889526367, "rewards/rejected": -16.354753494262695, "step": 3924 }, { "epoch": 0.94, "learning_rate": 1.2906666666666665e-08, "logps/chosen": -230.6833038330078, "logps/rejected": -367.5855712890625, "loss": 0.0007, "losses/dpo": 1.7805606988829936e-09, "losses/sft": 0.6933795809745789, "losses/total": 1.7805606988829936e-09, "ref_logps/chosen": -208.88629150390625, "ref_logps/rejected": -197.73692321777344, "rewards/accuracies": 1.0, "rewards/chosen": -2.179701089859009, "rewards/margins": 14.805164337158203, "rewards/rejected": -16.984867095947266, "step": 3925 }, { "epoch": 0.94, "learning_rate": 1.2853333333333332e-08, "logps/chosen": -280.3707275390625, "logps/rejected": -401.76446533203125, "loss": 0.0007, "losses/dpo": 1.3141088217594188e-08, "losses/sft": 1.1478874683380127, "losses/total": 1.3141088217594188e-08, "ref_logps/chosen": -265.6340026855469, "ref_logps/rejected": -238.6107635498047, "rewards/accuracies": 1.0, "rewards/chosen": -1.473671555519104, "rewards/margins": 14.841702461242676, "rewards/rejected": -16.315372467041016, "step": 3926 }, { "epoch": 0.94, "learning_rate": 1.28e-08, "logps/chosen": -230.46104431152344, "logps/rejected": -399.90679931640625, "loss": 0.0026, "losses/dpo": 0.0804126039147377, "losses/sft": 0.656880795955658, "losses/total": 0.0804126039147377, "ref_logps/chosen": -211.09146118164062, "ref_logps/rejected": -232.38987731933594, "rewards/accuracies": 1.0, "rewards/chosen": -1.9369564056396484, "rewards/margins": 14.814739227294922, "rewards/rejected": -16.75169563293457, "step": 3927 }, { "epoch": 0.94, "learning_rate": 1.2746666666666667e-08, "logps/chosen": -275.8676452636719, "logps/rejected": -420.9954833984375, "loss": 0.0001, "losses/dpo": 1.5456094182297875e-09, "losses/sft": 0.7063719630241394, "losses/total": 1.5456094182297875e-09, "ref_logps/chosen": -251.14108276367188, "ref_logps/rejected": -226.32156372070312, "rewards/accuracies": 1.0, "rewards/chosen": -2.4726572036743164, "rewards/margins": 16.994735717773438, "rewards/rejected": -19.467391967773438, "step": 3928 }, { "epoch": 0.94, "learning_rate": 1.2693333333333334e-08, "logps/chosen": -225.0290985107422, "logps/rejected": -373.34539794921875, "loss": 0.0003, "losses/dpo": 1.099614337363164e-06, "losses/sft": 0.5754179358482361, "losses/total": 1.099614337363164e-06, "ref_logps/chosen": -209.44174194335938, "ref_logps/rejected": -203.44305419921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5587365627288818, "rewards/margins": 15.431500434875488, "rewards/rejected": -16.990236282348633, "step": 3929 }, { "epoch": 0.94, "learning_rate": 1.2640000000000001e-08, "logps/chosen": -269.24151611328125, "logps/rejected": -394.5452880859375, "loss": 0.0015, "losses/dpo": 0.045278895646333694, "losses/sft": 0.6593966484069824, "losses/total": 0.045278895646333694, "ref_logps/chosen": -251.75125122070312, "ref_logps/rejected": -213.38734436035156, "rewards/accuracies": 1.0, "rewards/chosen": -1.749025583267212, "rewards/margins": 16.36676788330078, "rewards/rejected": -18.11579132080078, "step": 3930 }, { "epoch": 0.94, "learning_rate": 1.2586666666666665e-08, "logps/chosen": -239.7828826904297, "logps/rejected": -429.15740966796875, "loss": 0.0, "losses/dpo": 1.9757478986193178e-10, "losses/sft": 0.6542211771011353, "losses/total": 1.9757478986193178e-10, "ref_logps/chosen": -220.8070068359375, "ref_logps/rejected": -243.34164428710938, "rewards/accuracies": 1.0, "rewards/chosen": -1.8975872993469238, "rewards/margins": 16.683990478515625, "rewards/rejected": -18.58157730102539, "step": 3931 }, { "epoch": 0.94, "learning_rate": 1.2533333333333331e-08, "logps/chosen": -208.9612274169922, "logps/rejected": -377.8404541015625, "loss": 0.002, "losses/dpo": 6.9701733274030175e-09, "losses/sft": 0.7222435474395752, "losses/total": 6.9701733274030175e-09, "ref_logps/chosen": -190.07225036621094, "ref_logps/rejected": -201.5684814453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.88889741897583, "rewards/margins": 15.738300323486328, "rewards/rejected": -17.627197265625, "step": 3932 }, { "epoch": 0.94, "learning_rate": 1.2479999999999998e-08, "logps/chosen": -300.4797058105469, "logps/rejected": -433.7788391113281, "loss": 0.0001, "losses/dpo": 3.596205555567167e-10, "losses/sft": 0.57146817445755, "losses/total": 3.596205555567167e-10, "ref_logps/chosen": -284.95733642578125, "ref_logps/rejected": -255.43930053710938, "rewards/accuracies": 1.0, "rewards/chosen": -1.5522369146347046, "rewards/margins": 16.281719207763672, "rewards/rejected": -17.833955764770508, "step": 3933 }, { "epoch": 0.94, "learning_rate": 1.2426666666666665e-08, "logps/chosen": -234.3928680419922, "logps/rejected": -386.3345031738281, "loss": 0.0002, "losses/dpo": 2.354025491513312e-05, "losses/sft": 0.8145548105239868, "losses/total": 2.354025491513312e-05, "ref_logps/chosen": -214.148193359375, "ref_logps/rejected": -214.7351531982422, "rewards/accuracies": 1.0, "rewards/chosen": -2.0244674682617188, "rewards/margins": 15.135467529296875, "rewards/rejected": -17.159934997558594, "step": 3934 }, { "epoch": 0.94, "learning_rate": 1.2373333333333334e-08, "logps/chosen": -219.39820861816406, "logps/rejected": -404.9963684082031, "loss": 0.0007, "losses/dpo": 3.0855110133565233e-10, "losses/sft": 0.6072419881820679, "losses/total": 3.0855110133565233e-10, "ref_logps/chosen": -202.24171447753906, "ref_logps/rejected": -233.71456909179688, "rewards/accuracies": 1.0, "rewards/chosen": -1.7156479358673096, "rewards/margins": 15.412534713745117, "rewards/rejected": -17.12818145751953, "step": 3935 }, { "epoch": 0.94, "learning_rate": 1.232e-08, "logps/chosen": -245.57522583007812, "logps/rejected": -414.8729248046875, "loss": 0.0013, "losses/dpo": 1.808874577902131e-11, "losses/sft": 0.7913713455200195, "losses/total": 1.808874577902131e-11, "ref_logps/chosen": -223.34902954101562, "ref_logps/rejected": -231.22909545898438, "rewards/accuracies": 1.0, "rewards/chosen": -2.2226197719573975, "rewards/margins": 16.141761779785156, "rewards/rejected": -18.364381790161133, "step": 3936 }, { "epoch": 0.94, "learning_rate": 1.2266666666666666e-08, "logps/chosen": -204.29702758789062, "logps/rejected": -386.5519104003906, "loss": 0.0004, "losses/dpo": 6.929860873507165e-13, "losses/sft": 0.8499510288238525, "losses/total": 6.929860873507165e-13, "ref_logps/chosen": -185.82318115234375, "ref_logps/rejected": -208.00350952148438, "rewards/accuracies": 1.0, "rewards/chosen": -1.8473833799362183, "rewards/margins": 16.00745391845703, "rewards/rejected": -17.854835510253906, "step": 3937 }, { "epoch": 0.95, "learning_rate": 1.2213333333333333e-08, "logps/chosen": -313.74774169921875, "logps/rejected": -415.26885986328125, "loss": 0.0005, "losses/dpo": 2.5499247158222715e-07, "losses/sft": 0.6953622102737427, "losses/total": 2.5499247158222715e-07, "ref_logps/chosen": -294.57464599609375, "ref_logps/rejected": -239.7904052734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.9173074960708618, "rewards/margins": 15.630535125732422, "rewards/rejected": -17.54784393310547, "step": 3938 }, { "epoch": 0.95, "learning_rate": 1.216e-08, "logps/chosen": -235.4210968017578, "logps/rejected": -396.5267333984375, "loss": 0.0004, "losses/dpo": 3.7015307485788185e-11, "losses/sft": 0.5726625323295593, "losses/total": 3.7015307485788185e-11, "ref_logps/chosen": -216.90594482421875, "ref_logps/rejected": -226.2228546142578, "rewards/accuracies": 1.0, "rewards/chosen": -1.851515531539917, "rewards/margins": 15.178871154785156, "rewards/rejected": -17.03038787841797, "step": 3939 }, { "epoch": 0.95, "learning_rate": 1.2106666666666666e-08, "logps/chosen": -245.71286010742188, "logps/rejected": -392.2578430175781, "loss": 0.0021, "losses/dpo": 9.036359438496788e-10, "losses/sft": 0.47037482261657715, "losses/total": 9.036359438496788e-10, "ref_logps/chosen": -228.96527099609375, "ref_logps/rejected": -226.92384338378906, "rewards/accuracies": 1.0, "rewards/chosen": -1.6747572422027588, "rewards/margins": 14.858643531799316, "rewards/rejected": -16.533401489257812, "step": 3940 }, { "epoch": 0.95, "learning_rate": 1.2053333333333333e-08, "logps/chosen": -224.014892578125, "logps/rejected": -409.86944580078125, "loss": 0.0001, "losses/dpo": 5.724064555678865e-10, "losses/sft": 0.8269378542900085, "losses/total": 5.724064555678865e-10, "ref_logps/chosen": -207.31871032714844, "ref_logps/rejected": -230.23049926757812, "rewards/accuracies": 1.0, "rewards/chosen": -1.6696205139160156, "rewards/margins": 16.29427719116211, "rewards/rejected": -17.963897705078125, "step": 3941 }, { "epoch": 0.95, "learning_rate": 1.1999999999999998e-08, "logps/chosen": -237.04541015625, "logps/rejected": -381.6463623046875, "loss": 0.0003, "losses/dpo": 1.1103120159394742e-11, "losses/sft": 0.7656088471412659, "losses/total": 1.1103120159394742e-11, "ref_logps/chosen": -221.84375, "ref_logps/rejected": -215.4011688232422, "rewards/accuracies": 1.0, "rewards/chosen": -1.520166039466858, "rewards/margins": 15.104352951049805, "rewards/rejected": -16.62451934814453, "step": 3942 }, { "epoch": 0.95, "learning_rate": 1.1946666666666667e-08, "logps/chosen": -258.18603515625, "logps/rejected": -452.1304931640625, "loss": 0.0019, "losses/dpo": 2.118495934233522e-10, "losses/sft": 0.6282627582550049, "losses/total": 2.118495934233522e-10, "ref_logps/chosen": -238.9781494140625, "ref_logps/rejected": -254.50103759765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.9207870960235596, "rewards/margins": 17.8421630859375, "rewards/rejected": -19.762950897216797, "step": 3943 }, { "epoch": 0.95, "learning_rate": 1.1893333333333334e-08, "logps/chosen": -214.7592010498047, "logps/rejected": -335.2662048339844, "loss": 0.0073, "losses/dpo": 1.5595687727909535e-06, "losses/sft": 0.6096947193145752, "losses/total": 1.5595687727909535e-06, "ref_logps/chosen": -198.76678466796875, "ref_logps/rejected": -185.51278686523438, "rewards/accuracies": 1.0, "rewards/chosen": -1.599240779876709, "rewards/margins": 13.376104354858398, "rewards/rejected": -14.975343704223633, "step": 3944 }, { "epoch": 0.95, "learning_rate": 1.184e-08, "logps/chosen": -229.77723693847656, "logps/rejected": -414.7392578125, "loss": 0.0042, "losses/dpo": 3.115267488418283e-10, "losses/sft": 0.8181775212287903, "losses/total": 3.115267488418283e-10, "ref_logps/chosen": -213.40762329101562, "ref_logps/rejected": -238.21563720703125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6369620561599731, "rewards/margins": 16.015403747558594, "rewards/rejected": -17.652366638183594, "step": 3945 }, { "epoch": 0.95, "learning_rate": 1.1786666666666666e-08, "logps/chosen": -255.4498748779297, "logps/rejected": -372.09149169921875, "loss": 0.0078, "losses/dpo": 2.3462654041850328e-08, "losses/sft": 0.6297438740730286, "losses/total": 2.3462654041850328e-08, "ref_logps/chosen": -237.229248046875, "ref_logps/rejected": -205.98397827148438, "rewards/accuracies": 1.0, "rewards/chosen": -1.8220627307891846, "rewards/margins": 14.788690567016602, "rewards/rejected": -16.610754013061523, "step": 3946 }, { "epoch": 0.95, "learning_rate": 1.1733333333333333e-08, "logps/chosen": -220.6326141357422, "logps/rejected": -409.4736633300781, "loss": 0.0008, "losses/dpo": 2.734496309941825e-11, "losses/sft": 0.6457998752593994, "losses/total": 2.734496309941825e-11, "ref_logps/chosen": -202.23736572265625, "ref_logps/rejected": -231.46337890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.8395248651504517, "rewards/margins": 15.96150016784668, "rewards/rejected": -17.801023483276367, "step": 3947 }, { "epoch": 0.95, "learning_rate": 1.168e-08, "logps/chosen": -217.51193237304688, "logps/rejected": -356.0396423339844, "loss": 0.001, "losses/dpo": 7.911975653633607e-13, "losses/sft": 0.8249065279960632, "losses/total": 7.911975653633607e-13, "ref_logps/chosen": -202.43719482421875, "ref_logps/rejected": -202.8189239501953, "rewards/accuracies": 1.0, "rewards/chosen": -1.5074734687805176, "rewards/margins": 13.814598083496094, "rewards/rejected": -15.322072982788086, "step": 3948 }, { "epoch": 0.95, "learning_rate": 1.1626666666666666e-08, "logps/chosen": -232.05322265625, "logps/rejected": -407.932861328125, "loss": 0.0013, "losses/dpo": 2.167309958167607e-06, "losses/sft": 0.622153103351593, "losses/total": 2.167309958167607e-06, "ref_logps/chosen": -213.45703125, "ref_logps/rejected": -237.9452362060547, "rewards/accuracies": 1.0, "rewards/chosen": -1.859619379043579, "rewards/margins": 15.139143943786621, "rewards/rejected": -16.998764038085938, "step": 3949 }, { "epoch": 0.95, "learning_rate": 1.1573333333333331e-08, "logps/chosen": -244.22340393066406, "logps/rejected": -392.8173828125, "loss": 0.0007, "losses/dpo": 3.1286176427336443e-10, "losses/sft": 0.4945047199726105, "losses/total": 3.1286176427336443e-10, "ref_logps/chosen": -226.13148498535156, "ref_logps/rejected": -212.12034606933594, "rewards/accuracies": 1.0, "rewards/chosen": -1.8091918230056763, "rewards/margins": 16.260509490966797, "rewards/rejected": -18.0697021484375, "step": 3950 }, { "epoch": 0.95, "learning_rate": 1.152e-08, "logps/chosen": -215.95901489257812, "logps/rejected": -365.465087890625, "loss": 0.001, "losses/dpo": 7.50699294393975e-11, "losses/sft": 0.4934355318546295, "losses/total": 7.50699294393975e-11, "ref_logps/chosen": -201.98480224609375, "ref_logps/rejected": -214.31558227539062, "rewards/accuracies": 1.0, "rewards/chosen": -1.3974220752716064, "rewards/margins": 13.717530250549316, "rewards/rejected": -15.114952087402344, "step": 3951 }, { "epoch": 0.95, "learning_rate": 1.1466666666666667e-08, "logps/chosen": -241.2933349609375, "logps/rejected": -423.72991943359375, "loss": 0.0003, "losses/dpo": 6.377769068421912e-07, "losses/sft": 0.5335013270378113, "losses/total": 6.377769068421912e-07, "ref_logps/chosen": -225.18185424804688, "ref_logps/rejected": -240.04833984375, "rewards/accuracies": 1.0, "rewards/chosen": -1.6111476421356201, "rewards/margins": 16.75701332092285, "rewards/rejected": -18.368160247802734, "step": 3952 }, { "epoch": 0.95, "learning_rate": 1.1413333333333334e-08, "logps/chosen": -238.53273010253906, "logps/rejected": -423.4848327636719, "loss": 0.0016, "losses/dpo": 4.1192729782757453e-10, "losses/sft": 0.7164088487625122, "losses/total": 4.1192729782757453e-10, "ref_logps/chosen": -216.05294799804688, "ref_logps/rejected": -225.73033142089844, "rewards/accuracies": 1.0, "rewards/chosen": -2.247978448867798, "rewards/margins": 17.527467727661133, "rewards/rejected": -19.775447845458984, "step": 3953 }, { "epoch": 0.95, "learning_rate": 1.136e-08, "logps/chosen": -226.93844604492188, "logps/rejected": -380.0230712890625, "loss": 0.0043, "losses/dpo": 2.4213773031078745e-07, "losses/sft": 0.8907299637794495, "losses/total": 2.4213773031078745e-07, "ref_logps/chosen": -212.03839111328125, "ref_logps/rejected": -212.46185302734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.490004301071167, "rewards/margins": 15.266119956970215, "rewards/rejected": -16.75612449645996, "step": 3954 }, { "epoch": 0.95, "learning_rate": 1.1306666666666666e-08, "logps/chosen": -233.66543579101562, "logps/rejected": -413.4858703613281, "loss": 0.0004, "losses/dpo": 2.1187060439409322e-10, "losses/sft": 0.8951281905174255, "losses/total": 2.1187060439409322e-10, "ref_logps/chosen": -219.64944458007812, "ref_logps/rejected": -230.33755493164062, "rewards/accuracies": 1.0, "rewards/chosen": -1.4015989303588867, "rewards/margins": 16.913230895996094, "rewards/rejected": -18.314830780029297, "step": 3955 }, { "epoch": 0.95, "learning_rate": 1.1253333333333333e-08, "logps/chosen": -261.395263671875, "logps/rejected": -420.0281982421875, "loss": 0.0, "losses/dpo": 2.382407049417168e-10, "losses/sft": 0.5597839951515198, "losses/total": 2.382407049417168e-10, "ref_logps/chosen": -248.66273498535156, "ref_logps/rejected": -233.3680419921875, "rewards/accuracies": 1.0, "rewards/chosen": -1.2732512950897217, "rewards/margins": 17.39276123046875, "rewards/rejected": -18.666011810302734, "step": 3956 }, { "epoch": 0.95, "learning_rate": 1.12e-08, "logps/chosen": -242.70504760742188, "logps/rejected": -391.41937255859375, "loss": 0.0012, "losses/dpo": 8.271229035017313e-08, "losses/sft": 0.6593818068504333, "losses/total": 8.271229035017313e-08, "ref_logps/chosen": -223.77597045898438, "ref_logps/rejected": -207.72225952148438, "rewards/accuracies": 1.0, "rewards/chosen": -1.8929078578948975, "rewards/margins": 16.476804733276367, "rewards/rejected": -18.369712829589844, "step": 3957 }, { "epoch": 0.95, "learning_rate": 1.1146666666666666e-08, "logps/chosen": -260.30035400390625, "logps/rejected": -412.80828857421875, "loss": 0.0011, "losses/dpo": 3.7009750819549936e-10, "losses/sft": 0.5853065848350525, "losses/total": 3.7009750819549936e-10, "ref_logps/chosen": -237.148193359375, "ref_logps/rejected": -236.17701721191406, "rewards/accuracies": 1.0, "rewards/chosen": -2.3152170181274414, "rewards/margins": 15.347908973693848, "rewards/rejected": -17.66312599182129, "step": 3958 }, { "epoch": 0.95, "learning_rate": 1.1093333333333333e-08, "logps/chosen": -254.03823852539062, "logps/rejected": -440.8327941894531, "loss": 0.0, "losses/dpo": 1.7235769755874486e-10, "losses/sft": 0.6249293684959412, "losses/total": 1.7235769755874486e-10, "ref_logps/chosen": -235.5496826171875, "ref_logps/rejected": -242.38246154785156, "rewards/accuracies": 1.0, "rewards/chosen": -1.8488571643829346, "rewards/margins": 17.996177673339844, "rewards/rejected": -19.845035552978516, "step": 3959 }, { "epoch": 0.95, "learning_rate": 1.104e-08, "logps/chosen": -242.72425842285156, "logps/rejected": -431.036865234375, "loss": 0.0016, "losses/dpo": 5.061452839072444e-07, "losses/sft": 0.9819994568824768, "losses/total": 5.061452839072444e-07, "ref_logps/chosen": -224.5469207763672, "ref_logps/rejected": -245.90988159179688, "rewards/accuracies": 1.0, "rewards/chosen": -1.8177340030670166, "rewards/margins": 16.694965362548828, "rewards/rejected": -18.512699127197266, "step": 3960 }, { "epoch": 0.95, "learning_rate": 1.0986666666666667e-08, "logps/chosen": -257.8804931640625, "logps/rejected": -393.9739990234375, "loss": 0.0001, "losses/dpo": 6.529498053353677e-10, "losses/sft": 0.7179024815559387, "losses/total": 6.529498053353677e-10, "ref_logps/chosen": -237.49542236328125, "ref_logps/rejected": -216.5595245361328, "rewards/accuracies": 1.0, "rewards/chosen": -2.0385069847106934, "rewards/margins": 15.7029390335083, "rewards/rejected": -17.741445541381836, "step": 3961 }, { "epoch": 0.95, "learning_rate": 1.0933333333333334e-08, "logps/chosen": -212.237060546875, "logps/rejected": -364.4495849609375, "loss": 0.0005, "losses/dpo": 1.680571126705388e-09, "losses/sft": 0.6017729043960571, "losses/total": 1.680571126705388e-09, "ref_logps/chosen": -199.26536560058594, "ref_logps/rejected": -210.89187622070312, "rewards/accuracies": 1.0, "rewards/chosen": -1.2971718311309814, "rewards/margins": 14.058598518371582, "rewards/rejected": -15.355770111083984, "step": 3962 }, { "epoch": 0.95, "learning_rate": 1.0879999999999999e-08, "logps/chosen": -218.71824645996094, "logps/rejected": -343.975341796875, "loss": 0.0231, "losses/dpo": 0.0006253512110561132, "losses/sft": 0.6517434120178223, "losses/total": 0.0006253512110561132, "ref_logps/chosen": -202.54934692382812, "ref_logps/rejected": -188.74786376953125, "rewards/accuracies": 1.0, "rewards/chosen": -1.616889238357544, "rewards/margins": 13.90585708618164, "rewards/rejected": -15.522746086120605, "step": 3963 }, { "epoch": 0.95, "learning_rate": 1.0826666666666666e-08, "logps/chosen": -231.12332153320312, "logps/rejected": -383.3428649902344, "loss": 0.0014, "losses/dpo": 1.0990344190275891e-09, "losses/sft": 0.9927463531494141, "losses/total": 1.0990344190275891e-09, "ref_logps/chosen": -215.67190551757812, "ref_logps/rejected": -213.22109985351562, "rewards/accuracies": 1.0, "rewards/chosen": -1.5451433658599854, "rewards/margins": 15.467031478881836, "rewards/rejected": -17.01217269897461, "step": 3964 }, { "epoch": 0.95, "learning_rate": 1.0773333333333333e-08, "logps/chosen": -252.25550842285156, "logps/rejected": -436.1524658203125, "loss": 0.0001, "losses/dpo": 1.9677900127135217e-06, "losses/sft": 0.6170535683631897, "losses/total": 1.9677900127135217e-06, "ref_logps/chosen": -230.6851806640625, "ref_logps/rejected": -247.36424255371094, "rewards/accuracies": 1.0, "rewards/chosen": -2.1570310592651367, "rewards/margins": 16.7217960357666, "rewards/rejected": -18.878826141357422, "step": 3965 }, { "epoch": 0.95, "learning_rate": 1.072e-08, "logps/chosen": -278.9070739746094, "logps/rejected": -423.0736999511719, "loss": 0.0005, "losses/dpo": 0.0003034211986232549, "losses/sft": 0.511303722858429, "losses/total": 0.0003034211986232549, "ref_logps/chosen": -258.51123046875, "ref_logps/rejected": -238.95516967773438, "rewards/accuracies": 1.0, "rewards/chosen": -2.0395846366882324, "rewards/margins": 16.37226676940918, "rewards/rejected": -18.41185188293457, "step": 3966 }, { "epoch": 0.95, "learning_rate": 1.0666666666666666e-08, "logps/chosen": -252.66152954101562, "logps/rejected": -405.5616455078125, "loss": 0.0, "losses/dpo": 3.650250235409658e-08, "losses/sft": 0.6486557126045227, "losses/total": 3.650250235409658e-08, "ref_logps/chosen": -234.09703063964844, "ref_logps/rejected": -229.67047119140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.8564505577087402, "rewards/margins": 15.732666015625, "rewards/rejected": -17.5891170501709, "step": 3967 }, { "epoch": 0.95, "learning_rate": 1.0613333333333333e-08, "logps/chosen": -266.7974853515625, "logps/rejected": -380.4569396972656, "loss": 0.004, "losses/dpo": 0.0010156725766137242, "losses/sft": 0.6857507824897766, "losses/total": 0.0010156725766137242, "ref_logps/chosen": -247.68849182128906, "ref_logps/rejected": -220.22323608398438, "rewards/accuracies": 1.0, "rewards/chosen": -1.9108980894088745, "rewards/margins": 14.11247444152832, "rewards/rejected": -16.023372650146484, "step": 3968 }, { "epoch": 0.95, "learning_rate": 1.056e-08, "logps/chosen": -225.37916564941406, "logps/rejected": -409.982421875, "loss": 0.0002, "losses/dpo": 4.803115189133678e-06, "losses/sft": 0.848933756351471, "losses/total": 4.803115189133678e-06, "ref_logps/chosen": -206.9420928955078, "ref_logps/rejected": -228.68551635742188, "rewards/accuracies": 1.0, "rewards/chosen": -1.8437080383300781, "rewards/margins": 16.285978317260742, "rewards/rejected": -18.129688262939453, "step": 3969 }, { "epoch": 0.95, "learning_rate": 1.0506666666666667e-08, "logps/chosen": -239.19943237304688, "logps/rejected": -386.6351318359375, "loss": 0.004, "losses/dpo": 8.431272568998338e-09, "losses/sft": 0.4260696470737457, "losses/total": 8.431272568998338e-09, "ref_logps/chosen": -220.17446899414062, "ref_logps/rejected": -203.81121826171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.9024975299835205, "rewards/margins": 16.379894256591797, "rewards/rejected": -18.282392501831055, "step": 3970 }, { "epoch": 0.95, "learning_rate": 1.0453333333333334e-08, "logps/chosen": -223.79071044921875, "logps/rejected": -387.68389892578125, "loss": 0.0045, "losses/dpo": 8.162402309608296e-08, "losses/sft": 0.5722392201423645, "losses/total": 8.162402309608296e-08, "ref_logps/chosen": -203.11534118652344, "ref_logps/rejected": -211.07154846191406, "rewards/accuracies": 1.0, "rewards/chosen": -2.067534923553467, "rewards/margins": 15.59370231628418, "rewards/rejected": -17.661235809326172, "step": 3971 }, { "epoch": 0.95, "learning_rate": 1.0399999999999999e-08, "logps/chosen": -198.46011352539062, "logps/rejected": -362.11260986328125, "loss": 0.0027, "losses/dpo": 2.8018993791789626e-09, "losses/sft": 1.0620615482330322, "losses/total": 2.8018993791789626e-09, "ref_logps/chosen": -181.65499877929688, "ref_logps/rejected": -203.526123046875, "rewards/accuracies": 1.0, "rewards/chosen": -1.680509328842163, "rewards/margins": 14.178140640258789, "rewards/rejected": -15.858650207519531, "step": 3972 }, { "epoch": 0.95, "learning_rate": 1.0346666666666666e-08, "logps/chosen": -255.46951293945312, "logps/rejected": -397.3450927734375, "loss": 0.0017, "losses/dpo": 3.2607555766617224e-08, "losses/sft": 0.5186235308647156, "losses/total": 3.2607555766617224e-08, "ref_logps/chosen": -236.80979919433594, "ref_logps/rejected": -217.38372802734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.8659725189208984, "rewards/margins": 16.130165100097656, "rewards/rejected": -17.996137619018555, "step": 3973 }, { "epoch": 0.95, "learning_rate": 1.0293333333333333e-08, "logps/chosen": -232.7413330078125, "logps/rejected": -387.0078125, "loss": 0.0005, "losses/dpo": 7.925975751277292e-07, "losses/sft": 1.1976699829101562, "losses/total": 7.925975751277292e-07, "ref_logps/chosen": -213.98910522460938, "ref_logps/rejected": -217.97781372070312, "rewards/accuracies": 1.0, "rewards/chosen": -1.8752213716506958, "rewards/margins": 15.027777671813965, "rewards/rejected": -16.902999877929688, "step": 3974 }, { "epoch": 0.95, "learning_rate": 1.024e-08, "logps/chosen": -161.84274291992188, "logps/rejected": -333.0087890625, "loss": 0.0002, "losses/dpo": 2.5147124382840502e-09, "losses/sft": 0.6320769190788269, "losses/total": 2.5147124382840502e-09, "ref_logps/chosen": -147.75413513183594, "ref_logps/rejected": -189.15646362304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.4088611602783203, "rewards/margins": 12.976371765136719, "rewards/rejected": -14.385231971740723, "step": 3975 }, { "epoch": 0.95, "learning_rate": 1.0186666666666666e-08, "logps/chosen": -265.7200927734375, "logps/rejected": -395.9710388183594, "loss": 0.0002, "losses/dpo": 2.9035948045930127e-07, "losses/sft": 1.0104150772094727, "losses/total": 2.9035948045930127e-07, "ref_logps/chosen": -248.66708374023438, "ref_logps/rejected": -227.83950805664062, "rewards/accuracies": 1.0, "rewards/chosen": -1.7053029537200928, "rewards/margins": 15.107851028442383, "rewards/rejected": -16.813152313232422, "step": 3976 }, { "epoch": 0.95, "learning_rate": 1.0133333333333333e-08, "logps/chosen": -265.41717529296875, "logps/rejected": -412.31976318359375, "loss": 0.0044, "losses/dpo": 6.619312387101672e-08, "losses/sft": 0.5295205116271973, "losses/total": 6.619312387101672e-08, "ref_logps/chosen": -245.1328125, "ref_logps/rejected": -233.96128845214844, "rewards/accuracies": 1.0, "rewards/chosen": -2.0284361839294434, "rewards/margins": 15.807411193847656, "rewards/rejected": -17.835847854614258, "step": 3977 }, { "epoch": 0.95, "learning_rate": 1.008e-08, "logps/chosen": -255.77391052246094, "logps/rejected": -422.387451171875, "loss": 0.0017, "losses/dpo": 2.3226950816024328e-06, "losses/sft": 0.6986252069473267, "losses/total": 2.3226950816024328e-06, "ref_logps/chosen": -235.70762634277344, "ref_logps/rejected": -235.12042236328125, "rewards/accuracies": 1.0, "rewards/chosen": -2.0066277980804443, "rewards/margins": 16.720077514648438, "rewards/rejected": -18.726703643798828, "step": 3978 }, { "epoch": 0.95, "learning_rate": 1.0026666666666667e-08, "logps/chosen": -242.447998046875, "logps/rejected": -395.9696960449219, "loss": 0.0008, "losses/dpo": 3.344639765145985e-08, "losses/sft": 0.5304528474807739, "losses/total": 3.344639765145985e-08, "ref_logps/chosen": -224.68814086914062, "ref_logps/rejected": -223.76602172851562, "rewards/accuracies": 1.0, "rewards/chosen": -1.775986671447754, "rewards/margins": 15.444382667541504, "rewards/rejected": -17.220369338989258, "step": 3979 }, { "epoch": 0.96, "learning_rate": 9.973333333333332e-09, "logps/chosen": -317.41241455078125, "logps/rejected": -422.97772216796875, "loss": 0.003, "losses/dpo": 1.0012689344307546e-08, "losses/sft": 0.9516258239746094, "losses/total": 1.0012689344307546e-08, "ref_logps/chosen": -300.25390625, "ref_logps/rejected": -242.82052612304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.7158528566360474, "rewards/margins": 16.299869537353516, "rewards/rejected": -18.015722274780273, "step": 3980 }, { "epoch": 0.96, "learning_rate": 9.919999999999999e-09, "logps/chosen": -230.10726928710938, "logps/rejected": -389.6788330078125, "loss": 0.0007, "losses/dpo": 2.5988566854096007e-09, "losses/sft": 0.5052311420440674, "losses/total": 2.5988566854096007e-09, "ref_logps/chosen": -213.1094512939453, "ref_logps/rejected": -215.6927032470703, "rewards/accuracies": 1.0, "rewards/chosen": -1.699781894683838, "rewards/margins": 15.698833465576172, "rewards/rejected": -17.398616790771484, "step": 3981 }, { "epoch": 0.96, "learning_rate": 9.866666666666666e-09, "logps/chosen": -271.6716613769531, "logps/rejected": -454.62945556640625, "loss": 0.0007, "losses/dpo": 1.1058537419117442e-09, "losses/sft": 0.5481579899787903, "losses/total": 1.1058537419117442e-09, "ref_logps/chosen": -250.820556640625, "ref_logps/rejected": -254.02334594726562, "rewards/accuracies": 1.0, "rewards/chosen": -2.0851097106933594, "rewards/margins": 17.97550392150879, "rewards/rejected": -20.06061363220215, "step": 3982 }, { "epoch": 0.96, "learning_rate": 9.813333333333333e-09, "logps/chosen": -290.61883544921875, "logps/rejected": -425.65667724609375, "loss": 0.0, "losses/dpo": 4.3795668480672845e-11, "losses/sft": 0.5843073129653931, "losses/total": 4.3795668480672845e-11, "ref_logps/chosen": -266.599609375, "ref_logps/rejected": -238.8538818359375, "rewards/accuracies": 1.0, "rewards/chosen": -2.4019217491149902, "rewards/margins": 16.278356552124023, "rewards/rejected": -18.680278778076172, "step": 3983 }, { "epoch": 0.96, "learning_rate": 9.76e-09, "logps/chosen": -261.8115539550781, "logps/rejected": -397.24261474609375, "loss": 0.0005, "losses/dpo": 9.006234868991214e-09, "losses/sft": 0.9038060903549194, "losses/total": 9.006234868991214e-09, "ref_logps/chosen": -239.8310546875, "ref_logps/rejected": -222.4031982421875, "rewards/accuracies": 1.0, "rewards/chosen": -2.1980490684509277, "rewards/margins": 15.285894393920898, "rewards/rejected": -17.483943939208984, "step": 3984 }, { "epoch": 0.96, "learning_rate": 9.706666666666666e-09, "logps/chosen": -239.90234375, "logps/rejected": -419.06927490234375, "loss": 0.0005, "losses/dpo": 3.1089115282156854e-07, "losses/sft": 0.6171000599861145, "losses/total": 3.1089115282156854e-07, "ref_logps/chosen": -222.257080078125, "ref_logps/rejected": -236.02008056640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7645256519317627, "rewards/margins": 16.540393829345703, "rewards/rejected": -18.30491828918457, "step": 3985 }, { "epoch": 0.96, "learning_rate": 9.653333333333333e-09, "logps/chosen": -228.99368286132812, "logps/rejected": -395.69403076171875, "loss": 0.0057, "losses/dpo": 5.2183472689648625e-06, "losses/sft": 0.698208749294281, "losses/total": 5.2183472689648625e-06, "ref_logps/chosen": -213.5109405517578, "ref_logps/rejected": -218.0263214111328, "rewards/accuracies": 1.0, "rewards/chosen": -1.5482739210128784, "rewards/margins": 16.218494415283203, "rewards/rejected": -17.766769409179688, "step": 3986 }, { "epoch": 0.96, "learning_rate": 9.6e-09, "logps/chosen": -238.5296173095703, "logps/rejected": -416.64093017578125, "loss": 0.0007, "losses/dpo": 4.2918672088498866e-12, "losses/sft": 0.5315989851951599, "losses/total": 4.2918672088498866e-12, "ref_logps/chosen": -221.81106567382812, "ref_logps/rejected": -230.19827270507812, "rewards/accuracies": 1.0, "rewards/chosen": -1.6718559265136719, "rewards/margins": 16.972412109375, "rewards/rejected": -18.644268035888672, "step": 3987 }, { "epoch": 0.96, "learning_rate": 9.546666666666667e-09, "logps/chosen": -258.82977294921875, "logps/rejected": -410.431396484375, "loss": 0.0003, "losses/dpo": 2.950851829552903e-08, "losses/sft": 0.5085548758506775, "losses/total": 2.950851829552903e-08, "ref_logps/chosen": -242.49365234375, "ref_logps/rejected": -224.9373779296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.6336138248443604, "rewards/margins": 16.915788650512695, "rewards/rejected": -18.54940414428711, "step": 3988 }, { "epoch": 0.96, "learning_rate": 9.493333333333332e-09, "logps/chosen": -230.60035705566406, "logps/rejected": -391.00579833984375, "loss": 0.0001, "losses/dpo": 3.754741919692606e-05, "losses/sft": 0.5575851202011108, "losses/total": 3.754741919692606e-05, "ref_logps/chosen": -210.60162353515625, "ref_logps/rejected": -213.53936767578125, "rewards/accuracies": 1.0, "rewards/chosen": -1.9998729228973389, "rewards/margins": 15.746769905090332, "rewards/rejected": -17.74664306640625, "step": 3989 }, { "epoch": 0.96, "learning_rate": 9.439999999999999e-09, "logps/chosen": -254.2100372314453, "logps/rejected": -446.9363098144531, "loss": 0.0, "losses/dpo": 1.2725925202516919e-08, "losses/sft": 0.6553441286087036, "losses/total": 1.2725925202516919e-08, "ref_logps/chosen": -230.56527709960938, "ref_logps/rejected": -243.09877014160156, "rewards/accuracies": 1.0, "rewards/chosen": -2.3644747734069824, "rewards/margins": 18.01927947998047, "rewards/rejected": -20.38375473022461, "step": 3990 }, { "epoch": 0.96, "learning_rate": 9.386666666666666e-09, "logps/chosen": -225.93341064453125, "logps/rejected": -391.42352294921875, "loss": 0.0012, "losses/dpo": 4.810552667322554e-08, "losses/sft": 0.6374824643135071, "losses/total": 4.810552667322554e-08, "ref_logps/chosen": -209.1602325439453, "ref_logps/rejected": -217.18356323242188, "rewards/accuracies": 1.0, "rewards/chosen": -1.6773169040679932, "rewards/margins": 15.74668025970459, "rewards/rejected": -17.423995971679688, "step": 3991 }, { "epoch": 0.96, "learning_rate": 9.333333333333333e-09, "logps/chosen": -198.92420959472656, "logps/rejected": -385.8141174316406, "loss": 0.0005, "losses/dpo": 3.105884971432715e-08, "losses/sft": 0.583831250667572, "losses/total": 3.105884971432715e-08, "ref_logps/chosen": -183.73023986816406, "ref_logps/rejected": -209.5308837890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.5193960666656494, "rewards/margins": 16.10892677307129, "rewards/rejected": -17.62832260131836, "step": 3992 }, { "epoch": 0.96, "learning_rate": 9.28e-09, "logps/chosen": -237.60916137695312, "logps/rejected": -375.043212890625, "loss": 0.0036, "losses/dpo": 5.794434514427849e-07, "losses/sft": 0.5610439777374268, "losses/total": 5.794434514427849e-07, "ref_logps/chosen": -221.41050720214844, "ref_logps/rejected": -205.21075439453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6198642253875732, "rewards/margins": 15.363382339477539, "rewards/rejected": -16.983245849609375, "step": 3993 }, { "epoch": 0.96, "learning_rate": 9.226666666666666e-09, "logps/chosen": -272.094970703125, "logps/rejected": -422.28082275390625, "loss": 0.0, "losses/dpo": 3.5641334328317953e-09, "losses/sft": 0.8348506689071655, "losses/total": 3.5641334328317953e-09, "ref_logps/chosen": -254.07546997070312, "ref_logps/rejected": -247.19683837890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.8019497394561768, "rewards/margins": 15.706451416015625, "rewards/rejected": -17.508399963378906, "step": 3994 }, { "epoch": 0.96, "learning_rate": 9.173333333333333e-09, "logps/chosen": -187.58377075195312, "logps/rejected": -384.09130859375, "loss": 0.0005, "losses/dpo": 4.045176194011901e-09, "losses/sft": 0.34520506858825684, "losses/total": 4.045176194011901e-09, "ref_logps/chosen": -171.73159790039062, "ref_logps/rejected": -219.7255096435547, "rewards/accuracies": 1.0, "rewards/chosen": -1.585219144821167, "rewards/margins": 14.851359367370605, "rewards/rejected": -16.436580657958984, "step": 3995 }, { "epoch": 0.96, "learning_rate": 9.12e-09, "logps/chosen": -191.12985229492188, "logps/rejected": -368.22515869140625, "loss": 0.0013, "losses/dpo": 9.186368332692041e-10, "losses/sft": 0.73902428150177, "losses/total": 9.186368332692041e-10, "ref_logps/chosen": -175.5384521484375, "ref_logps/rejected": -206.43060302734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.5591399669647217, "rewards/margins": 14.620315551757812, "rewards/rejected": -16.179454803466797, "step": 3996 }, { "epoch": 0.96, "learning_rate": 9.066666666666667e-09, "logps/chosen": -249.57363891601562, "logps/rejected": -403.5060119628906, "loss": 0.0008, "losses/dpo": 2.705020847315609e-07, "losses/sft": 0.656421959400177, "losses/total": 2.705020847315609e-07, "ref_logps/chosen": -227.80252075195312, "ref_logps/rejected": -223.65908813476562, "rewards/accuracies": 1.0, "rewards/chosen": -2.177110195159912, "rewards/margins": 15.807581901550293, "rewards/rejected": -17.984691619873047, "step": 3997 }, { "epoch": 0.96, "learning_rate": 9.013333333333332e-09, "logps/chosen": -203.300537109375, "logps/rejected": -367.4317321777344, "loss": 0.002, "losses/dpo": 3.467628317821436e-08, "losses/sft": 0.570300817489624, "losses/total": 3.467628317821436e-08, "ref_logps/chosen": -182.4763946533203, "ref_logps/rejected": -201.63548278808594, "rewards/accuracies": 1.0, "rewards/chosen": -2.082413673400879, "rewards/margins": 14.497213363647461, "rewards/rejected": -16.579626083374023, "step": 3998 }, { "epoch": 0.96, "learning_rate": 8.959999999999999e-09, "logps/chosen": -269.20233154296875, "logps/rejected": -414.06402587890625, "loss": 0.0012, "losses/dpo": 1.11526378410165e-11, "losses/sft": 0.8137771487236023, "losses/total": 1.11526378410165e-11, "ref_logps/chosen": -248.89398193359375, "ref_logps/rejected": -233.79783630371094, "rewards/accuracies": 1.0, "rewards/chosen": -2.030834197998047, "rewards/margins": 15.995782852172852, "rewards/rejected": -18.0266170501709, "step": 3999 }, { "epoch": 0.96, "learning_rate": 8.906666666666666e-09, "logps/chosen": -215.74923706054688, "logps/rejected": -371.44085693359375, "loss": 0.0079, "losses/dpo": 4.300992340056098e-10, "losses/sft": 0.5637797713279724, "losses/total": 4.300992340056098e-10, "ref_logps/chosen": -198.44178771972656, "ref_logps/rejected": -203.44662475585938, "rewards/accuracies": 1.0, "rewards/chosen": -1.7307440042495728, "rewards/margins": 15.06867790222168, "rewards/rejected": -16.799423217773438, "step": 4000 }, { "epoch": 0.96, "learning_rate": 8.853333333333334e-09, "logps/chosen": -274.1885070800781, "logps/rejected": -429.4688720703125, "loss": 0.0003, "losses/dpo": 7.430477921843703e-08, "losses/sft": 0.5690112113952637, "losses/total": 7.430477921843703e-08, "ref_logps/chosen": -252.81349182128906, "ref_logps/rejected": -240.445068359375, "rewards/accuracies": 1.0, "rewards/chosen": -2.1375012397766113, "rewards/margins": 16.76487922668457, "rewards/rejected": -18.902379989624023, "step": 4001 }, { "epoch": 0.96, "learning_rate": 8.8e-09, "logps/chosen": -284.45147705078125, "logps/rejected": -399.8749694824219, "loss": 0.0004, "losses/dpo": 3.273509321388701e-07, "losses/sft": 1.0575753450393677, "losses/total": 3.273509321388701e-07, "ref_logps/chosen": -264.3685302734375, "ref_logps/rejected": -220.93458557128906, "rewards/accuracies": 1.0, "rewards/chosen": -2.0082945823669434, "rewards/margins": 15.885744094848633, "rewards/rejected": -17.894039154052734, "step": 4002 }, { "epoch": 0.96, "learning_rate": 8.746666666666666e-09, "logps/chosen": -210.92547607421875, "logps/rejected": -416.3907470703125, "loss": 0.0001, "losses/dpo": 3.173959706170848e-11, "losses/sft": 0.6170707941055298, "losses/total": 3.173959706170848e-11, "ref_logps/chosen": -192.5389404296875, "ref_logps/rejected": -234.53009033203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.838653326034546, "rewards/margins": 16.347412109375, "rewards/rejected": -18.186067581176758, "step": 4003 }, { "epoch": 0.96, "learning_rate": 8.693333333333333e-09, "logps/chosen": -261.0806884765625, "logps/rejected": -410.6623840332031, "loss": 0.0051, "losses/dpo": 1.3138644339161232e-10, "losses/sft": 0.40237513184547424, "losses/total": 1.3138644339161232e-10, "ref_logps/chosen": -241.25460815429688, "ref_logps/rejected": -235.59085083007812, "rewards/accuracies": 1.0, "rewards/chosen": -1.9826059341430664, "rewards/margins": 15.524545669555664, "rewards/rejected": -17.507152557373047, "step": 4004 }, { "epoch": 0.96, "learning_rate": 8.64e-09, "logps/chosen": -241.36029052734375, "logps/rejected": -402.2464599609375, "loss": 0.0003, "losses/dpo": 3.124397407461288e-09, "losses/sft": 0.7087437510490417, "losses/total": 3.124397407461288e-09, "ref_logps/chosen": -222.43582153320312, "ref_logps/rejected": -218.78280639648438, "rewards/accuracies": 1.0, "rewards/chosen": -1.8924449682235718, "rewards/margins": 16.45391845703125, "rewards/rejected": -18.34636116027832, "step": 4005 }, { "epoch": 0.96, "learning_rate": 8.586666666666665e-09, "logps/chosen": -247.05117797851562, "logps/rejected": -438.1044006347656, "loss": 0.0004, "losses/dpo": 9.179430897040056e-14, "losses/sft": 0.5982000231742859, "losses/total": 9.179430897040056e-14, "ref_logps/chosen": -228.17030334472656, "ref_logps/rejected": -238.77590942382812, "rewards/accuracies": 1.0, "rewards/chosen": -1.8880879878997803, "rewards/margins": 18.044761657714844, "rewards/rejected": -19.93284797668457, "step": 4006 }, { "epoch": 0.96, "learning_rate": 8.533333333333332e-09, "logps/chosen": -254.43603515625, "logps/rejected": -405.73065185546875, "loss": 0.0002, "losses/dpo": 2.6089388427408267e-09, "losses/sft": 0.5177523493766785, "losses/total": 2.6089388427408267e-09, "ref_logps/chosen": -237.26040649414062, "ref_logps/rejected": -215.89752197265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7175638675689697, "rewards/margins": 17.265750885009766, "rewards/rejected": -18.983314514160156, "step": 4007 }, { "epoch": 0.96, "learning_rate": 8.479999999999999e-09, "logps/chosen": -234.1763458251953, "logps/rejected": -419.2662658691406, "loss": 0.0001, "losses/dpo": 4.398368821934007e-11, "losses/sft": 0.5545967817306519, "losses/total": 4.398368821934007e-11, "ref_logps/chosen": -216.35914611816406, "ref_logps/rejected": -231.92776489257812, "rewards/accuracies": 1.0, "rewards/chosen": -1.7817184925079346, "rewards/margins": 16.952131271362305, "rewards/rejected": -18.733848571777344, "step": 4008 }, { "epoch": 0.96, "learning_rate": 8.426666666666667e-09, "logps/chosen": -229.3668670654297, "logps/rejected": -383.78436279296875, "loss": 0.0004, "losses/dpo": 7.594302076086024e-09, "losses/sft": 0.7040539383888245, "losses/total": 7.594302076086024e-09, "ref_logps/chosen": -215.88262939453125, "ref_logps/rejected": -217.46682739257812, "rewards/accuracies": 1.0, "rewards/chosen": -1.3484238386154175, "rewards/margins": 15.283331871032715, "rewards/rejected": -16.631755828857422, "step": 4009 }, { "epoch": 0.96, "learning_rate": 8.373333333333334e-09, "logps/chosen": -254.88232421875, "logps/rejected": -413.12371826171875, "loss": 0.0, "losses/dpo": 7.138840010156855e-05, "losses/sft": 0.4959677755832672, "losses/total": 7.138840010156855e-05, "ref_logps/chosen": -232.36553955078125, "ref_logps/rejected": -230.92022705078125, "rewards/accuracies": 1.0, "rewards/chosen": -2.2516798973083496, "rewards/margins": 15.968671798706055, "rewards/rejected": -18.220352172851562, "step": 4010 }, { "epoch": 0.96, "learning_rate": 8.32e-09, "logps/chosen": -243.64111328125, "logps/rejected": -399.53424072265625, "loss": 0.0134, "losses/dpo": 1.96688633877784e-05, "losses/sft": 0.40795668959617615, "losses/total": 1.96688633877784e-05, "ref_logps/chosen": -222.7210693359375, "ref_logps/rejected": -222.5531005859375, "rewards/accuracies": 1.0, "rewards/chosen": -2.0920026302337646, "rewards/margins": 15.60611343383789, "rewards/rejected": -17.698116302490234, "step": 4011 }, { "epoch": 0.96, "learning_rate": 8.266666666666666e-09, "logps/chosen": -252.75473022460938, "logps/rejected": -404.1921691894531, "loss": 0.0001, "losses/dpo": 7.762153142465422e-10, "losses/sft": 0.5672834515571594, "losses/total": 7.762153142465422e-10, "ref_logps/chosen": -234.07916259765625, "ref_logps/rejected": -224.37049865722656, "rewards/accuracies": 1.0, "rewards/chosen": -1.8675559759140015, "rewards/margins": 16.114612579345703, "rewards/rejected": -17.98217010498047, "step": 4012 }, { "epoch": 0.96, "learning_rate": 8.213333333333333e-09, "logps/chosen": -257.01885986328125, "logps/rejected": -398.15771484375, "loss": 0.0021, "losses/dpo": 3.0547988583151664e-09, "losses/sft": 0.6183279752731323, "losses/total": 3.0547988583151664e-09, "ref_logps/chosen": -241.25531005859375, "ref_logps/rejected": -229.34051513671875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5763540267944336, "rewards/margins": 15.305367469787598, "rewards/rejected": -16.88172149658203, "step": 4013 }, { "epoch": 0.96, "learning_rate": 8.16e-09, "logps/chosen": -245.15261840820312, "logps/rejected": -362.85992431640625, "loss": 0.0022, "losses/dpo": 2.474866596458014e-05, "losses/sft": 1.1630651950836182, "losses/total": 2.474866596458014e-05, "ref_logps/chosen": -228.05380249023438, "ref_logps/rejected": -205.5863800048828, "rewards/accuracies": 1.0, "rewards/chosen": -1.709883689880371, "rewards/margins": 14.01746940612793, "rewards/rejected": -15.7273530960083, "step": 4014 }, { "epoch": 0.96, "learning_rate": 8.106666666666665e-09, "logps/chosen": -238.21072387695312, "logps/rejected": -407.7239074707031, "loss": 0.0002, "losses/dpo": 1.063666388074916e-15, "losses/sft": 1.0389410257339478, "losses/total": 1.063666388074916e-15, "ref_logps/chosen": -221.62307739257812, "ref_logps/rejected": -220.84005737304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.6587638854980469, "rewards/margins": 17.029621124267578, "rewards/rejected": -18.688385009765625, "step": 4015 }, { "epoch": 0.96, "learning_rate": 8.053333333333332e-09, "logps/chosen": -323.42181396484375, "logps/rejected": -437.5037536621094, "loss": 0.0, "losses/dpo": 4.85709861219874e-10, "losses/sft": 0.6502588391304016, "losses/total": 4.85709861219874e-10, "ref_logps/chosen": -305.7113952636719, "ref_logps/rejected": -246.95579528808594, "rewards/accuracies": 1.0, "rewards/chosen": -1.771045446395874, "rewards/margins": 17.283750534057617, "rewards/rejected": -19.054798126220703, "step": 4016 }, { "epoch": 0.96, "learning_rate": 8e-09, "logps/chosen": -297.67266845703125, "logps/rejected": -465.5664978027344, "loss": 0.0002, "losses/dpo": 1.0171457631713565e-07, "losses/sft": 0.49335795640945435, "losses/total": 1.0171457631713565e-07, "ref_logps/chosen": -276.665283203125, "ref_logps/rejected": -271.40032958984375, "rewards/accuracies": 1.0, "rewards/chosen": -2.100735664367676, "rewards/margins": 17.31588363647461, "rewards/rejected": -19.41661834716797, "step": 4017 }, { "epoch": 0.96, "learning_rate": 7.946666666666667e-09, "logps/chosen": -218.6915283203125, "logps/rejected": -410.5780334472656, "loss": 0.0008, "losses/dpo": 5.139561380929081e-07, "losses/sft": 0.7312060594558716, "losses/total": 5.139561380929081e-07, "ref_logps/chosen": -201.942626953125, "ref_logps/rejected": -233.91806030273438, "rewards/accuracies": 1.0, "rewards/chosen": -1.6748907566070557, "rewards/margins": 15.991106986999512, "rewards/rejected": -17.665996551513672, "step": 4018 }, { "epoch": 0.96, "learning_rate": 7.893333333333333e-09, "logps/chosen": -217.63836669921875, "logps/rejected": -386.28240966796875, "loss": 0.0019, "losses/dpo": 5.297843830298632e-10, "losses/sft": 0.6771459579467773, "losses/total": 5.297843830298632e-10, "ref_logps/chosen": -202.30946350097656, "ref_logps/rejected": -218.9942169189453, "rewards/accuracies": 1.0, "rewards/chosen": -1.5328917503356934, "rewards/margins": 15.19592571258545, "rewards/rejected": -16.728816986083984, "step": 4019 }, { "epoch": 0.96, "learning_rate": 7.84e-09, "logps/chosen": -188.45950317382812, "logps/rejected": -356.2281494140625, "loss": 0.0015, "losses/dpo": 1.1534424526615794e-09, "losses/sft": 0.5305171608924866, "losses/total": 1.1534424526615794e-09, "ref_logps/chosen": -175.39027404785156, "ref_logps/rejected": -193.39419555664062, "rewards/accuracies": 1.0, "rewards/chosen": -1.3069219589233398, "rewards/margins": 14.976476669311523, "rewards/rejected": -16.28339958190918, "step": 4020 }, { "epoch": 0.96, "learning_rate": 7.786666666666666e-09, "logps/chosen": -226.63748168945312, "logps/rejected": -410.215087890625, "loss": 0.0001, "losses/dpo": 4.0779193910545075e-10, "losses/sft": 0.6499230861663818, "losses/total": 4.0779193910545075e-10, "ref_logps/chosen": -212.57981872558594, "ref_logps/rejected": -227.367431640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4057652950286865, "rewards/margins": 16.878999710083008, "rewards/rejected": -18.284765243530273, "step": 4021 }, { "epoch": 0.97, "learning_rate": 7.733333333333333e-09, "logps/chosen": -234.0323028564453, "logps/rejected": -414.39154052734375, "loss": 0.0036, "losses/dpo": 2.0612665740937786e-10, "losses/sft": 0.6048757433891296, "losses/total": 2.0612665740937786e-10, "ref_logps/chosen": -216.6741943359375, "ref_logps/rejected": -236.48727416992188, "rewards/accuracies": 1.0, "rewards/chosen": -1.7358124256134033, "rewards/margins": 16.054615020751953, "rewards/rejected": -17.790428161621094, "step": 4022 }, { "epoch": 0.97, "learning_rate": 7.679999999999998e-09, "logps/chosen": -270.7124938964844, "logps/rejected": -430.4521179199219, "loss": 0.0002, "losses/dpo": 2.1381323378477646e-09, "losses/sft": 0.5152398943901062, "losses/total": 2.1381323378477646e-09, "ref_logps/chosen": -254.28665161132812, "ref_logps/rejected": -245.60501098632812, "rewards/accuracies": 1.0, "rewards/chosen": -1.6425822973251343, "rewards/margins": 16.842126846313477, "rewards/rejected": -18.484708786010742, "step": 4023 }, { "epoch": 0.97, "learning_rate": 7.626666666666665e-09, "logps/chosen": -242.01742553710938, "logps/rejected": -396.56884765625, "loss": 0.0033, "losses/dpo": 3.573897629394196e-06, "losses/sft": 0.41672563552856445, "losses/total": 3.573897629394196e-06, "ref_logps/chosen": -226.10592651367188, "ref_logps/rejected": -227.07484436035156, "rewards/accuracies": 1.0, "rewards/chosen": -1.5911507606506348, "rewards/margins": 15.35824966430664, "rewards/rejected": -16.94940185546875, "step": 4024 }, { "epoch": 0.97, "learning_rate": 7.573333333333334e-09, "logps/chosen": -289.19122314453125, "logps/rejected": -432.1414794921875, "loss": 0.0001, "losses/dpo": 1.6024321780605533e-07, "losses/sft": 1.2426249980926514, "losses/total": 1.6024321780605533e-07, "ref_logps/chosen": -270.97149658203125, "ref_logps/rejected": -245.54278564453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.8219707012176514, "rewards/margins": 16.83789825439453, "rewards/rejected": -18.659868240356445, "step": 4025 }, { "epoch": 0.97, "learning_rate": 7.52e-09, "logps/chosen": -276.4586181640625, "logps/rejected": -390.28497314453125, "loss": 0.0003, "losses/dpo": 7.892936304543241e-10, "losses/sft": 0.6183251738548279, "losses/total": 7.892936304543241e-10, "ref_logps/chosen": -253.6166229248047, "ref_logps/rejected": -215.50904846191406, "rewards/accuracies": 1.0, "rewards/chosen": -2.284198760986328, "rewards/margins": 15.19339370727539, "rewards/rejected": -17.47759246826172, "step": 4026 }, { "epoch": 0.97, "learning_rate": 7.466666666666667e-09, "logps/chosen": -233.0592041015625, "logps/rejected": -408.3299560546875, "loss": 0.0001, "losses/dpo": 1.4726720953817107e-09, "losses/sft": 0.7659041881561279, "losses/total": 1.4726720953817107e-09, "ref_logps/chosen": -220.70071411132812, "ref_logps/rejected": -228.19955444335938, "rewards/accuracies": 1.0, "rewards/chosen": -1.2358499765396118, "rewards/margins": 16.777191162109375, "rewards/rejected": -18.013042449951172, "step": 4027 }, { "epoch": 0.97, "learning_rate": 7.4133333333333325e-09, "logps/chosen": -247.63912963867188, "logps/rejected": -397.43975830078125, "loss": 0.001, "losses/dpo": 1.5308663705582148e-06, "losses/sft": 0.5180732607841492, "losses/total": 1.5308663705582148e-06, "ref_logps/chosen": -228.31631469726562, "ref_logps/rejected": -219.4534454345703, "rewards/accuracies": 1.0, "rewards/chosen": -1.932281494140625, "rewards/margins": 15.866351127624512, "rewards/rejected": -17.798633575439453, "step": 4028 }, { "epoch": 0.97, "learning_rate": 7.359999999999999e-09, "logps/chosen": -234.13949584960938, "logps/rejected": -454.54730224609375, "loss": 0.0005, "losses/dpo": 1.8251546407554997e-07, "losses/sft": 0.5864689946174622, "losses/total": 1.8251546407554997e-07, "ref_logps/chosen": -216.36209106445312, "ref_logps/rejected": -259.26263427734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.7777413129806519, "rewards/margins": 17.75072479248047, "rewards/rejected": -19.528467178344727, "step": 4029 }, { "epoch": 0.97, "learning_rate": 7.306666666666666e-09, "logps/chosen": -274.21612548828125, "logps/rejected": -388.7932434082031, "loss": 0.0005, "losses/dpo": 2.4814644916659745e-07, "losses/sft": 0.5489174723625183, "losses/total": 2.4814644916659745e-07, "ref_logps/chosen": -255.21310424804688, "ref_logps/rejected": -214.37124633789062, "rewards/accuracies": 1.0, "rewards/chosen": -1.9003021717071533, "rewards/margins": 15.541899681091309, "rewards/rejected": -17.442201614379883, "step": 4030 }, { "epoch": 0.97, "learning_rate": 7.253333333333334e-09, "logps/chosen": -230.82354736328125, "logps/rejected": -414.4912414550781, "loss": 0.0015, "losses/dpo": 3.1638896363928026e-11, "losses/sft": 0.8640909194946289, "losses/total": 3.1638896363928026e-11, "ref_logps/chosen": -210.8677978515625, "ref_logps/rejected": -236.80906677246094, "rewards/accuracies": 1.0, "rewards/chosen": -1.9955778121948242, "rewards/margins": 15.772636413574219, "rewards/rejected": -17.768213272094727, "step": 4031 }, { "epoch": 0.97, "learning_rate": 7.199999999999999e-09, "logps/chosen": -191.1533203125, "logps/rejected": -341.4662780761719, "loss": 0.0191, "losses/dpo": 6.999218982173261e-08, "losses/sft": 0.7814293503761292, "losses/total": 6.999218982173261e-08, "ref_logps/chosen": -177.14471435546875, "ref_logps/rejected": -189.34681701660156, "rewards/accuracies": 1.0, "rewards/chosen": -1.4008615016937256, "rewards/margins": 13.811088562011719, "rewards/rejected": -15.211949348449707, "step": 4032 }, { "epoch": 0.97, "learning_rate": 7.146666666666666e-09, "logps/chosen": -246.50350952148438, "logps/rejected": -434.15283203125, "loss": 0.0003, "losses/dpo": 1.342351765742933e-06, "losses/sft": 0.3790375292301178, "losses/total": 1.342351765742933e-06, "ref_logps/chosen": -228.24810791015625, "ref_logps/rejected": -250.5321502685547, "rewards/accuracies": 1.0, "rewards/chosen": -1.8255398273468018, "rewards/margins": 16.536531448364258, "rewards/rejected": -18.362071990966797, "step": 4033 }, { "epoch": 0.97, "learning_rate": 7.093333333333333e-09, "logps/chosen": -219.65328979492188, "logps/rejected": -364.50762939453125, "loss": 0.0017, "losses/dpo": 3.066289930298005e-11, "losses/sft": 0.6363116502761841, "losses/total": 3.066289930298005e-11, "ref_logps/chosen": -204.47943115234375, "ref_logps/rejected": -203.20623779296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5173863172531128, "rewards/margins": 14.612751960754395, "rewards/rejected": -16.130138397216797, "step": 4034 }, { "epoch": 0.97, "learning_rate": 7.0400000000000005e-09, "logps/chosen": -227.1190948486328, "logps/rejected": -423.028076171875, "loss": 0.0075, "losses/dpo": 2.2959668888322682e-11, "losses/sft": 0.5907479524612427, "losses/total": 2.2959668888322682e-11, "ref_logps/chosen": -208.60971069335938, "ref_logps/rejected": -243.1710205078125, "rewards/accuracies": 1.0, "rewards/chosen": -1.8509377241134644, "rewards/margins": 16.134769439697266, "rewards/rejected": -17.985708236694336, "step": 4035 }, { "epoch": 0.97, "learning_rate": 6.986666666666666e-09, "logps/chosen": -267.88775634765625, "logps/rejected": -416.92779541015625, "loss": 0.0, "losses/dpo": 7.973455649334937e-05, "losses/sft": 0.9663991332054138, "losses/total": 7.973455649334937e-05, "ref_logps/chosen": -244.24697875976562, "ref_logps/rejected": -228.4926300048828, "rewards/accuracies": 1.0, "rewards/chosen": -2.3640787601470947, "rewards/margins": 16.47943878173828, "rewards/rejected": -18.843517303466797, "step": 4036 }, { "epoch": 0.97, "learning_rate": 6.9333333333333326e-09, "logps/chosen": -270.6527404785156, "logps/rejected": -411.3494873046875, "loss": 0.0005, "losses/dpo": 5.2438107189800576e-09, "losses/sft": 0.5975283980369568, "losses/total": 5.2438107189800576e-09, "ref_logps/chosen": -250.72900390625, "ref_logps/rejected": -231.0203857421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.9923739433288574, "rewards/margins": 16.04053497314453, "rewards/rejected": -18.032909393310547, "step": 4037 }, { "epoch": 0.97, "learning_rate": 6.879999999999999e-09, "logps/chosen": -229.52996826171875, "logps/rejected": -376.69879150390625, "loss": 0.0016, "losses/dpo": 7.920182565346323e-11, "losses/sft": 0.9409944415092468, "losses/total": 7.920182565346323e-11, "ref_logps/chosen": -207.21917724609375, "ref_logps/rejected": -209.57293701171875, "rewards/accuracies": 1.0, "rewards/chosen": -2.2310800552368164, "rewards/margins": 14.481504440307617, "rewards/rejected": -16.71258544921875, "step": 4038 }, { "epoch": 0.97, "learning_rate": 6.826666666666667e-09, "logps/chosen": -208.53875732421875, "logps/rejected": -359.5038757324219, "loss": 0.0034, "losses/dpo": 2.925997250713408e-06, "losses/sft": 0.5788851380348206, "losses/total": 2.925997250713408e-06, "ref_logps/chosen": -193.3614501953125, "ref_logps/rejected": -212.44174194335938, "rewards/accuracies": 1.0, "rewards/chosen": -1.5177295207977295, "rewards/margins": 13.188481330871582, "rewards/rejected": -14.706212997436523, "step": 4039 }, { "epoch": 0.97, "learning_rate": 6.773333333333334e-09, "logps/chosen": -268.8095397949219, "logps/rejected": -408.82037353515625, "loss": 0.001, "losses/dpo": 8.444087029602088e-08, "losses/sft": 0.7512470483779907, "losses/total": 8.444087029602088e-08, "ref_logps/chosen": -247.71697998046875, "ref_logps/rejected": -231.7120361328125, "rewards/accuracies": 1.0, "rewards/chosen": -2.109255313873291, "rewards/margins": 15.601581573486328, "rewards/rejected": -17.71083641052246, "step": 4040 }, { "epoch": 0.97, "learning_rate": 6.719999999999999e-09, "logps/chosen": -283.59136962890625, "logps/rejected": -405.5744934082031, "loss": 0.0012, "losses/dpo": 5.0605559742678e-09, "losses/sft": 0.5933290719985962, "losses/total": 5.0605559742678e-09, "ref_logps/chosen": -266.7857666015625, "ref_logps/rejected": -243.6092987060547, "rewards/accuracies": 1.0, "rewards/chosen": -1.6805601119995117, "rewards/margins": 14.515960693359375, "rewards/rejected": -16.19651985168457, "step": 4041 }, { "epoch": 0.97, "learning_rate": 6.666666666666666e-09, "logps/chosen": -222.88348388671875, "logps/rejected": -421.5545654296875, "loss": 0.0466, "losses/dpo": 1.491235375404358, "losses/sft": 0.7887449860572815, "losses/total": 1.491235375404358, "ref_logps/chosen": -205.46136474609375, "ref_logps/rejected": -236.45510864257812, "rewards/accuracies": 0.96875, "rewards/chosen": -1.7422103881835938, "rewards/margins": 16.76773452758789, "rewards/rejected": -18.509944915771484, "step": 4042 }, { "epoch": 0.97, "learning_rate": 6.613333333333334e-09, "logps/chosen": -208.78236389160156, "logps/rejected": -391.85784912109375, "loss": 0.0003, "losses/dpo": 4.340268424130045e-05, "losses/sft": 1.249463438987732, "losses/total": 4.340268424130045e-05, "ref_logps/chosen": -193.60400390625, "ref_logps/rejected": -211.90220642089844, "rewards/accuracies": 1.0, "rewards/chosen": -1.5178353786468506, "rewards/margins": 16.47772979736328, "rewards/rejected": -17.99556541442871, "step": 4043 }, { "epoch": 0.97, "learning_rate": 6.5600000000000005e-09, "logps/chosen": -259.715576171875, "logps/rejected": -418.9627990722656, "loss": 0.0001, "losses/dpo": 6.26539485892863e-06, "losses/sft": 0.9929467439651489, "losses/total": 6.26539485892863e-06, "ref_logps/chosen": -242.26409912109375, "ref_logps/rejected": -240.51527404785156, "rewards/accuracies": 1.0, "rewards/chosen": -1.7451469898223877, "rewards/margins": 16.099605560302734, "rewards/rejected": -17.84475326538086, "step": 4044 }, { "epoch": 0.97, "learning_rate": 6.506666666666666e-09, "logps/chosen": -285.03704833984375, "logps/rejected": -453.9632873535156, "loss": 0.0006, "losses/dpo": 3.62601303149912e-11, "losses/sft": 0.6504592299461365, "losses/total": 3.62601303149912e-11, "ref_logps/chosen": -259.05810546875, "ref_logps/rejected": -260.4657897949219, "rewards/accuracies": 1.0, "rewards/chosen": -2.5978944301605225, "rewards/margins": 16.75185775756836, "rewards/rejected": -19.349754333496094, "step": 4045 }, { "epoch": 0.97, "learning_rate": 6.4533333333333326e-09, "logps/chosen": -222.58441162109375, "logps/rejected": -446.4718017578125, "loss": 0.0002, "losses/dpo": 4.295042240701902e-13, "losses/sft": 0.42552658915519714, "losses/total": 4.295042240701902e-13, "ref_logps/chosen": -206.5092315673828, "ref_logps/rejected": -246.98492431640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.607517957687378, "rewards/margins": 18.341171264648438, "rewards/rejected": -19.948688507080078, "step": 4046 }, { "epoch": 0.97, "learning_rate": 6.4e-09, "logps/chosen": -235.73822021484375, "logps/rejected": -418.46466064453125, "loss": 0.0013, "losses/dpo": 2.0128225475257722e-10, "losses/sft": 0.41664668917655945, "losses/total": 2.0128225475257722e-10, "ref_logps/chosen": -217.15194702148438, "ref_logps/rejected": -232.25015258789062, "rewards/accuracies": 1.0, "rewards/chosen": -1.8586268424987793, "rewards/margins": 16.76282501220703, "rewards/rejected": -18.621450424194336, "step": 4047 }, { "epoch": 0.97, "learning_rate": 6.346666666666667e-09, "logps/chosen": -246.62347412109375, "logps/rejected": -396.3021240234375, "loss": 0.0002, "losses/dpo": 5.550215495531141e-11, "losses/sft": 0.5223743319511414, "losses/total": 5.550215495531141e-11, "ref_logps/chosen": -228.41006469726562, "ref_logps/rejected": -217.38668823242188, "rewards/accuracies": 1.0, "rewards/chosen": -1.8213413953781128, "rewards/margins": 16.07019805908203, "rewards/rejected": -17.89154052734375, "step": 4048 }, { "epoch": 0.97, "learning_rate": 6.293333333333332e-09, "logps/chosen": -247.52195739746094, "logps/rejected": -390.830322265625, "loss": 0.0033, "losses/dpo": 3.4975158880179436e-10, "losses/sft": 0.4832589626312256, "losses/total": 3.4975158880179436e-10, "ref_logps/chosen": -231.35812377929688, "ref_logps/rejected": -217.8800506591797, "rewards/accuracies": 1.0, "rewards/chosen": -1.6163831949234009, "rewards/margins": 15.678644180297852, "rewards/rejected": -17.295028686523438, "step": 4049 }, { "epoch": 0.97, "learning_rate": 6.239999999999999e-09, "logps/chosen": -257.0567321777344, "logps/rejected": -422.9172668457031, "loss": 0.0005, "losses/dpo": 4.015970223036902e-09, "losses/sft": 0.5509622097015381, "losses/total": 4.015970223036902e-09, "ref_logps/chosen": -240.16571044921875, "ref_logps/rejected": -241.97084045410156, "rewards/accuracies": 1.0, "rewards/chosen": -1.6891027688980103, "rewards/margins": 16.40554428100586, "rewards/rejected": -18.094646453857422, "step": 4050 }, { "epoch": 0.97, "learning_rate": 6.186666666666667e-09, "logps/chosen": -234.3603515625, "logps/rejected": -379.1692199707031, "loss": 0.0037, "losses/dpo": 7.356240416811488e-07, "losses/sft": 0.7302854061126709, "losses/total": 7.356240416811488e-07, "ref_logps/chosen": -214.09231567382812, "ref_logps/rejected": -200.34422302246094, "rewards/accuracies": 1.0, "rewards/chosen": -2.0268044471740723, "rewards/margins": 15.855696678161621, "rewards/rejected": -17.88249969482422, "step": 4051 }, { "epoch": 0.97, "learning_rate": 6.133333333333333e-09, "logps/chosen": -244.35374450683594, "logps/rejected": -423.39068603515625, "loss": 0.0, "losses/dpo": 4.2509755715514075e-09, "losses/sft": 0.6012014150619507, "losses/total": 4.2509755715514075e-09, "ref_logps/chosen": -229.3077392578125, "ref_logps/rejected": -245.60430908203125, "rewards/accuracies": 1.0, "rewards/chosen": -1.5046004056930542, "rewards/margins": 16.274036407470703, "rewards/rejected": -17.778635025024414, "step": 4052 }, { "epoch": 0.97, "learning_rate": 6.08e-09, "logps/chosen": -217.77410888671875, "logps/rejected": -425.8861999511719, "loss": 0.0008, "losses/dpo": 2.9259497047462446e-09, "losses/sft": 0.5094416737556458, "losses/total": 2.9259497047462446e-09, "ref_logps/chosen": -197.34683227539062, "ref_logps/rejected": -235.75775146484375, "rewards/accuracies": 1.0, "rewards/chosen": -2.0427298545837402, "rewards/margins": 16.970115661621094, "rewards/rejected": -19.01284408569336, "step": 4053 }, { "epoch": 0.97, "learning_rate": 6.0266666666666665e-09, "logps/chosen": -265.88067626953125, "logps/rejected": -418.0330505371094, "loss": 0.0004, "losses/dpo": 8.17154166554701e-09, "losses/sft": 0.6816834211349487, "losses/total": 8.17154166554701e-09, "ref_logps/chosen": -250.33071899414062, "ref_logps/rejected": -236.29930114746094, "rewards/accuracies": 1.0, "rewards/chosen": -1.5549952983856201, "rewards/margins": 16.618379592895508, "rewards/rejected": -18.173376083374023, "step": 4054 }, { "epoch": 0.97, "learning_rate": 5.973333333333333e-09, "logps/chosen": -286.37103271484375, "logps/rejected": -431.6376953125, "loss": 0.001, "losses/dpo": 7.430038095890268e-08, "losses/sft": 1.2344272136688232, "losses/total": 7.430038095890268e-08, "ref_logps/chosen": -265.3372802734375, "ref_logps/rejected": -257.8707275390625, "rewards/accuracies": 1.0, "rewards/chosen": -2.10337233543396, "rewards/margins": 15.273324012756348, "rewards/rejected": -17.376697540283203, "step": 4055 }, { "epoch": 0.97, "learning_rate": 5.92e-09, "logps/chosen": -258.255615234375, "logps/rejected": -407.193603515625, "loss": 0.0004, "losses/dpo": 1.1019722023775103e-06, "losses/sft": 0.5931324362754822, "losses/total": 1.1019722023775103e-06, "ref_logps/chosen": -240.45565795898438, "ref_logps/rejected": -236.6217498779297, "rewards/accuracies": 1.0, "rewards/chosen": -1.7799971103668213, "rewards/margins": 15.277188301086426, "rewards/rejected": -17.057186126708984, "step": 4056 }, { "epoch": 0.97, "learning_rate": 5.866666666666666e-09, "logps/chosen": -240.23324584960938, "logps/rejected": -432.13116455078125, "loss": 0.0108, "losses/dpo": 3.35070887702571e-10, "losses/sft": 0.6291749477386475, "losses/total": 3.35070887702571e-10, "ref_logps/chosen": -220.70919799804688, "ref_logps/rejected": -237.48667907714844, "rewards/accuracies": 1.0, "rewards/chosen": -1.9524054527282715, "rewards/margins": 17.512041091918945, "rewards/rejected": -19.464447021484375, "step": 4057 }, { "epoch": 0.97, "learning_rate": 5.813333333333333e-09, "logps/chosen": -226.62460327148438, "logps/rejected": -381.1512145996094, "loss": 0.0001, "losses/dpo": 1.6571515288887895e-06, "losses/sft": 0.6261178851127625, "losses/total": 1.6571515288887895e-06, "ref_logps/chosen": -211.682861328125, "ref_logps/rejected": -212.63218688964844, "rewards/accuracies": 1.0, "rewards/chosen": -1.4941751956939697, "rewards/margins": 15.357728958129883, "rewards/rejected": -16.851903915405273, "step": 4058 }, { "epoch": 0.97, "learning_rate": 5.76e-09, "logps/chosen": -263.5323791503906, "logps/rejected": -421.5142822265625, "loss": 0.0004, "losses/dpo": 1.556080633235979e-07, "losses/sft": 0.7698265314102173, "losses/total": 1.556080633235979e-07, "ref_logps/chosen": -242.74725341796875, "ref_logps/rejected": -236.259521484375, "rewards/accuracies": 1.0, "rewards/chosen": -2.0785131454467773, "rewards/margins": 16.446964263916016, "rewards/rejected": -18.52547836303711, "step": 4059 }, { "epoch": 0.97, "learning_rate": 5.706666666666667e-09, "logps/chosen": -220.78634643554688, "logps/rejected": -414.80828857421875, "loss": 0.002, "losses/dpo": 1.0663830929402707e-10, "losses/sft": 0.5698854923248291, "losses/total": 1.0663830929402707e-10, "ref_logps/chosen": -202.64990234375, "ref_logps/rejected": -228.33351135253906, "rewards/accuracies": 1.0, "rewards/chosen": -1.813643217086792, "rewards/margins": 16.83383560180664, "rewards/rejected": -18.647480010986328, "step": 4060 }, { "epoch": 0.97, "learning_rate": 5.653333333333333e-09, "logps/chosen": -233.1739044189453, "logps/rejected": -421.18798828125, "loss": 0.0, "losses/dpo": 6.763765103556807e-10, "losses/sft": 0.5747703909873962, "losses/total": 6.763765103556807e-10, "ref_logps/chosen": -216.74224853515625, "ref_logps/rejected": -236.21884155273438, "rewards/accuracies": 1.0, "rewards/chosen": -1.6431635618209839, "rewards/margins": 16.85375213623047, "rewards/rejected": -18.496915817260742, "step": 4061 }, { "epoch": 0.97, "learning_rate": 5.6e-09, "logps/chosen": -215.81314086914062, "logps/rejected": -380.8680419921875, "loss": 0.0002, "losses/dpo": 1.22513763023413e-10, "losses/sft": 0.6069256067276001, "losses/total": 1.22513763023413e-10, "ref_logps/chosen": -195.70608520507812, "ref_logps/rejected": -207.3209228515625, "rewards/accuracies": 1.0, "rewards/chosen": -2.0107054710388184, "rewards/margins": 15.34400463104248, "rewards/rejected": -17.35470962524414, "step": 4062 }, { "epoch": 0.98, "learning_rate": 5.5466666666666665e-09, "logps/chosen": -269.0312805175781, "logps/rejected": -381.80255126953125, "loss": 0.0009, "losses/dpo": 5.172302763867265e-08, "losses/sft": 0.7491232752799988, "losses/total": 5.172302763867265e-08, "ref_logps/chosen": -249.170166015625, "ref_logps/rejected": -213.0151824951172, "rewards/accuracies": 1.0, "rewards/chosen": -1.9861130714416504, "rewards/margins": 14.892621994018555, "rewards/rejected": -16.878734588623047, "step": 4063 }, { "epoch": 0.98, "learning_rate": 5.493333333333333e-09, "logps/chosen": -271.8191833496094, "logps/rejected": -404.6517333984375, "loss": 0.0001, "losses/dpo": 5.552628408622695e-06, "losses/sft": 0.431500107049942, "losses/total": 5.552628408622695e-06, "ref_logps/chosen": -253.855224609375, "ref_logps/rejected": -233.31039428710938, "rewards/accuracies": 1.0, "rewards/chosen": -1.796396017074585, "rewards/margins": 15.337740898132324, "rewards/rejected": -17.13413429260254, "step": 4064 }, { "epoch": 0.98, "learning_rate": 5.439999999999999e-09, "logps/chosen": -233.25840759277344, "logps/rejected": -364.9039306640625, "loss": 0.0009, "losses/dpo": 3.968257389352914e-10, "losses/sft": 0.47432953119277954, "losses/total": 3.968257389352914e-10, "ref_logps/chosen": -213.72879028320312, "ref_logps/rejected": -198.31411743164062, "rewards/accuracies": 1.0, "rewards/chosen": -1.9529595375061035, "rewards/margins": 14.70602035522461, "rewards/rejected": -16.658979415893555, "step": 4065 }, { "epoch": 0.98, "learning_rate": 5.386666666666666e-09, "logps/chosen": -303.0928039550781, "logps/rejected": -442.03619384765625, "loss": 0.0001, "losses/dpo": 1.183364251033936e-08, "losses/sft": 0.8361466526985168, "losses/total": 1.183364251033936e-08, "ref_logps/chosen": -285.6290283203125, "ref_logps/rejected": -264.0153503417969, "rewards/accuracies": 1.0, "rewards/chosen": -1.7463757991790771, "rewards/margins": 16.055707931518555, "rewards/rejected": -17.80208396911621, "step": 4066 }, { "epoch": 0.98, "learning_rate": 5.333333333333333e-09, "logps/chosen": -220.04241943359375, "logps/rejected": -381.0234069824219, "loss": 0.0067, "losses/dpo": 3.6170638395560673e-06, "losses/sft": 0.4966893792152405, "losses/total": 3.6170638395560673e-06, "ref_logps/chosen": -200.42462158203125, "ref_logps/rejected": -212.8173065185547, "rewards/accuracies": 1.0, "rewards/chosen": -1.9617830514907837, "rewards/margins": 14.858826637268066, "rewards/rejected": -16.82061004638672, "step": 4067 }, { "epoch": 0.98, "learning_rate": 5.28e-09, "logps/chosen": -241.33859252929688, "logps/rejected": -396.12139892578125, "loss": 0.0023, "losses/dpo": 1.8393437814512303e-12, "losses/sft": 0.5498513579368591, "losses/total": 1.8393437814512303e-12, "ref_logps/chosen": -223.8906707763672, "ref_logps/rejected": -224.96726989746094, "rewards/accuracies": 1.0, "rewards/chosen": -1.7447912693023682, "rewards/margins": 15.37061882019043, "rewards/rejected": -17.11541175842285, "step": 4068 }, { "epoch": 0.98, "learning_rate": 5.226666666666667e-09, "logps/chosen": -268.61309814453125, "logps/rejected": -461.4139099121094, "loss": 0.0052, "losses/dpo": 8.101207482624773e-10, "losses/sft": 0.6680597066879272, "losses/total": 8.101207482624773e-10, "ref_logps/chosen": -248.53756713867188, "ref_logps/rejected": -242.9420623779297, "rewards/accuracies": 1.0, "rewards/chosen": -2.007554531097412, "rewards/margins": 19.839630126953125, "rewards/rejected": -21.847187042236328, "step": 4069 }, { "epoch": 0.98, "learning_rate": 5.173333333333333e-09, "logps/chosen": -227.1529541015625, "logps/rejected": -358.85992431640625, "loss": 0.0015, "losses/dpo": 4.484325688558499e-10, "losses/sft": 0.6473811268806458, "losses/total": 4.484325688558499e-10, "ref_logps/chosen": -210.5508270263672, "ref_logps/rejected": -202.22503662109375, "rewards/accuracies": 1.0, "rewards/chosen": -1.660212755203247, "rewards/margins": 14.003274917602539, "rewards/rejected": -15.663487434387207, "step": 4070 }, { "epoch": 0.98, "learning_rate": 5.12e-09, "logps/chosen": -227.5149688720703, "logps/rejected": -364.4369201660156, "loss": 0.0035, "losses/dpo": 8.41530223283371e-09, "losses/sft": 0.5382915735244751, "losses/total": 8.41530223283371e-09, "ref_logps/chosen": -209.7255096435547, "ref_logps/rejected": -206.31893920898438, "rewards/accuracies": 1.0, "rewards/chosen": -1.7789467573165894, "rewards/margins": 14.032854080200195, "rewards/rejected": -15.811800956726074, "step": 4071 }, { "epoch": 0.98, "learning_rate": 5.0666666666666665e-09, "logps/chosen": -270.4400634765625, "logps/rejected": -434.5914001464844, "loss": 0.0018, "losses/dpo": 1.4438961137841488e-09, "losses/sft": 0.5661011338233948, "losses/total": 1.4438961137841488e-09, "ref_logps/chosen": -245.68548583984375, "ref_logps/rejected": -243.40353393554688, "rewards/accuracies": 1.0, "rewards/chosen": -2.475456476211548, "rewards/margins": 16.64333152770996, "rewards/rejected": -19.11878776550293, "step": 4072 }, { "epoch": 0.98, "learning_rate": 5.013333333333333e-09, "logps/chosen": -258.5491027832031, "logps/rejected": -398.46820068359375, "loss": 0.0039, "losses/dpo": 4.309978596239716e-09, "losses/sft": 0.5647914409637451, "losses/total": 4.309978596239716e-09, "ref_logps/chosen": -245.72296142578125, "ref_logps/rejected": -229.02713012695312, "rewards/accuracies": 1.0, "rewards/chosen": -1.282615065574646, "rewards/margins": 15.661491394042969, "rewards/rejected": -16.944107055664062, "step": 4073 }, { "epoch": 0.98, "learning_rate": 4.9599999999999994e-09, "logps/chosen": -241.10882568359375, "logps/rejected": -433.5872802734375, "loss": 0.0007, "losses/dpo": 7.322769897347367e-17, "losses/sft": 0.6049546003341675, "losses/total": 7.322769897347367e-17, "ref_logps/chosen": -221.26522827148438, "ref_logps/rejected": -247.645263671875, "rewards/accuracies": 1.0, "rewards/chosen": -1.9843603372573853, "rewards/margins": 16.60984230041504, "rewards/rejected": -18.59420394897461, "step": 4074 }, { "epoch": 0.98, "learning_rate": 4.906666666666666e-09, "logps/chosen": -240.53756713867188, "logps/rejected": -423.9360046386719, "loss": 0.0, "losses/dpo": 3.679642590626031e-09, "losses/sft": 0.5233311057090759, "losses/total": 3.679642590626031e-09, "ref_logps/chosen": -222.57485961914062, "ref_logps/rejected": -232.31358337402344, "rewards/accuracies": 1.0, "rewards/chosen": -1.7962696552276611, "rewards/margins": 17.3659725189209, "rewards/rejected": -19.162240982055664, "step": 4075 }, { "epoch": 0.98, "learning_rate": 4.853333333333333e-09, "logps/chosen": -280.551513671875, "logps/rejected": -384.2409973144531, "loss": 0.0018, "losses/dpo": 1.3385095371631905e-05, "losses/sft": 1.020019769668579, "losses/total": 1.3385095371631905e-05, "ref_logps/chosen": -260.92578125, "ref_logps/rejected": -218.05435180664062, "rewards/accuracies": 1.0, "rewards/chosen": -1.9625710248947144, "rewards/margins": 14.656091690063477, "rewards/rejected": -16.618663787841797, "step": 4076 }, { "epoch": 0.98, "learning_rate": 4.8e-09, "logps/chosen": -236.70556640625, "logps/rejected": -425.10186767578125, "loss": 0.0023, "losses/dpo": 2.4324733199909687e-11, "losses/sft": 0.6694411039352417, "losses/total": 2.4324733199909687e-11, "ref_logps/chosen": -221.6663818359375, "ref_logps/rejected": -243.00730895996094, "rewards/accuracies": 1.0, "rewards/chosen": -1.5039176940917969, "rewards/margins": 16.70553970336914, "rewards/rejected": -18.209457397460938, "step": 4077 }, { "epoch": 0.98, "learning_rate": 4.746666666666666e-09, "logps/chosen": -196.5386962890625, "logps/rejected": -367.4621276855469, "loss": 0.0017, "losses/dpo": 3.3528699261431427e-10, "losses/sft": 0.5681727528572083, "losses/total": 3.3528699261431427e-10, "ref_logps/chosen": -181.57058715820312, "ref_logps/rejected": -213.10052490234375, "rewards/accuracies": 1.0, "rewards/chosen": -1.496809959411621, "rewards/margins": 13.939350128173828, "rewards/rejected": -15.43616008758545, "step": 4078 }, { "epoch": 0.98, "learning_rate": 4.693333333333333e-09, "logps/chosen": -252.34695434570312, "logps/rejected": -413.50592041015625, "loss": 0.0001, "losses/dpo": 2.0608941220245924e-09, "losses/sft": 0.4754098951816559, "losses/total": 2.0608941220245924e-09, "ref_logps/chosen": -231.29940795898438, "ref_logps/rejected": -229.50729370117188, "rewards/accuracies": 1.0, "rewards/chosen": -2.1047542095184326, "rewards/margins": 16.295106887817383, "rewards/rejected": -18.399860382080078, "step": 4079 }, { "epoch": 0.98, "learning_rate": 4.64e-09, "logps/chosen": -248.00082397460938, "logps/rejected": -406.4182434082031, "loss": 0.0002, "losses/dpo": 4.419414920825426e-12, "losses/sft": 0.6207448840141296, "losses/total": 4.419414920825426e-12, "ref_logps/chosen": -229.15989685058594, "ref_logps/rejected": -223.46231079101562, "rewards/accuracies": 1.0, "rewards/chosen": -1.8840925693511963, "rewards/margins": 16.4114990234375, "rewards/rejected": -18.29559326171875, "step": 4080 }, { "epoch": 0.98, "learning_rate": 4.5866666666666665e-09, "logps/chosen": -248.17904663085938, "logps/rejected": -405.54339599609375, "loss": 0.0181, "losses/dpo": 1.2291571094280584e-09, "losses/sft": 0.46057260036468506, "losses/total": 1.2291571094280584e-09, "ref_logps/chosen": -228.38441467285156, "ref_logps/rejected": -224.5950164794922, "rewards/accuracies": 1.0, "rewards/chosen": -1.979465365409851, "rewards/margins": 16.115371704101562, "rewards/rejected": -18.09483528137207, "step": 4081 }, { "epoch": 0.98, "learning_rate": 4.533333333333333e-09, "logps/chosen": -270.92877197265625, "logps/rejected": -445.0105285644531, "loss": 0.0002, "losses/dpo": 1.844419102781103e-06, "losses/sft": 0.6156424880027771, "losses/total": 1.844419102781103e-06, "ref_logps/chosen": -250.67630004882812, "ref_logps/rejected": -251.58636474609375, "rewards/accuracies": 1.0, "rewards/chosen": -2.0252485275268555, "rewards/margins": 17.317167282104492, "rewards/rejected": -19.34241485595703, "step": 4082 }, { "epoch": 0.98, "learning_rate": 4.4799999999999994e-09, "logps/chosen": -234.79844665527344, "logps/rejected": -381.6954345703125, "loss": 0.0083, "losses/dpo": 4.2472456107667256e-10, "losses/sft": 0.5774924159049988, "losses/total": 4.2472456107667256e-10, "ref_logps/chosen": -214.67669677734375, "ref_logps/rejected": -213.79083251953125, "rewards/accuracies": 1.0, "rewards/chosen": -2.012174606323242, "rewards/margins": 14.778286933898926, "rewards/rejected": -16.79046058654785, "step": 4083 }, { "epoch": 0.98, "learning_rate": 4.426666666666667e-09, "logps/chosen": -251.36285400390625, "logps/rejected": -424.7539978027344, "loss": 0.0054, "losses/dpo": 2.51086621574359e-05, "losses/sft": 0.5891697406768799, "losses/total": 2.51086621574359e-05, "ref_logps/chosen": -223.33932495117188, "ref_logps/rejected": -228.90313720703125, "rewards/accuracies": 1.0, "rewards/chosen": -2.8023529052734375, "rewards/margins": 16.782733917236328, "rewards/rejected": -19.585086822509766, "step": 4084 }, { "epoch": 0.98, "learning_rate": 4.373333333333333e-09, "logps/chosen": -260.2824401855469, "logps/rejected": -440.3647766113281, "loss": 0.0, "losses/dpo": 2.038811786064798e-09, "losses/sft": 0.6490478515625, "losses/total": 2.038811786064798e-09, "ref_logps/chosen": -241.44326782226562, "ref_logps/rejected": -246.41140747070312, "rewards/accuracies": 1.0, "rewards/chosen": -1.8839178085327148, "rewards/margins": 17.511417388916016, "rewards/rejected": -19.39533805847168, "step": 4085 }, { "epoch": 0.98, "learning_rate": 4.32e-09, "logps/chosen": -240.8455810546875, "logps/rejected": -458.3633728027344, "loss": 0.0002, "losses/dpo": 2.525269750253134e-11, "losses/sft": 0.438721626996994, "losses/total": 2.525269750253134e-11, "ref_logps/chosen": -220.30685424804688, "ref_logps/rejected": -253.80108642578125, "rewards/accuracies": 1.0, "rewards/chosen": -2.053873300552368, "rewards/margins": 18.40235710144043, "rewards/rejected": -20.45623016357422, "step": 4086 }, { "epoch": 0.98, "learning_rate": 4.266666666666666e-09, "logps/chosen": -195.0837860107422, "logps/rejected": -349.5573425292969, "loss": 0.0052, "losses/dpo": 2.1873320577014965e-07, "losses/sft": 0.8380864262580872, "losses/total": 2.1873320577014965e-07, "ref_logps/chosen": -181.4204559326172, "ref_logps/rejected": -195.40249633789062, "rewards/accuracies": 1.0, "rewards/chosen": -1.3663334846496582, "rewards/margins": 14.049152374267578, "rewards/rejected": -15.415485382080078, "step": 4087 }, { "epoch": 0.98, "learning_rate": 4.213333333333334e-09, "logps/chosen": -213.53244018554688, "logps/rejected": -343.60540771484375, "loss": 0.0302, "losses/dpo": 5.543332008528523e-06, "losses/sft": 1.0829358100891113, "losses/total": 5.543332008528523e-06, "ref_logps/chosen": -196.5430145263672, "ref_logps/rejected": -193.17271423339844, "rewards/accuracies": 0.96875, "rewards/chosen": -1.6989414691925049, "rewards/margins": 13.344330787658691, "rewards/rejected": -15.043272018432617, "step": 4088 }, { "epoch": 0.98, "learning_rate": 4.16e-09, "logps/chosen": -227.47000122070312, "logps/rejected": -427.44537353515625, "loss": 0.0, "losses/dpo": 8.094303893813048e-07, "losses/sft": 0.8153586387634277, "losses/total": 8.094303893813048e-07, "ref_logps/chosen": -211.7553253173828, "ref_logps/rejected": -237.7830810546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5714670419692993, "rewards/margins": 17.394760131835938, "rewards/rejected": -18.966228485107422, "step": 4089 }, { "epoch": 0.98, "learning_rate": 4.1066666666666665e-09, "logps/chosen": -243.41171264648438, "logps/rejected": -369.1680908203125, "loss": 0.0012, "losses/dpo": 2.604733026601025e-06, "losses/sft": 0.5378522276878357, "losses/total": 2.604733026601025e-06, "ref_logps/chosen": -230.24232482910156, "ref_logps/rejected": -205.86375427246094, "rewards/accuracies": 1.0, "rewards/chosen": -1.3169376850128174, "rewards/margins": 15.013495445251465, "rewards/rejected": -16.330432891845703, "step": 4090 }, { "epoch": 0.98, "learning_rate": 4.0533333333333326e-09, "logps/chosen": -248.19703674316406, "logps/rejected": -370.7267150878906, "loss": 0.0009, "losses/dpo": 9.894125105347484e-05, "losses/sft": 0.8948594331741333, "losses/total": 9.894125105347484e-05, "ref_logps/chosen": -231.10440063476562, "ref_logps/rejected": -206.6512451171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.7092649936676025, "rewards/margins": 14.698283195495605, "rewards/rejected": -16.407548904418945, "step": 4091 }, { "epoch": 0.98, "learning_rate": 4e-09, "logps/chosen": -239.38299560546875, "logps/rejected": -400.2192077636719, "loss": 0.0073, "losses/dpo": 1.8901685772121368e-10, "losses/sft": 0.9730403423309326, "losses/total": 1.8901685772121368e-10, "ref_logps/chosen": -223.02557373046875, "ref_logps/rejected": -217.44654846191406, "rewards/accuracies": 1.0, "rewards/chosen": -1.6357427835464478, "rewards/margins": 16.64151954650879, "rewards/rejected": -18.277263641357422, "step": 4092 }, { "epoch": 0.98, "learning_rate": 3.946666666666666e-09, "logps/chosen": -252.2349853515625, "logps/rejected": -393.02996826171875, "loss": 0.0003, "losses/dpo": 3.2687835077593874e-11, "losses/sft": 0.6157178282737732, "losses/total": 3.2687835077593874e-11, "ref_logps/chosen": -232.40121459960938, "ref_logps/rejected": -221.31398010253906, "rewards/accuracies": 1.0, "rewards/chosen": -1.9833765029907227, "rewards/margins": 15.18822193145752, "rewards/rejected": -17.171598434448242, "step": 4093 }, { "epoch": 0.98, "learning_rate": 3.893333333333333e-09, "logps/chosen": -267.1946716308594, "logps/rejected": -445.038818359375, "loss": 0.0001, "losses/dpo": 6.128320961806821e-09, "losses/sft": 0.6354326605796814, "losses/total": 6.128320961806821e-09, "ref_logps/chosen": -244.97055053710938, "ref_logps/rejected": -247.324462890625, "rewards/accuracies": 1.0, "rewards/chosen": -2.222414970397949, "rewards/margins": 17.54901885986328, "rewards/rejected": -19.771434783935547, "step": 4094 }, { "epoch": 0.98, "learning_rate": 3.839999999999999e-09, "logps/chosen": -217.87037658691406, "logps/rejected": -388.10260009765625, "loss": 0.0107, "losses/dpo": 5.0961430630991345e-09, "losses/sft": 0.5074519515037537, "losses/total": 5.0961430630991345e-09, "ref_logps/chosen": -198.90594482421875, "ref_logps/rejected": -217.3012237548828, "rewards/accuracies": 1.0, "rewards/chosen": -1.8964438438415527, "rewards/margins": 15.183694839477539, "rewards/rejected": -17.080137252807617, "step": 4095 }, { "epoch": 0.98, "learning_rate": 3.786666666666667e-09, "logps/chosen": -234.70523071289062, "logps/rejected": -398.34136962890625, "loss": 0.004, "losses/dpo": 1.4458038322118227e-08, "losses/sft": 0.8093294501304626, "losses/total": 1.4458038322118227e-08, "ref_logps/chosen": -212.87496948242188, "ref_logps/rejected": -232.1450958251953, "rewards/accuracies": 1.0, "rewards/chosen": -2.183027505874634, "rewards/margins": 14.436602592468262, "rewards/rejected": -16.61962890625, "step": 4096 }, { "epoch": 0.98, "learning_rate": 3.733333333333334e-09, "logps/chosen": -170.47848510742188, "logps/rejected": -364.37640380859375, "loss": 0.0018, "losses/dpo": 2.4443808088747687e-10, "losses/sft": 0.6901707053184509, "losses/total": 2.4443808088747687e-10, "ref_logps/chosen": -156.21981811523438, "ref_logps/rejected": -202.50885009765625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4258689880371094, "rewards/margins": 14.760885238647461, "rewards/rejected": -16.186756134033203, "step": 4097 }, { "epoch": 0.98, "learning_rate": 3.6799999999999997e-09, "logps/chosen": -286.29803466796875, "logps/rejected": -404.63189697265625, "loss": 0.0002, "losses/dpo": 6.496869531247285e-08, "losses/sft": 0.6817206144332886, "losses/total": 6.496869531247285e-08, "ref_logps/chosen": -270.7979431152344, "ref_logps/rejected": -243.9871826171875, "rewards/accuracies": 1.0, "rewards/chosen": -1.5500067472457886, "rewards/margins": 14.51446533203125, "rewards/rejected": -16.064472198486328, "step": 4098 }, { "epoch": 0.98, "learning_rate": 3.626666666666667e-09, "logps/chosen": -270.6475524902344, "logps/rejected": -402.364013671875, "loss": 0.0003, "losses/dpo": 2.2151300527184503e-07, "losses/sft": 1.0713094472885132, "losses/total": 2.2151300527184503e-07, "ref_logps/chosen": -254.5496368408203, "ref_logps/rejected": -232.26431274414062, "rewards/accuracies": 1.0, "rewards/chosen": -1.6097922325134277, "rewards/margins": 15.400177001953125, "rewards/rejected": -17.00996971130371, "step": 4099 }, { "epoch": 0.98, "learning_rate": 3.573333333333333e-09, "logps/chosen": -209.30340576171875, "logps/rejected": -371.36566162109375, "loss": 0.0002, "losses/dpo": 4.204760983839151e-09, "losses/sft": 0.6876820921897888, "losses/total": 4.204760983839151e-09, "ref_logps/chosen": -193.2996063232422, "ref_logps/rejected": -212.10165405273438, "rewards/accuracies": 1.0, "rewards/chosen": -1.600379228591919, "rewards/margins": 14.326025009155273, "rewards/rejected": -15.92640495300293, "step": 4100 }, { "epoch": 0.98, "learning_rate": 3.5200000000000003e-09, "logps/chosen": -290.99542236328125, "logps/rejected": -450.1219482421875, "loss": 0.002, "losses/dpo": 2.2616888362136933e-09, "losses/sft": 0.6200652718544006, "losses/total": 2.2616888362136933e-09, "ref_logps/chosen": -267.9649963378906, "ref_logps/rejected": -258.10931396484375, "rewards/accuracies": 1.0, "rewards/chosen": -2.303041458129883, "rewards/margins": 16.898223876953125, "rewards/rejected": -19.20126724243164, "step": 4101 }, { "epoch": 0.98, "learning_rate": 3.4666666666666663e-09, "logps/chosen": -223.5694580078125, "logps/rejected": -379.3407287597656, "loss": 0.0033, "losses/dpo": 6.957534548490685e-11, "losses/sft": 0.7035995721817017, "losses/total": 6.957534548490685e-11, "ref_logps/chosen": -209.29873657226562, "ref_logps/rejected": -222.14743041992188, "rewards/accuracies": 1.0, "rewards/chosen": -1.4270730018615723, "rewards/margins": 14.292259216308594, "rewards/rejected": -15.719331741333008, "step": 4102 }, { "epoch": 0.98, "learning_rate": 3.4133333333333335e-09, "logps/chosen": -254.77442932128906, "logps/rejected": -431.72186279296875, "loss": 0.0005, "losses/dpo": 2.669375553931985e-10, "losses/sft": 0.4887681007385254, "losses/total": 2.669375553931985e-10, "ref_logps/chosen": -237.83447265625, "ref_logps/rejected": -236.52865600585938, "rewards/accuracies": 1.0, "rewards/chosen": -1.693996787071228, "rewards/margins": 17.82532501220703, "rewards/rejected": -19.519323348999023, "step": 4103 }, { "epoch": 0.98, "learning_rate": 3.3599999999999996e-09, "logps/chosen": -241.986572265625, "logps/rejected": -360.92706298828125, "loss": 0.0069, "losses/dpo": 1.775864620867651e-05, "losses/sft": 0.848760187625885, "losses/total": 1.775864620867651e-05, "ref_logps/chosen": -222.39053344726562, "ref_logps/rejected": -198.6080322265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.959604263305664, "rewards/margins": 14.272299766540527, "rewards/rejected": -16.231903076171875, "step": 4104 }, { "epoch": 0.99, "learning_rate": 3.306666666666667e-09, "logps/chosen": -287.5825500488281, "logps/rejected": -438.3114929199219, "loss": 0.0005, "losses/dpo": 1.2057101095308553e-08, "losses/sft": 0.4394311308860779, "losses/total": 1.2057101095308553e-08, "ref_logps/chosen": -269.2923889160156, "ref_logps/rejected": -240.6947479248047, "rewards/accuracies": 1.0, "rewards/chosen": -1.8290185928344727, "rewards/margins": 17.932655334472656, "rewards/rejected": -19.761672973632812, "step": 4105 }, { "epoch": 0.99, "learning_rate": 3.253333333333333e-09, "logps/chosen": -258.091552734375, "logps/rejected": -393.2445068359375, "loss": 0.0001, "losses/dpo": 1.2070797486551932e-11, "losses/sft": 0.6807427406311035, "losses/total": 1.2070797486551932e-11, "ref_logps/chosen": -237.97268676757812, "ref_logps/rejected": -216.00108337402344, "rewards/accuracies": 1.0, "rewards/chosen": -2.0118865966796875, "rewards/margins": 15.712459564208984, "rewards/rejected": -17.72434425354004, "step": 4106 }, { "epoch": 0.99, "learning_rate": 3.2e-09, "logps/chosen": -229.80557250976562, "logps/rejected": -415.3206481933594, "loss": 0.0027, "losses/dpo": 2.666496309779859e-13, "losses/sft": 0.6353884935379028, "losses/total": 2.666496309779859e-13, "ref_logps/chosen": -210.80792236328125, "ref_logps/rejected": -236.37936401367188, "rewards/accuracies": 1.0, "rewards/chosen": -1.899765968322754, "rewards/margins": 15.994363784790039, "rewards/rejected": -17.894128799438477, "step": 4107 }, { "epoch": 0.99, "learning_rate": 3.146666666666666e-09, "logps/chosen": -252.1195068359375, "logps/rejected": -450.85406494140625, "loss": 0.0003, "losses/dpo": 7.406769086948373e-10, "losses/sft": 0.5972864627838135, "losses/total": 7.406769086948373e-10, "ref_logps/chosen": -233.70797729492188, "ref_logps/rejected": -251.2391357421875, "rewards/accuracies": 1.0, "rewards/chosen": -1.841153621673584, "rewards/margins": 18.120342254638672, "rewards/rejected": -19.96149444580078, "step": 4108 }, { "epoch": 0.99, "learning_rate": 3.0933333333333334e-09, "logps/chosen": -208.4892120361328, "logps/rejected": -379.5207824707031, "loss": 0.0013, "losses/dpo": 3.970543184550479e-05, "losses/sft": 0.7769339680671692, "losses/total": 3.970543184550479e-05, "ref_logps/chosen": -193.01206970214844, "ref_logps/rejected": -206.6342315673828, "rewards/accuracies": 1.0, "rewards/chosen": -1.547713279724121, "rewards/margins": 15.740941047668457, "rewards/rejected": -17.288654327392578, "step": 4109 }, { "epoch": 0.99, "learning_rate": 3.04e-09, "logps/chosen": -245.77261352539062, "logps/rejected": -394.47698974609375, "loss": 0.0008, "losses/dpo": 2.8829436060640035e-10, "losses/sft": 0.9041277170181274, "losses/total": 2.8829436060640035e-10, "ref_logps/chosen": -230.44412231445312, "ref_logps/rejected": -228.06497192382812, "rewards/accuracies": 1.0, "rewards/chosen": -1.5328515768051147, "rewards/margins": 15.10835075378418, "rewards/rejected": -16.641202926635742, "step": 4110 }, { "epoch": 0.99, "learning_rate": 2.9866666666666667e-09, "logps/chosen": -217.07167053222656, "logps/rejected": -373.01483154296875, "loss": 0.0075, "losses/dpo": 2.575791029357788e-07, "losses/sft": 0.6232603788375854, "losses/total": 2.575791029357788e-07, "ref_logps/chosen": -199.3214111328125, "ref_logps/rejected": -201.761962890625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7750258445739746, "rewards/margins": 15.350262641906738, "rewards/rejected": -17.125288009643555, "step": 4111 }, { "epoch": 0.99, "learning_rate": 2.933333333333333e-09, "logps/chosen": -264.406494140625, "logps/rejected": -403.5468444824219, "loss": 0.0, "losses/dpo": 3.9055134126719793e-10, "losses/sft": 0.715388834476471, "losses/total": 3.9055134126719793e-10, "ref_logps/chosen": -244.57481384277344, "ref_logps/rejected": -227.06166076660156, "rewards/accuracies": 1.0, "rewards/chosen": -1.983168125152588, "rewards/margins": 15.665348052978516, "rewards/rejected": -17.648515701293945, "step": 4112 }, { "epoch": 0.99, "learning_rate": 2.88e-09, "logps/chosen": -236.23312377929688, "logps/rejected": -404.06121826171875, "loss": 0.0013, "losses/dpo": 2.7356060125494075e-12, "losses/sft": 0.6145645380020142, "losses/total": 2.7356060125494075e-12, "ref_logps/chosen": -220.72601318359375, "ref_logps/rejected": -226.45022583007812, "rewards/accuracies": 1.0, "rewards/chosen": -1.5507110357284546, "rewards/margins": 16.210391998291016, "rewards/rejected": -17.76110076904297, "step": 4113 }, { "epoch": 0.99, "learning_rate": 2.8266666666666664e-09, "logps/chosen": -194.11123657226562, "logps/rejected": -362.5224609375, "loss": 0.0021, "losses/dpo": 0.000446769263362512, "losses/sft": 0.7624515891075134, "losses/total": 0.000446769263362512, "ref_logps/chosen": -176.6391143798828, "ref_logps/rejected": -196.10064697265625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7472118139266968, "rewards/margins": 14.894968032836914, "rewards/rejected": -16.642179489135742, "step": 4114 }, { "epoch": 0.99, "learning_rate": 2.7733333333333333e-09, "logps/chosen": -248.00921630859375, "logps/rejected": -412.97344970703125, "loss": 0.0023, "losses/dpo": 8.849909138852752e-10, "losses/sft": 0.4875679016113281, "losses/total": 8.849909138852752e-10, "ref_logps/chosen": -229.914306640625, "ref_logps/rejected": -233.98365783691406, "rewards/accuracies": 1.0, "rewards/chosen": -1.8094911575317383, "rewards/margins": 16.08949089050293, "rewards/rejected": -17.89897918701172, "step": 4115 }, { "epoch": 0.99, "learning_rate": 2.7199999999999997e-09, "logps/chosen": -268.392578125, "logps/rejected": -450.62017822265625, "loss": 0.0001, "losses/dpo": 2.7591966489382003e-09, "losses/sft": 0.6026332974433899, "losses/total": 2.7591966489382003e-09, "ref_logps/chosen": -249.4752197265625, "ref_logps/rejected": -245.26986694335938, "rewards/accuracies": 1.0, "rewards/chosen": -1.8917334079742432, "rewards/margins": 18.643299102783203, "rewards/rejected": -20.5350341796875, "step": 4116 }, { "epoch": 0.99, "learning_rate": 2.6666666666666666e-09, "logps/chosen": -232.62442016601562, "logps/rejected": -394.9946594238281, "loss": 0.0008, "losses/dpo": 1.6512660128142898e-09, "losses/sft": 0.48234012722969055, "losses/total": 1.6512660128142898e-09, "ref_logps/chosen": -214.2765655517578, "ref_logps/rejected": -211.89071655273438, "rewards/accuracies": 1.0, "rewards/chosen": -1.834784984588623, "rewards/margins": 16.475608825683594, "rewards/rejected": -18.310394287109375, "step": 4117 }, { "epoch": 0.99, "learning_rate": 2.6133333333333334e-09, "logps/chosen": -249.20016479492188, "logps/rejected": -435.9563903808594, "loss": 0.0002, "losses/dpo": 2.0831836877732712e-07, "losses/sft": 0.5559608340263367, "losses/total": 2.0831836877732712e-07, "ref_logps/chosen": -229.5028533935547, "ref_logps/rejected": -248.3189239501953, "rewards/accuracies": 1.0, "rewards/chosen": -1.9697295427322388, "rewards/margins": 16.794017791748047, "rewards/rejected": -18.76374626159668, "step": 4118 }, { "epoch": 0.99, "learning_rate": 2.56e-09, "logps/chosen": -252.3341064453125, "logps/rejected": -379.4931640625, "loss": 0.0006, "losses/dpo": 0.0001700437132967636, "losses/sft": 0.9076548218727112, "losses/total": 0.0001700437132967636, "ref_logps/chosen": -230.7897186279297, "ref_logps/rejected": -209.44125366210938, "rewards/accuracies": 1.0, "rewards/chosen": -2.1544411182403564, "rewards/margins": 14.850753784179688, "rewards/rejected": -17.00519561767578, "step": 4119 }, { "epoch": 0.99, "learning_rate": 2.5066666666666667e-09, "logps/chosen": -274.45611572265625, "logps/rejected": -429.45098876953125, "loss": 0.001, "losses/dpo": 1.7260090023896169e-09, "losses/sft": 0.7028641700744629, "losses/total": 1.7260090023896169e-09, "ref_logps/chosen": -253.30734252929688, "ref_logps/rejected": -251.5621337890625, "rewards/accuracies": 1.0, "rewards/chosen": -2.1148757934570312, "rewards/margins": 15.6740083694458, "rewards/rejected": -17.788883209228516, "step": 4120 }, { "epoch": 0.99, "learning_rate": 2.453333333333333e-09, "logps/chosen": -232.6537322998047, "logps/rejected": -353.033447265625, "loss": 0.0001, "losses/dpo": 1.208290800747136e-08, "losses/sft": 0.6687092185020447, "losses/total": 1.208290800747136e-08, "ref_logps/chosen": -212.42990112304688, "ref_logps/rejected": -199.79112243652344, "rewards/accuracies": 1.0, "rewards/chosen": -2.022383689880371, "rewards/margins": 13.301847457885742, "rewards/rejected": -15.324230194091797, "step": 4121 }, { "epoch": 0.99, "learning_rate": 2.4e-09, "logps/chosen": -257.9937438964844, "logps/rejected": -412.5142822265625, "loss": 0.0009, "losses/dpo": 3.6173574358144833e-07, "losses/sft": 0.6251382231712341, "losses/total": 3.6173574358144833e-07, "ref_logps/chosen": -235.78451538085938, "ref_logps/rejected": -234.66165161132812, "rewards/accuracies": 1.0, "rewards/chosen": -2.2209219932556152, "rewards/margins": 15.564342498779297, "rewards/rejected": -17.785263061523438, "step": 4122 }, { "epoch": 0.99, "learning_rate": 2.3466666666666664e-09, "logps/chosen": -207.03822326660156, "logps/rejected": -393.0020751953125, "loss": 0.0003, "losses/dpo": 5.450201401302479e-10, "losses/sft": 0.7769985198974609, "losses/total": 5.450201401302479e-10, "ref_logps/chosen": -192.7598114013672, "ref_logps/rejected": -225.63763427734375, "rewards/accuracies": 1.0, "rewards/chosen": -1.4278411865234375, "rewards/margins": 15.30860424041748, "rewards/rejected": -16.736446380615234, "step": 4123 }, { "epoch": 0.99, "learning_rate": 2.2933333333333333e-09, "logps/chosen": -237.7834930419922, "logps/rejected": -377.96661376953125, "loss": 0.0007, "losses/dpo": 3.0462346330750734e-05, "losses/sft": 0.5922948122024536, "losses/total": 3.0462346330750734e-05, "ref_logps/chosen": -220.36593627929688, "ref_logps/rejected": -205.578369140625, "rewards/accuracies": 1.0, "rewards/chosen": -1.741757869720459, "rewards/margins": 15.49706745147705, "rewards/rejected": -17.23882484436035, "step": 4124 }, { "epoch": 0.99, "learning_rate": 2.2399999999999997e-09, "logps/chosen": -207.1325225830078, "logps/rejected": -369.981689453125, "loss": 0.0002, "losses/dpo": 1.6157988724785355e-08, "losses/sft": 0.7301328778266907, "losses/total": 1.6157988724785355e-08, "ref_logps/chosen": -194.24267578125, "ref_logps/rejected": -215.49227905273438, "rewards/accuracies": 1.0, "rewards/chosen": -1.2889857292175293, "rewards/margins": 14.159955024719238, "rewards/rejected": -15.448941230773926, "step": 4125 }, { "epoch": 0.99, "learning_rate": 2.1866666666666666e-09, "logps/chosen": -231.2310791015625, "logps/rejected": -413.56610107421875, "loss": 0.003, "losses/dpo": 1.826279429906208e-07, "losses/sft": 0.6513069868087769, "losses/total": 1.826279429906208e-07, "ref_logps/chosen": -210.54248046875, "ref_logps/rejected": -236.89761352539062, "rewards/accuracies": 1.0, "rewards/chosen": -2.068859100341797, "rewards/margins": 15.59798812866211, "rewards/rejected": -17.666847229003906, "step": 4126 }, { "epoch": 0.99, "learning_rate": 2.133333333333333e-09, "logps/chosen": -245.29376220703125, "logps/rejected": -405.3524475097656, "loss": 0.0003, "losses/dpo": 4.31837987591166e-09, "losses/sft": 0.992884635925293, "losses/total": 4.31837987591166e-09, "ref_logps/chosen": -230.6783447265625, "ref_logps/rejected": -244.1641082763672, "rewards/accuracies": 1.0, "rewards/chosen": -1.4615434408187866, "rewards/margins": 14.657289505004883, "rewards/rejected": -16.118833541870117, "step": 4127 }, { "epoch": 0.99, "learning_rate": 2.08e-09, "logps/chosen": -253.5890655517578, "logps/rejected": -372.9982604980469, "loss": 0.0011, "losses/dpo": 2.5708581929961838e-09, "losses/sft": 0.5904974341392517, "losses/total": 2.5708581929961838e-09, "ref_logps/chosen": -235.69168090820312, "ref_logps/rejected": -212.45936584472656, "rewards/accuracies": 1.0, "rewards/chosen": -1.7897388935089111, "rewards/margins": 14.26414966583252, "rewards/rejected": -16.05388832092285, "step": 4128 }, { "epoch": 0.99, "learning_rate": 2.0266666666666663e-09, "logps/chosen": -227.76046752929688, "logps/rejected": -394.53729248046875, "loss": 0.0014, "losses/dpo": 9.81204362204835e-09, "losses/sft": 0.5767449736595154, "losses/total": 9.81204362204835e-09, "ref_logps/chosen": -215.19500732421875, "ref_logps/rejected": -222.5699005126953, "rewards/accuracies": 1.0, "rewards/chosen": -1.2565457820892334, "rewards/margins": 15.940191268920898, "rewards/rejected": -17.196735382080078, "step": 4129 }, { "epoch": 0.99, "learning_rate": 1.973333333333333e-09, "logps/chosen": -250.42828369140625, "logps/rejected": -401.87908935546875, "loss": 0.0004, "losses/dpo": 6.453025434893789e-06, "losses/sft": 0.5329847931861877, "losses/total": 6.453025434893789e-06, "ref_logps/chosen": -233.26504516601562, "ref_logps/rejected": -223.34176635742188, "rewards/accuracies": 1.0, "rewards/chosen": -1.7163236141204834, "rewards/margins": 16.137409210205078, "rewards/rejected": -17.85373306274414, "step": 4130 }, { "epoch": 0.99, "learning_rate": 1.9199999999999996e-09, "logps/chosen": -200.34860229492188, "logps/rejected": -383.5408935546875, "loss": 0.002, "losses/dpo": 1.3926134911912413e-08, "losses/sft": 0.6665917634963989, "losses/total": 1.3926134911912413e-08, "ref_logps/chosen": -184.22805786132812, "ref_logps/rejected": -218.91543579101562, "rewards/accuracies": 1.0, "rewards/chosen": -1.6120532751083374, "rewards/margins": 14.850491523742676, "rewards/rejected": -16.46254539489746, "step": 4131 }, { "epoch": 0.99, "learning_rate": 1.866666666666667e-09, "logps/chosen": -257.57720947265625, "logps/rejected": -370.9984130859375, "loss": 0.0008, "losses/dpo": 8.314061261671668e-08, "losses/sft": 0.751914381980896, "losses/total": 8.314061261671668e-08, "ref_logps/chosen": -240.25820922851562, "ref_logps/rejected": -205.9820556640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.7318965196609497, "rewards/margins": 14.769737243652344, "rewards/rejected": -16.50163459777832, "step": 4132 }, { "epoch": 0.99, "learning_rate": 1.8133333333333335e-09, "logps/chosen": -251.4123992919922, "logps/rejected": -406.6363525390625, "loss": 0.0001, "losses/dpo": 2.345004577364307e-05, "losses/sft": 0.7057897448539734, "losses/total": 2.345004577364307e-05, "ref_logps/chosen": -232.2147979736328, "ref_logps/rejected": -227.64404296875, "rewards/accuracies": 1.0, "rewards/chosen": -1.9197615385055542, "rewards/margins": 15.979473114013672, "rewards/rejected": -17.899232864379883, "step": 4133 }, { "epoch": 0.99, "learning_rate": 1.7600000000000001e-09, "logps/chosen": -269.01702880859375, "logps/rejected": -434.7181396484375, "loss": 0.0, "losses/dpo": 1.0395748706315544e-09, "losses/sft": 0.8027241230010986, "losses/total": 1.0395748706315544e-09, "ref_logps/chosen": -250.36549377441406, "ref_logps/rejected": -245.67849731445312, "rewards/accuracies": 1.0, "rewards/chosen": -1.8651530742645264, "rewards/margins": 17.038808822631836, "rewards/rejected": -18.903963088989258, "step": 4134 }, { "epoch": 0.99, "learning_rate": 1.7066666666666668e-09, "logps/chosen": -282.69342041015625, "logps/rejected": -437.8626403808594, "loss": 0.0002, "losses/dpo": 1.6027652804950776e-07, "losses/sft": 0.49809420108795166, "losses/total": 1.6027652804950776e-07, "ref_logps/chosen": -258.0958251953125, "ref_logps/rejected": -250.9359588623047, "rewards/accuracies": 1.0, "rewards/chosen": -2.459757089614868, "rewards/margins": 16.232912063598633, "rewards/rejected": -18.692668914794922, "step": 4135 }, { "epoch": 0.99, "learning_rate": 1.6533333333333334e-09, "logps/chosen": -255.01266479492188, "logps/rejected": -393.802734375, "loss": 0.0001, "losses/dpo": 2.494271882369503e-07, "losses/sft": 0.46759071946144104, "losses/total": 2.494271882369503e-07, "ref_logps/chosen": -237.08721923828125, "ref_logps/rejected": -225.0054931640625, "rewards/accuracies": 1.0, "rewards/chosen": -1.792543649673462, "rewards/margins": 15.087181091308594, "rewards/rejected": -16.879722595214844, "step": 4136 }, { "epoch": 0.99, "learning_rate": 1.6e-09, "logps/chosen": -222.86077880859375, "logps/rejected": -355.2513427734375, "loss": 0.001, "losses/dpo": 4.5443179885751306e-08, "losses/sft": 0.5175303220748901, "losses/total": 4.5443179885751306e-08, "ref_logps/chosen": -206.5826416015625, "ref_logps/rejected": -204.5938720703125, "rewards/accuracies": 1.0, "rewards/chosen": -1.6278138160705566, "rewards/margins": 13.437934875488281, "rewards/rejected": -15.06574821472168, "step": 4137 }, { "epoch": 0.99, "learning_rate": 1.5466666666666667e-09, "logps/chosen": -275.68896484375, "logps/rejected": -456.6210632324219, "loss": 0.0, "losses/dpo": 2.8321128553443486e-08, "losses/sft": 0.40933653712272644, "losses/total": 2.8321128553443486e-08, "ref_logps/chosen": -261.66912841796875, "ref_logps/rejected": -259.57830810546875, "rewards/accuracies": 1.0, "rewards/chosen": -1.4019842147827148, "rewards/margins": 18.302291870117188, "rewards/rejected": -19.70427703857422, "step": 4138 }, { "epoch": 0.99, "learning_rate": 1.4933333333333333e-09, "logps/chosen": -203.09506225585938, "logps/rejected": -372.1794128417969, "loss": 0.001, "losses/dpo": 8.521290340013365e-09, "losses/sft": 1.0993194580078125, "losses/total": 8.521290340013365e-09, "ref_logps/chosen": -189.424072265625, "ref_logps/rejected": -207.85923767089844, "rewards/accuracies": 1.0, "rewards/chosen": -1.3670982122421265, "rewards/margins": 15.064918518066406, "rewards/rejected": -16.432016372680664, "step": 4139 }, { "epoch": 0.99, "learning_rate": 1.44e-09, "logps/chosen": -255.05953979492188, "logps/rejected": -379.94403076171875, "loss": 0.0008, "losses/dpo": 1.0536613359590774e-07, "losses/sft": 0.7352312207221985, "losses/total": 1.0536613359590774e-07, "ref_logps/chosen": -236.67018127441406, "ref_logps/rejected": -219.3795623779297, "rewards/accuracies": 1.0, "rewards/chosen": -1.8389358520507812, "rewards/margins": 14.217513084411621, "rewards/rejected": -16.05644989013672, "step": 4140 }, { "epoch": 0.99, "learning_rate": 1.3866666666666666e-09, "logps/chosen": -271.9453430175781, "logps/rejected": -410.16357421875, "loss": 0.0003, "losses/dpo": 5.656103918560973e-10, "losses/sft": 0.6731630563735962, "losses/total": 5.656103918560973e-10, "ref_logps/chosen": -255.294189453125, "ref_logps/rejected": -231.39779663085938, "rewards/accuracies": 1.0, "rewards/chosen": -1.6651145219802856, "rewards/margins": 16.211467742919922, "rewards/rejected": -17.8765811920166, "step": 4141 }, { "epoch": 0.99, "learning_rate": 1.3333333333333333e-09, "logps/chosen": -277.7950439453125, "logps/rejected": -404.47998046875, "loss": 0.0, "losses/dpo": 2.1404763174359687e-05, "losses/sft": 0.923263669013977, "losses/total": 2.1404763174359687e-05, "ref_logps/chosen": -255.1221466064453, "ref_logps/rejected": -232.26443481445312, "rewards/accuracies": 1.0, "rewards/chosen": -2.26728892326355, "rewards/margins": 14.954266548156738, "rewards/rejected": -17.221553802490234, "step": 4142 }, { "epoch": 0.99, "learning_rate": 1.28e-09, "logps/chosen": -200.8186798095703, "logps/rejected": -383.2127380371094, "loss": 0.0004, "losses/dpo": 7.3856263327343186e-09, "losses/sft": 0.6066176891326904, "losses/total": 7.3856263327343186e-09, "ref_logps/chosen": -182.47821044921875, "ref_logps/rejected": -207.49668884277344, "rewards/accuracies": 1.0, "rewards/chosen": -1.8340449333190918, "rewards/margins": 15.737560272216797, "rewards/rejected": -17.571605682373047, "step": 4143 }, { "epoch": 0.99, "learning_rate": 1.2266666666666666e-09, "logps/chosen": -292.713623046875, "logps/rejected": -448.613525390625, "loss": 0.0, "losses/dpo": 9.232314912566153e-11, "losses/sft": 0.620937705039978, "losses/total": 9.232314912566153e-11, "ref_logps/chosen": -273.48175048828125, "ref_logps/rejected": -252.4978790283203, "rewards/accuracies": 1.0, "rewards/chosen": -1.923187255859375, "rewards/margins": 17.688377380371094, "rewards/rejected": -19.61156463623047, "step": 4144 }, { "epoch": 0.99, "learning_rate": 1.1733333333333332e-09, "logps/chosen": -261.19012451171875, "logps/rejected": -443.7205810546875, "loss": 0.0003, "losses/dpo": 2.672612481546821e-06, "losses/sft": 0.9915967583656311, "losses/total": 2.672612481546821e-06, "ref_logps/chosen": -241.24502563476562, "ref_logps/rejected": -252.5336456298828, "rewards/accuracies": 1.0, "rewards/chosen": -1.9945131540298462, "rewards/margins": 17.12417984008789, "rewards/rejected": -19.11869239807129, "step": 4145 }, { "epoch": 0.99, "learning_rate": 1.1199999999999999e-09, "logps/chosen": -251.36607360839844, "logps/rejected": -423.47137451171875, "loss": 0.0, "losses/dpo": 1.2443303332165812e-11, "losses/sft": 0.6214112639427185, "losses/total": 1.2443303332165812e-11, "ref_logps/chosen": -231.92752075195312, "ref_logps/rejected": -240.77513122558594, "rewards/accuracies": 1.0, "rewards/chosen": -1.9438542127609253, "rewards/margins": 16.32576560974121, "rewards/rejected": -18.269620895385742, "step": 4146 }, { "epoch": 1.0, "learning_rate": 1.0666666666666665e-09, "logps/chosen": -238.1381378173828, "logps/rejected": -346.2450866699219, "loss": 0.0111, "losses/dpo": 3.4423479622347486e-09, "losses/sft": 0.5293878316879272, "losses/total": 3.4423479622347486e-09, "ref_logps/chosen": -216.63287353515625, "ref_logps/rejected": -185.75643920898438, "rewards/accuracies": 1.0, "rewards/chosen": -2.150526523590088, "rewards/margins": 13.898340225219727, "rewards/rejected": -16.048866271972656, "step": 4147 }, { "epoch": 1.0, "learning_rate": 1.0133333333333331e-09, "logps/chosen": -270.9444580078125, "logps/rejected": -417.3063659667969, "loss": 0.0019, "losses/dpo": 1.1473234806658184e-08, "losses/sft": 0.5297524333000183, "losses/total": 1.1473234806658184e-08, "ref_logps/chosen": -251.23269653320312, "ref_logps/rejected": -234.41659545898438, "rewards/accuracies": 1.0, "rewards/chosen": -1.971177577972412, "rewards/margins": 16.317800521850586, "rewards/rejected": -18.288978576660156, "step": 4148 }, { "epoch": 1.0, "learning_rate": 9.599999999999998e-10, "logps/chosen": -190.84609985351562, "logps/rejected": -372.3074951171875, "loss": 0.0037, "losses/dpo": 8.006195012510986e-11, "losses/sft": 0.7999357581138611, "losses/total": 8.006195012510986e-11, "ref_logps/chosen": -175.12535095214844, "ref_logps/rejected": -193.68118286132812, "rewards/accuracies": 1.0, "rewards/chosen": -1.5720750093460083, "rewards/margins": 16.29055404663086, "rewards/rejected": -17.862628936767578, "step": 4149 }, { "epoch": 1.0, "learning_rate": 9.066666666666667e-10, "logps/chosen": -230.11773681640625, "logps/rejected": -385.7341613769531, "loss": 0.0008, "losses/dpo": 8.481547908445464e-09, "losses/sft": 0.7854890823364258, "losses/total": 8.481547908445464e-09, "ref_logps/chosen": -209.24942016601562, "ref_logps/rejected": -212.09750366210938, "rewards/accuracies": 1.0, "rewards/chosen": -2.086829900741577, "rewards/margins": 15.276836395263672, "rewards/rejected": -17.363664627075195, "step": 4150 }, { "epoch": 1.0, "learning_rate": 8.533333333333334e-10, "logps/chosen": -229.19183349609375, "logps/rejected": -394.1565246582031, "loss": 0.0002, "losses/dpo": 0.00011561163410078734, "losses/sft": 0.6909365057945251, "losses/total": 0.00011561163410078734, "ref_logps/chosen": -208.53509521484375, "ref_logps/rejected": -220.1859588623047, "rewards/accuracies": 1.0, "rewards/chosen": -2.0656747817993164, "rewards/margins": 15.331382751464844, "rewards/rejected": -17.397056579589844, "step": 4151 }, { "epoch": 1.0, "learning_rate": 8e-10, "logps/chosen": -267.36212158203125, "logps/rejected": -366.618408203125, "loss": 0.0135, "losses/dpo": 6.328280122147589e-10, "losses/sft": 0.6303030848503113, "losses/total": 6.328280122147589e-10, "ref_logps/chosen": -250.61203002929688, "ref_logps/rejected": -198.81546020507812, "rewards/accuracies": 1.0, "rewards/chosen": -1.6750110387802124, "rewards/margins": 15.105283737182617, "rewards/rejected": -16.78029441833496, "step": 4152 }, { "epoch": 1.0, "learning_rate": 7.466666666666667e-10, "logps/chosen": -241.30145263671875, "logps/rejected": -446.5852966308594, "loss": 0.0, "losses/dpo": 6.795398466152847e-09, "losses/sft": 0.8973372578620911, "losses/total": 6.795398466152847e-09, "ref_logps/chosen": -218.8157196044922, "ref_logps/rejected": -253.37786865234375, "rewards/accuracies": 1.0, "rewards/chosen": -2.2485742568969727, "rewards/margins": 17.07217025756836, "rewards/rejected": -19.32074546813965, "step": 4153 }, { "epoch": 1.0, "learning_rate": 6.933333333333333e-10, "logps/chosen": -244.38754272460938, "logps/rejected": -395.4586181640625, "loss": 0.0023, "losses/dpo": 6.222106163633612e-10, "losses/sft": 1.147240161895752, "losses/total": 6.222106163633612e-10, "ref_logps/chosen": -229.74969482421875, "ref_logps/rejected": -234.23068237304688, "rewards/accuracies": 1.0, "rewards/chosen": -1.46378493309021, "rewards/margins": 14.659008979797363, "rewards/rejected": -16.122793197631836, "step": 4154 }, { "epoch": 1.0, "learning_rate": 6.4e-10, "logps/chosen": -283.3436279296875, "logps/rejected": -426.52423095703125, "loss": 0.0, "losses/dpo": 2.9211053131916742e-08, "losses/sft": 1.1723490953445435, "losses/total": 2.9211053131916742e-08, "ref_logps/chosen": -265.3243408203125, "ref_logps/rejected": -242.86135864257812, "rewards/accuracies": 1.0, "rewards/chosen": -1.8019275665283203, "rewards/margins": 16.564361572265625, "rewards/rejected": -18.366291046142578, "step": 4155 }, { "epoch": 1.0, "learning_rate": 5.866666666666666e-10, "logps/chosen": -216.9442901611328, "logps/rejected": -409.82745361328125, "loss": 0.0143, "losses/dpo": 9.19838021218311e-06, "losses/sft": 0.8020731210708618, "losses/total": 9.19838021218311e-06, "ref_logps/chosen": -201.4021759033203, "ref_logps/rejected": -219.81349182128906, "rewards/accuracies": 1.0, "rewards/chosen": -1.5542120933532715, "rewards/margins": 17.447185516357422, "rewards/rejected": -19.00139808654785, "step": 4156 }, { "epoch": 1.0, "learning_rate": 5.333333333333332e-10, "logps/chosen": -261.44293212890625, "logps/rejected": -407.7825927734375, "loss": 0.0004, "losses/dpo": 1.6670700375698289e-09, "losses/sft": 0.7988789677619934, "losses/total": 1.6670700375698289e-09, "ref_logps/chosen": -244.72230529785156, "ref_logps/rejected": -235.15567016601562, "rewards/accuracies": 1.0, "rewards/chosen": -1.6720614433288574, "rewards/margins": 15.590632438659668, "rewards/rejected": -17.262693405151367, "step": 4157 }, { "epoch": 1.0, "learning_rate": 4.799999999999999e-10, "logps/chosen": -229.0016632080078, "logps/rejected": -414.7772216796875, "loss": 0.002, "losses/dpo": 1.1532357291343942e-09, "losses/sft": 0.5036135911941528, "losses/total": 1.1532357291343942e-09, "ref_logps/chosen": -214.18597412109375, "ref_logps/rejected": -231.01629638671875, "rewards/accuracies": 1.0, "rewards/chosen": -1.4815680980682373, "rewards/margins": 16.894527435302734, "rewards/rejected": -18.376094818115234, "step": 4158 }, { "epoch": 1.0, "learning_rate": 4.266666666666667e-10, "logps/chosen": -249.31097412109375, "logps/rejected": -396.3387145996094, "loss": 0.0017, "losses/dpo": 1.7797491125293163e-07, "losses/sft": 0.5657548904418945, "losses/total": 1.7797491125293163e-07, "ref_logps/chosen": -231.30067443847656, "ref_logps/rejected": -225.85324096679688, "rewards/accuracies": 1.0, "rewards/chosen": -1.8010308742523193, "rewards/margins": 15.247519493103027, "rewards/rejected": -17.04854965209961, "step": 4159 }, { "epoch": 1.0, "learning_rate": 3.7333333333333334e-10, "logps/chosen": -225.14743041992188, "logps/rejected": -393.3639831542969, "loss": 0.0007, "losses/dpo": 6.70284180159797e-07, "losses/sft": 0.624460756778717, "losses/total": 6.70284180159797e-07, "ref_logps/chosen": -209.86264038085938, "ref_logps/rejected": -227.94189453125, "rewards/accuracies": 1.0, "rewards/chosen": -1.528480052947998, "rewards/margins": 15.0137300491333, "rewards/rejected": -16.54220962524414, "step": 4160 }, { "epoch": 1.0, "learning_rate": 3.2e-10, "logps/chosen": -329.8233642578125, "logps/rejected": -436.4799499511719, "loss": 0.0, "losses/dpo": 1.6177466477529379e-06, "losses/sft": 0.6285468935966492, "losses/total": 1.6177466477529379e-06, "ref_logps/chosen": -309.5126953125, "ref_logps/rejected": -257.3038635253906, "rewards/accuracies": 1.0, "rewards/chosen": -2.031066417694092, "rewards/margins": 15.886544227600098, "rewards/rejected": -17.91761016845703, "step": 4161 }, { "epoch": 1.0, "learning_rate": 2.666666666666666e-10, "logps/chosen": -259.5295104980469, "logps/rejected": -426.3839111328125, "loss": 0.011, "losses/dpo": 6.385119655227811e-10, "losses/sft": 0.6164373755455017, "losses/total": 6.385119655227811e-10, "ref_logps/chosen": -237.27789306640625, "ref_logps/rejected": -235.76565551757812, "rewards/accuracies": 1.0, "rewards/chosen": -2.2251620292663574, "rewards/margins": 16.8366641998291, "rewards/rejected": -19.061826705932617, "step": 4162 }, { "epoch": 1.0, "learning_rate": 2.1333333333333335e-10, "logps/chosen": -243.00592041015625, "logps/rejected": -401.72637939453125, "loss": 0.0001, "losses/dpo": 5.089758614573725e-10, "losses/sft": 0.767311692237854, "losses/total": 5.089758614573725e-10, "ref_logps/chosen": -229.51976013183594, "ref_logps/rejected": -220.2251434326172, "rewards/accuracies": 1.0, "rewards/chosen": -1.3486162424087524, "rewards/margins": 16.80150604248047, "rewards/rejected": -18.150121688842773, "step": 4163 }, { "epoch": 1.0, "learning_rate": 1.6e-10, "logps/chosen": -245.78057861328125, "logps/rejected": -409.08209228515625, "loss": 0.0005, "losses/dpo": 1.6757570886483109e-09, "losses/sft": 0.9181809425354004, "losses/total": 1.6757570886483109e-09, "ref_logps/chosen": -231.34628295898438, "ref_logps/rejected": -228.81451416015625, "rewards/accuracies": 1.0, "rewards/chosen": -1.4434292316436768, "rewards/margins": 16.583328247070312, "rewards/rejected": -18.026758193969727, "step": 4164 }, { "epoch": 1.0, "learning_rate": 1.0666666666666667e-10, "logps/chosen": -290.1867370605469, "logps/rejected": -418.6895751953125, "loss": 0.0003, "losses/dpo": 1.4667635106491161e-08, "losses/sft": 0.6284276843070984, "losses/total": 1.4667635106491161e-08, "ref_logps/chosen": -271.8544921875, "ref_logps/rejected": -234.46347045898438, "rewards/accuracies": 1.0, "rewards/chosen": -1.8332252502441406, "rewards/margins": 16.589385986328125, "rewards/rejected": -18.422611236572266, "step": 4165 }, { "epoch": 1.0, "learning_rate": 5.3333333333333337e-11, "logps/chosen": -295.1136474609375, "logps/rejected": -433.77557373046875, "loss": 0.0001, "losses/dpo": 2.9494094633264467e-07, "losses/sft": 0.5550050735473633, "losses/total": 2.9494094633264467e-07, "ref_logps/chosen": -278.8128356933594, "ref_logps/rejected": -247.43179321289062, "rewards/accuracies": 1.0, "rewards/chosen": -1.6300804615020752, "rewards/margins": 17.00429916381836, "rewards/rejected": -18.634376525878906, "step": 4166 }, { "epoch": 1.0, "learning_rate": 0.0, "logps/chosen": -263.17425537109375, "logps/rejected": -415.3414306640625, "loss": 0.0007, "losses/dpo": 8.926357253358219e-08, "losses/sft": 0.4851631820201874, "losses/total": 8.926357253358219e-08, "ref_logps/chosen": -245.44346618652344, "ref_logps/rejected": -250.26878356933594, "rewards/accuracies": 1.0, "rewards/chosen": -1.7730789184570312, "rewards/margins": 14.734188079833984, "rewards/rejected": -16.507266998291016, "step": 4167 }, { "epoch": 1.0, "step": 4167, "total_flos": 0.0, "train_loss": 0.042034030193247, "train_runtime": 55676.5199, "train_samples_per_second": 2.395, "train_steps_per_second": 0.075 } ], "logging_steps": 1.0, "max_steps": 4167, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10000, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }