diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,4423 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 100, + "global_step": 2428, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0, + "grad_norm": 0.35546875, + "learning_rate": 8.23045267489712e-09, + "logits/chosen": 0.24564924836158752, + "logits/rejected": 1.0062695741653442, + "logps/chosen": -229.83255004882812, + "logps/rejected": -164.65399169921875, + "loss": 0.6931, + "rewards/accuracies": 0.0, + "rewards/chosen": 0.0, + "rewards/margins": 0.0, + "rewards/margins_max": 0.0, + "rewards/margins_min": 0.0, + "rewards/margins_std": 0.0, + "rewards/rejected": 0.0, + "step": 1 + }, + { + "epoch": 0.0, + "grad_norm": 0.369140625, + "learning_rate": 8.230452674897118e-08, + "logits/chosen": -0.04918687045574188, + "logits/rejected": 0.6123232245445251, + "logps/chosen": -238.79006958007812, + "logps/rejected": -207.5037841796875, + "loss": 0.6931, + "rewards/accuracies": 0.5277777910232544, + "rewards/chosen": 0.00015826645540073514, + "rewards/margins": 0.0006196785252541304, + "rewards/margins_max": 0.002893384313210845, + "rewards/margins_min": -0.0016540272627025843, + "rewards/margins_std": 0.0032155057415366173, + "rewards/rejected": -0.0004614120698533952, + "step": 10 + }, + { + "epoch": 0.01, + "grad_norm": 0.443359375, + "learning_rate": 1.6460905349794237e-07, + "logits/chosen": 0.04978996887803078, + "logits/rejected": 0.601681649684906, + "logps/chosen": -255.1076202392578, + "logps/rejected": -220.27145385742188, + "loss": 0.6931, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 2.9706948225793894e-06, + "rewards/margins": 0.00022058103058952838, + "rewards/margins_max": 0.00360403535887599, + "rewards/margins_min": -0.0031628732103854418, + "rewards/margins_std": 0.004784926772117615, + "rewards/rejected": -0.0002176103589590639, + "step": 20 + }, + { + "epoch": 0.01, + "grad_norm": 0.361328125, + "learning_rate": 2.4691358024691354e-07, + "logits/chosen": 0.0722523182630539, + "logits/rejected": 0.5806540250778198, + "logps/chosen": -242.0666046142578, + "logps/rejected": -229.0381317138672, + "loss": 0.693, + "rewards/accuracies": 0.4625000059604645, + "rewards/chosen": -0.00025353097589686513, + "rewards/margins": -0.0001711220684228465, + "rewards/margins_max": 0.00201609218493104, + "rewards/margins_min": -0.002358336467295885, + "rewards/margins_std": 0.003093188162893057, + "rewards/rejected": -8.240890747401863e-05, + "step": 30 + }, + { + "epoch": 0.02, + "grad_norm": 0.408203125, + "learning_rate": 3.2921810699588474e-07, + "logits/chosen": 0.0854184553027153, + "logits/rejected": 0.6598686575889587, + "logps/chosen": -272.9035339355469, + "logps/rejected": -232.7262725830078, + "loss": 0.6926, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 8.360335777979344e-05, + "rewards/margins": 0.001020856318064034, + "rewards/margins_max": 0.003615677822381258, + "rewards/margins_min": -0.0015739649534225464, + "rewards/margins_std": 0.003669631900265813, + "rewards/rejected": -0.0009372529457323253, + "step": 40 + }, + { + "epoch": 0.02, + "grad_norm": 0.40234375, + "learning_rate": 4.11522633744856e-07, + "logits/chosen": 0.03861381113529205, + "logits/rejected": 0.42459020018577576, + "logps/chosen": -248.6800537109375, + "logps/rejected": -249.634033203125, + "loss": 0.6923, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.0004395098949316889, + "rewards/margins": 0.00210589449852705, + "rewards/margins_max": 0.004740457516163588, + "rewards/margins_min": -0.0005286684026941657, + "rewards/margins_std": 0.003725834656506777, + "rewards/rejected": -0.0016663845162838697, + "step": 50 + }, + { + "epoch": 0.02, + "grad_norm": 0.412109375, + "learning_rate": 4.938271604938271e-07, + "logits/chosen": 0.028602436184883118, + "logits/rejected": 0.599826991558075, + "logps/chosen": -243.0851287841797, + "logps/rejected": -205.10818481445312, + "loss": 0.6915, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.0011325245723128319, + "rewards/margins": 0.002941467333585024, + "rewards/margins_max": 0.005611724685877562, + "rewards/margins_min": 0.0002712096902541816, + "rewards/margins_std": 0.003776314901188016, + "rewards/rejected": -0.0018089428776875138, + "step": 60 + }, + { + "epoch": 0.03, + "grad_norm": 0.43359375, + "learning_rate": 5.761316872427983e-07, + "logits/chosen": 0.12637177109718323, + "logits/rejected": 0.6491262912750244, + "logps/chosen": -233.43142700195312, + "logps/rejected": -179.89846801757812, + "loss": 0.6918, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.0002556543913669884, + "rewards/margins": 0.0022196024656295776, + "rewards/margins_max": 0.004624036606401205, + "rewards/margins_min": -0.00018483158783055842, + "rewards/margins_std": 0.0034003830514848232, + "rewards/rejected": -0.001963948365300894, + "step": 70 + }, + { + "epoch": 0.03, + "grad_norm": 0.44140625, + "learning_rate": 6.584362139917695e-07, + "logits/chosen": -0.029578953981399536, + "logits/rejected": 0.4067414402961731, + "logps/chosen": -235.6042022705078, + "logps/rejected": -224.54019165039062, + "loss": 0.6906, + "rewards/accuracies": 0.800000011920929, + "rewards/chosen": 0.0017460808157920837, + "rewards/margins": 0.004769898019731045, + "rewards/margins_max": 0.007818765938282013, + "rewards/margins_min": 0.0017210301011800766, + "rewards/margins_std": 0.004311750642955303, + "rewards/rejected": -0.0030238174367696047, + "step": 80 + }, + { + "epoch": 0.04, + "grad_norm": 0.4921875, + "learning_rate": 7.407407407407406e-07, + "logits/chosen": 0.2614774703979492, + "logits/rejected": 0.6254442930221558, + "logps/chosen": -205.9300994873047, + "logps/rejected": -194.76925659179688, + "loss": 0.6901, + "rewards/accuracies": 0.875, + "rewards/chosen": 0.002798306755721569, + "rewards/margins": 0.006482880562543869, + "rewards/margins_max": 0.01096098218113184, + "rewards/margins_min": 0.002004781039431691, + "rewards/margins_std": 0.0063329897820949554, + "rewards/rejected": -0.0036845742724835873, + "step": 90 + }, + { + "epoch": 0.04, + "grad_norm": 0.63671875, + "learning_rate": 8.23045267489712e-07, + "logits/chosen": -0.02924344502389431, + "logits/rejected": 0.43279844522476196, + "logps/chosen": -237.76242065429688, + "logps/rejected": -233.0888671875, + "loss": 0.6891, + "rewards/accuracies": 0.8125, + "rewards/chosen": 0.002704631770029664, + "rewards/margins": 0.008112462237477303, + "rewards/margins_max": 0.013658873736858368, + "rewards/margins_min": 0.0025660484097898006, + "rewards/margins_std": 0.007843811996281147, + "rewards/rejected": -0.0054078297689557076, + "step": 100 + }, + { + "epoch": 0.05, + "grad_norm": 0.5, + "learning_rate": 9.053497942386831e-07, + "logits/chosen": 0.05362590029835701, + "logits/rejected": 0.6371204257011414, + "logps/chosen": -253.3213348388672, + "logps/rejected": -201.6106414794922, + "loss": 0.6881, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.0040958598256111145, + "rewards/margins": 0.010091823525726795, + "rewards/margins_max": 0.015223483555018902, + "rewards/margins_min": 0.004960163962095976, + "rewards/margins_std": 0.007257262710481882, + "rewards/rejected": -0.005995963700115681, + "step": 110 + }, + { + "epoch": 0.05, + "grad_norm": 0.375, + "learning_rate": 9.876543209876542e-07, + "logits/chosen": 0.02120272070169449, + "logits/rejected": 0.5330603718757629, + "logps/chosen": -230.68896484375, + "logps/rejected": -202.53201293945312, + "loss": 0.6871, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": 0.004152396693825722, + "rewards/margins": 0.01102996151894331, + "rewards/margins_max": 0.016634680330753326, + "rewards/margins_min": 0.005425242241472006, + "rewards/margins_std": 0.007926270365715027, + "rewards/rejected": -0.006877565290778875, + "step": 120 + }, + { + "epoch": 0.05, + "grad_norm": 0.470703125, + "learning_rate": 1.0699588477366254e-06, + "logits/chosen": 0.13731414079666138, + "logits/rejected": 0.6454218626022339, + "logps/chosen": -265.0476379394531, + "logps/rejected": -232.1322479248047, + "loss": 0.6851, + "rewards/accuracies": 0.9375, + "rewards/chosen": 0.008298086933791637, + "rewards/margins": 0.01611563190817833, + "rewards/margins_max": 0.0227045975625515, + "rewards/margins_min": 0.009526659734547138, + "rewards/margins_std": 0.009318210184574127, + "rewards/rejected": -0.007817542180418968, + "step": 130 + }, + { + "epoch": 0.06, + "grad_norm": 0.46875, + "learning_rate": 1.1522633744855967e-06, + "logits/chosen": 0.0916055217385292, + "logits/rejected": 0.5900839567184448, + "logps/chosen": -250.656005859375, + "logps/rejected": -215.56851196289062, + "loss": 0.6834, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.008936955593526363, + "rewards/margins": 0.018578212708234787, + "rewards/margins_max": 0.027034681290388107, + "rewards/margins_min": 0.010121742263436317, + "rewards/margins_std": 0.011959253810346127, + "rewards/rejected": -0.009641257114708424, + "step": 140 + }, + { + "epoch": 0.06, + "grad_norm": 0.50390625, + "learning_rate": 1.2345679012345677e-06, + "logits/chosen": 0.02975723147392273, + "logits/rejected": 0.7174798250198364, + "logps/chosen": -273.29351806640625, + "logps/rejected": -229.1119842529297, + "loss": 0.6808, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.016104739159345627, + "rewards/margins": 0.025698691606521606, + "rewards/margins_max": 0.037033237516880035, + "rewards/margins_min": 0.014364147558808327, + "rewards/margins_std": 0.016029467806220055, + "rewards/rejected": -0.00959395244717598, + "step": 150 + }, + { + "epoch": 0.07, + "grad_norm": 0.396484375, + "learning_rate": 1.316872427983539e-06, + "logits/chosen": 0.014136433601379395, + "logits/rejected": 0.4981762766838074, + "logps/chosen": -229.10940551757812, + "logps/rejected": -197.9926300048828, + "loss": 0.6811, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.014829346910119057, + "rewards/margins": 0.026064058765769005, + "rewards/margins_max": 0.03911607339978218, + "rewards/margins_min": 0.01301204226911068, + "rewards/margins_std": 0.01845833659172058, + "rewards/rejected": -0.011234709993004799, + "step": 160 + }, + { + "epoch": 0.07, + "grad_norm": 0.3984375, + "learning_rate": 1.3991769547325102e-06, + "logits/chosen": 0.057651955634355545, + "logits/rejected": 0.5741917490959167, + "logps/chosen": -231.759033203125, + "logps/rejected": -235.8512420654297, + "loss": 0.6767, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.02400249056518078, + "rewards/margins": 0.03238454461097717, + "rewards/margins_max": 0.04800540953874588, + "rewards/margins_min": 0.01676369085907936, + "rewards/margins_std": 0.022091226652264595, + "rewards/rejected": -0.008382054045796394, + "step": 170 + }, + { + "epoch": 0.07, + "grad_norm": 0.375, + "learning_rate": 1.4814814814814812e-06, + "logits/chosen": 0.21878819167613983, + "logits/rejected": 0.5812119245529175, + "logps/chosen": -207.7227783203125, + "logps/rejected": -216.443115234375, + "loss": 0.6781, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.014631425030529499, + "rewards/margins": 0.029586512595415115, + "rewards/margins_max": 0.042366527020931244, + "rewards/margins_min": 0.016806500032544136, + "rewards/margins_std": 0.018073670566082, + "rewards/rejected": -0.014955088496208191, + "step": 180 + }, + { + "epoch": 0.08, + "grad_norm": 0.50390625, + "learning_rate": 1.5637860082304525e-06, + "logits/chosen": 0.1410341113805771, + "logits/rejected": 0.7296265363693237, + "logps/chosen": -253.40957641601562, + "logps/rejected": -219.5933837890625, + "loss": 0.6718, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.02252976968884468, + "rewards/margins": 0.03989989683032036, + "rewards/margins_max": 0.05754275247454643, + "rewards/margins_min": 0.022257043048739433, + "rewards/margins_std": 0.02495076134800911, + "rewards/rejected": -0.017370129004120827, + "step": 190 + }, + { + "epoch": 0.08, + "grad_norm": 0.474609375, + "learning_rate": 1.646090534979424e-06, + "logits/chosen": 0.050902754068374634, + "logits/rejected": 0.6948504447937012, + "logps/chosen": -269.0355529785156, + "logps/rejected": -228.97781372070312, + "loss": 0.6694, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.031442780047655106, + "rewards/margins": 0.049986980855464935, + "rewards/margins_max": 0.07442543655633926, + "rewards/margins_min": 0.025548523291945457, + "rewards/margins_std": 0.03456118702888489, + "rewards/rejected": -0.018544193357229233, + "step": 200 + }, + { + "epoch": 0.09, + "grad_norm": 0.46875, + "learning_rate": 1.7283950617283948e-06, + "logits/chosen": 0.055720794945955276, + "logits/rejected": 0.5427404642105103, + "logps/chosen": -244.46435546875, + "logps/rejected": -219.5703887939453, + "loss": 0.6705, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.029670244082808495, + "rewards/margins": 0.050485529005527496, + "rewards/margins_max": 0.07362981140613556, + "rewards/margins_min": 0.027341246604919434, + "rewards/margins_std": 0.03273095563054085, + "rewards/rejected": -0.020815281197428703, + "step": 210 + }, + { + "epoch": 0.09, + "grad_norm": 0.416015625, + "learning_rate": 1.8106995884773662e-06, + "logits/chosen": 0.09668431431055069, + "logits/rejected": 0.6245120167732239, + "logps/chosen": -235.32608032226562, + "logps/rejected": -208.3621063232422, + "loss": 0.665, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.030144259333610535, + "rewards/margins": 0.059492819011211395, + "rewards/margins_max": 0.0863434299826622, + "rewards/margins_min": 0.03264220803976059, + "rewards/margins_std": 0.037972498685121536, + "rewards/rejected": -0.02934856340289116, + "step": 220 + }, + { + "epoch": 0.09, + "grad_norm": 0.431640625, + "learning_rate": 1.8930041152263375e-06, + "logits/chosen": -0.016712257638573647, + "logits/rejected": 0.5332568883895874, + "logps/chosen": -288.5390319824219, + "logps/rejected": -247.04513549804688, + "loss": 0.6598, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.03193775564432144, + "rewards/margins": 0.07212281227111816, + "rewards/margins_max": 0.10512430965900421, + "rewards/margins_min": 0.039121340960264206, + "rewards/margins_std": 0.04667114093899727, + "rewards/rejected": -0.04018506780266762, + "step": 230 + }, + { + "epoch": 0.1, + "grad_norm": 0.45703125, + "learning_rate": 1.9753086419753083e-06, + "logits/chosen": 0.020423922687768936, + "logits/rejected": 0.5501176118850708, + "logps/chosen": -246.56204223632812, + "logps/rejected": -229.26943969726562, + "loss": 0.6555, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.03230474889278412, + "rewards/margins": 0.07798168063163757, + "rewards/margins_max": 0.11025931686162949, + "rewards/margins_min": 0.045704036951065063, + "rewards/margins_std": 0.045647479593753815, + "rewards/rejected": -0.045676928013563156, + "step": 240 + }, + { + "epoch": 0.1, + "grad_norm": 0.458984375, + "learning_rate": 1.999949352352126e-06, + "logits/chosen": -0.020852217450737953, + "logits/rejected": 0.5500503778457642, + "logps/chosen": -271.888916015625, + "logps/rejected": -256.8134765625, + "loss": 0.6557, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.04405756667256355, + "rewards/margins": 0.07757656276226044, + "rewards/margins_max": 0.10984426736831665, + "rewards/margins_min": 0.04530886188149452, + "rewards/margins_std": 0.04563341662287712, + "rewards/rejected": -0.033518996089696884, + "step": 250 + }, + { + "epoch": 0.11, + "grad_norm": 0.443359375, + "learning_rate": 1.999701294590502e-06, + "logits/chosen": 0.1329582929611206, + "logits/rejected": 0.7303738594055176, + "logps/chosen": -267.12274169921875, + "logps/rejected": -211.52099609375, + "loss": 0.6531, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": 0.022596510127186775, + "rewards/margins": 0.0730314701795578, + "rewards/margins_max": 0.11627304553985596, + "rewards/margins_min": 0.029789889231324196, + "rewards/margins_std": 0.061152826994657516, + "rewards/rejected": -0.050434958189725876, + "step": 260 + }, + { + "epoch": 0.11, + "grad_norm": 0.439453125, + "learning_rate": 1.9992465753011367e-06, + "logits/chosen": 0.02824712172150612, + "logits/rejected": 0.6052254438400269, + "logps/chosen": -287.28057861328125, + "logps/rejected": -242.43618774414062, + "loss": 0.643, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.04510802775621414, + "rewards/margins": 0.10490649938583374, + "rewards/margins_max": 0.149946391582489, + "rewards/margins_min": 0.059866636991500854, + "rewards/margins_std": 0.0636960119009018, + "rewards/rejected": -0.059798479080200195, + "step": 270 + }, + { + "epoch": 0.12, + "grad_norm": 0.478515625, + "learning_rate": 1.9985852884850918e-06, + "logits/chosen": 0.16367292404174805, + "logits/rejected": 0.6942164301872253, + "logps/chosen": -255.4570770263672, + "logps/rejected": -232.69760131835938, + "loss": 0.6471, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": 0.027844402939081192, + "rewards/margins": 0.09164806455373764, + "rewards/margins_max": 0.12632620334625244, + "rewards/margins_min": 0.05696992203593254, + "rewards/margins_std": 0.04904230311512947, + "rewards/rejected": -0.06380365788936615, + "step": 280 + }, + { + "epoch": 0.12, + "grad_norm": 0.419921875, + "learning_rate": 1.9977175708457446e-06, + "logits/chosen": 0.1536540985107422, + "logits/rejected": 0.6605737209320068, + "logps/chosen": -240.44729614257812, + "logps/rejected": -224.4777374267578, + "loss": 0.644, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.02331436611711979, + "rewards/margins": 0.11056084930896759, + "rewards/margins_max": 0.1643027514219284, + "rewards/margins_min": 0.05681893974542618, + "rewards/margins_std": 0.07600252330303192, + "rewards/rejected": -0.08724648505449295, + "step": 290 + }, + { + "epoch": 0.12, + "grad_norm": 0.46484375, + "learning_rate": 1.9966436017605294e-06, + "logits/chosen": 0.07062125205993652, + "logits/rejected": 0.6594554781913757, + "logps/chosen": -252.1466064453125, + "logps/rejected": -233.97787475585938, + "loss": 0.6346, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.028239211067557335, + "rewards/margins": 0.13431617617607117, + "rewards/margins_max": 0.19096195697784424, + "rewards/margins_min": 0.0776703953742981, + "rewards/margins_std": 0.08010922372341156, + "rewards/rejected": -0.10607695579528809, + "step": 300 + }, + { + "epoch": 0.13, + "grad_norm": 0.408203125, + "learning_rate": 1.995363603243855e-06, + "logits/chosen": 0.2892570495605469, + "logits/rejected": 0.6574875712394714, + "logps/chosen": -216.6258087158203, + "logps/rejected": -206.6038818359375, + "loss": 0.6333, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.019283967092633247, + "rewards/margins": 0.121584951877594, + "rewards/margins_max": 0.1772836297750473, + "rewards/margins_min": 0.06588628143072128, + "rewards/margins_std": 0.07876982539892197, + "rewards/rejected": -0.1023009866476059, + "step": 310 + }, + { + "epoch": 0.13, + "grad_norm": 0.45703125, + "learning_rate": 1.9938778399012094e-06, + "logits/chosen": 0.1439567506313324, + "logits/rejected": 0.6365154385566711, + "logps/chosen": -236.16378784179688, + "logps/rejected": -216.11618041992188, + "loss": 0.6301, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.011489281430840492, + "rewards/margins": 0.13021530210971832, + "rewards/margins_max": 0.18579894304275513, + "rewards/margins_min": 0.07463165372610092, + "rewards/margins_std": 0.07860714942216873, + "rewards/rejected": -0.11872602999210358, + "step": 320 + }, + { + "epoch": 0.14, + "grad_norm": 0.455078125, + "learning_rate": 1.9921866188744596e-06, + "logits/chosen": 0.03234120458364487, + "logits/rejected": 0.6771044731140137, + "logps/chosen": -229.37313842773438, + "logps/rejected": -193.59500122070312, + "loss": 0.6243, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": 0.01423537265509367, + "rewards/margins": 0.14501173794269562, + "rewards/margins_max": 0.19469766318798065, + "rewards/margins_min": 0.09532581269741058, + "rewards/margins_std": 0.07026650756597519, + "rewards/rejected": -0.13077637553215027, + "step": 330 + }, + { + "epoch": 0.14, + "grad_norm": 0.466796875, + "learning_rate": 1.990290289778359e-06, + "logits/chosen": 0.2403167188167572, + "logits/rejected": 0.7011794447898865, + "logps/chosen": -252.5185089111328, + "logps/rejected": -227.126708984375, + "loss": 0.6276, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.0113031892105937, + "rewards/margins": 0.128191739320755, + "rewards/margins_max": 0.19695988297462463, + "rewards/margins_min": 0.05942361429333687, + "rewards/margins_std": 0.09725283086299896, + "rewards/rejected": -0.13949494063854218, + "step": 340 + }, + { + "epoch": 0.14, + "grad_norm": 0.4765625, + "learning_rate": 1.988189244628272e-06, + "logits/chosen": 0.10928988456726074, + "logits/rejected": 0.6726782917976379, + "logps/chosen": -255.8716278076172, + "logps/rejected": -242.6270751953125, + "loss": 0.6212, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.0170623529702425, + "rewards/margins": 0.17394407093524933, + "rewards/margins_max": 0.25714507699012756, + "rewards/margins_min": 0.0907430499792099, + "rewards/margins_std": 0.11766400188207626, + "rewards/rejected": -0.15688170492649078, + "step": 350 + }, + { + "epoch": 0.15, + "grad_norm": 0.5625, + "learning_rate": 1.9858839177591384e-06, + "logits/chosen": 0.1698768585920334, + "logits/rejected": 0.7562354803085327, + "logps/chosen": -246.46035766601562, + "logps/rejected": -253.3776092529297, + "loss": 0.6066, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": 0.019840896129608154, + "rewards/margins": 0.19313883781433105, + "rewards/margins_max": 0.2668205201625824, + "rewards/margins_min": 0.11945716291666031, + "rewards/margins_std": 0.10420163720846176, + "rewards/rejected": -0.1732979416847229, + "step": 360 + }, + { + "epoch": 0.15, + "grad_norm": 0.427734375, + "learning_rate": 1.9833747857356827e-06, + "logits/chosen": 0.0925469920039177, + "logits/rejected": 0.6798457503318787, + "logps/chosen": -227.651123046875, + "logps/rejected": -220.4510498046875, + "loss": 0.6026, + "rewards/accuracies": 1.0, + "rewards/chosen": 0.004902270622551441, + "rewards/margins": 0.19105985760688782, + "rewards/margins_max": 0.27683204412460327, + "rewards/margins_min": 0.10528764873743057, + "rewards/margins_std": 0.12130022048950195, + "rewards/rejected": -0.18615756928920746, + "step": 370 + }, + { + "epoch": 0.16, + "grad_norm": 0.43359375, + "learning_rate": 1.9806623672538997e-06, + "logits/chosen": 0.0311798807233572, + "logits/rejected": 0.5755618810653687, + "logps/chosen": -231.2305450439453, + "logps/rejected": -229.2065887451172, + "loss": 0.606, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": 0.006447208113968372, + "rewards/margins": 0.17201881110668182, + "rewards/margins_max": 0.2625262141227722, + "rewards/margins_min": 0.08151140064001083, + "rewards/margins_std": 0.12799681723117828, + "rewards/rejected": -0.16557160019874573, + "step": 380 + }, + { + "epoch": 0.16, + "grad_norm": 0.50390625, + "learning_rate": 1.9777472230338267e-06, + "logits/chosen": 0.015010332688689232, + "logits/rejected": 0.6248622536659241, + "logps/chosen": -252.9764404296875, + "logps/rejected": -239.882568359375, + "loss": 0.587, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.004931238479912281, + "rewards/margins": 0.2389887571334839, + "rewards/margins_max": 0.34165245294570923, + "rewards/margins_min": 0.13632504642009735, + "rewards/margins_std": 0.14518840610980988, + "rewards/rejected": -0.23405751585960388, + "step": 390 + }, + { + "epoch": 0.16, + "grad_norm": 0.3984375, + "learning_rate": 1.9746299557036303e-06, + "logits/chosen": 0.10073033720254898, + "logits/rejected": 0.8144109845161438, + "logps/chosen": -293.6244201660156, + "logps/rejected": -236.57235717773438, + "loss": 0.5871, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.005161326378583908, + "rewards/margins": 0.2277032881975174, + "rewards/margins_max": 0.33263522386550903, + "rewards/margins_min": 0.12277133762836456, + "rewards/margins_std": 0.14839616417884827, + "rewards/rejected": -0.2328646183013916, + "step": 400 + }, + { + "epoch": 0.17, + "grad_norm": 0.51953125, + "learning_rate": 1.9713112096750285e-06, + "logits/chosen": -0.02684302069246769, + "logits/rejected": 0.59294593334198, + "logps/chosen": -248.1615753173828, + "logps/rejected": -245.7884979248047, + "loss": 0.5744, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": 0.011066530831158161, + "rewards/margins": 0.2501886487007141, + "rewards/margins_max": 0.3555333614349365, + "rewards/margins_min": 0.1448439061641693, + "rewards/margins_std": 0.14897994697093964, + "rewards/rejected": -0.23912210762500763, + "step": 410 + }, + { + "epoch": 0.17, + "grad_norm": 0.5078125, + "learning_rate": 1.967791671010076e-06, + "logits/chosen": 0.14378827810287476, + "logits/rejected": 0.6853595972061157, + "logps/chosen": -264.4352722167969, + "logps/rejected": -287.1185607910156, + "loss": 0.5597, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.02877393364906311, + "rewards/margins": 0.3156929314136505, + "rewards/margins_max": 0.4402744770050049, + "rewards/margins_min": 0.19111141562461853, + "rewards/margins_std": 0.17618489265441895, + "rewards/rejected": -0.344466894865036, + "step": 420 + }, + { + "epoch": 0.18, + "grad_norm": 0.5234375, + "learning_rate": 1.96407206727934e-06, + "logits/chosen": 0.010093556717038155, + "logits/rejected": 0.5722212195396423, + "logps/chosen": -261.48260498046875, + "logps/rejected": -242.7886505126953, + "loss": 0.5802, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.03540874272584915, + "rewards/margins": 0.2602460980415344, + "rewards/margins_max": 0.41468754410743713, + "rewards/margins_min": 0.1058046966791153, + "rewards/margins_std": 0.21841315925121307, + "rewards/rejected": -0.29565486311912537, + "step": 430 + }, + { + "epoch": 0.18, + "grad_norm": 0.458984375, + "learning_rate": 1.9601531674114928e-06, + "logits/chosen": 0.1727772355079651, + "logits/rejected": 0.7151114344596863, + "logps/chosen": -261.04229736328125, + "logps/rejected": -249.06704711914062, + "loss": 0.5599, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.037648189812898636, + "rewards/margins": 0.3025735020637512, + "rewards/margins_max": 0.4183417856693268, + "rewards/margins_min": 0.18680524826049805, + "rewards/margins_std": 0.1637210100889206, + "rewards/rejected": -0.34022170305252075, + "step": 440 + }, + { + "epoch": 0.19, + "grad_norm": 0.6015625, + "learning_rate": 1.9560357815343576e-06, + "logits/chosen": -0.02275443822145462, + "logits/rejected": 0.6079251766204834, + "logps/chosen": -291.15667724609375, + "logps/rejected": -295.98114013671875, + "loss": 0.5534, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.03242644667625427, + "rewards/margins": 0.3713623881340027, + "rewards/margins_max": 0.5583276748657227, + "rewards/margins_min": 0.18439707159996033, + "rewards/margins_std": 0.2644089162349701, + "rewards/rejected": -0.40378880500793457, + "step": 450 + }, + { + "epoch": 0.19, + "grad_norm": 0.5390625, + "learning_rate": 1.9517207608074365e-06, + "logits/chosen": 0.010817606933414936, + "logits/rejected": 0.457902729511261, + "logps/chosen": -247.76101684570312, + "logps/rejected": -259.99969482421875, + "loss": 0.5481, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.046189289540052414, + "rewards/margins": 0.35407206416130066, + "rewards/margins_max": 0.5428398251533508, + "rewards/margins_min": 0.16530433297157288, + "rewards/margins_std": 0.2669579088687897, + "rewards/rejected": -0.40026140213012695, + "step": 460 + }, + { + "epoch": 0.19, + "grad_norm": 0.4765625, + "learning_rate": 1.9472089972459547e-06, + "logits/chosen": -0.01565355248749256, + "logits/rejected": 0.6114306449890137, + "logps/chosen": -269.9651794433594, + "logps/rejected": -249.69210815429688, + "loss": 0.5448, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.06878812611103058, + "rewards/margins": 0.32420676946640015, + "rewards/margins_max": 0.5209608674049377, + "rewards/margins_min": 0.12745265662670135, + "rewards/margins_std": 0.2782523036003113, + "rewards/rejected": -0.39299488067626953, + "step": 470 + }, + { + "epoch": 0.2, + "grad_norm": 0.5703125, + "learning_rate": 1.942501423536461e-06, + "logits/chosen": 0.1253264844417572, + "logits/rejected": 0.615772545337677, + "logps/chosen": -234.8943634033203, + "logps/rejected": -258.9188232421875, + "loss": 0.5388, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.08462107926607132, + "rewards/margins": 0.3709794580936432, + "rewards/margins_max": 0.5461065173149109, + "rewards/margins_min": 0.19585230946540833, + "rewards/margins_std": 0.24766714870929718, + "rewards/rejected": -0.45560044050216675, + "step": 480 + }, + { + "epoch": 0.2, + "grad_norm": 0.5625, + "learning_rate": 1.93759901284402e-06, + "logits/chosen": 0.05123847723007202, + "logits/rejected": 0.5316873788833618, + "logps/chosen": -255.9114532470703, + "logps/rejected": -303.42718505859375, + "loss": 0.5123, + "rewards/accuracies": 1.0, + "rewards/chosen": -0.06182605028152466, + "rewards/margins": 0.5358118414878845, + "rewards/margins_max": 0.822005569934845, + "rewards/margins_min": 0.24961814284324646, + "rewards/margins_std": 0.40473905205726624, + "rewards/rejected": -0.597637951374054, + "step": 490 + }, + { + "epoch": 0.21, + "grad_norm": 0.6328125, + "learning_rate": 1.932502778611036e-06, + "logits/chosen": -0.022655535489320755, + "logits/rejected": 0.6211397647857666, + "logps/chosen": -233.3824462890625, + "logps/rejected": -231.17599487304688, + "loss": 0.5255, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.07823699712753296, + "rewards/margins": 0.4163808822631836, + "rewards/margins_max": 0.658298134803772, + "rewards/margins_min": 0.1744636446237564, + "rewards/margins_std": 0.3421226441860199, + "rewards/rejected": -0.49461787939071655, + "step": 500 + }, + { + "epoch": 0.21, + "grad_norm": 0.61328125, + "learning_rate": 1.9272137743477504e-06, + "logits/chosen": 0.20029589533805847, + "logits/rejected": 0.7615786790847778, + "logps/chosen": -251.86819458007812, + "logps/rejected": -269.39971923828125, + "loss": 0.4883, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.10473309457302094, + "rewards/margins": 0.5166608095169067, + "rewards/margins_max": 0.7467631101608276, + "rewards/margins_min": 0.28655844926834106, + "rewards/margins_std": 0.32541388273239136, + "rewards/rejected": -0.6213939189910889, + "step": 510 + }, + { + "epoch": 0.21, + "grad_norm": 0.57421875, + "learning_rate": 1.9217330934144564e-06, + "logits/chosen": 0.03549078106880188, + "logits/rejected": 0.6382437348365784, + "logps/chosen": -269.1963806152344, + "logps/rejected": -284.4259338378906, + "loss": 0.5078, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.13496743142604828, + "rewards/margins": 0.5025084614753723, + "rewards/margins_max": 0.7574218511581421, + "rewards/margins_min": 0.24759499728679657, + "rewards/margins_std": 0.3605020344257355, + "rewards/rejected": -0.637475848197937, + "step": 520 + }, + { + "epoch": 0.22, + "grad_norm": 0.66796875, + "learning_rate": 1.916061868795478e-06, + "logits/chosen": 0.2350475788116455, + "logits/rejected": 0.724064290523529, + "logps/chosen": -263.99786376953125, + "logps/rejected": -286.0734558105469, + "loss": 0.5188, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.15573835372924805, + "rewards/margins": 0.44272828102111816, + "rewards/margins_max": 0.688580334186554, + "rewards/margins_min": 0.19687625765800476, + "rewards/margins_std": 0.3476872742176056, + "rewards/rejected": -0.598466694355011, + "step": 530 + }, + { + "epoch": 0.22, + "grad_norm": 0.6953125, + "learning_rate": 1.910201272864954e-06, + "logits/chosen": 0.1834249347448349, + "logits/rejected": 0.7275029420852661, + "logps/chosen": -267.31304931640625, + "logps/rejected": -274.42120361328125, + "loss": 0.4977, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.1938314139842987, + "rewards/margins": 0.532240092754364, + "rewards/margins_max": 0.8296705484390259, + "rewards/margins_min": 0.23480959236621857, + "rewards/margins_std": 0.42063021659851074, + "rewards/rejected": -0.7260714769363403, + "step": 540 + }, + { + "epoch": 0.23, + "grad_norm": 0.578125, + "learning_rate": 1.9041525171444798e-06, + "logits/chosen": -0.012256382033228874, + "logits/rejected": 0.5923458933830261, + "logps/chosen": -266.64044189453125, + "logps/rejected": -261.02215576171875, + "loss": 0.5211, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.19421935081481934, + "rewards/margins": 0.3819182515144348, + "rewards/margins_max": 0.5985667705535889, + "rewards/margins_min": 0.16526973247528076, + "rewards/margins_std": 0.3063872456550598, + "rewards/rejected": -0.5761376023292542, + "step": 550 + }, + { + "epoch": 0.23, + "grad_norm": 0.7265625, + "learning_rate": 1.897916852052661e-06, + "logits/chosen": -0.12678642570972443, + "logits/rejected": 0.4983861446380615, + "logps/chosen": -287.7276306152344, + "logps/rejected": -330.13482666015625, + "loss": 0.4613, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.2061140239238739, + "rewards/margins": 0.6443026065826416, + "rewards/margins_max": 0.9833240509033203, + "rewards/margins_min": 0.30528122186660767, + "rewards/margins_std": 0.4794486463069916, + "rewards/rejected": -0.8504166603088379, + "step": 560 + }, + { + "epoch": 0.23, + "grad_norm": 0.56640625, + "learning_rate": 1.8914955666466205e-06, + "logits/chosen": 0.03088958188891411, + "logits/rejected": 0.6151102781295776, + "logps/chosen": -258.5025634765625, + "logps/rejected": -306.0191650390625, + "loss": 0.4808, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.18039652705192566, + "rewards/margins": 0.6223662495613098, + "rewards/margins_max": 0.9027007222175598, + "rewards/margins_min": 0.3420317769050598, + "rewards/margins_std": 0.39645272493362427, + "rewards/rejected": -0.8027628064155579, + "step": 570 + }, + { + "epoch": 0.24, + "grad_norm": 0.625, + "learning_rate": 1.8848899883555203e-06, + "logits/chosen": 0.09567205607891083, + "logits/rejected": 0.7999943494796753, + "logps/chosen": -286.7126770019531, + "logps/rejected": -330.4248962402344, + "loss": 0.4626, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.2216329574584961, + "rewards/margins": 0.682709813117981, + "rewards/margins_max": 1.0534820556640625, + "rewards/margins_min": 0.3119375705718994, + "rewards/margins_std": 0.5243510007858276, + "rewards/rejected": -0.904342770576477, + "step": 580 + }, + { + "epoch": 0.24, + "grad_norm": 0.60546875, + "learning_rate": 1.8781014827061518e-06, + "logits/chosen": 0.051412492990493774, + "logits/rejected": 0.7583128213882446, + "logps/chosen": -257.7097473144531, + "logps/rejected": -265.9875183105469, + "loss": 0.4877, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -0.2396390736103058, + "rewards/margins": 0.5867950916290283, + "rewards/margins_max": 0.9322711825370789, + "rewards/margins_min": 0.24131877720355988, + "rewards/margins_std": 0.4885772168636322, + "rewards/rejected": -0.8264341354370117, + "step": 590 + }, + { + "epoch": 0.25, + "grad_norm": 0.62890625, + "learning_rate": 1.8711314530406498e-06, + "logits/chosen": 0.02027386799454689, + "logits/rejected": 0.6661813259124756, + "logps/chosen": -280.32208251953125, + "logps/rejected": -305.6834411621094, + "loss": 0.4487, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.2480877935886383, + "rewards/margins": 0.7212269306182861, + "rewards/margins_max": 1.1429131031036377, + "rewards/margins_min": 0.29954075813293457, + "rewards/margins_std": 0.5963543653488159, + "rewards/rejected": -0.969314694404602, + "step": 600 + }, + { + "epoch": 0.25, + "grad_norm": 0.66796875, + "learning_rate": 1.8639813402263877e-06, + "logits/chosen": -0.020800206810235977, + "logits/rejected": 0.6128827333450317, + "logps/chosen": -306.2142333984375, + "logps/rejected": -309.4848327636719, + "loss": 0.4649, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.3131290376186371, + "rewards/margins": 0.7527261972427368, + "rewards/margins_max": 1.202072024345398, + "rewards/margins_min": 0.30338022112846375, + "rewards/margins_std": 0.6354711055755615, + "rewards/rejected": -1.0658552646636963, + "step": 610 + }, + { + "epoch": 0.26, + "grad_norm": 0.78515625, + "learning_rate": 1.8566526223581192e-06, + "logits/chosen": 0.039588745683431625, + "logits/rejected": 0.5985323786735535, + "logps/chosen": -284.71929931640625, + "logps/rejected": -321.4588928222656, + "loss": 0.4402, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.3328380584716797, + "rewards/margins": 0.8725109100341797, + "rewards/margins_max": 1.4007904529571533, + "rewards/margins_min": 0.3442313075065613, + "rewards/margins_std": 0.7471002340316772, + "rewards/rejected": -1.2053489685058594, + "step": 620 + }, + { + "epoch": 0.26, + "grad_norm": 0.58984375, + "learning_rate": 1.8491468144524177e-06, + "logits/chosen": -0.07715997099876404, + "logits/rejected": 0.4883267283439636, + "logps/chosen": -325.30877685546875, + "logps/rejected": -369.89923095703125, + "loss": 0.4566, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.43882113695144653, + "rewards/margins": 0.86943519115448, + "rewards/margins_max": 1.459219217300415, + "rewards/margins_min": 0.2796511948108673, + "rewards/margins_std": 0.8340805768966675, + "rewards/rejected": -1.3082562685012817, + "step": 630 + }, + { + "epoch": 0.26, + "grad_norm": 0.59765625, + "learning_rate": 1.8414654681344916e-06, + "logits/chosen": -0.09930239617824554, + "logits/rejected": 0.520045280456543, + "logps/chosen": -286.84912109375, + "logps/rejected": -331.6852111816406, + "loss": 0.4534, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.4153270125389099, + "rewards/margins": 0.733604907989502, + "rewards/margins_max": 1.1434333324432373, + "rewards/margins_min": 0.32377633452415466, + "rewards/margins_std": 0.579585075378418, + "rewards/rejected": -1.1489319801330566, + "step": 640 + }, + { + "epoch": 0.27, + "grad_norm": 0.640625, + "learning_rate": 1.833610171317424e-06, + "logits/chosen": 0.07414983212947845, + "logits/rejected": 0.6430131196975708, + "logps/chosen": -300.6047668457031, + "logps/rejected": -354.2503967285156, + "loss": 0.4345, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.4335756301879883, + "rewards/margins": 0.7363289594650269, + "rewards/margins_max": 1.2120670080184937, + "rewards/margins_min": 0.2605907917022705, + "rewards/margins_std": 0.6727953553199768, + "rewards/rejected": -1.1699045896530151, + "step": 650 + }, + { + "epoch": 0.27, + "grad_norm": 0.5859375, + "learning_rate": 1.8255825478739157e-06, + "logits/chosen": 0.15367427468299866, + "logits/rejected": 0.6044631004333496, + "logps/chosen": -254.28286743164062, + "logps/rejected": -345.909912109375, + "loss": 0.4196, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.33262401819229126, + "rewards/margins": 0.8059485554695129, + "rewards/margins_max": 1.1957746744155884, + "rewards/margins_min": 0.4161224365234375, + "rewards/margins_std": 0.5512973666191101, + "rewards/rejected": -1.1385724544525146, + "step": 660 + }, + { + "epoch": 0.28, + "grad_norm": 0.68359375, + "learning_rate": 1.8173842573005922e-06, + "logits/chosen": -0.05639176443219185, + "logits/rejected": 0.4384500980377197, + "logps/chosen": -290.5333557128906, + "logps/rejected": -347.9354553222656, + "loss": 0.4435, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.5654058456420898, + "rewards/margins": 0.7975835204124451, + "rewards/margins_max": 1.297826886177063, + "rewards/margins_min": 0.2973402142524719, + "rewards/margins_std": 0.707450807094574, + "rewards/rejected": -1.3629894256591797, + "step": 670 + }, + { + "epoch": 0.28, + "grad_norm": 0.72265625, + "learning_rate": 1.8090169943749474e-06, + "logits/chosen": 0.07231085002422333, + "logits/rejected": 0.709048330783844, + "logps/chosen": -317.9612731933594, + "logps/rejected": -366.837890625, + "loss": 0.4035, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -0.4872046113014221, + "rewards/margins": 0.969443678855896, + "rewards/margins_max": 1.5566800832748413, + "rewards/margins_min": 0.3822072446346283, + "rewards/margins_std": 0.8304777145385742, + "rewards/rejected": -1.456648349761963, + "step": 680 + }, + { + "epoch": 0.28, + "grad_norm": 0.6953125, + "learning_rate": 1.8004824888049936e-06, + "logits/chosen": -0.02150675281882286, + "logits/rejected": 0.6057881116867065, + "logps/chosen": -290.603271484375, + "logps/rejected": -371.76788330078125, + "loss": 0.4219, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -0.45583152770996094, + "rewards/margins": 1.104190707206726, + "rewards/margins_max": 1.8512020111083984, + "rewards/margins_min": 0.3571794033050537, + "rewards/margins_std": 1.0564334392547607, + "rewards/rejected": -1.560022234916687, + "step": 690 + }, + { + "epoch": 0.29, + "grad_norm": 0.76171875, + "learning_rate": 1.791782504871691e-06, + "logits/chosen": -0.041911929845809937, + "logits/rejected": 0.615075409412384, + "logps/chosen": -325.01495361328125, + "logps/rejected": -324.512451171875, + "loss": 0.4271, + "rewards/accuracies": 0.875, + "rewards/chosen": -0.687096118927002, + "rewards/margins": 0.7125922441482544, + "rewards/margins_max": 1.197318196296692, + "rewards/margins_min": 0.22786636650562286, + "rewards/margins_std": 0.6855059266090393, + "rewards/rejected": -1.3996882438659668, + "step": 700 + }, + { + "epoch": 0.29, + "grad_norm": 0.7265625, + "learning_rate": 1.7829188410642288e-06, + "logits/chosen": 0.03392393887042999, + "logits/rejected": 0.7335731983184814, + "logps/chosen": -328.3592224121094, + "logps/rejected": -401.75653076171875, + "loss": 0.3674, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.5749274492263794, + "rewards/margins": 1.163051962852478, + "rewards/margins_max": 1.8984100818634033, + "rewards/margins_min": 0.42769408226013184, + "rewards/margins_std": 1.0399531126022339, + "rewards/rejected": -1.737979531288147, + "step": 710 + }, + { + "epoch": 0.3, + "grad_norm": 0.578125, + "learning_rate": 1.7738933297082363e-06, + "logits/chosen": -0.004636755678802729, + "logits/rejected": 0.5097047090530396, + "logps/chosen": -307.2280578613281, + "logps/rejected": -362.5736389160156, + "loss": 0.3987, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6072796583175659, + "rewards/margins": 1.077012300491333, + "rewards/margins_max": 1.6295830011367798, + "rewards/margins_min": 0.5244414806365967, + "rewards/margins_std": 0.781453013420105, + "rewards/rejected": -1.6842920780181885, + "step": 720 + }, + { + "epoch": 0.3, + "grad_norm": 0.71484375, + "learning_rate": 1.7647078365869988e-06, + "logits/chosen": -0.08409127593040466, + "logits/rejected": 0.4622929096221924, + "logps/chosen": -307.9881896972656, + "logps/rejected": -387.77264404296875, + "loss": 0.3973, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.5732977986335754, + "rewards/margins": 0.9682035446166992, + "rewards/margins_max": 1.5226585865020752, + "rewards/margins_min": 0.41374826431274414, + "rewards/margins_std": 0.7841179966926575, + "rewards/rejected": -1.5415012836456299, + "step": 730 + }, + { + "epoch": 0.3, + "grad_norm": 0.6953125, + "learning_rate": 1.7553642605557558e-06, + "logits/chosen": 0.028938591480255127, + "logits/rejected": 0.5773764848709106, + "logps/chosen": -324.56439208984375, + "logps/rejected": -408.5997314453125, + "loss": 0.3654, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.5612246990203857, + "rewards/margins": 1.1907762289047241, + "rewards/margins_max": 1.8759260177612305, + "rewards/margins_min": 0.5056263208389282, + "rewards/margins_std": 0.968948245048523, + "rewards/rejected": -1.7520010471343994, + "step": 740 + }, + { + "epoch": 0.31, + "grad_norm": 0.7265625, + "learning_rate": 1.745864533149165e-06, + "logits/chosen": -0.1704981029033661, + "logits/rejected": 0.4236753582954407, + "logps/chosen": -336.75787353515625, + "logps/rejected": -425.0238342285156, + "loss": 0.4012, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.7402617335319519, + "rewards/margins": 1.3461772203445435, + "rewards/margins_max": 2.0805575847625732, + "rewards/margins_min": 0.6117968559265137, + "rewards/margins_std": 1.0385706424713135, + "rewards/rejected": -2.0864386558532715, + "step": 750 + }, + { + "epoch": 0.31, + "grad_norm": 0.8984375, + "learning_rate": 1.7362106181820062e-06, + "logits/chosen": -0.05409733206033707, + "logits/rejected": 0.6319289803504944, + "logps/chosen": -323.01031494140625, + "logps/rejected": -408.16046142578125, + "loss": 0.4064, + "rewards/accuracies": 0.9375, + "rewards/chosen": -0.6317715048789978, + "rewards/margins": 1.3276995420455933, + "rewards/margins_max": 2.0891098976135254, + "rewards/margins_min": 0.566289484500885, + "rewards/margins_std": 1.076796531677246, + "rewards/rejected": -1.9594709873199463, + "step": 760 + }, + { + "epoch": 0.32, + "grad_norm": 0.66796875, + "learning_rate": 1.7264045113432197e-06, + "logits/chosen": -0.05517064407467842, + "logits/rejected": 0.5958997011184692, + "logps/chosen": -377.96453857421875, + "logps/rejected": -445.7513732910156, + "loss": 0.3622, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7109798192977905, + "rewards/margins": 1.3723578453063965, + "rewards/margins_max": 2.0370492935180664, + "rewards/margins_min": 0.7076665163040161, + "rewards/margins_std": 0.9400156140327454, + "rewards/rejected": -2.0833375453948975, + "step": 770 + }, + { + "epoch": 0.32, + "grad_norm": 0.984375, + "learning_rate": 1.7164482397833462e-06, + "logits/chosen": 0.047315459698438644, + "logits/rejected": 0.658032238483429, + "logps/chosen": -330.7193298339844, + "logps/rejected": -425.60394287109375, + "loss": 0.3565, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -0.7782403826713562, + "rewards/margins": 1.2866474390029907, + "rewards/margins_max": 1.9514182806015015, + "rewards/margins_min": 0.6218767762184143, + "rewards/margins_std": 0.9401277303695679, + "rewards/rejected": -2.064887762069702, + "step": 780 + }, + { + "epoch": 0.33, + "grad_norm": 0.81640625, + "learning_rate": 1.70634386169547e-06, + "logits/chosen": -0.034783005714416504, + "logits/rejected": 0.5936989188194275, + "logps/chosen": -369.987548828125, + "logps/rejected": -421.5458068847656, + "loss": 0.4285, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.8623000979423523, + "rewards/margins": 1.0687025785446167, + "rewards/margins_max": 1.701873779296875, + "rewards/margins_min": 0.43553146719932556, + "rewards/margins_std": 0.8954392671585083, + "rewards/rejected": -1.9310028553009033, + "step": 790 + }, + { + "epoch": 0.33, + "grad_norm": 1.1796875, + "learning_rate": 1.696093465889743e-06, + "logits/chosen": -0.0015446215402334929, + "logits/rejected": 0.5988802313804626, + "logps/chosen": -329.92071533203125, + "logps/rejected": -458.45196533203125, + "loss": 0.3701, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -0.9185296297073364, + "rewards/margins": 1.5133209228515625, + "rewards/margins_max": 2.527653694152832, + "rewards/margins_min": 0.49898791313171387, + "rewards/margins_std": 1.4344834089279175, + "rewards/rejected": -2.4318506717681885, + "step": 800 + }, + { + "epoch": 0.33, + "grad_norm": 0.91796875, + "learning_rate": 1.6856991713615775e-06, + "logits/chosen": -0.036314308643341064, + "logits/rejected": 0.5499120354652405, + "logps/chosen": -344.57598876953125, + "logps/rejected": -425.4794921875, + "loss": 0.3308, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.8832759857177734, + "rewards/margins": 1.2336915731430054, + "rewards/margins_max": 1.9916486740112305, + "rewards/margins_min": 0.4757346212863922, + "rewards/margins_std": 1.0719130039215088, + "rewards/rejected": -2.1169676780700684, + "step": 810 + }, + { + "epoch": 0.34, + "grad_norm": 1.2421875, + "learning_rate": 1.6751631268536018e-06, + "logits/chosen": -0.07272686064243317, + "logits/rejected": 0.5183537602424622, + "logps/chosen": -353.64892578125, + "logps/rejected": -507.5834045410156, + "loss": 0.3119, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.9213719367980957, + "rewards/margins": 1.8004786968231201, + "rewards/margins_max": 2.6694061756134033, + "rewards/margins_min": 0.9315509796142578, + "rewards/margins_std": 1.2288492918014526, + "rewards/rejected": -2.721850872039795, + "step": 820 + }, + { + "epoch": 0.34, + "grad_norm": 0.8359375, + "learning_rate": 1.664487510411464e-06, + "logits/chosen": -0.09257794171571732, + "logits/rejected": 0.5011342167854309, + "logps/chosen": -348.8028259277344, + "logps/rejected": -537.207275390625, + "loss": 0.3326, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.0882186889648438, + "rewards/margins": 2.0627102851867676, + "rewards/margins_max": 3.325389862060547, + "rewards/margins_min": 0.8000311851501465, + "rewards/margins_std": 1.7856981754302979, + "rewards/rejected": -3.1509292125701904, + "step": 830 + }, + { + "epoch": 0.35, + "grad_norm": 0.78515625, + "learning_rate": 1.65367452893358e-06, + "logits/chosen": -0.002668508794158697, + "logits/rejected": 0.6281024813652039, + "logps/chosen": -369.59014892578125, + "logps/rejected": -580.8179931640625, + "loss": 0.3385, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.1532074213027954, + "rewards/margins": 2.41599702835083, + "rewards/margins_max": 3.841174364089966, + "rewards/margins_min": 0.9908199310302734, + "rewards/margins_std": 2.015505313873291, + "rewards/rejected": -3.569204807281494, + "step": 840 + }, + { + "epoch": 0.35, + "grad_norm": 1.4921875, + "learning_rate": 1.6427264177149165e-06, + "logits/chosen": 0.042776815593242645, + "logits/rejected": 0.6086363792419434, + "logps/chosen": -355.4585876464844, + "logps/rejected": -518.1069946289062, + "loss": 0.3465, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.1320966482162476, + "rewards/margins": 1.8160098791122437, + "rewards/margins_max": 2.7302584648132324, + "rewards/margins_min": 0.9017614126205444, + "rewards/margins_std": 1.2929426431655884, + "rewards/rejected": -2.9481067657470703, + "step": 850 + }, + { + "epoch": 0.35, + "grad_norm": 0.66015625, + "learning_rate": 1.6316454399849025e-06, + "logits/chosen": 0.07836954295635223, + "logits/rejected": 0.7413384914398193, + "logps/chosen": -389.7508544921875, + "logps/rejected": -541.7508544921875, + "loss": 0.3002, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.192826509475708, + "rewards/margins": 1.808393120765686, + "rewards/margins_max": 2.9427132606506348, + "rewards/margins_min": 0.674072802066803, + "rewards/margins_std": 1.6041711568832397, + "rewards/rejected": -3.0012195110321045, + "step": 860 + }, + { + "epoch": 0.36, + "grad_norm": 1.546875, + "learning_rate": 1.620433886439568e-06, + "logits/chosen": 0.029862603172659874, + "logits/rejected": 0.5954081416130066, + "logps/chosen": -338.1944885253906, + "logps/rejected": -488.2452087402344, + "loss": 0.3331, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -0.9371468424797058, + "rewards/margins": 1.646079659461975, + "rewards/margins_max": 2.532169818878174, + "rewards/margins_min": 0.7599895596504211, + "rewards/margins_std": 1.25312077999115, + "rewards/rejected": -2.5832266807556152, + "step": 870 + }, + { + "epoch": 0.36, + "grad_norm": 0.6171875, + "learning_rate": 1.6090940747680032e-06, + "logits/chosen": -0.009293178096413612, + "logits/rejected": 0.6269145607948303, + "logps/chosen": -374.23455810546875, + "logps/rejected": -615.919189453125, + "loss": 0.3139, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.2220368385314941, + "rewards/margins": 2.7851784229278564, + "rewards/margins_max": 4.606410980224609, + "rewards/margins_min": 0.963945746421814, + "rewards/margins_std": 2.5756115913391113, + "rewards/rejected": -4.0072150230407715, + "step": 880 + }, + { + "epoch": 0.37, + "grad_norm": 0.58203125, + "learning_rate": 1.5976283491732386e-06, + "logits/chosen": -0.046679772436618805, + "logits/rejected": 0.6183110475540161, + "logps/chosen": -391.77337646484375, + "logps/rejected": -539.9234619140625, + "loss": 0.2852, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -1.5056828260421753, + "rewards/margins": 2.0037107467651367, + "rewards/margins_max": 3.213691234588623, + "rewards/margins_min": 0.7937299609184265, + "rewards/margins_std": 1.7111711502075195, + "rewards/rejected": -3.5093936920166016, + "step": 890 + }, + { + "epoch": 0.37, + "grad_norm": 1.2421875, + "learning_rate": 1.5860390798876432e-06, + "logits/chosen": 0.10775299370288849, + "logits/rejected": 0.7166673541069031, + "logps/chosen": -399.3635559082031, + "logps/rejected": -538.3640747070312, + "loss": 0.3538, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.2117259502410889, + "rewards/margins": 1.6585693359375, + "rewards/margins_max": 2.707691192626953, + "rewards/margins_min": 0.6094473004341125, + "rewards/margins_std": 1.4836825132369995, + "rewards/rejected": -2.870295286178589, + "step": 900 + }, + { + "epoch": 0.37, + "grad_norm": 0.59765625, + "learning_rate": 1.5743286626829435e-06, + "logits/chosen": 0.022806577384471893, + "logits/rejected": 0.5851965546607971, + "logps/chosen": -390.1748962402344, + "logps/rejected": -625.4842529296875, + "loss": 0.3135, + "rewards/accuracies": 0.875, + "rewards/chosen": -1.6653339862823486, + "rewards/margins": 2.4102187156677246, + "rewards/margins_max": 4.1714911460876465, + "rewards/margins_min": 0.6489461660385132, + "rewards/margins_std": 2.4908154010772705, + "rewards/rejected": -4.075552463531494, + "step": 910 + }, + { + "epoch": 0.38, + "grad_norm": 1.0546875, + "learning_rate": 1.5624995183749601e-06, + "logits/chosen": -0.08865977823734283, + "logits/rejected": 0.5650321245193481, + "logps/chosen": -385.13409423828125, + "logps/rejected": -572.8970336914062, + "loss": 0.3019, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.4613841772079468, + "rewards/margins": 2.3046936988830566, + "rewards/margins_max": 3.730020523071289, + "rewards/margins_min": 0.8793666958808899, + "rewards/margins_std": 2.015716552734375, + "rewards/rejected": -3.766078233718872, + "step": 920 + }, + { + "epoch": 0.38, + "grad_norm": 1.421875, + "learning_rate": 1.5505540923231695e-06, + "logits/chosen": 0.09656616300344467, + "logits/rejected": 0.7050750851631165, + "logps/chosen": -410.744873046875, + "logps/rejected": -602.7825927734375, + "loss": 0.2936, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.506037950515747, + "rewards/margins": 2.2272439002990723, + "rewards/margins_max": 3.3739781379699707, + "rewards/margins_min": 1.0805096626281738, + "rewards/margins_std": 1.6217267513275146, + "rewards/rejected": -3.7332820892333984, + "step": 930 + }, + { + "epoch": 0.39, + "grad_norm": 2.40625, + "learning_rate": 1.5384948539251919e-06, + "logits/chosen": -0.046519361436367035, + "logits/rejected": 0.5364492535591125, + "logps/chosen": -407.41961669921875, + "logps/rejected": -627.1873168945312, + "loss": 0.3125, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.7679170370101929, + "rewards/margins": 2.495460033416748, + "rewards/margins_max": 4.152594566345215, + "rewards/margins_min": 0.8383258581161499, + "rewards/margins_std": 2.3435416221618652, + "rewards/rejected": -4.2633771896362305, + "step": 940 + }, + { + "epoch": 0.39, + "grad_norm": 0.65234375, + "learning_rate": 1.5263242961063074e-06, + "logits/chosen": -0.01217577327042818, + "logits/rejected": 0.7021702527999878, + "logps/chosen": -418.34674072265625, + "logps/rejected": -653.3511352539062, + "loss": 0.2666, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.6848056316375732, + "rewards/margins": 2.7814083099365234, + "rewards/margins_max": 4.369351387023926, + "rewards/margins_min": 1.1934659481048584, + "rewards/margins_std": 2.245690107345581, + "rewards/rejected": -4.466213703155518, + "step": 950 + }, + { + "epoch": 0.4, + "grad_norm": 0.7421875, + "learning_rate": 1.5140449348041133e-06, + "logits/chosen": 0.15041589736938477, + "logits/rejected": 0.7314427495002747, + "logps/chosen": -418.79571533203125, + "logps/rejected": -602.0596923828125, + "loss": 0.3008, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.6542141437530518, + "rewards/margins": 2.1634926795959473, + "rewards/margins_max": 3.671837568283081, + "rewards/margins_min": 0.6551474928855896, + "rewards/margins_std": 2.133122205734253, + "rewards/rejected": -3.817707061767578, + "step": 960 + }, + { + "epoch": 0.4, + "grad_norm": 0.88671875, + "learning_rate": 1.5016593084484188e-06, + "logits/chosen": 0.006452396512031555, + "logits/rejected": 0.6475186347961426, + "logps/chosen": -441.0831604003906, + "logps/rejected": -657.4892578125, + "loss": 0.3244, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.5441867113113403, + "rewards/margins": 2.7112343311309814, + "rewards/margins_max": 4.389430999755859, + "rewards/margins_min": 1.0330379009246826, + "rewards/margins_std": 2.373328685760498, + "rewards/rejected": -4.2554216384887695, + "step": 970 + }, + { + "epoch": 0.4, + "grad_norm": 1.21875, + "learning_rate": 1.4891699774364925e-06, + "logits/chosen": -0.04653478413820267, + "logits/rejected": 0.585922122001648, + "logps/chosen": -463.1758728027344, + "logps/rejected": -709.259765625, + "loss": 0.2991, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.9931418895721436, + "rewards/margins": 2.939608097076416, + "rewards/margins_max": 4.592248439788818, + "rewards/margins_min": 1.2869676351547241, + "rewards/margins_std": 2.337186336517334, + "rewards/rejected": -4.932750225067139, + "step": 980 + }, + { + "epoch": 0.41, + "grad_norm": 1.109375, + "learning_rate": 1.4765795236037705e-06, + "logits/chosen": 0.11729402840137482, + "logits/rejected": 0.6863471269607544, + "logps/chosen": -499.93182373046875, + "logps/rejected": -771.2833251953125, + "loss": 0.2888, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.091594696044922, + "rewards/margins": 3.2571346759796143, + "rewards/margins_max": 5.196051597595215, + "rewards/margins_min": 1.3182172775268555, + "rewards/margins_std": 2.7420434951782227, + "rewards/rejected": -5.348729133605957, + "step": 990 + }, + { + "epoch": 0.41, + "grad_norm": 0.6875, + "learning_rate": 1.463890549690129e-06, + "logits/chosen": -0.0073835537768900394, + "logits/rejected": 0.5565542578697205, + "logps/chosen": -399.1741638183594, + "logps/rejected": -644.0343017578125, + "loss": 0.3183, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -1.6458442211151123, + "rewards/margins": 2.509960412979126, + "rewards/margins_max": 3.9676921367645264, + "rewards/margins_min": 1.0522279739379883, + "rewards/margins_std": 2.061544418334961, + "rewards/rejected": -4.155804634094238, + "step": 1000 + }, + { + "epoch": 0.42, + "grad_norm": 0.89453125, + "learning_rate": 1.4511056788018387e-06, + "logits/chosen": 0.03009922243654728, + "logits/rejected": 0.7003548741340637, + "logps/chosen": -442.25994873046875, + "logps/rejected": -612.8889770507812, + "loss": 0.3125, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.7707973718643188, + "rewards/margins": 2.1549880504608154, + "rewards/margins_max": 3.36381196975708, + "rewards/margins_min": 0.9461652636528015, + "rewards/margins_std": 1.7095340490341187, + "rewards/rejected": -3.925785541534424, + "step": 1010 + }, + { + "epoch": 0.42, + "grad_norm": 2.546875, + "learning_rate": 1.438227553869307e-06, + "logits/chosen": 0.08984429389238358, + "logits/rejected": 0.6354426145553589, + "logps/chosen": -430.12957763671875, + "logps/rejected": -771.4137573242188, + "loss": 0.2714, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -1.9618059396743774, + "rewards/margins": 3.78582763671875, + "rewards/margins_max": 6.3192243576049805, + "rewards/margins_min": 1.25243079662323, + "rewards/margins_std": 3.5827643871307373, + "rewards/rejected": -5.747633457183838, + "step": 1020 + }, + { + "epoch": 0.42, + "grad_norm": 3.859375, + "learning_rate": 1.4252588371007226e-06, + "logits/chosen": 0.038023028522729874, + "logits/rejected": 0.7126880884170532, + "logps/chosen": -465.6451721191406, + "logps/rejected": -643.1898803710938, + "loss": 0.3392, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -1.6084156036376953, + "rewards/margins": 2.4245221614837646, + "rewards/margins_max": 4.242518424987793, + "rewards/margins_min": 0.6065254807472229, + "rewards/margins_std": 2.571035861968994, + "rewards/rejected": -4.032937526702881, + "step": 1030 + }, + { + "epoch": 0.43, + "grad_norm": 0.77734375, + "learning_rate": 1.412202209431716e-06, + "logits/chosen": 0.014304918237030506, + "logits/rejected": 0.6482391357421875, + "logps/chosen": -429.71514892578125, + "logps/rejected": -718.8599243164062, + "loss": 0.2488, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -1.7444334030151367, + "rewards/margins": 3.125716209411621, + "rewards/margins_max": 5.137998580932617, + "rewards/margins_min": 1.1134343147277832, + "rewards/margins_std": 2.845796823501587, + "rewards/rejected": -4.870149612426758, + "step": 1040 + }, + { + "epoch": 0.43, + "grad_norm": 6.40625, + "learning_rate": 1.3990603699711468e-06, + "logits/chosen": 0.13320419192314148, + "logits/rejected": 0.7178879976272583, + "logps/chosen": -432.60931396484375, + "logps/rejected": -785.9718017578125, + "loss": 0.2916, + "rewards/accuracies": 0.9375, + "rewards/chosen": -1.9751720428466797, + "rewards/margins": 3.503911256790161, + "rewards/margins_max": 5.681495666503906, + "rewards/margins_min": 1.3263267278671265, + "rewards/margins_std": 3.0795693397521973, + "rewards/rejected": -5.479083061218262, + "step": 1050 + }, + { + "epoch": 0.44, + "grad_norm": 4.1875, + "learning_rate": 1.3858360354431353e-06, + "logits/chosen": -0.04740985855460167, + "logits/rejected": 0.6082924008369446, + "logps/chosen": -461.19854736328125, + "logps/rejected": -738.2750244140625, + "loss": 0.3134, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.0386252403259277, + "rewards/margins": 3.0969669818878174, + "rewards/margins_max": 4.9145002365112305, + "rewards/margins_min": 1.2794336080551147, + "rewards/margins_std": 2.570380449295044, + "rewards/rejected": -5.135591983795166, + "step": 1060 + }, + { + "epoch": 0.44, + "grad_norm": 0.6171875, + "learning_rate": 1.3725319396254528e-06, + "logits/chosen": 0.06939555704593658, + "logits/rejected": 0.721479058265686, + "logps/chosen": -423.41082763671875, + "logps/rejected": -740.5270385742188, + "loss": 0.279, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.8382021188735962, + "rewards/margins": 3.333662509918213, + "rewards/margins_max": 5.442681312561035, + "rewards/margins_min": 1.2246429920196533, + "rewards/margins_std": 2.9826035499572754, + "rewards/rejected": -5.171864032745361, + "step": 1070 + }, + { + "epoch": 0.44, + "grad_norm": 0.71875, + "learning_rate": 1.3591508327843857e-06, + "logits/chosen": 0.05436503142118454, + "logits/rejected": 0.669663667678833, + "logps/chosen": -454.3190002441406, + "logps/rejected": -752.100830078125, + "loss": 0.2395, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.265321969985962, + "rewards/margins": 3.1536312103271484, + "rewards/margins_max": 4.983828544616699, + "rewards/margins_min": 1.3234339952468872, + "rewards/margins_std": 2.588289737701416, + "rewards/rejected": -5.418953895568848, + "step": 1080 + }, + { + "epoch": 0.45, + "grad_norm": 2.21875, + "learning_rate": 1.3456954811061907e-06, + "logits/chosen": 0.13526487350463867, + "logits/rejected": 0.5835285782814026, + "logps/chosen": -445.73199462890625, + "logps/rejected": -693.9923706054688, + "loss": 0.3059, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.054556131362915, + "rewards/margins": 2.580620288848877, + "rewards/margins_max": 4.460507869720459, + "rewards/margins_min": 0.7007322907447815, + "rewards/margins_std": 2.6585628986358643, + "rewards/rejected": -4.635176658630371, + "step": 1090 + }, + { + "epoch": 0.45, + "grad_norm": 0.6796875, + "learning_rate": 1.3321686661252624e-06, + "logits/chosen": -0.02377261593937874, + "logits/rejected": 0.5029060244560242, + "logps/chosen": -429.89276123046875, + "logps/rejected": -790.6046142578125, + "loss": 0.2515, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.046562910079956, + "rewards/margins": 3.737776517868042, + "rewards/margins_max": 6.0582804679870605, + "rewards/margins_min": 1.4172735214233398, + "rewards/margins_std": 3.2816872596740723, + "rewards/rejected": -5.78433895111084, + "step": 1100 + }, + { + "epoch": 0.46, + "grad_norm": 0.79296875, + "learning_rate": 1.3185731841491217e-06, + "logits/chosen": -0.05110805109143257, + "logits/rejected": 0.5351995229721069, + "logps/chosen": -491.8407287597656, + "logps/rejected": -814.8461303710938, + "loss": 0.3023, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.366361141204834, + "rewards/margins": 3.5456528663635254, + "rewards/margins_max": 5.71653938293457, + "rewards/margins_min": 1.3747665882110596, + "rewards/margins_std": 3.070096969604492, + "rewards/rejected": -5.912014484405518, + "step": 1110 + }, + { + "epoch": 0.46, + "grad_norm": 1.09375, + "learning_rate": 1.3049118456803566e-06, + "logits/chosen": 0.06179860979318619, + "logits/rejected": 0.6705427169799805, + "logps/chosen": -449.32220458984375, + "logps/rejected": -707.3748779296875, + "loss": 0.3074, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1201488971710205, + "rewards/margins": 3.0255444049835205, + "rewards/margins_max": 4.990096569061279, + "rewards/margins_min": 1.0609924793243408, + "rewards/margins_std": 2.7782959938049316, + "rewards/rejected": -5.145693778991699, + "step": 1120 + }, + { + "epoch": 0.47, + "grad_norm": 1.03125, + "learning_rate": 1.2911874748356252e-06, + "logits/chosen": -0.028266632929444313, + "logits/rejected": 0.6430469751358032, + "logps/chosen": -443.5455017089844, + "logps/rejected": -795.4043579101562, + "loss": 0.2608, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.1805038452148438, + "rewards/margins": 3.5634307861328125, + "rewards/margins_max": 5.326353073120117, + "rewards/margins_min": 1.800508737564087, + "rewards/margins_std": 2.4931483268737793, + "rewards/rejected": -5.743934631347656, + "step": 1130 + }, + { + "epoch": 0.47, + "grad_norm": 0.66796875, + "learning_rate": 1.2774029087618445e-06, + "logits/chosen": -0.06501082330942154, + "logits/rejected": 0.5853902101516724, + "logps/chosen": -521.4249877929688, + "logps/rejected": -714.2313232421875, + "loss": 0.2854, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.3054754734039307, + "rewards/margins": 2.6544671058654785, + "rewards/margins_max": 4.386072635650635, + "rewards/margins_min": 0.9228616952896118, + "rewards/margins_std": 2.448859691619873, + "rewards/rejected": -4.959942817687988, + "step": 1140 + }, + { + "epoch": 0.47, + "grad_norm": 1.6328125, + "learning_rate": 1.263560997049687e-06, + "logits/chosen": -0.01226266659796238, + "logits/rejected": 0.6223964691162109, + "logps/chosen": -500.0978088378906, + "logps/rejected": -738.5953369140625, + "loss": 0.2613, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.160482883453369, + "rewards/margins": 3.0736801624298096, + "rewards/margins_max": 4.78936767578125, + "rewards/margins_min": 1.3579928874969482, + "rewards/margins_std": 2.4263482093811035, + "rewards/rejected": -5.234162330627441, + "step": 1150 + }, + { + "epoch": 0.48, + "grad_norm": 1.4140625, + "learning_rate": 1.2496646011445024e-06, + "logits/chosen": 0.13887211680412292, + "logits/rejected": 0.6845839619636536, + "logps/chosen": -482.53924560546875, + "logps/rejected": -763.1187744140625, + "loss": 0.2828, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.2161526679992676, + "rewards/margins": 3.281670331954956, + "rewards/margins_max": 5.1992340087890625, + "rewards/margins_min": 1.36410653591156, + "rewards/margins_std": 2.7118449211120605, + "rewards/rejected": -5.497823715209961, + "step": 1160 + }, + { + "epoch": 0.48, + "grad_norm": 0.87890625, + "learning_rate": 1.2357165937547932e-06, + "logits/chosen": 0.11630807816982269, + "logits/rejected": 0.8136134147644043, + "logps/chosen": -416.43377685546875, + "logps/rejected": -637.5525512695312, + "loss": 0.3023, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -1.602250099182129, + "rewards/margins": 2.4919466972351074, + "rewards/margins_max": 4.262382507324219, + "rewards/margins_min": 0.7215104103088379, + "rewards/margins_std": 2.503774642944336, + "rewards/rejected": -4.0941972732543945, + "step": 1170 + }, + { + "epoch": 0.49, + "grad_norm": 1.1015625, + "learning_rate": 1.2217198582583553e-06, + "logits/chosen": 0.1416541039943695, + "logits/rejected": 0.6444225907325745, + "logps/chosen": -453.56085205078125, + "logps/rejected": -785.13623046875, + "loss": 0.3186, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.104236602783203, + "rewards/margins": 3.521073818206787, + "rewards/margins_max": 6.118127822875977, + "rewards/margins_min": 0.9240198135375977, + "rewards/margins_std": 3.6727893352508545, + "rewards/rejected": -5.62531042098999, + "step": 1180 + }, + { + "epoch": 0.49, + "grad_norm": 1.203125, + "learning_rate": 1.20767728810622e-06, + "logits/chosen": 0.098558709025383, + "logits/rejected": 0.7595298886299133, + "logps/chosen": -484.1192932128906, + "logps/rejected": -835.5067138671875, + "loss": 0.2177, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.26068115234375, + "rewards/margins": 3.935065746307373, + "rewards/margins_max": 5.916754722595215, + "rewards/margins_min": 1.9533783197402954, + "rewards/margins_std": 2.802530288696289, + "rewards/rejected": -6.195747375488281, + "step": 1190 + }, + { + "epoch": 0.49, + "grad_norm": 1.2578125, + "learning_rate": 1.1935917862245069e-06, + "logits/chosen": 0.009665842168033123, + "logits/rejected": 0.692471981048584, + "logps/chosen": -446.05364990234375, + "logps/rejected": -773.1881103515625, + "loss": 0.3002, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.023267984390259, + "rewards/margins": 3.7675445079803467, + "rewards/margins_max": 6.028232574462891, + "rewards/margins_min": 1.506856083869934, + "rewards/margins_std": 3.197096347808838, + "rewards/rejected": -5.7908124923706055, + "step": 1200 + }, + { + "epoch": 0.5, + "grad_norm": 0.7890625, + "learning_rate": 1.1794662644143256e-06, + "logits/chosen": 0.022773366421461105, + "logits/rejected": 0.6556586027145386, + "logps/chosen": -499.6419982910156, + "logps/rejected": -979.7394409179688, + "loss": 0.2683, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6731841564178467, + "rewards/margins": 5.120915412902832, + "rewards/margins_max": 8.691125869750977, + "rewards/margins_min": 1.5507053136825562, + "rewards/margins_std": 5.049039840698242, + "rewards/rejected": -7.7940993309021, + "step": 1210 + }, + { + "epoch": 0.5, + "grad_norm": 0.9453125, + "learning_rate": 1.1653036427498352e-06, + "logits/chosen": 0.06103574112057686, + "logits/rejected": 0.5934966206550598, + "logps/chosen": -460.0519104003906, + "logps/rejected": -799.0787353515625, + "loss": 0.2657, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.192873239517212, + "rewards/margins": 3.3739676475524902, + "rewards/margins_max": 5.504680156707764, + "rewards/margins_min": 1.2432544231414795, + "rewards/margins_std": 3.0132834911346436, + "rewards/rejected": -5.566840648651123, + "step": 1220 + }, + { + "epoch": 0.51, + "grad_norm": 0.75390625, + "learning_rate": 1.1511068489745986e-06, + "logits/chosen": 0.07468974590301514, + "logits/rejected": 0.8071237802505493, + "logps/chosen": -477.79571533203125, + "logps/rejected": -913.1275634765625, + "loss": 0.249, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.136107921600342, + "rewards/margins": 4.6890764236450195, + "rewards/margins_max": 7.924353122711182, + "rewards/margins_min": 1.4538004398345947, + "rewards/margins_std": 4.575371742248535, + "rewards/rejected": -6.8251848220825195, + "step": 1230 + }, + { + "epoch": 0.51, + "grad_norm": 0.63671875, + "learning_rate": 1.1368788178963491e-06, + "logits/chosen": 0.06184614449739456, + "logits/rejected": 0.6528698205947876, + "logps/chosen": -485.544189453125, + "logps/rejected": -858.0904541015625, + "loss": 0.253, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4247965812683105, + "rewards/margins": 3.964677333831787, + "rewards/margins_max": 5.929955959320068, + "rewards/margins_min": 1.9993988275527954, + "rewards/margins_std": 2.7793235778808594, + "rewards/rejected": -6.389473915100098, + "step": 1240 + }, + { + "epoch": 0.51, + "grad_norm": 1.2109375, + "learning_rate": 1.1226224907802983e-06, + "logits/chosen": 0.134813591837883, + "logits/rejected": 0.5963112115859985, + "logps/chosen": -488.6258239746094, + "logps/rejected": -963.7542724609375, + "loss": 0.2625, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.413456439971924, + "rewards/margins": 5.094055652618408, + "rewards/margins_max": 8.218836784362793, + "rewards/margins_min": 1.9692729711532593, + "rewards/margins_std": 4.419109344482422, + "rewards/rejected": -7.507512092590332, + "step": 1250 + }, + { + "epoch": 0.52, + "grad_norm": 3.046875, + "learning_rate": 1.1083408147411073e-06, + "logits/chosen": 0.2207455337047577, + "logits/rejected": 0.8767908811569214, + "logps/chosen": -470.7706604003906, + "logps/rejected": -718.1776123046875, + "loss": 0.2267, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.117788791656494, + "rewards/margins": 3.199702024459839, + "rewards/margins_max": 5.136730670928955, + "rewards/margins_min": 1.2626738548278809, + "rewards/margins_std": 2.7393720149993896, + "rewards/rejected": -5.317490577697754, + "step": 1260 + }, + { + "epoch": 0.52, + "grad_norm": 1.40625, + "learning_rate": 1.0940367421336488e-06, + "logits/chosen": 0.10231053829193115, + "logits/rejected": 0.6745079755783081, + "logps/chosen": -482.825927734375, + "logps/rejected": -798.8470458984375, + "loss": 0.2976, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.555229663848877, + "rewards/margins": 3.457376480102539, + "rewards/margins_max": 5.613675594329834, + "rewards/margins_min": 1.301077127456665, + "rewards/margins_std": 3.0494678020477295, + "rewards/rejected": -6.012606143951416, + "step": 1270 + }, + { + "epoch": 0.53, + "grad_norm": 3.0625, + "learning_rate": 1.079713229942688e-06, + "logits/chosen": 0.14812633395195007, + "logits/rejected": 0.7574166059494019, + "logps/chosen": -506.6991271972656, + "logps/rejected": -787.7262573242188, + "loss": 0.2928, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.6476612091064453, + "rewards/margins": 3.1967766284942627, + "rewards/margins_max": 5.269640922546387, + "rewards/margins_min": 1.1239116191864014, + "rewards/margins_std": 2.931473731994629, + "rewards/rejected": -5.844437599182129, + "step": 1280 + }, + { + "epoch": 0.53, + "grad_norm": 1.21875, + "learning_rate": 1.0653732391716053e-06, + "logits/chosen": 0.12779296934604645, + "logits/rejected": 0.6562881469726562, + "logps/chosen": -452.0044860839844, + "logps/rejected": -755.9925537109375, + "loss": 0.2875, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.2710297107696533, + "rewards/margins": 3.172046184539795, + "rewards/margins_max": 5.278564929962158, + "rewards/margins_min": 1.0655282735824585, + "rewards/margins_std": 2.9790663719177246, + "rewards/rejected": -5.443076133728027, + "step": 1290 + }, + { + "epoch": 0.54, + "grad_norm": 0.88671875, + "learning_rate": 1.0510197342302864e-06, + "logits/chosen": 0.15161243081092834, + "logits/rejected": 0.683529257774353, + "logps/chosen": -446.7079162597656, + "logps/rejected": -819.0192260742188, + "loss": 0.3016, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.081666946411133, + "rewards/margins": 3.9550280570983887, + "rewards/margins_max": 6.584043025970459, + "rewards/margins_min": 1.3260126113891602, + "rewards/margins_std": 3.717989444732666, + "rewards/rejected": -6.036694526672363, + "step": 1300 + }, + { + "epoch": 0.54, + "grad_norm": 1.28125, + "learning_rate": 1.0366556823223101e-06, + "logits/chosen": 0.20373359322547913, + "logits/rejected": 0.6877504587173462, + "logps/chosen": -456.41015625, + "logps/rejected": -850.4407958984375, + "loss": 0.2665, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.4085049629211426, + "rewards/margins": 4.007854461669922, + "rewards/margins_max": 6.256316184997559, + "rewards/margins_min": 1.7593927383422852, + "rewards/margins_std": 3.1798055171966553, + "rewards/rejected": -6.416359901428223, + "step": 1310 + }, + { + "epoch": 0.54, + "grad_norm": 1.53125, + "learning_rate": 1.02228405283156e-06, + "logits/chosen": -0.020195502787828445, + "logits/rejected": 0.6102248430252075, + "logps/chosen": -493.6954040527344, + "logps/rejected": -811.216064453125, + "loss": 0.2313, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.36457896232605, + "rewards/margins": 3.4458298683166504, + "rewards/margins_max": 5.659657955169678, + "rewards/margins_min": 1.2320020198822021, + "rewards/margins_std": 3.130825996398926, + "rewards/rejected": -5.810408592224121, + "step": 1320 + }, + { + "epoch": 0.55, + "grad_norm": 0.8828125, + "learning_rate": 1.0079078167083814e-06, + "logits/chosen": 0.16562719643115997, + "logits/rejected": 0.8193610906600952, + "logps/chosen": -526.57763671875, + "logps/rejected": -813.3748168945312, + "loss": 0.2867, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.440490245819092, + "rewards/margins": 3.532073497772217, + "rewards/margins_max": 5.491944789886475, + "rewards/margins_min": 1.5722014904022217, + "rewards/margins_std": 2.7716774940490723, + "rewards/rejected": -5.972563743591309, + "step": 1330 + }, + { + "epoch": 0.55, + "grad_norm": 1.1171875, + "learning_rate": 9.935299458554181e-07, + "logits/chosen": -0.01127061527222395, + "logits/rejected": 0.612311601638794, + "logps/chosen": -485.59979248046875, + "logps/rejected": -858.2190551757812, + "loss": 0.2744, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.3641650676727295, + "rewards/margins": 4.078332901000977, + "rewards/margins_max": 6.7388482093811035, + "rewards/margins_min": 1.4178178310394287, + "rewards/margins_std": 3.7625365257263184, + "rewards/rejected": -6.442498683929443, + "step": 1340 + }, + { + "epoch": 0.56, + "grad_norm": 2.671875, + "learning_rate": 9.791534125132508e-07, + "logits/chosen": 0.03967234492301941, + "logits/rejected": 0.7782914042472839, + "logps/chosen": -573.2506713867188, + "logps/rejected": -899.2452392578125, + "loss": 0.2705, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7396163940429688, + "rewards/margins": 4.099890232086182, + "rewards/margins_max": 6.30058479309082, + "rewards/margins_min": 1.8991953134536743, + "rewards/margins_std": 3.1122524738311768, + "rewards/rejected": -6.839505672454834, + "step": 1350 + }, + { + "epoch": 0.56, + "grad_norm": 0.83984375, + "learning_rate": 9.64781188645965e-07, + "logits/chosen": 0.10714595019817352, + "logits/rejected": 0.7092632055282593, + "logps/chosen": -544.226806640625, + "logps/rejected": -752.5347290039062, + "loss": 0.3475, + "rewards/accuracies": 0.8500000238418579, + "rewards/chosen": -2.7692503929138184, + "rewards/margins": 2.6476550102233887, + "rewards/margins_max": 4.169407844543457, + "rewards/margins_min": 1.125902533531189, + "rewards/margins_std": 2.152083396911621, + "rewards/rejected": -5.416905403137207, + "step": 1360 + }, + { + "epoch": 0.56, + "grad_norm": 1.328125, + "learning_rate": 9.504162453267776e-07, + "logits/chosen": -0.025841986760497093, + "logits/rejected": 0.49216756224632263, + "logps/chosen": -457.9346618652344, + "logps/rejected": -994.6002807617188, + "loss": 0.2278, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.288224458694458, + "rewards/margins": 5.407242774963379, + "rewards/margins_max": 8.10576343536377, + "rewards/margins_min": 2.70872163772583, + "rewards/margins_std": 3.8162853717803955, + "rewards/rejected": -7.695467472076416, + "step": 1370 + }, + { + "epoch": 0.57, + "grad_norm": 0.9453125, + "learning_rate": 9.360615521238475e-07, + "logits/chosen": 0.24673600494861603, + "logits/rejected": 0.7878357172012329, + "logps/chosen": -515.7044067382812, + "logps/rejected": -769.2423706054688, + "loss": 0.2768, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.598513603210449, + "rewards/margins": 2.9555516242980957, + "rewards/margins_max": 4.985101699829102, + "rewards/margins_min": 0.9260021448135376, + "rewards/margins_std": 2.8702168464660645, + "rewards/rejected": -5.554066181182861, + "step": 1380 + }, + { + "epoch": 0.57, + "grad_norm": 0.56640625, + "learning_rate": 9.217200764863956e-07, + "logits/chosen": 0.13058429956436157, + "logits/rejected": 0.7461265325546265, + "logps/chosen": -489.1720275878906, + "logps/rejected": -916.9035034179688, + "loss": 0.2409, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.4763474464416504, + "rewards/margins": 4.656981468200684, + "rewards/margins_max": 7.720522880554199, + "rewards/margins_min": 1.5934394598007202, + "rewards/margins_std": 4.332502365112305, + "rewards/rejected": -7.133328914642334, + "step": 1390 + }, + { + "epoch": 0.58, + "grad_norm": 4.125, + "learning_rate": 9.073947831312634e-07, + "logits/chosen": 0.19845367968082428, + "logits/rejected": 0.6116907000541687, + "logps/chosen": -449.885498046875, + "logps/rejected": -832.296875, + "loss": 0.2708, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.4242727756500244, + "rewards/margins": 3.8387694358825684, + "rewards/margins_max": 6.290716648101807, + "rewards/margins_min": 1.386821985244751, + "rewards/margins_std": 3.4675774574279785, + "rewards/rejected": -6.263042449951172, + "step": 1400 + }, + { + "epoch": 0.58, + "grad_norm": 0.71875, + "learning_rate": 8.930886334300395e-07, + "logits/chosen": 0.06111987307667732, + "logits/rejected": 0.71096271276474, + "logps/chosen": -545.9169921875, + "logps/rejected": -853.5006103515625, + "loss": 0.2583, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.761983633041382, + "rewards/margins": 3.652026653289795, + "rewards/margins_max": 6.0150017738342285, + "rewards/margins_min": 1.2890517711639404, + "rewards/margins_std": 3.3417510986328125, + "rewards/rejected": -6.414010047912598, + "step": 1410 + }, + { + "epoch": 0.58, + "grad_norm": 1.3125, + "learning_rate": 8.78804584796872e-07, + "logits/chosen": 0.035090453922748566, + "logits/rejected": 0.6259672045707703, + "logps/chosen": -474.5460510253906, + "logps/rejected": -847.3800048828125, + "loss": 0.2675, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.222008466720581, + "rewards/margins": 3.796189785003662, + "rewards/margins_max": 6.324049949645996, + "rewards/margins_min": 1.2683302164077759, + "rewards/margins_std": 3.5749340057373047, + "rewards/rejected": -6.018198013305664, + "step": 1420 + }, + { + "epoch": 0.59, + "grad_norm": 2.6875, + "learning_rate": 8.645455900771052e-07, + "logits/chosen": 0.11879072338342667, + "logits/rejected": 0.7143687009811401, + "logps/chosen": -543.6822509765625, + "logps/rejected": -943.021484375, + "loss": 0.2134, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.0677709579467773, + "rewards/margins": 4.426670551300049, + "rewards/margins_max": 7.0038580894470215, + "rewards/margins_min": 1.8494832515716553, + "rewards/margins_std": 3.644692897796631, + "rewards/rejected": -7.494442939758301, + "step": 1430 + }, + { + "epoch": 0.59, + "grad_norm": 0.86328125, + "learning_rate": 8.503145969368561e-07, + "logits/chosen": 0.0862460657954216, + "logits/rejected": 0.5994366407394409, + "logps/chosen": -504.5082092285156, + "logps/rejected": -958.7373046875, + "loss": 0.2234, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.5698065757751465, + "rewards/margins": 4.695794105529785, + "rewards/margins_max": 7.5340986251831055, + "rewards/margins_min": 1.857489824295044, + "rewards/margins_std": 4.013968467712402, + "rewards/rejected": -7.265600681304932, + "step": 1440 + }, + { + "epoch": 0.6, + "grad_norm": 1.2578125, + "learning_rate": 8.361145472536617e-07, + "logits/chosen": 0.148963063955307, + "logits/rejected": 0.7144413590431213, + "logps/chosen": -518.219970703125, + "logps/rejected": -832.0569458007812, + "loss": 0.2983, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.5618433952331543, + "rewards/margins": 3.5098252296447754, + "rewards/margins_max": 5.876803398132324, + "rewards/margins_min": 1.1428462266921997, + "rewards/margins_std": 3.3474135398864746, + "rewards/rejected": -6.071669101715088, + "step": 1450 + }, + { + "epoch": 0.6, + "grad_norm": 0.83984375, + "learning_rate": 8.219483765083293e-07, + "logits/chosen": 0.03291046619415283, + "logits/rejected": 0.5989701151847839, + "logps/chosen": -541.6590576171875, + "logps/rejected": -917.2067260742188, + "loss": 0.2071, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.0608572959899902, + "rewards/margins": 4.072875022888184, + "rewards/margins_max": 6.147627830505371, + "rewards/margins_min": 1.9981224536895752, + "rewards/margins_std": 2.934143304824829, + "rewards/rejected": -7.133731842041016, + "step": 1460 + }, + { + "epoch": 0.61, + "grad_norm": 0.703125, + "learning_rate": 8.078190131780982e-07, + "logits/chosen": 0.10352887213230133, + "logits/rejected": 0.5709009766578674, + "logps/chosen": -459.8164978027344, + "logps/rejected": -901.9567260742188, + "loss": 0.211, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.477700710296631, + "rewards/margins": 4.241365909576416, + "rewards/margins_max": 6.99854040145874, + "rewards/margins_min": 1.4841907024383545, + "rewards/margins_std": 3.8992340564727783, + "rewards/rejected": -6.719066619873047, + "step": 1470 + }, + { + "epoch": 0.61, + "grad_norm": 1.1953125, + "learning_rate": 7.9372937813126e-07, + "logits/chosen": 0.08338715136051178, + "logits/rejected": 0.7014255523681641, + "logps/chosen": -550.0524291992188, + "logps/rejected": -882.2293090820312, + "loss": 0.2304, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9287476539611816, + "rewards/margins": 3.599247455596924, + "rewards/margins_max": 5.723820686340332, + "rewards/margins_min": 1.4746736288070679, + "rewards/margins_std": 3.004601001739502, + "rewards/rejected": -6.5279951095581055, + "step": 1480 + }, + { + "epoch": 0.61, + "grad_norm": 2.6875, + "learning_rate": 7.796823840233442e-07, + "logits/chosen": 0.04040234535932541, + "logits/rejected": 0.7684676051139832, + "logps/chosen": -565.4691772460938, + "logps/rejected": -789.9411010742188, + "loss": 0.3451, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.7227869033813477, + "rewards/margins": 2.916978359222412, + "rewards/margins_max": 4.791220664978027, + "rewards/margins_min": 1.0427358150482178, + "rewards/margins_std": 2.6505794525146484, + "rewards/rejected": -5.639765739440918, + "step": 1490 + }, + { + "epoch": 0.62, + "grad_norm": 1.3828125, + "learning_rate": 7.656809346950066e-07, + "logits/chosen": 0.04201055318117142, + "logits/rejected": 0.6305156946182251, + "logps/chosen": -497.396240234375, + "logps/rejected": -962.208984375, + "loss": 0.2567, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.571776866912842, + "rewards/margins": 4.9352216720581055, + "rewards/margins_max": 8.420190811157227, + "rewards/margins_min": 1.450252890586853, + "rewards/margins_std": 4.928489685058594, + "rewards/rejected": -7.506998538970947, + "step": 1500 + }, + { + "epoch": 0.62, + "grad_norm": 1.046875, + "learning_rate": 7.517279245717367e-07, + "logits/chosen": 0.11693109571933746, + "logits/rejected": 0.6607118844985962, + "logps/chosen": -458.0367126464844, + "logps/rejected": -978.6554565429688, + "loss": 0.2966, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.4729740619659424, + "rewards/margins": 5.092294216156006, + "rewards/margins_max": 8.261327743530273, + "rewards/margins_min": 1.9232604503631592, + "rewards/margins_std": 4.481690406799316, + "rewards/rejected": -7.565268039703369, + "step": 1510 + }, + { + "epoch": 0.63, + "grad_norm": 1.1875, + "learning_rate": 7.378262380655118e-07, + "logits/chosen": 0.055606938898563385, + "logits/rejected": 0.7165523171424866, + "logps/chosen": -509.15155029296875, + "logps/rejected": -901.2708740234375, + "loss": 0.2279, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.534592390060425, + "rewards/margins": 4.365265846252441, + "rewards/margins_max": 7.091165065765381, + "rewards/margins_min": 1.6393667459487915, + "rewards/margins_std": 3.8550033569335938, + "rewards/rejected": -6.899857997894287, + "step": 1520 + }, + { + "epoch": 0.63, + "grad_norm": 1.6484375, + "learning_rate": 7.239787489785247e-07, + "logits/chosen": 0.1286771148443222, + "logits/rejected": 0.7139743566513062, + "logps/chosen": -519.9439086914062, + "logps/rejected": -866.30078125, + "loss": 0.2376, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.677018404006958, + "rewards/margins": 4.071950912475586, + "rewards/margins_max": 7.0152387619018555, + "rewards/margins_min": 1.1286628246307373, + "rewards/margins_std": 4.16243839263916, + "rewards/rejected": -6.748970031738281, + "step": 1530 + }, + { + "epoch": 0.63, + "grad_norm": 0.75, + "learning_rate": 7.101883199090987e-07, + "logits/chosen": 0.1824348419904709, + "logits/rejected": 0.6094152331352234, + "logps/chosen": -533.3997802734375, + "logps/rejected": -980.6068115234375, + "loss": 0.1974, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.984278440475464, + "rewards/margins": 4.562946796417236, + "rewards/margins_max": 7.375452518463135, + "rewards/margins_min": 1.7504408359527588, + "rewards/margins_std": 3.9774837493896484, + "rewards/rejected": -7.547224998474121, + "step": 1540 + }, + { + "epoch": 0.64, + "grad_norm": 1.1015625, + "learning_rate": 6.964578016599238e-07, + "logits/chosen": 0.1528719961643219, + "logits/rejected": 0.7347079515457153, + "logps/chosen": -511.112060546875, + "logps/rejected": -1012.8218994140625, + "loss": 0.2411, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.4701285362243652, + "rewards/margins": 5.128227233886719, + "rewards/margins_max": 7.814431667327881, + "rewards/margins_min": 2.4420230388641357, + "rewards/margins_std": 3.7988662719726562, + "rewards/rejected": -7.598355770111084, + "step": 1550 + }, + { + "epoch": 0.64, + "grad_norm": 0.87890625, + "learning_rate": 6.827900326487286e-07, + "logits/chosen": 0.12157417833805084, + "logits/rejected": 0.8184272646903992, + "logps/chosen": -529.847900390625, + "logps/rejected": -1091.0225830078125, + "loss": 0.2172, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8752691745758057, + "rewards/margins": 5.952185153961182, + "rewards/margins_max": 9.909235000610352, + "rewards/margins_min": 1.995133399963379, + "rewards/margins_std": 5.596114635467529, + "rewards/rejected": -8.82745361328125, + "step": 1560 + }, + { + "epoch": 0.65, + "grad_norm": 3.078125, + "learning_rate": 6.691878383215141e-07, + "logits/chosen": 0.09048546850681305, + "logits/rejected": 0.5541390776634216, + "logps/chosen": -566.58349609375, + "logps/rejected": -987.8902587890625, + "loss": 0.3041, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.2955403327941895, + "rewards/margins": 4.377233028411865, + "rewards/margins_max": 6.9765167236328125, + "rewards/margins_min": 1.7779486179351807, + "rewards/margins_std": 3.6759426593780518, + "rewards/rejected": -7.672772407531738, + "step": 1570 + }, + { + "epoch": 0.65, + "grad_norm": 2.84375, + "learning_rate": 6.556540305684669e-07, + "logits/chosen": -0.011808687821030617, + "logits/rejected": 0.6870378851890564, + "logps/chosen": -546.3944091796875, + "logps/rejected": -931.5330200195312, + "loss": 0.2546, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.909376621246338, + "rewards/margins": 4.274510860443115, + "rewards/margins_max": 6.635194301605225, + "rewards/margins_min": 1.913827896118164, + "rewards/margins_std": 3.338510513305664, + "rewards/rejected": -7.183887481689453, + "step": 1580 + }, + { + "epoch": 0.65, + "grad_norm": 1.6015625, + "learning_rate": 6.421914071426778e-07, + "logits/chosen": -0.04181584715843201, + "logits/rejected": 0.5624270439147949, + "logps/chosen": -541.0858154296875, + "logps/rejected": -971.2429809570312, + "loss": 0.3229, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.9971630573272705, + "rewards/margins": 4.618222236633301, + "rewards/margins_max": 7.345058441162109, + "rewards/margins_min": 1.891385793685913, + "rewards/margins_std": 3.8563284873962402, + "rewards/rejected": -7.615384578704834, + "step": 1590 + }, + { + "epoch": 0.66, + "grad_norm": 0.63671875, + "learning_rate": 6.288027510817791e-07, + "logits/chosen": 0.15334565937519073, + "logits/rejected": 0.8467614054679871, + "logps/chosen": -588.7959594726562, + "logps/rejected": -1015.6530151367188, + "loss": 0.238, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.3042540550231934, + "rewards/margins": 4.745846748352051, + "rewards/margins_max": 7.398883819580078, + "rewards/margins_min": 2.092808246612549, + "rewards/margins_std": 3.751962184906006, + "rewards/rejected": -8.050100326538086, + "step": 1600 + }, + { + "epoch": 0.66, + "grad_norm": 0.86328125, + "learning_rate": 6.154908301326289e-07, + "logits/chosen": 0.07106464356184006, + "logits/rejected": 0.6679781675338745, + "logps/chosen": -521.3948364257812, + "logps/rejected": -967.5283203125, + "loss": 0.3055, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4520983695983887, + "rewards/margins": 4.829917907714844, + "rewards/margins_max": 7.988490104675293, + "rewards/margins_min": 1.6713447570800781, + "rewards/margins_std": 4.4668965339660645, + "rewards/rejected": -7.282015800476074, + "step": 1610 + }, + { + "epoch": 0.67, + "grad_norm": 1.0859375, + "learning_rate": 6.022583961791494e-07, + "logits/chosen": 0.06975733488798141, + "logits/rejected": 0.6143258810043335, + "logps/chosen": -547.225341796875, + "logps/rejected": -929.7529296875, + "loss": 0.2615, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.8573384284973145, + "rewards/margins": 4.046209335327148, + "rewards/margins_max": 6.497920036315918, + "rewards/margins_min": 1.5944980382919312, + "rewards/margins_std": 3.467243194580078, + "rewards/rejected": -6.903547763824463, + "step": 1620 + }, + { + "epoch": 0.67, + "grad_norm": 4.28125, + "learning_rate": 5.891081846734518e-07, + "logits/chosen": 0.03396327421069145, + "logits/rejected": 0.680204451084137, + "logps/chosen": -582.3570556640625, + "logps/rejected": -1022.9677734375, + "loss": 0.2667, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.1391189098358154, + "rewards/margins": 4.894112586975098, + "rewards/margins_max": 7.7569475173950195, + "rewards/margins_min": 2.0312764644622803, + "rewards/margins_std": 4.048661231994629, + "rewards/rejected": -8.033230781555176, + "step": 1630 + }, + { + "epoch": 0.68, + "grad_norm": 1.5, + "learning_rate": 5.760429140703533e-07, + "logits/chosen": 0.1480637490749359, + "logits/rejected": 0.6867285370826721, + "logps/chosen": -511.46832275390625, + "logps/rejected": -906.4397583007812, + "loss": 0.2314, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.893805980682373, + "rewards/margins": 4.286385536193848, + "rewards/margins_max": 6.519837856292725, + "rewards/margins_min": 2.052934169769287, + "rewards/margins_std": 3.1585774421691895, + "rewards/rejected": -7.180192470550537, + "step": 1640 + }, + { + "epoch": 0.68, + "grad_norm": 2.3125, + "learning_rate": 5.63065285265409e-07, + "logits/chosen": 0.016267577186226845, + "logits/rejected": 0.6522185206413269, + "logps/chosen": -537.4530029296875, + "logps/rejected": -844.8182373046875, + "loss": 0.31, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.0314884185791016, + "rewards/margins": 3.5092055797576904, + "rewards/margins_max": 5.994574546813965, + "rewards/margins_min": 1.0238367319107056, + "rewards/margins_std": 3.514842987060547, + "rewards/rejected": -6.540694236755371, + "step": 1650 + }, + { + "epoch": 0.68, + "grad_norm": 1.2421875, + "learning_rate": 5.501779810365744e-07, + "logits/chosen": 0.10497574508190155, + "logits/rejected": 0.6778794527053833, + "logps/chosen": -606.8820190429688, + "logps/rejected": -939.759765625, + "loss": 0.2511, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.223036527633667, + "rewards/margins": 3.837195873260498, + "rewards/margins_max": 6.3507466316223145, + "rewards/margins_min": 1.323644757270813, + "rewards/margins_std": 3.5546982288360596, + "rewards/rejected": -7.060232639312744, + "step": 1660 + }, + { + "epoch": 0.69, + "grad_norm": 4.71875, + "learning_rate": 5.373836654896127e-07, + "logits/chosen": 0.21054425835609436, + "logits/rejected": 0.7104513645172119, + "logps/chosen": -551.0523071289062, + "logps/rejected": -927.8137817382812, + "loss": 0.2092, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.922978162765503, + "rewards/margins": 4.11228084564209, + "rewards/margins_max": 6.838304042816162, + "rewards/margins_min": 1.3862587213516235, + "rewards/margins_std": 3.855178117752075, + "rewards/rejected": -7.035260200500488, + "step": 1670 + }, + { + "epoch": 0.69, + "grad_norm": 3.4375, + "learning_rate": 5.246849835073623e-07, + "logits/chosen": 0.22590751945972443, + "logits/rejected": 0.653782844543457, + "logps/chosen": -501.31475830078125, + "logps/rejected": -810.4156494140625, + "loss": 0.2812, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.845599412918091, + "rewards/margins": 3.173737049102783, + "rewards/margins_max": 4.869901657104492, + "rewards/margins_min": 1.4775731563568115, + "rewards/margins_std": 2.398738384246826, + "rewards/rejected": -6.019336700439453, + "step": 1680 + }, + { + "epoch": 0.7, + "grad_norm": 0.80859375, + "learning_rate": 5.120845602029775e-07, + "logits/chosen": 0.1999841332435608, + "logits/rejected": 0.7564531564712524, + "logps/chosen": -498.1185607910156, + "logps/rejected": -965.1281127929688, + "loss": 0.1513, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7307372093200684, + "rewards/margins": 4.529786586761475, + "rewards/margins_max": 6.7064385414123535, + "rewards/margins_min": 2.3531341552734375, + "rewards/margins_std": 3.0782508850097656, + "rewards/rejected": -7.260523796081543, + "step": 1690 + }, + { + "epoch": 0.7, + "grad_norm": 1.2109375, + "learning_rate": 4.995850003772563e-07, + "logits/chosen": 0.1419237107038498, + "logits/rejected": 0.6594254970550537, + "logps/chosen": -526.7871704101562, + "logps/rejected": -946.35400390625, + "loss": 0.2515, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -2.9440999031066895, + "rewards/margins": 4.337802886962891, + "rewards/margins_max": 7.0636725425720215, + "rewards/margins_min": 1.6119331121444702, + "rewards/margins_std": 3.854962110519409, + "rewards/rejected": -7.281902313232422, + "step": 1700 + }, + { + "epoch": 0.7, + "grad_norm": 2.875, + "learning_rate": 4.871888879801684e-07, + "logits/chosen": 0.1439136564731598, + "logits/rejected": 0.6816688776016235, + "logps/chosen": -516.1966552734375, + "logps/rejected": -867.43115234375, + "loss": 0.2889, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.8488383293151855, + "rewards/margins": 3.843712568283081, + "rewards/margins_max": 6.232985019683838, + "rewards/margins_min": 1.4544405937194824, + "rewards/margins_std": 3.378941297531128, + "rewards/rejected": -6.692551612854004, + "step": 1710 + }, + { + "epoch": 0.71, + "grad_norm": 7.5625, + "learning_rate": 4.7489878557669236e-07, + "logits/chosen": 0.1883472502231598, + "logits/rejected": 0.6354864835739136, + "logps/chosen": -522.1241455078125, + "logps/rejected": -909.0133666992188, + "loss": 0.2573, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9454784393310547, + "rewards/margins": 4.083091735839844, + "rewards/margins_max": 7.010195732116699, + "rewards/margins_min": 1.1559871435165405, + "rewards/margins_std": 4.139551639556885, + "rewards/rejected": -7.028570652008057, + "step": 1720 + }, + { + "epoch": 0.71, + "grad_norm": 2.875, + "learning_rate": 4.6271723381707204e-07, + "logits/chosen": -0.010915858671069145, + "logits/rejected": 0.5235196352005005, + "logps/chosen": -535.598876953125, + "logps/rejected": -1009.626953125, + "loss": 0.2645, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.0873775482177734, + "rewards/margins": 4.849740028381348, + "rewards/margins_max": 7.774973392486572, + "rewards/margins_min": 1.9245054721832275, + "rewards/margins_std": 4.136904716491699, + "rewards/rejected": -7.937117099761963, + "step": 1730 + }, + { + "epoch": 0.72, + "grad_norm": 3.203125, + "learning_rate": 4.5064675091160777e-07, + "logits/chosen": -0.08643798530101776, + "logits/rejected": 0.6465299725532532, + "logps/chosen": -634.9217529296875, + "logps/rejected": -949.5773315429688, + "loss": 0.5499, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.7211272716522217, + "rewards/margins": 3.7824363708496094, + "rewards/margins_max": 6.287593841552734, + "rewards/margins_min": 1.2772791385650635, + "rewards/margins_std": 3.5428271293640137, + "rewards/rejected": -7.503562927246094, + "step": 1740 + }, + { + "epoch": 0.72, + "grad_norm": 0.94921875, + "learning_rate": 4.386898321100817e-07, + "logits/chosen": 0.16737410426139832, + "logits/rejected": 0.7846443057060242, + "logps/chosen": -571.4229736328125, + "logps/rejected": -1064.5064697265625, + "loss": 0.2579, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.09366774559021, + "rewards/margins": 5.503853797912598, + "rewards/margins_max": 9.53122615814209, + "rewards/margins_min": 1.476481318473816, + "rewards/margins_std": 5.695565223693848, + "rewards/rejected": -8.59752082824707, + "step": 1750 + }, + { + "epoch": 0.72, + "grad_norm": 0.6796875, + "learning_rate": 4.268489491859335e-07, + "logits/chosen": 0.14373087882995605, + "logits/rejected": 0.619976818561554, + "logps/chosen": -571.4114990234375, + "logps/rejected": -1039.7352294921875, + "loss": 0.2761, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.399026870727539, + "rewards/margins": 4.906405448913574, + "rewards/margins_max": 7.806390285491943, + "rewards/margins_min": 2.006420850753784, + "rewards/margins_std": 4.101198196411133, + "rewards/rejected": -8.30543327331543, + "step": 1760 + }, + { + "epoch": 0.73, + "grad_norm": 3.421875, + "learning_rate": 4.151265499252841e-07, + "logits/chosen": 0.10607640445232391, + "logits/rejected": 0.7857314944267273, + "logps/chosen": -559.5725708007812, + "logps/rejected": -937.8776245117188, + "loss": 0.2363, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.8877224922180176, + "rewards/margins": 4.257446765899658, + "rewards/margins_max": 6.9602532386779785, + "rewards/margins_min": 1.554640293121338, + "rewards/margins_std": 3.82234525680542, + "rewards/rejected": -7.145169258117676, + "step": 1770 + }, + { + "epoch": 0.73, + "grad_norm": 1.328125, + "learning_rate": 4.0352505762092436e-07, + "logits/chosen": 0.0463348887860775, + "logits/rejected": 0.6185473799705505, + "logps/chosen": -528.8240356445312, + "logps/rejected": -927.42822265625, + "loss": 0.227, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.7649338245391846, + "rewards/margins": 4.2219719886779785, + "rewards/margins_max": 6.481306552886963, + "rewards/margins_min": 1.962636947631836, + "rewards/margins_std": 3.1951823234558105, + "rewards/rejected": -6.986905574798584, + "step": 1780 + }, + { + "epoch": 0.74, + "grad_norm": 1.7734375, + "learning_rate": 3.920468705713629e-07, + "logits/chosen": 0.15475311875343323, + "logits/rejected": 0.6595792174339294, + "logps/chosen": -510.0287170410156, + "logps/rejected": -1023.1057739257812, + "loss": 0.242, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.653700113296509, + "rewards/margins": 5.281457424163818, + "rewards/margins_max": 8.370940208435059, + "rewards/margins_min": 2.191974639892578, + "rewards/margins_std": 4.3691887855529785, + "rewards/rejected": -7.93515682220459, + "step": 1790 + }, + { + "epoch": 0.74, + "grad_norm": 1.4296875, + "learning_rate": 3.8069436158504163e-07, + "logits/chosen": 0.11404214799404144, + "logits/rejected": 0.6743693351745605, + "logps/chosen": -570.0484619140625, + "logps/rejected": -1008.6219482421875, + "loss": 0.2144, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.973473072052002, + "rewards/margins": 4.615906715393066, + "rewards/margins_max": 6.867118835449219, + "rewards/margins_min": 2.3646950721740723, + "rewards/margins_std": 3.183694362640381, + "rewards/rejected": -7.58937931060791, + "step": 1800 + }, + { + "epoch": 0.75, + "grad_norm": 0.99609375, + "learning_rate": 3.6946987748982196e-07, + "logits/chosen": -0.07372093200683594, + "logits/rejected": 0.47911280393600464, + "logps/chosen": -601.9035034179688, + "logps/rejected": -1067.716064453125, + "loss": 0.248, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.395397186279297, + "rewards/margins": 4.819011211395264, + "rewards/margins_max": 7.499534606933594, + "rewards/margins_min": 2.1384873390197754, + "rewards/margins_std": 3.790832996368408, + "rewards/rejected": -8.214407920837402, + "step": 1810 + }, + { + "epoch": 0.75, + "grad_norm": 1.7109375, + "learning_rate": 3.5837573864783886e-07, + "logits/chosen": -0.05284532159566879, + "logits/rejected": 0.6970399618148804, + "logps/chosen": -556.5332641601562, + "logps/rejected": -932.3153076171875, + "loss": 0.2203, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.8831331729888916, + "rewards/margins": 4.284741401672363, + "rewards/margins_max": 6.570008277893066, + "rewards/margins_min": 1.9994745254516602, + "rewards/margins_std": 3.2318553924560547, + "rewards/rejected": -7.167874813079834, + "step": 1820 + }, + { + "epoch": 0.75, + "grad_norm": 1.234375, + "learning_rate": 3.4741423847583127e-07, + "logits/chosen": 0.2800007462501526, + "logits/rejected": 0.9037263989448547, + "logps/chosen": -551.2252807617188, + "logps/rejected": -1097.2529296875, + "loss": 0.2205, + "rewards/accuracies": 0.987500011920929, + "rewards/chosen": -2.862596035003662, + "rewards/margins": 5.937441825866699, + "rewards/margins_max": 9.964279174804688, + "rewards/margins_min": 1.9106025695800781, + "rewards/margins_std": 5.694809913635254, + "rewards/rejected": -8.800037384033203, + "step": 1830 + }, + { + "epoch": 0.76, + "grad_norm": 1.734375, + "learning_rate": 3.365876429710366e-07, + "logits/chosen": 0.026043016463518143, + "logits/rejected": 0.6701821088790894, + "logps/chosen": -580.5740356445312, + "logps/rejected": -984.2448120117188, + "loss": 0.227, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0298023223876953, + "rewards/margins": 4.495330333709717, + "rewards/margins_max": 7.44777774810791, + "rewards/margins_min": 1.5428824424743652, + "rewards/margins_std": 4.175391674041748, + "rewards/rejected": -7.525132179260254, + "step": 1840 + }, + { + "epoch": 0.76, + "grad_norm": 1.375, + "learning_rate": 3.2589819024275744e-07, + "logits/chosen": 0.14268314838409424, + "logits/rejected": 0.7200323939323425, + "logps/chosen": -548.6128540039062, + "logps/rejected": -1005.54736328125, + "loss": 0.2673, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.04203462600708, + "rewards/margins": 4.95261812210083, + "rewards/margins_max": 7.634843349456787, + "rewards/margins_min": 2.2703919410705566, + "rewards/margins_std": 3.7932395935058594, + "rewards/rejected": -7.99465274810791, + "step": 1850 + }, + { + "epoch": 0.77, + "grad_norm": 1.3515625, + "learning_rate": 3.1534809004969186e-07, + "logits/chosen": 0.04652264714241028, + "logits/rejected": 0.6002200841903687, + "logps/chosen": -528.8763427734375, + "logps/rejected": -903.3206787109375, + "loss": 0.2614, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.9814038276672363, + "rewards/margins": 4.039307594299316, + "rewards/margins_max": 6.4738287925720215, + "rewards/margins_min": 1.6047871112823486, + "rewards/margins_std": 3.442932605743408, + "rewards/rejected": -7.020711421966553, + "step": 1860 + }, + { + "epoch": 0.77, + "grad_norm": 1.921875, + "learning_rate": 3.049395233431259e-07, + "logits/chosen": 0.07690130174160004, + "logits/rejected": 0.6124612092971802, + "logps/chosen": -497.55072021484375, + "logps/rejected": -863.54052734375, + "loss": 0.2795, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.6492507457733154, + "rewards/margins": 3.8400909900665283, + "rewards/margins_max": 6.673186302185059, + "rewards/margins_min": 1.0069960355758667, + "rewards/margins_std": 4.006600856781006, + "rewards/rejected": -6.489341735839844, + "step": 1870 + }, + { + "epoch": 0.77, + "grad_norm": 1.3515625, + "learning_rate": 2.946746418160787e-07, + "logits/chosen": 0.10635950416326523, + "logits/rejected": 0.8063497543334961, + "logps/chosen": -539.3719482421875, + "logps/rejected": -899.5970458984375, + "loss": 0.222, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.922797441482544, + "rewards/margins": 3.8881256580352783, + "rewards/margins_max": 6.072175025939941, + "rewards/margins_min": 1.7040764093399048, + "rewards/margins_std": 3.088712215423584, + "rewards/rejected": -6.810922145843506, + "step": 1880 + }, + { + "epoch": 0.78, + "grad_norm": 0.66015625, + "learning_rate": 2.8455556745849905e-07, + "logits/chosen": 0.22083833813667297, + "logits/rejected": 0.6882720589637756, + "logps/chosen": -562.1170043945312, + "logps/rejected": -982.6512451171875, + "loss": 0.2326, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.0150840282440186, + "rewards/margins": 4.539645671844482, + "rewards/margins_max": 7.401745796203613, + "rewards/margins_min": 1.6775459051132202, + "rewards/margins_std": 4.04762077331543, + "rewards/rejected": -7.554730415344238, + "step": 1890 + }, + { + "epoch": 0.78, + "grad_norm": 1.1953125, + "learning_rate": 2.745843921185991e-07, + "logits/chosen": 0.10289929062128067, + "logits/rejected": 0.6530742049217224, + "logps/chosen": -491.39947509765625, + "logps/rejected": -889.4318237304688, + "loss": 0.1988, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -2.5300493240356445, + "rewards/margins": 4.14646053314209, + "rewards/margins_max": 6.6356048583984375, + "rewards/margins_min": 1.6573164463043213, + "rewards/margins_std": 3.520181179046631, + "rewards/rejected": -6.676509857177734, + "step": 1900 + }, + { + "epoch": 0.79, + "grad_norm": 6.75, + "learning_rate": 2.647631770704217e-07, + "logits/chosen": 0.0862136036157608, + "logits/rejected": 0.6793027520179749, + "logps/chosen": -587.4644775390625, + "logps/rejected": -1091.2603759765625, + "loss": 0.3395, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.415440320968628, + "rewards/margins": 5.334762096405029, + "rewards/margins_max": 8.447580337524414, + "rewards/margins_min": 2.2219443321228027, + "rewards/margins_std": 4.4021897315979, + "rewards/rejected": -8.750203132629395, + "step": 1910 + }, + { + "epoch": 0.79, + "grad_norm": 3.96875, + "learning_rate": 2.550939525877269e-07, + "logits/chosen": 0.1680660843849182, + "logits/rejected": 0.8692294955253601, + "logps/chosen": -567.0643310546875, + "logps/rejected": -903.1422729492188, + "loss": 0.2927, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.0026445388793945, + "rewards/margins": 3.917665481567383, + "rewards/margins_max": 6.735628604888916, + "rewards/margins_min": 1.0997036695480347, + "rewards/margins_std": 3.9852001667022705, + "rewards/rejected": -6.920310020446777, + "step": 1920 + }, + { + "epoch": 0.79, + "grad_norm": 1.15625, + "learning_rate": 2.455787175242867e-07, + "logits/chosen": -0.026486584916710854, + "logits/rejected": 0.6239403486251831, + "logps/chosen": -556.9910888671875, + "logps/rejected": -1082.27490234375, + "loss": 0.2382, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.8589916229248047, + "rewards/margins": 5.537055015563965, + "rewards/margins_max": 8.529805183410645, + "rewards/margins_min": 2.5443062782287598, + "rewards/margins_std": 4.232387065887451, + "rewards/rejected": -8.396047592163086, + "step": 1930 + }, + { + "epoch": 0.8, + "grad_norm": 2.203125, + "learning_rate": 2.3621943890067608e-07, + "logits/chosen": 0.13468703627586365, + "logits/rejected": 0.7454935908317566, + "logps/chosen": -593.1705322265625, + "logps/rejected": -986.3148193359375, + "loss": 0.3127, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -3.286363124847412, + "rewards/margins": 4.50880765914917, + "rewards/margins_max": 7.584027290344238, + "rewards/margins_min": 1.4335881471633911, + "rewards/margins_std": 4.349017143249512, + "rewards/rejected": -7.795170783996582, + "step": 1940 + }, + { + "epoch": 0.8, + "grad_norm": 1.859375, + "learning_rate": 2.2701805149764287e-07, + "logits/chosen": 0.30822715163230896, + "logits/rejected": 0.8703896403312683, + "logps/chosen": -640.6173095703125, + "logps/rejected": -919.14990234375, + "loss": 0.3887, + "rewards/accuracies": 0.875, + "rewards/chosen": -3.50831937789917, + "rewards/margins": 3.5380778312683105, + "rewards/margins_max": 5.758213996887207, + "rewards/margins_min": 1.3179413080215454, + "rewards/margins_std": 3.139747142791748, + "rewards/rejected": -7.0463972091674805, + "step": 1950 + }, + { + "epoch": 0.81, + "grad_norm": 4.65625, + "learning_rate": 2.1797645745614522e-07, + "logits/chosen": 0.2973627746105194, + "logits/rejected": 0.7991029024124146, + "logps/chosen": -523.9281616210938, + "logps/rejected": -1035.779541015625, + "loss": 0.237, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.9458510875701904, + "rewards/margins": 5.273934841156006, + "rewards/margins_max": 8.809687614440918, + "rewards/margins_min": 1.738181710243225, + "rewards/margins_std": 5.000309944152832, + "rewards/rejected": -8.219786643981934, + "step": 1960 + }, + { + "epoch": 0.81, + "grad_norm": 1.953125, + "learning_rate": 2.090965258841334e-07, + "logits/chosen": 0.11088068783283234, + "logits/rejected": 0.6458398103713989, + "logps/chosen": -493.0948181152344, + "logps/rejected": -834.015625, + "loss": 0.3476, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.821851968765259, + "rewards/margins": 3.7064621448516846, + "rewards/margins_max": 6.326642036437988, + "rewards/margins_min": 1.08628249168396, + "rewards/margins_std": 3.705493450164795, + "rewards/rejected": -6.528314113616943, + "step": 1970 + }, + { + "epoch": 0.82, + "grad_norm": 1.4296875, + "learning_rate": 2.0038009247016317e-07, + "logits/chosen": 0.10617595911026001, + "logits/rejected": 0.6900930404663086, + "logps/chosen": -544.4014892578125, + "logps/rejected": -984.28369140625, + "loss": 0.3149, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.090888738632202, + "rewards/margins": 4.560122489929199, + "rewards/margins_max": 7.143895149230957, + "rewards/margins_min": 1.9763494729995728, + "rewards/margins_std": 3.6540064811706543, + "rewards/rejected": -7.651010990142822, + "step": 1980 + }, + { + "epoch": 0.82, + "grad_norm": 1.046875, + "learning_rate": 1.918289591039137e-07, + "logits/chosen": 0.14296357333660126, + "logits/rejected": 0.6799240708351135, + "logps/chosen": -543.3214111328125, + "logps/rejected": -1012.4182739257812, + "loss": 0.2345, + "rewards/accuracies": 0.8999999761581421, + "rewards/chosen": -3.016738176345825, + "rewards/margins": 5.098849773406982, + "rewards/margins_max": 8.255114555358887, + "rewards/margins_min": 1.9425843954086304, + "rewards/margins_std": 4.463633060455322, + "rewards/rejected": -8.11558723449707, + "step": 1990 + }, + { + "epoch": 0.82, + "grad_norm": 0.9453125, + "learning_rate": 1.8344489350369775e-07, + "logits/chosen": 0.078754723072052, + "logits/rejected": 0.7793623805046082, + "logps/chosen": -538.4935913085938, + "logps/rejected": -1064.3187255859375, + "loss": 0.1977, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.8698158264160156, + "rewards/margins": 5.53084659576416, + "rewards/margins_max": 9.207681655883789, + "rewards/margins_min": 1.854010820388794, + "rewards/margins_std": 5.199830532073975, + "rewards/rejected": -8.400662422180176, + "step": 2000 + }, + { + "epoch": 0.83, + "grad_norm": 1.1171875, + "learning_rate": 1.7522962885103143e-07, + "logits/chosen": 0.22342924773693085, + "logits/rejected": 0.8045142889022827, + "logps/chosen": -557.4118041992188, + "logps/rejected": -1015.7853393554688, + "loss": 0.216, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.8877434730529785, + "rewards/margins": 5.013927459716797, + "rewards/margins_max": 8.602627754211426, + "rewards/margins_min": 1.425227403640747, + "rewards/margins_std": 5.075188159942627, + "rewards/rejected": -7.901670932769775, + "step": 2010 + }, + { + "epoch": 0.83, + "grad_norm": 0.57421875, + "learning_rate": 1.6718486343234627e-07, + "logits/chosen": 0.22265294194221497, + "logits/rejected": 0.8611626625061035, + "logps/chosen": -595.16748046875, + "logps/rejected": -976.0812377929688, + "loss": 0.2934, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.374727249145508, + "rewards/margins": 4.3763909339904785, + "rewards/margins_max": 7.205048561096191, + "rewards/margins_min": 1.547734022140503, + "rewards/margins_std": 4.0003252029418945, + "rewards/rejected": -7.7511186599731445, + "step": 2020 + }, + { + "epoch": 0.84, + "grad_norm": 3.109375, + "learning_rate": 1.5931226028791323e-07, + "logits/chosen": 0.17107948660850525, + "logits/rejected": 0.7077086567878723, + "logps/chosen": -550.4708251953125, + "logps/rejected": -1006.26318359375, + "loss": 0.1998, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0446720123291016, + "rewards/margins": 4.2325286865234375, + "rewards/margins_max": 6.568942070007324, + "rewards/margins_min": 1.8961141109466553, + "rewards/margins_std": 3.3041882514953613, + "rewards/rejected": -7.277200222015381, + "step": 2030 + }, + { + "epoch": 0.84, + "grad_norm": 4.84375, + "learning_rate": 1.516134468680532e-07, + "logits/chosen": 0.18136277794837952, + "logits/rejected": 0.7017911076545715, + "logps/chosen": -527.5110473632812, + "logps/rejected": -993.22900390625, + "loss": 0.2196, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.743804693222046, + "rewards/margins": 4.674644470214844, + "rewards/margins_max": 7.117551326751709, + "rewards/margins_min": 2.231738567352295, + "rewards/margins_std": 3.4547908306121826, + "rewards/rejected": -7.418449401855469, + "step": 2040 + }, + { + "epoch": 0.84, + "grad_norm": 0.84375, + "learning_rate": 1.4409001469670613e-07, + "logits/chosen": 0.09716422855854034, + "logits/rejected": 0.7657040357589722, + "logps/chosen": -583.7706298828125, + "logps/rejected": -961.2703247070312, + "loss": 0.2395, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.9377636909484863, + "rewards/margins": 4.50144100189209, + "rewards/margins_max": 7.113333225250244, + "rewards/margins_min": 1.889548897743225, + "rewards/margins_std": 3.6937732696533203, + "rewards/rejected": -7.439205169677734, + "step": 2050 + }, + { + "epoch": 0.85, + "grad_norm": 1.453125, + "learning_rate": 1.3674351904242608e-07, + "logits/chosen": 0.11017270386219025, + "logits/rejected": 0.6662808656692505, + "logps/chosen": -541.803955078125, + "logps/rejected": -1012.2199096679688, + "loss": 0.2271, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.0164852142333984, + "rewards/margins": 4.881779670715332, + "rewards/margins_max": 7.408524513244629, + "rewards/margins_min": 2.3550355434417725, + "rewards/margins_std": 3.5733566284179688, + "rewards/rejected": -7.898265838623047, + "step": 2060 + }, + { + "epoch": 0.85, + "grad_norm": 0.9140625, + "learning_rate": 1.295754785968698e-07, + "logits/chosen": 0.12141978740692139, + "logits/rejected": 0.7523366808891296, + "logps/chosen": -567.1585693359375, + "logps/rejected": -1006.70166015625, + "loss": 0.2292, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.1486592292785645, + "rewards/margins": 4.691858768463135, + "rewards/margins_max": 7.296378135681152, + "rewards/margins_min": 2.087339401245117, + "rewards/margins_std": 3.6833465099334717, + "rewards/rejected": -7.840517997741699, + "step": 2070 + }, + { + "epoch": 0.86, + "grad_norm": 1.6640625, + "learning_rate": 1.2258737516084827e-07, + "logits/chosen": 0.009871700778603554, + "logits/rejected": 0.49689459800720215, + "logps/chosen": -511.316162109375, + "logps/rejected": -989.1583862304688, + "loss": 0.208, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.903729200363159, + "rewards/margins": 4.876491546630859, + "rewards/margins_max": 7.534029960632324, + "rewards/margins_min": 2.218952178955078, + "rewards/margins_std": 3.7583279609680176, + "rewards/rejected": -7.780220031738281, + "step": 2080 + }, + { + "epoch": 0.86, + "grad_norm": 1.203125, + "learning_rate": 1.1578065333800457e-07, + "logits/chosen": 0.0854678601026535, + "logits/rejected": 0.6038640737533569, + "logps/chosen": -579.2017822265625, + "logps/rejected": -1042.4700927734375, + "loss": 0.3034, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.3256430625915527, + "rewards/margins": 4.763891696929932, + "rewards/margins_max": 7.874256134033203, + "rewards/margins_min": 1.6535274982452393, + "rewards/margins_std": 4.398719787597656, + "rewards/rejected": -8.089534759521484, + "step": 2090 + }, + { + "epoch": 0.86, + "grad_norm": 2.046875, + "learning_rate": 1.091567202361805e-07, + "logits/chosen": -0.015169775113463402, + "logits/rejected": 0.5395482778549194, + "logps/chosen": -629.2236938476562, + "logps/rejected": -1089.643798828125, + "loss": 0.4088, + "rewards/accuracies": 0.862500011920929, + "rewards/chosen": -3.6674251556396484, + "rewards/margins": 4.9454522132873535, + "rewards/margins_max": 8.529561042785645, + "rewards/margins_min": 1.3613433837890625, + "rewards/margins_std": 5.068695068359375, + "rewards/rejected": -8.612876892089844, + "step": 2100 + }, + { + "epoch": 0.87, + "grad_norm": 0.8046875, + "learning_rate": 1.0271694517653395e-07, + "logits/chosen": 0.17594434320926666, + "logits/rejected": 0.7346351742744446, + "logps/chosen": -545.63818359375, + "logps/rejected": -901.31640625, + "loss": 0.3593, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.8124184608459473, + "rewards/margins": 4.03906774520874, + "rewards/margins_max": 6.741438388824463, + "rewards/margins_min": 1.3366973400115967, + "rewards/margins_std": 3.8217289447784424, + "rewards/rejected": -6.8514862060546875, + "step": 2110 + }, + { + "epoch": 0.87, + "grad_norm": 0.921875, + "learning_rate": 9.646265941046916e-08, + "logits/chosen": 0.10627947002649307, + "logits/rejected": 0.7213876843452454, + "logps/chosen": -531.1597900390625, + "logps/rejected": -987.6064453125, + "loss": 0.2611, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.926069974899292, + "rewards/margins": 4.628062725067139, + "rewards/margins_max": 7.4127936363220215, + "rewards/margins_min": 1.8433315753936768, + "rewards/margins_std": 3.938204526901245, + "rewards/rejected": -7.554131984710693, + "step": 2120 + }, + { + "epoch": 0.88, + "grad_norm": 1.015625, + "learning_rate": 9.039515584443558e-08, + "logits/chosen": 0.10931396484375, + "logits/rejected": 0.675295889377594, + "logps/chosen": -513.3177490234375, + "logps/rejected": -960.24072265625, + "loss": 0.2292, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.982128143310547, + "rewards/margins": 4.812419891357422, + "rewards/margins_max": 8.374956130981445, + "rewards/margins_min": 1.2498825788497925, + "rewards/margins_std": 5.0381879806518555, + "rewards/rejected": -7.794547080993652, + "step": 2130 + }, + { + "epoch": 0.88, + "grad_norm": 1.4765625, + "learning_rate": 8.451568877265425e-08, + "logits/chosen": 0.1267772912979126, + "logits/rejected": 0.7309524416923523, + "logps/chosen": -550.2372436523438, + "logps/rejected": -1106.2911376953125, + "loss": 0.285, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.0835139751434326, + "rewards/margins": 5.875071048736572, + "rewards/margins_max": 9.52598762512207, + "rewards/margins_min": 2.224155902862549, + "rewards/margins_std": 5.163174629211426, + "rewards/rejected": -8.958585739135742, + "step": 2140 + }, + { + "epoch": 0.89, + "grad_norm": 1.2890625, + "learning_rate": 7.882547361782587e-08, + "logits/chosen": 0.07169238477945328, + "logits/rejected": 0.6479736566543579, + "logps/chosen": -541.5816650390625, + "logps/rejected": -951.8138427734375, + "loss": 0.2547, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.038245677947998, + "rewards/margins": 4.422773838043213, + "rewards/margins_max": 7.200972557067871, + "rewards/margins_min": 1.6445751190185547, + "rewards/margins_std": 3.9289660453796387, + "rewards/rejected": -7.461019039154053, + "step": 2150 + }, + { + "epoch": 0.89, + "grad_norm": 1.9375, + "learning_rate": 7.332568667987482e-08, + "logits/chosen": 0.19428284466266632, + "logits/rejected": 0.808147132396698, + "logps/chosen": -556.2681274414062, + "logps/rejected": -916.072265625, + "loss": 0.2601, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.920149326324463, + "rewards/margins": 3.948967695236206, + "rewards/margins_max": 6.100172996520996, + "rewards/margins_min": 1.7977619171142578, + "rewards/margins_std": 3.042264223098755, + "rewards/rejected": -6.86911678314209, + "step": 2160 + }, + { + "epoch": 0.89, + "grad_norm": 2.59375, + "learning_rate": 6.801746489277993e-08, + "logits/chosen": 0.07816837728023529, + "logits/rejected": 0.6985357999801636, + "logps/chosen": -507.4588928222656, + "logps/rejected": -870.8380737304688, + "loss": 0.3532, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.5561635494232178, + "rewards/margins": 4.063462257385254, + "rewards/margins_max": 6.586556911468506, + "rewards/margins_min": 1.5403677225112915, + "rewards/margins_std": 3.56819486618042, + "rewards/rejected": -6.619626045227051, + "step": 2170 + }, + { + "epoch": 0.9, + "grad_norm": 1.5078125, + "learning_rate": 6.290190558954478e-08, + "logits/chosen": -0.012972557917237282, + "logits/rejected": 0.689728856086731, + "logps/chosen": -533.2125244140625, + "logps/rejected": -1008.7321166992188, + "loss": 0.2846, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.664839029312134, + "rewards/margins": 5.193449974060059, + "rewards/margins_max": 7.740516662597656, + "rewards/margins_min": 2.6463842391967773, + "rewards/margins_std": 3.602095365524292, + "rewards/rejected": -7.8582892417907715, + "step": 2180 + }, + { + "epoch": 0.9, + "grad_norm": 8.125, + "learning_rate": 5.798006627535279e-08, + "logits/chosen": 0.0373331718146801, + "logits/rejected": 0.6454305648803711, + "logps/chosen": -633.1779174804688, + "logps/rejected": -1055.0909423828125, + "loss": 0.3153, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.5589823722839355, + "rewards/margins": 4.771185874938965, + "rewards/margins_max": 7.790403842926025, + "rewards/margins_min": 1.751967430114746, + "rewards/margins_std": 4.269819736480713, + "rewards/rejected": -8.330168724060059, + "step": 2190 + }, + { + "epoch": 0.91, + "grad_norm": 2.296875, + "learning_rate": 5.325296440895621e-08, + "logits/chosen": 0.20634958148002625, + "logits/rejected": 0.8938538432121277, + "logps/chosen": -470.0528259277344, + "logps/rejected": -813.2053833007812, + "loss": 0.2234, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.4969820976257324, + "rewards/margins": 3.7381529808044434, + "rewards/margins_max": 5.691822528839111, + "rewards/margins_min": 1.7844839096069336, + "rewards/margins_std": 2.7629055976867676, + "rewards/rejected": -6.235135555267334, + "step": 2200 + }, + { + "epoch": 0.91, + "grad_norm": 0.7578125, + "learning_rate": 4.872157719234438e-08, + "logits/chosen": 0.12718608975410461, + "logits/rejected": 0.7145224213600159, + "logps/chosen": -562.0367431640625, + "logps/rejected": -1052.066162109375, + "loss": 0.206, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -3.0564522743225098, + "rewards/margins": 5.207182884216309, + "rewards/margins_max": 8.013465881347656, + "rewards/margins_min": 2.4009008407592773, + "rewards/margins_std": 3.9686825275421143, + "rewards/rejected": -8.263635635375977, + "step": 2210 + }, + { + "epoch": 0.91, + "grad_norm": 1.53125, + "learning_rate": 4.438684136873217e-08, + "logits/chosen": 0.08831767737865448, + "logits/rejected": 0.6083860993385315, + "logps/chosen": -542.2150268554688, + "logps/rejected": -1053.9603271484375, + "loss": 0.1925, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.0658152103424072, + "rewards/margins": 5.440025329589844, + "rewards/margins_max": 8.818717956542969, + "rewards/margins_min": 2.0613327026367188, + "rewards/margins_std": 4.77819299697876, + "rewards/rejected": -8.505840301513672, + "step": 2220 + }, + { + "epoch": 0.92, + "grad_norm": 2.34375, + "learning_rate": 4.02496530289147e-08, + "logits/chosen": 0.12742793560028076, + "logits/rejected": 0.7163810729980469, + "logps/chosen": -584.5067749023438, + "logps/rejected": -1026.2125244140625, + "loss": 0.1992, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.1832902431488037, + "rewards/margins": 4.808717250823975, + "rewards/margins_max": 7.214354515075684, + "rewards/margins_min": 2.4030795097351074, + "rewards/margins_std": 3.402085542678833, + "rewards/rejected": -7.992007255554199, + "step": 2230 + }, + { + "epoch": 0.92, + "grad_norm": 5.0625, + "learning_rate": 3.6310867426023295e-08, + "logits/chosen": 0.09458984434604645, + "logits/rejected": 0.7976502180099487, + "logps/chosen": -559.86962890625, + "logps/rejected": -1006.2230224609375, + "loss": 0.2743, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.063464403152466, + "rewards/margins": 4.954630374908447, + "rewards/margins_max": 7.5468573570251465, + "rewards/margins_min": 2.3624024391174316, + "rewards/margins_std": 3.665963649749756, + "rewards/rejected": -8.018095016479492, + "step": 2240 + }, + { + "epoch": 0.93, + "grad_norm": 2.609375, + "learning_rate": 3.2571298798726e-08, + "logits/chosen": 0.09177270531654358, + "logits/rejected": 0.6422449946403503, + "logps/chosen": -540.8308715820312, + "logps/rejected": -921.3922119140625, + "loss": 0.22, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.0615222454071045, + "rewards/margins": 4.212675094604492, + "rewards/margins_max": 6.487915992736816, + "rewards/margins_min": 1.9374347925186157, + "rewards/margins_std": 3.2176766395568848, + "rewards/rejected": -7.274197578430176, + "step": 2250 + }, + { + "epoch": 0.93, + "grad_norm": 2.5, + "learning_rate": 2.9031720202904008e-08, + "logits/chosen": 0.150480717420578, + "logits/rejected": 0.8441624641418457, + "logps/chosen": -568.8901977539062, + "logps/rejected": -986.38525390625, + "loss": 0.2288, + "rewards/accuracies": 0.9375, + "rewards/chosen": -3.069028377532959, + "rewards/margins": 4.737165927886963, + "rewards/margins_max": 7.883784294128418, + "rewards/margins_min": 1.5905473232269287, + "rewards/margins_std": 4.449990749359131, + "rewards/rejected": -7.8061933517456055, + "step": 2260 + }, + { + "epoch": 0.93, + "grad_norm": 0.9453125, + "learning_rate": 2.5692863351844175e-08, + "logits/chosen": 0.08601401001214981, + "logits/rejected": 0.7968393564224243, + "logps/chosen": -508.9917907714844, + "logps/rejected": -905.7818603515625, + "loss": 0.2264, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.7462658882141113, + "rewards/margins": 4.208594799041748, + "rewards/margins_max": 6.139928340911865, + "rewards/margins_min": 2.2772605419158936, + "rewards/margins_std": 2.731318712234497, + "rewards/rejected": -6.954860687255859, + "step": 2270 + }, + { + "epoch": 0.94, + "grad_norm": 1.71875, + "learning_rate": 2.2555418464976884e-08, + "logits/chosen": 0.24783821403980255, + "logits/rejected": 0.7340123057365417, + "logps/chosen": -573.2374877929688, + "logps/rejected": -1094.7659912109375, + "loss": 0.2276, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -3.265376567840576, + "rewards/margins": 5.4828691482543945, + "rewards/margins_max": 8.924985885620117, + "rewards/margins_min": 2.040750503540039, + "rewards/margins_std": 4.867890357971191, + "rewards/rejected": -8.748245239257812, + "step": 2280 + }, + { + "epoch": 0.94, + "grad_norm": 0.95703125, + "learning_rate": 1.9620034125190643e-08, + "logits/chosen": 0.021796632558107376, + "logits/rejected": 0.609171986579895, + "logps/chosen": -615.38818359375, + "logps/rejected": -1102.287353515625, + "loss": 0.1806, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -3.3852379322052, + "rewards/margins": 5.42401123046875, + "rewards/margins_max": 8.265124320983887, + "rewards/margins_min": 2.5828983783721924, + "rewards/margins_std": 4.017940044403076, + "rewards/rejected": -8.809249877929688, + "step": 2290 + }, + { + "epoch": 0.95, + "grad_norm": 2.421875, + "learning_rate": 1.6887317144755776e-08, + "logits/chosen": 0.16773036122322083, + "logits/rejected": 0.7562099695205688, + "logps/chosen": -571.23828125, + "logps/rejected": -980.9654541015625, + "loss": 0.2831, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.1232476234436035, + "rewards/margins": 4.673791885375977, + "rewards/margins_max": 7.8030548095703125, + "rewards/margins_min": 1.544529914855957, + "rewards/margins_std": 4.425445556640625, + "rewards/rejected": -7.797039985656738, + "step": 2300 + }, + { + "epoch": 0.95, + "grad_norm": 0.4609375, + "learning_rate": 1.4357832439881868e-08, + "logits/chosen": 0.08502840995788574, + "logits/rejected": 0.6817075610160828, + "logps/chosen": -530.4733276367188, + "logps/rejected": -875.8533325195312, + "loss": 0.285, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -3.0532920360565186, + "rewards/margins": 3.653752088546753, + "rewards/margins_max": 6.376016616821289, + "rewards/margins_min": 0.9314873814582825, + "rewards/margins_std": 3.8498637676239014, + "rewards/rejected": -6.7070441246032715, + "step": 2310 + }, + { + "epoch": 0.96, + "grad_norm": 1.3984375, + "learning_rate": 1.2032102913936525e-08, + "logits/chosen": 0.08291347324848175, + "logits/rejected": 0.7909741401672363, + "logps/chosen": -521.2633666992188, + "logps/rejected": -997.1781005859375, + "loss": 0.2373, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.600470781326294, + "rewards/margins": 5.073606967926025, + "rewards/margins_max": 8.561439514160156, + "rewards/margins_min": 1.5857731103897095, + "rewards/margins_std": 4.932542324066162, + "rewards/rejected": -7.674078464508057, + "step": 2320 + }, + { + "epoch": 0.96, + "grad_norm": 1.0390625, + "learning_rate": 9.910609349348953e-09, + "logits/chosen": 0.1890447735786438, + "logits/rejected": 0.7709522247314453, + "logps/chosen": -516.685791015625, + "logps/rejected": -850.3052978515625, + "loss": 0.2563, + "rewards/accuracies": 0.9125000238418579, + "rewards/chosen": -2.8008828163146973, + "rewards/margins": 3.603776216506958, + "rewards/margins_max": 6.171905517578125, + "rewards/margins_min": 1.0356473922729492, + "rewards/margins_std": 3.631882429122925, + "rewards/rejected": -6.404659271240234, + "step": 2330 + }, + { + "epoch": 0.96, + "grad_norm": 1.3515625, + "learning_rate": 7.993790308221227e-09, + "logits/chosen": 0.09116091579198837, + "logits/rejected": 0.6433526873588562, + "logps/chosen": -550.9741821289062, + "logps/rejected": -902.2535400390625, + "loss": 0.264, + "rewards/accuracies": 0.9750000238418579, + "rewards/chosen": -2.5910322666168213, + "rewards/margins": 3.7961928844451904, + "rewards/margins_max": 5.869515419006348, + "rewards/margins_min": 1.7228702306747437, + "rewards/margins_std": 2.9321210384368896, + "rewards/rejected": -6.387225151062012, + "step": 2340 + }, + { + "epoch": 0.97, + "grad_norm": 1.21875, + "learning_rate": 6.282042041667046e-09, + "logits/chosen": 0.027905773371458054, + "logits/rejected": 0.596718966960907, + "logps/chosen": -605.1668701171875, + "logps/rejected": -1050.55078125, + "loss": 0.2155, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -3.3858304023742676, + "rewards/margins": 4.683663368225098, + "rewards/margins_max": 7.590599060058594, + "rewards/margins_min": 1.7767282724380493, + "rewards/margins_std": 4.111027717590332, + "rewards/rejected": -8.069494247436523, + "step": 2350 + }, + { + "epoch": 0.97, + "grad_norm": 1.0078125, + "learning_rate": 4.775718407897811e-09, + "logits/chosen": 0.1464410424232483, + "logits/rejected": 0.5830925703048706, + "logps/chosen": -496.97662353515625, + "logps/rejected": -940.4425048828125, + "loss": 0.1986, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.711729049682617, + "rewards/margins": 4.474714756011963, + "rewards/margins_max": 7.022757053375244, + "rewards/margins_min": 1.9266719818115234, + "rewards/margins_std": 3.6034762859344482, + "rewards/rejected": -7.186443328857422, + "step": 2360 + }, + { + "epoch": 0.98, + "grad_norm": 2.234375, + "learning_rate": 3.4751307990712466e-09, + "logits/chosen": 0.14855477213859558, + "logits/rejected": 0.7474950551986694, + "logps/chosen": -538.835693359375, + "logps/rejected": -997.9420776367188, + "loss": 0.2956, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.9597933292388916, + "rewards/margins": 4.721034049987793, + "rewards/margins_max": 7.788567543029785, + "rewards/margins_min": 1.6535009145736694, + "rewards/margins_std": 4.338146686553955, + "rewards/rejected": -7.680828094482422, + "step": 2370 + }, + { + "epoch": 0.98, + "grad_norm": 0.8203125, + "learning_rate": 2.38054807692023e-09, + "logits/chosen": 0.09776248037815094, + "logits/rejected": 0.6629201173782349, + "logps/chosen": -488.8106994628906, + "logps/rejected": -883.71826171875, + "loss": 0.2274, + "rewards/accuracies": 0.9375, + "rewards/chosen": -2.568514347076416, + "rewards/margins": 4.160928249359131, + "rewards/margins_max": 6.740607261657715, + "rewards/margins_min": 1.5812499523162842, + "rewards/margins_std": 3.6482162475585938, + "rewards/rejected": -6.729442596435547, + "step": 2380 + }, + { + "epoch": 0.98, + "grad_norm": 1.890625, + "learning_rate": 1.4921965171720286e-09, + "logits/chosen": 0.15427419543266296, + "logits/rejected": 0.7093546986579895, + "logps/chosen": -487.88214111328125, + "logps/rejected": -885.75390625, + "loss": 0.2778, + "rewards/accuracies": 0.949999988079071, + "rewards/chosen": -2.636676788330078, + "rewards/margins": 4.090733528137207, + "rewards/margins_max": 7.045225620269775, + "rewards/margins_min": 1.1362407207489014, + "rewards/margins_std": 4.178283214569092, + "rewards/rejected": -6.727410316467285, + "step": 2390 + }, + { + "epoch": 0.99, + "grad_norm": 2.171875, + "learning_rate": 8.102597627722696e-10, + "logits/chosen": 0.16055794060230255, + "logits/rejected": 0.7714218497276306, + "logps/chosen": -504.7491149902344, + "logps/rejected": -1007.6917724609375, + "loss": 0.2977, + "rewards/accuracies": 0.9624999761581421, + "rewards/chosen": -2.8060338497161865, + "rewards/margins": 5.096834182739258, + "rewards/margins_max": 7.9286065101623535, + "rewards/margins_min": 2.265061140060425, + "rewards/margins_std": 4.004731178283691, + "rewards/rejected": -7.902867794036865, + "step": 2400 + }, + { + "epoch": 0.99, + "grad_norm": 0.9296875, + "learning_rate": 3.34878785921755e-10, + "logits/chosen": 0.27159827947616577, + "logits/rejected": 0.8951881527900696, + "logps/chosen": -534.0177001953125, + "logps/rejected": -945.98876953125, + "loss": 0.2609, + "rewards/accuracies": 0.887499988079071, + "rewards/chosen": -2.8999059200286865, + "rewards/margins": 4.4170026779174805, + "rewards/margins_max": 6.8793535232543945, + "rewards/margins_min": 1.9546514749526978, + "rewards/margins_std": 3.482290267944336, + "rewards/rejected": -7.316908359527588, + "step": 2410 + }, + { + "epoch": 1.0, + "grad_norm": 0.953125, + "learning_rate": 6.615185893366072e-11, + "logits/chosen": 0.05070207267999649, + "logits/rejected": 0.7962743043899536, + "logps/chosen": -517.2967529296875, + "logps/rejected": -865.5984497070312, + "loss": 0.2338, + "rewards/accuracies": 0.925000011920929, + "rewards/chosen": -2.6130902767181396, + "rewards/margins": 4.025864124298096, + "rewards/margins_max": 6.2240309715271, + "rewards/margins_min": 1.8276973962783813, + "rewards/margins_std": 3.1086769104003906, + "rewards/rejected": -6.638954162597656, + "step": 2420 + }, + { + "epoch": 1.0, + "eval_logits/chosen": 0.7758960127830505, + "eval_logits/rejected": 0.9632418155670166, + "eval_logps/chosen": -548.3036499023438, + "eval_logps/rejected": -562.2833862304688, + "eval_loss": 0.7594465613365173, + "eval_rewards/accuracies": 0.5525000095367432, + "eval_rewards/chosen": -2.099947452545166, + "eval_rewards/margins": 0.3320136070251465, + "eval_rewards/margins_max": 2.8946704864501953, + "eval_rewards/margins_min": -1.5015698671340942, + "eval_rewards/margins_std": 1.4200025796890259, + "eval_rewards/rejected": -2.4319608211517334, + "eval_runtime": 1667.9803, + "eval_samples_per_second": 4.796, + "eval_steps_per_second": 0.3, + "step": 2428 + }, + { + "epoch": 1.0, + "step": 2428, + "total_flos": 0.0, + "train_loss": 0.36623305416185736, + "train_runtime": 22442.9766, + "train_samples_per_second": 1.731, + "train_steps_per_second": 0.108 + } + ], + "logging_steps": 10, + "max_steps": 2428, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 100, + "total_flos": 0.0, + "train_batch_size": 2, + "trial_name": null, + "trial_params": null +}