{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 4164, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007204610951008645, "grad_norm": 2.336021900177002, "learning_rate": 1.199040767386091e-10, "logits/chosen": -1.3860063552856445, "logits/rejected": -1.3949532508850098, "logps/chosen": -34.621925354003906, "logps/rejected": -37.30891418457031, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.007204610951008645, "grad_norm": 2.7931599617004395, "learning_rate": 1.199040767386091e-09, "logits/chosen": -1.546767234802246, "logits/rejected": -1.5282517671585083, "logps/chosen": -42.52494812011719, "logps/rejected": -44.546756744384766, "loss": 0.6932, "rewards/accuracies": 0.3680555522441864, "rewards/chosen": -0.00010908626427408308, "rewards/margins": -0.00013866486551705748, "rewards/rejected": 2.95786012429744e-05, "step": 10 }, { "epoch": 0.01440922190201729, "grad_norm": 2.9333579540252686, "learning_rate": 2.398081534772182e-09, "logits/chosen": -1.5552335977554321, "logits/rejected": -1.5412750244140625, "logps/chosen": -44.075599670410156, "logps/rejected": -46.59809112548828, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 9.86563172773458e-05, "rewards/margins": 0.00012048264034092426, "rewards/rejected": -2.182633033953607e-05, "step": 20 }, { "epoch": 0.021613832853025938, "grad_norm": 3.4939088821411133, "learning_rate": 3.597122302158273e-09, "logits/chosen": -1.5116419792175293, "logits/rejected": -1.5045888423919678, "logps/chosen": -47.83784866333008, "logps/rejected": -50.80131149291992, "loss": 0.693, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.000109563407022506, "rewards/margins": 0.00020976344239898026, "rewards/rejected": -0.00010020002082455903, "step": 30 }, { "epoch": 0.02881844380403458, "grad_norm": 2.5776360034942627, "learning_rate": 4.796163069544364e-09, "logits/chosen": -1.558410406112671, "logits/rejected": -1.5545051097869873, "logps/chosen": -43.07380294799805, "logps/rejected": -45.55127716064453, "loss": 0.6932, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -0.00022044967045076191, "rewards/margins": -0.00018095636914949864, "rewards/rejected": -3.949330493924208e-05, "step": 40 }, { "epoch": 0.03602305475504323, "grad_norm": 2.6562020778656006, "learning_rate": 5.995203836930456e-09, "logits/chosen": -1.4691417217254639, "logits/rejected": -1.46826171875, "logps/chosen": -43.00727081298828, "logps/rejected": -44.83103942871094, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.1715678144573758e-07, "rewards/margins": 4.3453786929603666e-05, "rewards/rejected": -4.367092333268374e-05, "step": 50 }, { "epoch": 0.043227665706051875, "grad_norm": 3.931988477706909, "learning_rate": 7.194244604316546e-09, "logits/chosen": -1.567317247390747, "logits/rejected": -1.5605990886688232, "logps/chosen": -50.6888542175293, "logps/rejected": -52.02692794799805, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.2426731447922066e-05, "rewards/margins": 3.057774301851168e-05, "rewards/rejected": -5.3004478104412556e-05, "step": 60 }, { "epoch": 0.05043227665706052, "grad_norm": 2.3015387058258057, "learning_rate": 8.393285371702639e-09, "logits/chosen": -1.5361818075180054, "logits/rejected": -1.528193473815918, "logps/chosen": -50.07262420654297, "logps/rejected": -52.7786750793457, "loss": 0.6931, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -4.649541006074287e-05, "rewards/margins": -4.221068138576811e-06, "rewards/rejected": -4.2274350562365726e-05, "step": 70 }, { "epoch": 0.05763688760806916, "grad_norm": 3.439854145050049, "learning_rate": 9.592326139088728e-09, "logits/chosen": -1.5699741840362549, "logits/rejected": -1.562044382095337, "logps/chosen": -51.11206817626953, "logps/rejected": -52.69794464111328, "loss": 0.6931, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -4.887674094788963e-06, "rewards/margins": 0.0001481349318055436, "rewards/rejected": -0.00015302257088478655, "step": 80 }, { "epoch": 0.06484149855907781, "grad_norm": 2.781512975692749, "learning_rate": 1.0791366906474819e-08, "logits/chosen": -1.5034945011138916, "logits/rejected": -1.500799536705017, "logps/chosen": -49.011016845703125, "logps/rejected": -51.1546516418457, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 5.785684334114194e-05, "rewards/margins": -0.00010168743028771132, "rewards/rejected": 0.0001595442445250228, "step": 90 }, { "epoch": 0.07204610951008646, "grad_norm": 3.0006866455078125, "learning_rate": 1.1990407673860912e-08, "logits/chosen": -1.584449052810669, "logits/rejected": -1.5737826824188232, "logps/chosen": -45.691627502441406, "logps/rejected": -48.739620208740234, "loss": 0.6932, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -1.0443580322316848e-05, "rewards/margins": -0.00019995025650132447, "rewards/rejected": 0.00018950665253214538, "step": 100 }, { "epoch": 0.0792507204610951, "grad_norm": 2.3066394329071045, "learning_rate": 1.3189448441247003e-08, "logits/chosen": -1.4547593593597412, "logits/rejected": -1.4310551881790161, "logps/chosen": -48.961212158203125, "logps/rejected": -51.16516876220703, "loss": 0.6933, "rewards/accuracies": 0.3687500059604645, "rewards/chosen": -0.0002893832861445844, "rewards/margins": -0.00031176736229099333, "rewards/rejected": 2.2384085241355933e-05, "step": 110 }, { "epoch": 0.08645533141210375, "grad_norm": 2.296614408493042, "learning_rate": 1.4388489208633092e-08, "logits/chosen": -1.4833275079727173, "logits/rejected": -1.480101227760315, "logps/chosen": -44.263572692871094, "logps/rejected": -46.608760833740234, "loss": 0.6932, "rewards/accuracies": 0.4375, "rewards/chosen": -0.0001023497898131609, "rewards/margins": -1.7168078557006083e-05, "rewards/rejected": -8.518168760929257e-05, "step": 120 }, { "epoch": 0.0936599423631124, "grad_norm": 3.2075729370117188, "learning_rate": 1.5587529976019183e-08, "logits/chosen": -1.5763187408447266, "logits/rejected": -1.5709102153778076, "logps/chosen": -49.41350555419922, "logps/rejected": -51.297760009765625, "loss": 0.6932, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -7.49648897908628e-05, "rewards/margins": -0.00010434426076244563, "rewards/rejected": 2.9379374609561637e-05, "step": 130 }, { "epoch": 0.10086455331412104, "grad_norm": 2.821803092956543, "learning_rate": 1.6786570743405277e-08, "logits/chosen": -1.4521141052246094, "logits/rejected": -1.442638874053955, "logps/chosen": -45.84361267089844, "logps/rejected": -50.25464630126953, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 3.1858147849561647e-05, "rewards/margins": 0.00010751285299193114, "rewards/rejected": -7.56547087803483e-05, "step": 140 }, { "epoch": 0.10806916426512968, "grad_norm": 3.5821127891540527, "learning_rate": 1.7985611510791365e-08, "logits/chosen": -1.471673607826233, "logits/rejected": -1.464890956878662, "logps/chosen": -48.29644775390625, "logps/rejected": -51.45283889770508, "loss": 0.693, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 3.749009920284152e-05, "rewards/margins": 0.00023978715762495995, "rewards/rejected": -0.0002022970438702032, "step": 150 }, { "epoch": 0.11527377521613832, "grad_norm": 2.457958459854126, "learning_rate": 1.9184652278177456e-08, "logits/chosen": -1.5046113729476929, "logits/rejected": -1.4862873554229736, "logps/chosen": -41.24737548828125, "logps/rejected": -44.58059310913086, "loss": 0.6932, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 7.707150507485494e-05, "rewards/margins": -9.014904935611412e-05, "rewards/rejected": 0.00016722058353479952, "step": 160 }, { "epoch": 0.12247838616714697, "grad_norm": 3.1706480979919434, "learning_rate": 2.038369304556355e-08, "logits/chosen": -1.5169397592544556, "logits/rejected": -1.4981354475021362, "logps/chosen": -44.85956954956055, "logps/rejected": -46.871952056884766, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -6.305011629592627e-05, "rewards/margins": -7.952237501740456e-05, "rewards/rejected": 1.6472253264510073e-05, "step": 170 }, { "epoch": 0.12968299711815562, "grad_norm": 2.6257331371307373, "learning_rate": 2.1582733812949638e-08, "logits/chosen": -1.5798842906951904, "logits/rejected": -1.5681228637695312, "logps/chosen": -45.10202407836914, "logps/rejected": -46.846622467041016, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -9.954520646715537e-05, "rewards/margins": -8.836419874569401e-05, "rewards/rejected": -1.1181010449945461e-05, "step": 180 }, { "epoch": 0.13688760806916425, "grad_norm": 2.9169774055480957, "learning_rate": 2.278177458033573e-08, "logits/chosen": -1.5893226861953735, "logits/rejected": -1.5855244398117065, "logps/chosen": -42.26114273071289, "logps/rejected": -45.406829833984375, "loss": 0.6931, "rewards/accuracies": 0.46875, "rewards/chosen": 6.678106728941202e-06, "rewards/margins": 2.9820144845871255e-05, "rewards/rejected": -2.3142014470067807e-05, "step": 190 }, { "epoch": 0.1440922190201729, "grad_norm": 3.5420353412628174, "learning_rate": 2.3980815347721823e-08, "logits/chosen": -1.5361340045928955, "logits/rejected": -1.5295995473861694, "logps/chosen": -43.47291946411133, "logps/rejected": -47.111000061035156, "loss": 0.6931, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": 0.0001089770594262518, "rewards/margins": 1.1494899808894843e-05, "rewards/rejected": 9.748217416927218e-05, "step": 200 }, { "epoch": 0.15129682997118155, "grad_norm": 3.056239128112793, "learning_rate": 2.517985611510791e-08, "logits/chosen": -1.5651589632034302, "logits/rejected": -1.5538842678070068, "logps/chosen": -43.047855377197266, "logps/rejected": -43.400611877441406, "loss": 0.6932, "rewards/accuracies": 0.4312500059604645, "rewards/chosen": -0.00010786174971144646, "rewards/margins": -0.00011335410817991942, "rewards/rejected": 5.4923671086726245e-06, "step": 210 }, { "epoch": 0.1585014409221902, "grad_norm": 2.8404786586761475, "learning_rate": 2.6378896882494006e-08, "logits/chosen": -1.4806840419769287, "logits/rejected": -1.4740344285964966, "logps/chosen": -47.371559143066406, "logps/rejected": -52.52393341064453, "loss": 0.6933, "rewards/accuracies": 0.40625, "rewards/chosen": -0.00014377260231412947, "rewards/margins": -0.00026818824699148536, "rewards/rejected": 0.00012441558646969497, "step": 220 }, { "epoch": 0.16570605187319884, "grad_norm": 2.566518545150757, "learning_rate": 2.7577937649880097e-08, "logits/chosen": -1.5321465730667114, "logits/rejected": -1.5307929515838623, "logps/chosen": -44.468902587890625, "logps/rejected": -48.26350021362305, "loss": 0.6931, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 9.540587598166894e-06, "rewards/margins": 0.00015555776190012693, "rewards/rejected": -0.00014601717703044415, "step": 230 }, { "epoch": 0.1729106628242075, "grad_norm": 2.9977493286132812, "learning_rate": 2.8776978417266184e-08, "logits/chosen": -1.579411268234253, "logits/rejected": -1.5686959028244019, "logps/chosen": -49.147926330566406, "logps/rejected": -51.22269821166992, "loss": 0.6931, "rewards/accuracies": 0.5, "rewards/chosen": -6.021944136591628e-05, "rewards/margins": 0.00012635164603125304, "rewards/rejected": -0.00018657106556929648, "step": 240 }, { "epoch": 0.18011527377521613, "grad_norm": 4.014063835144043, "learning_rate": 2.997601918465228e-08, "logits/chosen": -1.4550889730453491, "logits/rejected": -1.4445072412490845, "logps/chosen": -49.6921501159668, "logps/rejected": -50.77552795410156, "loss": 0.693, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.0001581499818712473, "rewards/margins": 0.0003015203692484647, "rewards/rejected": -0.0001433703873772174, "step": 250 }, { "epoch": 0.1873198847262248, "grad_norm": 3.186095952987671, "learning_rate": 3.1175059952038366e-08, "logits/chosen": -1.495941162109375, "logits/rejected": -1.4901201725006104, "logps/chosen": -49.999717712402344, "logps/rejected": -51.457984924316406, "loss": 0.6931, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 9.421829599887133e-05, "rewards/margins": 0.0001734937832225114, "rewards/rejected": -7.927548722364008e-05, "step": 260 }, { "epoch": 0.19452449567723343, "grad_norm": 3.3234660625457764, "learning_rate": 3.237410071942446e-08, "logits/chosen": -1.5844049453735352, "logits/rejected": -1.5691239833831787, "logps/chosen": -46.7442512512207, "logps/rejected": -49.30167007446289, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.0002405096747679636, "rewards/margins": 1.5671601431677118e-05, "rewards/rejected": 0.00022483807697426528, "step": 270 }, { "epoch": 0.2017291066282421, "grad_norm": 2.917389154434204, "learning_rate": 3.3573141486810555e-08, "logits/chosen": -1.5494643449783325, "logits/rejected": -1.533740758895874, "logps/chosen": -42.46552276611328, "logps/rejected": -43.715232849121094, "loss": 0.6931, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 5.060394687461667e-05, "rewards/margins": 1.1760695088014472e-05, "rewards/rejected": 3.884324905811809e-05, "step": 280 }, { "epoch": 0.20893371757925072, "grad_norm": 2.7840898036956787, "learning_rate": 3.477218225419664e-08, "logits/chosen": -1.579872727394104, "logits/rejected": -1.5693047046661377, "logps/chosen": -44.499839782714844, "logps/rejected": -45.791568756103516, "loss": 0.6931, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 0.00012782707926817238, "rewards/margins": 0.00019142820383422077, "rewards/rejected": -6.360108818626031e-05, "step": 290 }, { "epoch": 0.21613832853025935, "grad_norm": 3.146667003631592, "learning_rate": 3.597122302158273e-08, "logits/chosen": -1.5557529926300049, "logits/rejected": -1.5390459299087524, "logps/chosen": -48.216209411621094, "logps/rejected": -50.678340911865234, "loss": 0.6933, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.00012882667942903936, "rewards/margins": -0.00033005099976435304, "rewards/rejected": 0.00020122430578339845, "step": 300 }, { "epoch": 0.22334293948126802, "grad_norm": 3.016376495361328, "learning_rate": 3.717026378896883e-08, "logits/chosen": -1.4412884712219238, "logits/rejected": -1.431753158569336, "logps/chosen": -48.03168487548828, "logps/rejected": -50.060646057128906, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 9.238120401278138e-05, "rewards/margins": 7.04801277606748e-05, "rewards/rejected": 2.1901068976148963e-05, "step": 310 }, { "epoch": 0.23054755043227665, "grad_norm": 2.544971466064453, "learning_rate": 3.836930455635491e-08, "logits/chosen": -1.5533429384231567, "logits/rejected": -1.53842031955719, "logps/chosen": -47.08930206298828, "logps/rejected": -51.531005859375, "loss": 0.6932, "rewards/accuracies": 0.46875, "rewards/chosen": 0.00025622855173423886, "rewards/margins": -9.28807639866136e-05, "rewards/rejected": 0.0003491092938929796, "step": 320 }, { "epoch": 0.2377521613832853, "grad_norm": 2.311227798461914, "learning_rate": 3.9568345323741003e-08, "logits/chosen": -1.5248558521270752, "logits/rejected": -1.5185081958770752, "logps/chosen": -50.7647819519043, "logps/rejected": -49.56135177612305, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -8.110304770525545e-05, "rewards/margins": -0.00011024585546692833, "rewards/rejected": 2.914278593380004e-05, "step": 330 }, { "epoch": 0.24495677233429394, "grad_norm": 2.6722328662872314, "learning_rate": 4.07673860911271e-08, "logits/chosen": -1.5785841941833496, "logits/rejected": -1.5681426525115967, "logps/chosen": -51.070430755615234, "logps/rejected": -52.11237335205078, "loss": 0.693, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0002661117759998888, "rewards/margins": 0.0003887194616254419, "rewards/rejected": -0.00012260770017746836, "step": 340 }, { "epoch": 0.2521613832853026, "grad_norm": 3.674198865890503, "learning_rate": 4.1966426858513185e-08, "logits/chosen": -1.5093214511871338, "logits/rejected": -1.5045934915542603, "logps/chosen": -45.63981628417969, "logps/rejected": -48.58732986450195, "loss": 0.6931, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0002857402723748237, "rewards/margins": 0.00017633094103075564, "rewards/rejected": 0.00010940933134406805, "step": 350 }, { "epoch": 0.25936599423631124, "grad_norm": 3.499232053756714, "learning_rate": 4.3165467625899276e-08, "logits/chosen": -1.5016661882400513, "logits/rejected": -1.4912707805633545, "logps/chosen": -53.919525146484375, "logps/rejected": -56.54018020629883, "loss": 0.6932, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.00013425681390799582, "rewards/margins": -2.2593365429202095e-05, "rewards/rejected": 0.0001568501756992191, "step": 360 }, { "epoch": 0.2665706051873199, "grad_norm": 3.743687152862549, "learning_rate": 4.4364508393285374e-08, "logits/chosen": -1.4821672439575195, "logits/rejected": -1.4804311990737915, "logps/chosen": -48.30991744995117, "logps/rejected": -53.03217697143555, "loss": 0.6929, "rewards/accuracies": 0.53125, "rewards/chosen": 0.00033552644890733063, "rewards/margins": 0.0004464868106879294, "rewards/rejected": -0.00011096037633251399, "step": 370 }, { "epoch": 0.2737752161383285, "grad_norm": 2.4759514331817627, "learning_rate": 4.556354916067146e-08, "logits/chosen": -1.5723472833633423, "logits/rejected": -1.5651805400848389, "logps/chosen": -46.992698669433594, "logps/rejected": -48.03093719482422, "loss": 0.6932, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": 7.932665903354064e-05, "rewards/margins": -7.613336492795497e-05, "rewards/rejected": 0.00015546004578936845, "step": 380 }, { "epoch": 0.28097982708933716, "grad_norm": 2.923274040222168, "learning_rate": 4.676258992805755e-08, "logits/chosen": -1.544915795326233, "logits/rejected": -1.535043716430664, "logps/chosen": -48.17082595825195, "logps/rejected": -51.49504470825195, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.000253693520789966, "rewards/margins": 0.0001750610099406913, "rewards/rejected": 7.86325108492747e-05, "step": 390 }, { "epoch": 0.2881844380403458, "grad_norm": 3.412151575088501, "learning_rate": 4.796163069544365e-08, "logits/chosen": -1.5585193634033203, "logits/rejected": -1.553504467010498, "logps/chosen": -44.887977600097656, "logps/rejected": -46.11194610595703, "loss": 0.6932, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.00015390969929285347, "rewards/margins": -2.5745690436451696e-05, "rewards/rejected": 0.00017965536972042173, "step": 400 }, { "epoch": 0.2953890489913545, "grad_norm": 3.6315953731536865, "learning_rate": 4.916067146282973e-08, "logits/chosen": -1.509155511856079, "logits/rejected": -1.5060787200927734, "logps/chosen": -47.50922393798828, "logps/rejected": -49.60902786254883, "loss": 0.6932, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 9.884338214760646e-05, "rewards/margins": -4.629031172953546e-05, "rewards/rejected": 0.00014513370115309954, "step": 410 }, { "epoch": 0.3025936599423631, "grad_norm": 2.4514944553375244, "learning_rate": 4.999992091672379e-08, "logits/chosen": -1.4691174030303955, "logits/rejected": -1.4793603420257568, "logps/chosen": -45.651798248291016, "logps/rejected": -48.96102523803711, "loss": 0.6931, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 0.00014277843001764268, "rewards/margins": 2.968159424199257e-05, "rewards/rejected": 0.00011309684487059712, "step": 420 }, { "epoch": 0.30979827089337175, "grad_norm": 2.288839817047119, "learning_rate": 4.999851500573209e-08, "logits/chosen": -1.497201681137085, "logits/rejected": -1.4975249767303467, "logps/chosen": -46.066871643066406, "logps/rejected": -46.192413330078125, "loss": 0.6931, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.00031205304549075663, "rewards/margins": 4.402028935146518e-05, "rewards/rejected": 0.00026803271612152457, "step": 430 }, { "epoch": 0.3170028818443804, "grad_norm": 2.4444422721862793, "learning_rate": 4.999535180235972e-08, "logits/chosen": -1.4983166456222534, "logits/rejected": -1.4901825189590454, "logps/chosen": -46.0124397277832, "logps/rejected": -49.43706512451172, "loss": 0.6931, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.000347124005202204, "rewards/margins": 0.00018896095571108162, "rewards/rejected": 0.0001581630203872919, "step": 440 }, { "epoch": 0.3242074927953891, "grad_norm": 3.173124313354492, "learning_rate": 4.9990431528966836e-08, "logits/chosen": -1.5115959644317627, "logits/rejected": -1.4905065298080444, "logps/chosen": -53.1868896484375, "logps/rejected": -51.287513732910156, "loss": 0.693, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.00036136750713922083, "rewards/margins": 0.00031076278537511826, "rewards/rejected": 5.0604761781869456e-05, "step": 450 }, { "epoch": 0.3314121037463977, "grad_norm": 3.7471864223480225, "learning_rate": 4.9983754531428326e-08, "logits/chosen": -1.5160473585128784, "logits/rejected": -1.4985129833221436, "logps/chosen": -53.772499084472656, "logps/rejected": -55.6755256652832, "loss": 0.6928, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0006765121361240745, "rewards/margins": 0.0007202303386293352, "rewards/rejected": -4.371829709270969e-05, "step": 460 }, { "epoch": 0.33861671469740634, "grad_norm": 3.7323288917541504, "learning_rate": 4.997532127910954e-08, "logits/chosen": -1.5780543088912964, "logits/rejected": -1.5486719608306885, "logps/chosen": -52.630889892578125, "logps/rejected": -53.20234298706055, "loss": 0.6929, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0005277583841234446, "rewards/margins": 0.00045975102693773806, "rewards/rejected": 6.800738628953695e-05, "step": 470 }, { "epoch": 0.345821325648415, "grad_norm": 3.6508657932281494, "learning_rate": 4.996513236483331e-08, "logits/chosen": -1.6470165252685547, "logits/rejected": -1.6330372095108032, "logps/chosen": -42.519134521484375, "logps/rejected": -45.442176818847656, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0003384821757208556, "rewards/margins": 0.00045337239862419665, "rewards/rejected": -0.00011489020835142583, "step": 480 }, { "epoch": 0.3530259365994236, "grad_norm": 4.240896701812744, "learning_rate": 4.9953188504838225e-08, "logits/chosen": -1.5246374607086182, "logits/rejected": -1.5129501819610596, "logps/chosen": -46.44121170043945, "logps/rejected": -49.50823974609375, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.00048602544120512903, "rewards/margins": 0.00015522187459282577, "rewards/rejected": 0.0003308035375084728, "step": 490 }, { "epoch": 0.36023054755043227, "grad_norm": 2.8267297744750977, "learning_rate": 4.993949053872834e-08, "logits/chosen": -1.5284509658813477, "logits/rejected": -1.5051023960113525, "logps/chosen": -42.638057708740234, "logps/rejected": -45.888179779052734, "loss": 0.6929, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0007548181456513703, "rewards/margins": 0.0005205943016335368, "rewards/rejected": 0.0002342238585697487, "step": 500 }, { "epoch": 0.36743515850144093, "grad_norm": 2.894747257232666, "learning_rate": 4.9924039429414086e-08, "logits/chosen": -1.6394548416137695, "logits/rejected": -1.6217561960220337, "logps/chosen": -45.99773025512695, "logps/rejected": -47.97636795043945, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.000765104778110981, "rewards/margins": 0.0005781695945188403, "rewards/rejected": 0.000186935139936395, "step": 510 }, { "epoch": 0.3746397694524496, "grad_norm": 3.5549099445343018, "learning_rate": 4.990683626304467e-08, "logits/chosen": -1.5345748662948608, "logits/rejected": -1.5293605327606201, "logps/chosen": -53.918365478515625, "logps/rejected": -56.037811279296875, "loss": 0.693, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.00044068036368116736, "rewards/margins": 0.00020219176076352596, "rewards/rejected": 0.00023848857381381094, "step": 520 }, { "epoch": 0.3818443804034582, "grad_norm": 3.221543788909912, "learning_rate": 4.9887882248931646e-08, "logits/chosen": -1.4587208032608032, "logits/rejected": -1.4379479885101318, "logps/chosen": -46.45323181152344, "logps/rejected": -47.54587173461914, "loss": 0.6931, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": 0.0005941767594777048, "rewards/margins": 6.761190161341801e-05, "rewards/rejected": 0.0005265648360364139, "step": 530 }, { "epoch": 0.38904899135446686, "grad_norm": 3.1585187911987305, "learning_rate": 4.986717871946393e-08, "logits/chosen": -1.4854298830032349, "logits/rejected": -1.4644863605499268, "logps/chosen": -45.88875198364258, "logps/rejected": -47.8057975769043, "loss": 0.6929, "rewards/accuracies": 0.53125, "rewards/chosen": 0.0008031625184230506, "rewards/margins": 0.0005653280531987548, "rewards/rejected": 0.00023783447977621108, "step": 540 }, { "epoch": 0.3962536023054755, "grad_norm": 3.0785837173461914, "learning_rate": 4.984472713001416e-08, "logits/chosen": -1.4299240112304688, "logits/rejected": -1.4215822219848633, "logps/chosen": -48.3651008605957, "logps/rejected": -48.36215591430664, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0007953125168569386, "rewards/margins": 0.000552699959371239, "rewards/rejected": 0.0002426125865895301, "step": 550 }, { "epoch": 0.4034582132564842, "grad_norm": 3.2375829219818115, "learning_rate": 4.982052905883637e-08, "logits/chosen": -1.5734624862670898, "logits/rejected": -1.5630805492401123, "logps/chosen": -48.487159729003906, "logps/rejected": -49.93968200683594, "loss": 0.6929, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0006789817707613111, "rewards/margins": 0.0005560799618251622, "rewards/rejected": 0.00012290175072848797, "step": 560 }, { "epoch": 0.4106628242074928, "grad_norm": 2.9218199253082275, "learning_rate": 4.979458620695505e-08, "logits/chosen": -1.552829384803772, "logits/rejected": -1.523561716079712, "logps/chosen": -52.486000061035156, "logps/rejected": -54.49695587158203, "loss": 0.6928, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0009576571173965931, "rewards/margins": 0.0007622435805387795, "rewards/rejected": 0.00019541350775398314, "step": 570 }, { "epoch": 0.41786743515850144, "grad_norm": 3.183899164199829, "learning_rate": 4.976690039804555e-08, "logits/chosen": -1.5768444538116455, "logits/rejected": -1.563186764717102, "logps/chosen": -42.62350845336914, "logps/rejected": -44.09015655517578, "loss": 0.6928, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.000733331311494112, "rewards/margins": 0.0006372180068865418, "rewards/rejected": 9.611332643544301e-05, "step": 580 }, { "epoch": 0.4250720461095101, "grad_norm": 2.7339372634887695, "learning_rate": 4.973747357830592e-08, "logits/chosen": -1.527267336845398, "logits/rejected": -1.525882601737976, "logps/chosen": -47.51892852783203, "logps/rejected": -53.13426971435547, "loss": 0.6928, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0010354900732636452, "rewards/margins": 0.0007501026266254485, "rewards/rejected": 0.0002853873302228749, "step": 590 }, { "epoch": 0.4322766570605187, "grad_norm": 2.791902780532837, "learning_rate": 4.970630781632009e-08, "logits/chosen": -1.6299854516983032, "logits/rejected": -1.619668960571289, "logps/chosen": -45.41339874267578, "logps/rejected": -49.07583236694336, "loss": 0.6928, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0008391918381676078, "rewards/margins": 0.0007512712036259472, "rewards/rejected": 8.792075095698237e-05, "step": 600 }, { "epoch": 0.43948126801152737, "grad_norm": 3.9196627140045166, "learning_rate": 4.967340530291242e-08, "logits/chosen": -1.534325361251831, "logits/rejected": -1.5173766613006592, "logps/chosen": -50.456092834472656, "logps/rejected": -51.076194763183594, "loss": 0.693, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0006875869585201144, "rewards/margins": 0.00029638074920512736, "rewards/rejected": 0.0003912062384188175, "step": 610 }, { "epoch": 0.44668587896253603, "grad_norm": 2.6910152435302734, "learning_rate": 4.9638768350993755e-08, "logits/chosen": -1.5682487487792969, "logits/rejected": -1.5539703369140625, "logps/chosen": -42.393123626708984, "logps/rejected": -44.458831787109375, "loss": 0.6929, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0009301385725848377, "rewards/margins": 0.0005425423150882125, "rewards/rejected": 0.00038759634480811656, "step": 620 }, { "epoch": 0.4538904899135447, "grad_norm": 2.3632237911224365, "learning_rate": 4.9602399395398786e-08, "logits/chosen": -1.5710407495498657, "logits/rejected": -1.5638505220413208, "logps/chosen": -43.065277099609375, "logps/rejected": -46.55408477783203, "loss": 0.6928, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0009118100861087441, "rewards/margins": 0.0007094824686646461, "rewards/rejected": 0.00020232764654792845, "step": 630 }, { "epoch": 0.4610951008645533, "grad_norm": 2.9160289764404297, "learning_rate": 4.9564300992714914e-08, "logits/chosen": -1.42928946018219, "logits/rejected": -1.4244521856307983, "logps/chosen": -45.373146057128906, "logps/rejected": -48.03164291381836, "loss": 0.6926, "rewards/accuracies": 0.59375, "rewards/chosen": 0.001190593116916716, "rewards/margins": 0.001122686779126525, "rewards/rejected": 6.790638872189447e-05, "step": 640 }, { "epoch": 0.46829971181556196, "grad_norm": 3.4644691944122314, "learning_rate": 4.952447582110253e-08, "logits/chosen": -1.6130802631378174, "logits/rejected": -1.5843290090560913, "logps/chosen": -45.46699905395508, "logps/rejected": -45.392459869384766, "loss": 0.6929, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0012219400377944112, "rewards/margins": 0.0004954574396833777, "rewards/rejected": 0.0007264827145263553, "step": 650 }, { "epoch": 0.4755043227665706, "grad_norm": 3.4215919971466064, "learning_rate": 4.948292668010676e-08, "logits/chosen": -1.5425606966018677, "logits/rejected": -1.5405076742172241, "logps/chosen": -47.15483474731445, "logps/rejected": -50.021751403808594, "loss": 0.6927, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0011111641069874167, "rewards/margins": 0.000992415938526392, "rewards/rejected": 0.00011874813208123669, "step": 660 }, { "epoch": 0.4827089337175792, "grad_norm": 3.4226503372192383, "learning_rate": 4.943965649046064e-08, "logits/chosen": -1.501985788345337, "logits/rejected": -1.4745370149612427, "logps/chosen": -49.8314323425293, "logps/rejected": -51.143699645996094, "loss": 0.6929, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": 0.0009843518491834402, "rewards/margins": 0.00042856420623138547, "rewards/rejected": 0.0005557876429520547, "step": 670 }, { "epoch": 0.4899135446685879, "grad_norm": 4.60683536529541, "learning_rate": 4.9394668293879835e-08, "logits/chosen": -1.4445635080337524, "logits/rejected": -1.4311821460723877, "logps/chosen": -49.72310256958008, "logps/rejected": -49.60923767089844, "loss": 0.6925, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0016157480422407389, "rewards/margins": 0.0012294099433347583, "rewards/rejected": 0.00038633812800981104, "step": 680 }, { "epoch": 0.49711815561959655, "grad_norm": 3.328009605407715, "learning_rate": 4.93479652528488e-08, "logits/chosen": -1.5311024188995361, "logits/rejected": -1.5204837322235107, "logps/chosen": -47.83661651611328, "logps/rejected": -50.61973190307617, "loss": 0.6928, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0011463447008281946, "rewards/margins": 0.0007241896819323301, "rewards/rejected": 0.0004221551935188472, "step": 690 }, { "epoch": 0.5043227665706052, "grad_norm": 2.7923624515533447, "learning_rate": 4.929955065039848e-08, "logits/chosen": -1.544571042060852, "logits/rejected": -1.5311849117279053, "logps/chosen": -46.463233947753906, "logps/rejected": -49.25968933105469, "loss": 0.6927, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0013559869257733226, "rewards/margins": 0.00096817099256441, "rewards/rejected": 0.00038781590410508215, "step": 700 }, { "epoch": 0.5115273775216138, "grad_norm": 2.8159823417663574, "learning_rate": 4.92494278898755e-08, "logits/chosen": -1.5252270698547363, "logits/rejected": -1.509064793586731, "logps/chosen": -41.342586517333984, "logps/rejected": -43.392704010009766, "loss": 0.6927, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0013612203765660524, "rewards/margins": 0.000969057553447783, "rewards/rejected": 0.0003921627067029476, "step": 710 }, { "epoch": 0.5187319884726225, "grad_norm": 3.322333574295044, "learning_rate": 4.9197600494702955e-08, "logits/chosen": -1.4962725639343262, "logits/rejected": -1.4811670780181885, "logps/chosen": -49.289588928222656, "logps/rejected": -52.451210021972656, "loss": 0.6926, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0014330081176012754, "rewards/margins": 0.0010211131302639842, "rewards/rejected": 0.00041189510375261307, "step": 720 }, { "epoch": 0.5259365994236311, "grad_norm": 2.875361442565918, "learning_rate": 4.9144072108132725e-08, "logits/chosen": -1.5103423595428467, "logits/rejected": -1.490912675857544, "logps/chosen": -48.928245544433594, "logps/rejected": -51.06798553466797, "loss": 0.6927, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0015080540906637907, "rewards/margins": 0.000958929886110127, "rewards/rejected": 0.0005491242045536637, "step": 730 }, { "epoch": 0.5331412103746398, "grad_norm": 2.9019076824188232, "learning_rate": 4.908884649298937e-08, "logits/chosen": -1.5038750171661377, "logits/rejected": -1.5000956058502197, "logps/chosen": -46.73221206665039, "logps/rejected": -46.28728485107422, "loss": 0.6929, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0010901311179623008, "rewards/margins": 0.0005718341562896967, "rewards/rejected": 0.0005182969616726041, "step": 740 }, { "epoch": 0.5403458213256485, "grad_norm": 2.882866859436035, "learning_rate": 4.903192753140557e-08, "logits/chosen": -1.5271108150482178, "logits/rejected": -1.510434865951538, "logps/chosen": -48.88886260986328, "logps/rejected": -50.089317321777344, "loss": 0.6924, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0017092444468289614, "rewards/margins": 0.001493553863838315, "rewards/rejected": 0.00021569046657532454, "step": 750 }, { "epoch": 0.547550432276657, "grad_norm": 3.3220198154449463, "learning_rate": 4.897331922454931e-08, "logits/chosen": -1.4528629779815674, "logits/rejected": -1.4513108730316162, "logps/chosen": -45.530757904052734, "logps/rejected": -48.61653518676758, "loss": 0.6927, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0014288409147411585, "rewards/margins": 0.0009447381016798317, "rewards/rejected": 0.00048410287126898766, "step": 760 }, { "epoch": 0.5547550432276657, "grad_norm": 3.242431640625, "learning_rate": 4.891302569234256e-08, "logits/chosen": -1.4737539291381836, "logits/rejected": -1.4677354097366333, "logps/chosen": -43.22222137451172, "logps/rejected": -45.910953521728516, "loss": 0.6923, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0018933138344436884, "rewards/margins": 0.0016789849614724517, "rewards/rejected": 0.0002143288729712367, "step": 770 }, { "epoch": 0.5619596541786743, "grad_norm": 2.821153163909912, "learning_rate": 4.8851051173171656e-08, "logits/chosen": -1.4989246129989624, "logits/rejected": -1.4895321130752563, "logps/chosen": -48.4155387878418, "logps/rejected": -50.1673583984375, "loss": 0.6927, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0015832766657695174, "rewards/margins": 0.0008711686241440475, "rewards/rejected": 0.0007121083326637745, "step": 780 }, { "epoch": 0.569164265129683, "grad_norm": 2.891462802886963, "learning_rate": 4.87874000235894e-08, "logits/chosen": -1.5470997095108032, "logits/rejected": -1.5371620655059814, "logps/chosen": -49.890480041503906, "logps/rejected": -53.466880798339844, "loss": 0.6923, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0018944181501865387, "rewards/margins": 0.001705197966657579, "rewards/rejected": 0.00018922006711363792, "step": 790 }, { "epoch": 0.5763688760806917, "grad_norm": 3.3358006477355957, "learning_rate": 4.872207671800876e-08, "logits/chosen": -1.5252196788787842, "logits/rejected": -1.5138275623321533, "logps/chosen": -46.89189910888672, "logps/rejected": -47.887367248535156, "loss": 0.6926, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0016973629826679826, "rewards/margins": 0.0011269793612882495, "rewards/rejected": 0.0005703835631720722, "step": 800 }, { "epoch": 0.5835734870317003, "grad_norm": 2.7009220123291016, "learning_rate": 4.865508584838841e-08, "logits/chosen": -1.5175960063934326, "logits/rejected": -1.5209373235702515, "logps/chosen": -44.73839569091797, "logps/rejected": -47.89008712768555, "loss": 0.6925, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0014606156619265676, "rewards/margins": 0.0012506326893344522, "rewards/rejected": 0.00020998305990360677, "step": 810 }, { "epoch": 0.590778097982709, "grad_norm": 2.719752788543701, "learning_rate": 4.858643212390985e-08, "logits/chosen": -1.552513837814331, "logits/rejected": -1.5306655168533325, "logps/chosen": -46.937049865722656, "logps/rejected": -47.570526123046875, "loss": 0.6924, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0019067225512117147, "rewards/margins": 0.0016014000866562128, "rewards/rejected": 0.0003053225518669933, "step": 820 }, { "epoch": 0.5979827089337176, "grad_norm": 2.6290316581726074, "learning_rate": 4.851612037064643e-08, "logits/chosen": -1.5101115703582764, "logits/rejected": -1.5031507015228271, "logps/chosen": -41.800697326660156, "logps/rejected": -44.64197540283203, "loss": 0.6924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.002227164339274168, "rewards/margins": 0.0015375663060694933, "rewards/rejected": 0.0006895981496199965, "step": 830 }, { "epoch": 0.6051873198847262, "grad_norm": 2.2590839862823486, "learning_rate": 4.8444155531224065e-08, "logits/chosen": -1.5194236040115356, "logits/rejected": -1.5119448900222778, "logps/chosen": -47.19294357299805, "logps/rejected": -47.494895935058594, "loss": 0.6922, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0023757501039654016, "rewards/margins": 0.0018679177155718207, "rewards/rejected": 0.0005078322719782591, "step": 840 }, { "epoch": 0.6123919308357348, "grad_norm": 3.8300580978393555, "learning_rate": 4.8370542664473805e-08, "logits/chosen": -1.5282552242279053, "logits/rejected": -1.5171794891357422, "logps/chosen": -47.18886947631836, "logps/rejected": -50.46569061279297, "loss": 0.6922, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0021307552233338356, "rewards/margins": 0.001966592390090227, "rewards/rejected": 0.00016416283324360847, "step": 850 }, { "epoch": 0.6195965417867435, "grad_norm": 2.7995476722717285, "learning_rate": 4.829528694507624e-08, "logits/chosen": -1.5349972248077393, "logits/rejected": -1.5197311639785767, "logps/chosen": -56.855506896972656, "logps/rejected": -56.77360153198242, "loss": 0.6923, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.002222384326159954, "rewards/margins": 0.0018006416503340006, "rewards/rejected": 0.00042174261761829257, "step": 860 }, { "epoch": 0.6268011527377522, "grad_norm": 3.1079301834106445, "learning_rate": 4.821839366319768e-08, "logits/chosen": -1.5740025043487549, "logits/rejected": -1.563123345375061, "logps/chosen": -47.606285095214844, "logps/rejected": -50.62641906738281, "loss": 0.6922, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0018488764762878418, "rewards/margins": 0.001883768243715167, "rewards/rejected": -3.4891847462859005e-05, "step": 870 }, { "epoch": 0.6340057636887608, "grad_norm": 3.0539097785949707, "learning_rate": 4.813986822411833e-08, "logits/chosen": -1.5950686931610107, "logits/rejected": -1.5871632099151611, "logps/chosen": -46.467430114746094, "logps/rejected": -47.59926223754883, "loss": 0.6924, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0018272616434842348, "rewards/margins": 0.0014566404279321432, "rewards/rejected": 0.0003706212155520916, "step": 880 }, { "epoch": 0.6412103746397695, "grad_norm": 2.957125663757324, "learning_rate": 4.805971614785231e-08, "logits/chosen": -1.5934646129608154, "logits/rejected": -1.5831199884414673, "logps/chosen": -44.18696975708008, "logps/rejected": -45.84946823120117, "loss": 0.6922, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0023696511052548885, "rewards/margins": 0.001978666288778186, "rewards/rejected": 0.00039098464185371995, "step": 890 }, { "epoch": 0.6484149855907781, "grad_norm": 3.1889781951904297, "learning_rate": 4.797794306875963e-08, "logits/chosen": -1.4427862167358398, "logits/rejected": -1.4459459781646729, "logps/chosen": -52.905487060546875, "logps/rejected": -56.0252685546875, "loss": 0.6925, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0017143869772553444, "rewards/margins": 0.0013808589428663254, "rewards/rejected": 0.000333528034389019, "step": 900 }, { "epoch": 0.6556195965417867, "grad_norm": 3.1252031326293945, "learning_rate": 4.7894554735150076e-08, "logits/chosen": -1.4938929080963135, "logits/rejected": -1.486290693283081, "logps/chosen": -50.43352508544922, "logps/rejected": -51.920677185058594, "loss": 0.6926, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0016551424050703645, "rewards/margins": 0.0010273735970258713, "rewards/rejected": 0.0006277688080444932, "step": 910 }, { "epoch": 0.6628242074927954, "grad_norm": 2.4982540607452393, "learning_rate": 4.7809557008879185e-08, "logits/chosen": -1.5264360904693604, "logits/rejected": -1.514701247215271, "logps/chosen": -42.05514907836914, "logps/rejected": -43.98528289794922, "loss": 0.692, "rewards/accuracies": 0.625, "rewards/chosen": 0.002506498945876956, "rewards/margins": 0.0023819494526833296, "rewards/rejected": 0.00012454968236852437, "step": 920 }, { "epoch": 0.670028818443804, "grad_norm": 3.3386528491973877, "learning_rate": 4.772295586493613e-08, "logits/chosen": -1.5923378467559814, "logits/rejected": -1.5789897441864014, "logps/chosen": -46.37192916870117, "logps/rejected": -48.806602478027344, "loss": 0.6922, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.002459385199472308, "rewards/margins": 0.001840058364905417, "rewards/rejected": 0.0006193270673975348, "step": 930 }, { "epoch": 0.6772334293948127, "grad_norm": 2.304319143295288, "learning_rate": 4.763475739102374e-08, "logits/chosen": -1.4732818603515625, "logits/rejected": -1.468638300895691, "logps/chosen": -54.89690017700195, "logps/rejected": -55.61511993408203, "loss": 0.6923, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.002316520782187581, "rewards/margins": 0.00171504239551723, "rewards/rejected": 0.0006014782702550292, "step": 940 }, { "epoch": 0.6844380403458213, "grad_norm": 2.9147212505340576, "learning_rate": 4.754496778713054e-08, "logits/chosen": -1.4295583963394165, "logits/rejected": -1.443414330482483, "logps/chosen": -46.24966049194336, "logps/rejected": -50.85737228393555, "loss": 0.6925, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.00283303065225482, "rewards/margins": 0.001242567435838282, "rewards/rejected": 0.0015904635656625032, "step": 950 }, { "epoch": 0.69164265129683, "grad_norm": 2.9757370948791504, "learning_rate": 4.7453593365094926e-08, "logits/chosen": -1.5650156736373901, "logits/rejected": -1.5567947626113892, "logps/chosen": -48.92793655395508, "logps/rejected": -51.33110809326172, "loss": 0.6923, "rewards/accuracies": 0.518750011920929, "rewards/chosen": 0.00195170973893255, "rewards/margins": 0.0016631295438855886, "rewards/rejected": 0.00028858016594313085, "step": 960 }, { "epoch": 0.6988472622478387, "grad_norm": 3.6602158546447754, "learning_rate": 4.736064054816145e-08, "logits/chosen": -1.5796701908111572, "logits/rejected": -1.571428656578064, "logps/chosen": -44.42335510253906, "logps/rejected": -47.71165084838867, "loss": 0.6918, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.003234829055145383, "rewards/margins": 0.0026938277296721935, "rewards/rejected": 0.0005410015000961721, "step": 970 }, { "epoch": 0.7060518731988472, "grad_norm": 2.7763288021087646, "learning_rate": 4.726611587052933e-08, "logits/chosen": -1.4306137561798096, "logits/rejected": -1.4332586526870728, "logps/chosen": -50.7025260925293, "logps/rejected": -55.78404998779297, "loss": 0.6925, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0023340326733887196, "rewards/margins": 0.0012934005353599787, "rewards/rejected": 0.0010406316723674536, "step": 980 }, { "epoch": 0.7132564841498559, "grad_norm": 3.8869450092315674, "learning_rate": 4.71700259768931e-08, "logits/chosen": -1.5388015508651733, "logits/rejected": -1.53265380859375, "logps/chosen": -50.47968673706055, "logps/rejected": -51.989112854003906, "loss": 0.6922, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.002496583852916956, "rewards/margins": 0.0018227867549285293, "rewards/rejected": 0.0006737969815731049, "step": 990 }, { "epoch": 0.7204610951008645, "grad_norm": 2.7834904193878174, "learning_rate": 4.707237762197549e-08, "logits/chosen": -1.5226459503173828, "logits/rejected": -1.5110045671463013, "logps/chosen": -47.024845123291016, "logps/rejected": -49.01807403564453, "loss": 0.6922, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0033415439538657665, "rewards/margins": 0.0018541000317782164, "rewards/rejected": 0.0014874439220875502, "step": 1000 }, { "epoch": 0.7276657060518732, "grad_norm": 3.953099250793457, "learning_rate": 4.697317767005265e-08, "logits/chosen": -1.5304441452026367, "logits/rejected": -1.518947958946228, "logps/chosen": -43.01828384399414, "logps/rejected": -44.82352066040039, "loss": 0.692, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0028657042421400547, "rewards/margins": 0.0023243515752255917, "rewards/rejected": 0.0005413526087068021, "step": 1010 }, { "epoch": 0.7348703170028819, "grad_norm": 2.848383665084839, "learning_rate": 4.6872433094471577e-08, "logits/chosen": -1.5489187240600586, "logits/rejected": -1.5362749099731445, "logps/chosen": -46.616783142089844, "logps/rejected": -48.41503143310547, "loss": 0.6927, "rewards/accuracies": 0.5, "rewards/chosen": 0.002097527962177992, "rewards/margins": 0.0009780559921637177, "rewards/rejected": 0.0011194719700142741, "step": 1020 }, { "epoch": 0.7420749279538905, "grad_norm": 2.6786868572235107, "learning_rate": 4.677015097715994e-08, "logits/chosen": -1.4802360534667969, "logits/rejected": -1.472486972808838, "logps/chosen": -43.47395706176758, "logps/rejected": -46.79792785644531, "loss": 0.6916, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0036659184843301773, "rewards/margins": 0.003058892907574773, "rewards/rejected": 0.0006070258095860481, "step": 1030 }, { "epoch": 0.7492795389048992, "grad_norm": 2.420900583267212, "learning_rate": 4.666633850812825e-08, "logits/chosen": -1.523895502090454, "logits/rejected": -1.5075430870056152, "logps/chosen": -46.26005554199219, "logps/rejected": -48.20013427734375, "loss": 0.6921, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.002487817080691457, "rewards/margins": 0.002060279715806246, "rewards/rejected": 0.0004275373066775501, "step": 1040 }, { "epoch": 0.7564841498559077, "grad_norm": 2.367501974105835, "learning_rate": 4.656100298496439e-08, "logits/chosen": -1.43381929397583, "logits/rejected": -1.4203495979309082, "logps/chosen": -41.25200271606445, "logps/rejected": -44.32691955566406, "loss": 0.6917, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0034980247728526592, "rewards/margins": 0.002962891710922122, "rewards/rejected": 0.0005351334111765027, "step": 1050 }, { "epoch": 0.7636887608069164, "grad_norm": 3.0824179649353027, "learning_rate": 4.6454151812320715e-08, "logits/chosen": -1.5102834701538086, "logits/rejected": -1.4844160079956055, "logps/chosen": -47.216556549072266, "logps/rejected": -48.71002960205078, "loss": 0.6916, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0031948499381542206, "rewards/margins": 0.0031615805346518755, "rewards/rejected": 3.326934529468417e-05, "step": 1060 }, { "epoch": 0.770893371757925, "grad_norm": 3.8348448276519775, "learning_rate": 4.6345792501393434e-08, "logits/chosen": -1.4996126890182495, "logits/rejected": -1.4940050840377808, "logps/chosen": -53.705413818359375, "logps/rejected": -57.729454040527344, "loss": 0.6917, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0037941508926451206, "rewards/margins": 0.002940600039437413, "rewards/rejected": 0.0008535509696230292, "step": 1070 }, { "epoch": 0.7780979827089337, "grad_norm": 3.0553150177001953, "learning_rate": 4.6235932669394676e-08, "logits/chosen": -1.5072972774505615, "logits/rejected": -1.499051809310913, "logps/chosen": -48.09918975830078, "logps/rejected": -51.08711624145508, "loss": 0.6916, "rewards/accuracies": 0.625, "rewards/chosen": 0.004021850414574146, "rewards/margins": 0.0031438730657100677, "rewards/rejected": 0.0008779771742410958, "step": 1080 }, { "epoch": 0.7853025936599424, "grad_norm": 3.5560803413391113, "learning_rate": 4.612458003901698e-08, "logits/chosen": -1.5295929908752441, "logits/rejected": -1.5242204666137695, "logps/chosen": -52.4840202331543, "logps/rejected": -56.06623458862305, "loss": 0.6914, "rewards/accuracies": 0.625, "rewards/chosen": 0.002851675031706691, "rewards/margins": 0.0034307793248444796, "rewards/rejected": -0.0005791039438918233, "step": 1090 }, { "epoch": 0.792507204610951, "grad_norm": 3.1871278285980225, "learning_rate": 4.6011742437890476e-08, "logits/chosen": -1.5383789539337158, "logits/rejected": -1.515812873840332, "logps/chosen": -47.310516357421875, "logps/rejected": -48.80550765991211, "loss": 0.6918, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0031328003387898207, "rewards/margins": 0.002742994111031294, "rewards/rejected": 0.0003898058203049004, "step": 1100 }, { "epoch": 0.7997118155619597, "grad_norm": 2.296072483062744, "learning_rate": 4.589742779803259e-08, "logits/chosen": -1.5470924377441406, "logits/rejected": -1.5346300601959229, "logps/chosen": -46.48540115356445, "logps/rejected": -48.65108108520508, "loss": 0.6919, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0027128090150654316, "rewards/margins": 0.002509825862944126, "rewards/rejected": 0.00020298334129620343, "step": 1110 }, { "epoch": 0.8069164265129684, "grad_norm": 2.6368446350097656, "learning_rate": 4.5781644155290486e-08, "logits/chosen": -1.4871833324432373, "logits/rejected": -1.4772343635559082, "logps/chosen": -45.52531433105469, "logps/rejected": -46.510433197021484, "loss": 0.6915, "rewards/accuracies": 0.5625, "rewards/chosen": 0.004077502526342869, "rewards/margins": 0.00330347940325737, "rewards/rejected": 0.0007740228320471942, "step": 1120 }, { "epoch": 0.8141210374639769, "grad_norm": 2.8489768505096436, "learning_rate": 4.566439964877613e-08, "logits/chosen": -1.5221761465072632, "logits/rejected": -1.5160510540008545, "logps/chosen": -43.423152923583984, "logps/rejected": -45.22438049316406, "loss": 0.6921, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.002408969681710005, "rewards/margins": 0.002129070693627, "rewards/rejected": 0.0002798990753944963, "step": 1130 }, { "epoch": 0.8213256484149856, "grad_norm": 2.916175127029419, "learning_rate": 4.554570252029421e-08, "logits/chosen": -1.5706638097763062, "logits/rejected": -1.561694622039795, "logps/chosen": -46.79415512084961, "logps/rejected": -49.107398986816406, "loss": 0.6911, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.004531461279839277, "rewards/margins": 0.004187966696918011, "rewards/rejected": 0.0003434947575442493, "step": 1140 }, { "epoch": 0.8285302593659942, "grad_norm": 2.6654767990112305, "learning_rate": 4.542556111376274e-08, "logits/chosen": -1.5651055574417114, "logits/rejected": -1.55381178855896, "logps/chosen": -48.72823715209961, "logps/rejected": -50.8982048034668, "loss": 0.6917, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.003072004299610853, "rewards/margins": 0.0029076444916427135, "rewards/rejected": 0.00016435966244898736, "step": 1150 }, { "epoch": 0.8357348703170029, "grad_norm": 3.0894460678100586, "learning_rate": 4.5303983874626506e-08, "logits/chosen": -1.541265606880188, "logits/rejected": -1.5295203924179077, "logps/chosen": -50.63302993774414, "logps/rejected": -51.34119415283203, "loss": 0.6919, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0029607131145894527, "rewards/margins": 0.0025523048825562, "rewards/rejected": 0.00040840805741027, "step": 1160 }, { "epoch": 0.8429394812680115, "grad_norm": 3.537862539291382, "learning_rate": 4.518097934926339e-08, "logits/chosen": -1.460115909576416, "logits/rejected": -1.4346593618392944, "logps/chosen": -46.76906967163086, "logps/rejected": -46.864540100097656, "loss": 0.6916, "rewards/accuracies": 0.5625, "rewards/chosen": 0.003340616822242737, "rewards/margins": 0.0031544927041977644, "rewards/rejected": 0.00018612373969517648, "step": 1170 }, { "epoch": 0.8501440922190202, "grad_norm": 3.8841352462768555, "learning_rate": 4.505655618438363e-08, "logits/chosen": -1.4248067140579224, "logits/rejected": -1.4110429286956787, "logps/chosen": -48.690521240234375, "logps/rejected": -49.68914794921875, "loss": 0.6918, "rewards/accuracies": 0.5625, "rewards/chosen": 0.002877553692087531, "rewards/margins": 0.0027670259587466717, "rewards/rejected": 0.00011052779154852033, "step": 1180 }, { "epoch": 0.8573487031700289, "grad_norm": 2.915313482284546, "learning_rate": 4.4930723126421945e-08, "logits/chosen": -1.5898973941802979, "logits/rejected": -1.5667184591293335, "logps/chosen": -49.06376647949219, "logps/rejected": -50.36428451538086, "loss": 0.6916, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0030966580379754305, "rewards/margins": 0.0030564782209694386, "rewards/rejected": 4.0179769712267444e-05, "step": 1190 }, { "epoch": 0.8645533141210374, "grad_norm": 3.2781777381896973, "learning_rate": 4.48034890209227e-08, "logits/chosen": -1.465415120124817, "logits/rejected": -1.4456936120986938, "logps/chosen": -51.64118576049805, "logps/rejected": -53.60643768310547, "loss": 0.6914, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.00304031022824347, "rewards/margins": 0.003465033369138837, "rewards/rejected": -0.000424722908064723, "step": 1200 }, { "epoch": 0.8717579250720461, "grad_norm": 2.642411470413208, "learning_rate": 4.4674862811918155e-08, "logits/chosen": -1.4467787742614746, "logits/rejected": -1.4440176486968994, "logps/chosen": -43.359764099121094, "logps/rejected": -46.50818634033203, "loss": 0.6915, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.003694910556077957, "rewards/margins": 0.003382860217243433, "rewards/rejected": 0.00031205095001496375, "step": 1210 }, { "epoch": 0.8789625360230547, "grad_norm": 3.445678234100342, "learning_rate": 4.454485354129966e-08, "logits/chosen": -1.4948651790618896, "logits/rejected": -1.4892648458480835, "logps/chosen": -46.580467224121094, "logps/rejected": -50.10879135131836, "loss": 0.6912, "rewards/accuracies": 0.59375, "rewards/chosen": 0.004008608870208263, "rewards/margins": 0.003896749345585704, "rewards/rejected": 0.0001118591899285093, "step": 1220 }, { "epoch": 0.8861671469740634, "grad_norm": 2.9829981327056885, "learning_rate": 4.4413470348182124e-08, "logits/chosen": -1.4465805292129517, "logits/rejected": -1.4232655763626099, "logps/chosen": -48.75006866455078, "logps/rejected": -50.792945861816406, "loss": 0.6913, "rewards/accuracies": 0.53125, "rewards/chosen": 0.003733579069375992, "rewards/margins": 0.0036661014892160892, "rewards/rejected": 6.747785664629191e-05, "step": 1230 }, { "epoch": 0.8933717579250721, "grad_norm": 3.606907367706299, "learning_rate": 4.42807224682615e-08, "logits/chosen": -1.502768874168396, "logits/rejected": -1.490201473236084, "logps/chosen": -42.87495040893555, "logps/rejected": -45.96978759765625, "loss": 0.6909, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0038322817999869585, "rewards/margins": 0.004567673895508051, "rewards/rejected": -0.0007353918626904488, "step": 1240 }, { "epoch": 0.9005763688760807, "grad_norm": 2.5278074741363525, "learning_rate": 4.4146619233165604e-08, "logits/chosen": -1.5510034561157227, "logits/rejected": -1.546442985534668, "logps/chosen": -50.644317626953125, "logps/rejected": -54.01775360107422, "loss": 0.6917, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0030824330169707537, "rewards/margins": 0.0030220781918615103, "rewards/rejected": 6.0354708693921566e-05, "step": 1250 }, { "epoch": 0.9077809798270894, "grad_norm": 3.128469944000244, "learning_rate": 4.4011170069798126e-08, "logits/chosen": -1.5054194927215576, "logits/rejected": -1.521511435508728, "logps/chosen": -46.49076461791992, "logps/rejected": -53.44841766357422, "loss": 0.6916, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.00285265501588583, "rewards/margins": 0.003102459479123354, "rewards/rejected": -0.00024980431771837175, "step": 1260 }, { "epoch": 0.9149855907780979, "grad_norm": 3.1893060207366943, "learning_rate": 4.387438449967594e-08, "logits/chosen": -1.4547739028930664, "logits/rejected": -1.4414390325546265, "logps/chosen": -45.3841438293457, "logps/rejected": -47.97360610961914, "loss": 0.6907, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.00473793875426054, "rewards/margins": 0.005001851357519627, "rewards/rejected": -0.0002639126614667475, "step": 1270 }, { "epoch": 0.9221902017291066, "grad_norm": 3.4499995708465576, "learning_rate": 4.373627213825983e-08, "logits/chosen": -1.609289526939392, "logits/rejected": -1.599395751953125, "logps/chosen": -46.193485260009766, "logps/rejected": -49.694297790527344, "loss": 0.6905, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.004828107543289661, "rewards/margins": 0.005287128034979105, "rewards/rejected": -0.00045902031706646085, "step": 1280 }, { "epoch": 0.9293948126801153, "grad_norm": 2.4953017234802246, "learning_rate": 4.359684269427848e-08, "logits/chosen": -1.5663963556289673, "logits/rejected": -1.563528299331665, "logps/chosen": -45.60202407836914, "logps/rejected": -49.13114929199219, "loss": 0.6911, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.004381075501441956, "rewards/margins": 0.0041623106226325035, "rewards/rejected": 0.00021876460232306272, "step": 1290 }, { "epoch": 0.9365994236311239, "grad_norm": 3.026409149169922, "learning_rate": 4.34561059690461e-08, "logits/chosen": -1.6096134185791016, "logits/rejected": -1.608337163925171, "logps/chosen": -47.30002975463867, "logps/rejected": -48.903472900390625, "loss": 0.6922, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.0020707384683191776, "rewards/margins": 0.0018296821508556604, "rewards/rejected": 0.00024105608463287354, "step": 1300 }, { "epoch": 0.9438040345821326, "grad_norm": 2.6735422611236572, "learning_rate": 4.3314071855773314e-08, "logits/chosen": -1.5702834129333496, "logits/rejected": -1.5721920728683472, "logps/chosen": -41.86522674560547, "logps/rejected": -45.095703125, "loss": 0.6912, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0037654649931937456, "rewards/margins": 0.004000894725322723, "rewards/rejected": -0.0002354306634515524, "step": 1310 }, { "epoch": 0.9510086455331412, "grad_norm": 3.103926420211792, "learning_rate": 4.3170750338871806e-08, "logits/chosen": -1.5061908960342407, "logits/rejected": -1.4911664724349976, "logps/chosen": -46.52951431274414, "logps/rejected": -49.689754486083984, "loss": 0.6909, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.004671442788094282, "rewards/margins": 0.004490147810429335, "rewards/rejected": 0.00018129443924408406, "step": 1320 }, { "epoch": 0.9582132564841499, "grad_norm": 3.006434917449951, "learning_rate": 4.3026151493252414e-08, "logits/chosen": -1.549617886543274, "logits/rejected": -1.5283677577972412, "logps/chosen": -51.5010871887207, "logps/rejected": -52.90639114379883, "loss": 0.6909, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.003953360952436924, "rewards/margins": 0.0045930324122309685, "rewards/rejected": -0.0006396712851710618, "step": 1330 }, { "epoch": 0.9654178674351584, "grad_norm": 3.5157463550567627, "learning_rate": 4.2880285483616895e-08, "logits/chosen": -1.5331405401229858, "logits/rejected": -1.5302000045776367, "logps/chosen": -45.698429107666016, "logps/rejected": -48.99650955200195, "loss": 0.6912, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.004010093864053488, "rewards/margins": 0.003920042887330055, "rewards/rejected": 9.005082392832264e-05, "step": 1340 }, { "epoch": 0.9726224783861671, "grad_norm": 2.748664617538452, "learning_rate": 4.273316256374342e-08, "logits/chosen": -1.4038106203079224, "logits/rejected": -1.39895498752594, "logps/chosen": -52.240440368652344, "logps/rejected": -53.147804260253906, "loss": 0.6912, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.004330686293542385, "rewards/margins": 0.004021846689283848, "rewards/rejected": 0.00030883977888152003, "step": 1350 }, { "epoch": 0.9798270893371758, "grad_norm": 3.36702299118042, "learning_rate": 4.258479307576576e-08, "logits/chosen": -1.5006659030914307, "logits/rejected": -1.4956198930740356, "logps/chosen": -43.774024963378906, "logps/rejected": -45.699832916259766, "loss": 0.6903, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0054828086867928505, "rewards/margins": 0.005769900046288967, "rewards/rejected": -0.0002870913012884557, "step": 1360 }, { "epoch": 0.9870317002881844, "grad_norm": 2.7340095043182373, "learning_rate": 4.243518744944626e-08, "logits/chosen": -1.504451870918274, "logits/rejected": -1.5006357431411743, "logps/chosen": -43.27858352661133, "logps/rejected": -47.12054443359375, "loss": 0.6909, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.003977509681135416, "rewards/margins": 0.004465220961719751, "rewards/rejected": -0.0004877113678958267, "step": 1370 }, { "epoch": 0.9942363112391931, "grad_norm": 3.410355567932129, "learning_rate": 4.22843562014427e-08, "logits/chosen": -1.4497559070587158, "logits/rejected": -1.439967155456543, "logps/chosen": -46.891605377197266, "logps/rejected": -49.04447937011719, "loss": 0.6918, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.003908143378794193, "rewards/margins": 0.002761934418231249, "rewards/rejected": 0.0011462090769782662, "step": 1380 }, { "epoch": 1.0014409221902016, "grad_norm": 3.0687458515167236, "learning_rate": 4.2132309934569e-08, "logits/chosen": -1.5673506259918213, "logits/rejected": -1.5622578859329224, "logps/chosen": -43.75770950317383, "logps/rejected": -46.118858337402344, "loss": 0.6913, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.005200340412557125, "rewards/margins": 0.003705868497490883, "rewards/rejected": 0.0014944719150662422, "step": 1390 }, { "epoch": 1.0086455331412103, "grad_norm": 2.4820141792297363, "learning_rate": 4.197905933704989e-08, "logits/chosen": -1.4312834739685059, "logits/rejected": -1.4214531183242798, "logps/chosen": -47.255332946777344, "logps/rejected": -49.971588134765625, "loss": 0.6895, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.005209643859416246, "rewards/margins": 0.007327331695705652, "rewards/rejected": -0.0021176892332732677, "step": 1400 }, { "epoch": 1.015850144092219, "grad_norm": 2.7282636165618896, "learning_rate": 4.1824615181769577e-08, "logits/chosen": -1.4861654043197632, "logits/rejected": -1.4929125308990479, "logps/chosen": -43.8185920715332, "logps/rejected": -47.74663162231445, "loss": 0.6902, "rewards/accuracies": 0.625, "rewards/chosen": 0.004913496784865856, "rewards/margins": 0.005964468698948622, "rewards/rejected": -0.0010509720304980874, "step": 1410 }, { "epoch": 1.0230547550432276, "grad_norm": 3.1228253841400146, "learning_rate": 4.1668988325514434e-08, "logits/chosen": -1.5240795612335205, "logits/rejected": -1.5136892795562744, "logps/chosen": -49.41614532470703, "logps/rejected": -52.12762451171875, "loss": 0.6899, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.004956229589879513, "rewards/margins": 0.006649328861385584, "rewards/rejected": -0.0016930990386754274, "step": 1420 }, { "epoch": 1.0302593659942363, "grad_norm": 3.0137906074523926, "learning_rate": 4.1512189708209844e-08, "logits/chosen": -1.5741335153579712, "logits/rejected": -1.5646674633026123, "logps/chosen": -38.225059509277344, "logps/rejected": -39.43558883666992, "loss": 0.6906, "rewards/accuracies": 0.625, "rewards/chosen": 0.005329563282430172, "rewards/margins": 0.005151194520294666, "rewards/rejected": 0.00017836911138147116, "step": 1430 }, { "epoch": 1.037463976945245, "grad_norm": 3.6123361587524414, "learning_rate": 4.1354230352151143e-08, "logits/chosen": -1.5040456056594849, "logits/rejected": -1.4911547899246216, "logps/chosen": -56.4525260925293, "logps/rejected": -56.65864944458008, "loss": 0.6907, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0040727341547608376, "rewards/margins": 0.004969144240021706, "rewards/rejected": -0.0008964102598838508, "step": 1440 }, { "epoch": 1.0446685878962536, "grad_norm": 2.5805535316467285, "learning_rate": 4.119512136122882e-08, "logits/chosen": -1.608700156211853, "logits/rejected": -1.621206283569336, "logps/chosen": -42.28131866455078, "logps/rejected": -48.45494842529297, "loss": 0.6897, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.004296852741390467, "rewards/margins": 0.0070196837186813354, "rewards/rejected": -0.002722830278798938, "step": 1450 }, { "epoch": 1.0518731988472623, "grad_norm": 3.4811322689056396, "learning_rate": 4.103487392014795e-08, "logits/chosen": -1.4754607677459717, "logits/rejected": -1.4568121433258057, "logps/chosen": -46.388267517089844, "logps/rejected": -51.01404571533203, "loss": 0.689, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.005554754287004471, "rewards/margins": 0.008482200093567371, "rewards/rejected": -0.002927445573732257, "step": 1460 }, { "epoch": 1.059077809798271, "grad_norm": 2.9166300296783447, "learning_rate": 4.087349929364192e-08, "logits/chosen": -1.5654090642929077, "logits/rejected": -1.544526219367981, "logps/chosen": -42.55342483520508, "logps/rejected": -45.895294189453125, "loss": 0.6896, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.004680173471570015, "rewards/margins": 0.007236185017973185, "rewards/rejected": -0.0025560115464031696, "step": 1470 }, { "epoch": 1.0662824207492796, "grad_norm": 2.4293131828308105, "learning_rate": 4.0711008825680645e-08, "logits/chosen": -1.5042253732681274, "logits/rejected": -1.4851328134536743, "logps/chosen": -47.321651458740234, "logps/rejected": -50.18535614013672, "loss": 0.6904, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.004479935858398676, "rewards/margins": 0.005603041499853134, "rewards/rejected": -0.0011231055250391364, "step": 1480 }, { "epoch": 1.0734870317002883, "grad_norm": 3.547532796859741, "learning_rate": 4.054741393867306e-08, "logits/chosen": -1.4755966663360596, "logits/rejected": -1.4644145965576172, "logps/chosen": -54.06425857543945, "logps/rejected": -55.69597244262695, "loss": 0.6902, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.004462078679352999, "rewards/margins": 0.005896457936614752, "rewards/rejected": -0.0014343796065077186, "step": 1490 }, { "epoch": 1.080691642651297, "grad_norm": 2.9653351306915283, "learning_rate": 4.038272613266419e-08, "logits/chosen": -1.5455403327941895, "logits/rejected": -1.5216772556304932, "logps/chosen": -44.87678909301758, "logps/rejected": -47.38240432739258, "loss": 0.6902, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.005075398366898298, "rewards/margins": 0.006029829382896423, "rewards/rejected": -0.0009544305503368378, "step": 1500 }, { "epoch": 1.0878962536023056, "grad_norm": 3.3833298683166504, "learning_rate": 4.0216956984526784e-08, "logits/chosen": -1.5539488792419434, "logits/rejected": -1.5496807098388672, "logps/chosen": -42.88352966308594, "logps/rejected": -45.572662353515625, "loss": 0.6896, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.005250538233667612, "rewards/margins": 0.007218127138912678, "rewards/rejected": -0.001967588672414422, "step": 1510 }, { "epoch": 1.0951008645533142, "grad_norm": 3.101933002471924, "learning_rate": 4.0050118147147446e-08, "logits/chosen": -1.5179487466812134, "logits/rejected": -1.5099337100982666, "logps/chosen": -53.398643493652344, "logps/rejected": -52.119972229003906, "loss": 0.6921, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.0032478254288434982, "rewards/margins": 0.0021998791489750147, "rewards/rejected": 0.0010479463962838054, "step": 1520 }, { "epoch": 1.1023054755043227, "grad_norm": 3.1006407737731934, "learning_rate": 3.988222134860755e-08, "logits/chosen": -1.5638402700424194, "logits/rejected": -1.550837516784668, "logps/chosen": -47.294315338134766, "logps/rejected": -51.660377502441406, "loss": 0.69, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0049809180200099945, "rewards/margins": 0.006470340304076672, "rewards/rejected": -0.0014894230989739299, "step": 1530 }, { "epoch": 1.1095100864553313, "grad_norm": 3.0943164825439453, "learning_rate": 3.9713278391358724e-08, "logits/chosen": -1.5746233463287354, "logits/rejected": -1.5629007816314697, "logps/chosen": -45.94882583618164, "logps/rejected": -49.18882369995117, "loss": 0.6904, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.005064360331743956, "rewards/margins": 0.005639818962663412, "rewards/rejected": -0.0005754587473347783, "step": 1540 }, { "epoch": 1.11671469740634, "grad_norm": 2.466256618499756, "learning_rate": 3.954330115139328e-08, "logits/chosen": -1.5431041717529297, "logits/rejected": -1.532755732536316, "logps/chosen": -46.7624397277832, "logps/rejected": -48.83042526245117, "loss": 0.6899, "rewards/accuracies": 0.625, "rewards/chosen": 0.004654773510992527, "rewards/margins": 0.006525079254060984, "rewards/rejected": -0.0018703056266531348, "step": 1550 }, { "epoch": 1.1239193083573487, "grad_norm": 4.066368103027344, "learning_rate": 3.937230157740931e-08, "logits/chosen": -1.591922402381897, "logits/rejected": -1.573209285736084, "logps/chosen": -47.90293884277344, "logps/rejected": -51.45854949951172, "loss": 0.6893, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.006319095846265554, "rewards/margins": 0.007731067482382059, "rewards/rejected": -0.001411972800269723, "step": 1560 }, { "epoch": 1.1311239193083573, "grad_norm": 2.3516957759857178, "learning_rate": 3.920029168997077e-08, "logits/chosen": -1.5559136867523193, "logits/rejected": -1.5427807569503784, "logps/chosen": -48.74748992919922, "logps/rejected": -51.54194259643555, "loss": 0.6904, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.004629113245755434, "rewards/margins": 0.005624239332973957, "rewards/rejected": -0.0009951259708032012, "step": 1570 }, { "epoch": 1.138328530259366, "grad_norm": 3.9600844383239746, "learning_rate": 3.9027283580662476e-08, "logits/chosen": -1.519817590713501, "logits/rejected": -1.5075610876083374, "logps/chosen": -49.621826171875, "logps/rejected": -52.77042770385742, "loss": 0.6888, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.005040620919317007, "rewards/margins": 0.008789965882897377, "rewards/rejected": -0.0037493451964110136, "step": 1580 }, { "epoch": 1.1455331412103746, "grad_norm": 3.975217819213867, "learning_rate": 3.885328941124014e-08, "logits/chosen": -1.5018306970596313, "logits/rejected": -1.4887568950653076, "logps/chosen": -45.943092346191406, "logps/rejected": -50.64377975463867, "loss": 0.6896, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.005147297866642475, "rewards/margins": 0.007178432308137417, "rewards/rejected": -0.0020311346743255854, "step": 1590 }, { "epoch": 1.1527377521613833, "grad_norm": 3.0368916988372803, "learning_rate": 3.867832141277539e-08, "logits/chosen": -1.5485479831695557, "logits/rejected": -1.5292450189590454, "logps/chosen": -49.10287857055664, "logps/rejected": -51.29143142700195, "loss": 0.6899, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.004347370471805334, "rewards/margins": 0.00650381064042449, "rewards/rejected": -0.002156440168619156, "step": 1600 }, { "epoch": 1.159942363112392, "grad_norm": 3.4196557998657227, "learning_rate": 3.850239188479606e-08, "logits/chosen": -1.4620041847229004, "logits/rejected": -1.4584031105041504, "logps/chosen": -46.734825134277344, "logps/rejected": -49.128395080566406, "loss": 0.6898, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0048998757265508175, "rewards/margins": 0.0066976905800402164, "rewards/rejected": -0.0017978150863200426, "step": 1610 }, { "epoch": 1.1671469740634006, "grad_norm": 3.5657782554626465, "learning_rate": 3.832551319442151e-08, "logits/chosen": -1.5856021642684937, "logits/rejected": -1.5821958780288696, "logps/chosen": -49.59635543823242, "logps/rejected": -53.7983283996582, "loss": 0.6899, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0052263312973082066, "rewards/margins": 0.006498755421489477, "rewards/rejected": -0.0012724247062578797, "step": 1620 }, { "epoch": 1.1743515850144093, "grad_norm": 4.230812072753906, "learning_rate": 3.81476977754933e-08, "logits/chosen": -1.3997455835342407, "logits/rejected": -1.3851207494735718, "logps/chosen": -51.304656982421875, "logps/rejected": -50.67051315307617, "loss": 0.6901, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.004272806458175182, "rewards/margins": 0.006219337694346905, "rewards/rejected": -0.0019465312361717224, "step": 1630 }, { "epoch": 1.181556195965418, "grad_norm": 2.398345470428467, "learning_rate": 3.796895812770114e-08, "logits/chosen": -1.5023247003555298, "logits/rejected": -1.4925954341888428, "logps/chosen": -45.7723503112793, "logps/rejected": -47.24066925048828, "loss": 0.6894, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0059529999271035194, "rewards/margins": 0.0075266240164637566, "rewards/rejected": -0.0015736244386062026, "step": 1640 }, { "epoch": 1.1887608069164266, "grad_norm": 3.0664162635803223, "learning_rate": 3.7789306815704216e-08, "logits/chosen": -1.5271222591400146, "logits/rejected": -1.5166699886322021, "logps/chosen": -40.933349609375, "logps/rejected": -42.03236770629883, "loss": 0.6908, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.003515125485137105, "rewards/margins": 0.00480083329603076, "rewards/rejected": -0.001285707694478333, "step": 1650 }, { "epoch": 1.195965417867435, "grad_norm": 2.647942543029785, "learning_rate": 3.760875646824795e-08, "logits/chosen": -1.3960988521575928, "logits/rejected": -1.3973405361175537, "logps/chosen": -46.11708068847656, "logps/rejected": -48.433963775634766, "loss": 0.6896, "rewards/accuracies": 0.625, "rewards/chosen": 0.0037715521175414324, "rewards/margins": 0.007300334516912699, "rewards/rejected": -0.003528781933709979, "step": 1660 }, { "epoch": 1.2031700288184437, "grad_norm": 3.641119956970215, "learning_rate": 3.742731977727623e-08, "logits/chosen": -1.5391809940338135, "logits/rejected": -1.5319098234176636, "logps/chosen": -45.204803466796875, "logps/rejected": -49.10814666748047, "loss": 0.6896, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.006137064192444086, "rewards/margins": 0.007269109599292278, "rewards/rejected": -0.0011320464545860887, "step": 1670 }, { "epoch": 1.2103746397694524, "grad_norm": 3.703112840652466, "learning_rate": 3.7245009497039244e-08, "logits/chosen": -1.4356962442398071, "logits/rejected": -1.4204628467559814, "logps/chosen": -45.42398452758789, "logps/rejected": -49.50120162963867, "loss": 0.6892, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.003938040696084499, "rewards/margins": 0.008085368201136589, "rewards/rejected": -0.004147327970713377, "step": 1680 }, { "epoch": 1.217579250720461, "grad_norm": 2.649240732192993, "learning_rate": 3.7061838443196886e-08, "logits/chosen": -1.511671543121338, "logits/rejected": -1.5024120807647705, "logps/chosen": -50.048927307128906, "logps/rejected": -52.17205810546875, "loss": 0.6885, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.005982678383588791, "rewards/margins": 0.009426699951291084, "rewards/rejected": -0.003444021102041006, "step": 1690 }, { "epoch": 1.2247838616714697, "grad_norm": 2.996338129043579, "learning_rate": 3.68778194919179e-08, "logits/chosen": -1.4723883867263794, "logits/rejected": -1.4665647745132446, "logps/chosen": -50.0838737487793, "logps/rejected": -53.330780029296875, "loss": 0.6878, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.008046514354646206, "rewards/margins": 0.010942642576992512, "rewards/rejected": -0.0028961277566850185, "step": 1700 }, { "epoch": 1.2319884726224783, "grad_norm": 3.614102840423584, "learning_rate": 3.66929655789747e-08, "logits/chosen": -1.570845365524292, "logits/rejected": -1.5528652667999268, "logps/chosen": -41.95561599731445, "logps/rejected": -46.511497497558594, "loss": 0.689, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.005627007223665714, "rewards/margins": 0.008476624265313148, "rewards/rejected": -0.0028496168088167906, "step": 1710 }, { "epoch": 1.239193083573487, "grad_norm": 2.334775447845459, "learning_rate": 3.6507289698834064e-08, "logits/chosen": -1.4717720746994019, "logits/rejected": -1.455172061920166, "logps/chosen": -43.546348571777344, "logps/rejected": -46.07920455932617, "loss": 0.689, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.005223124288022518, "rewards/margins": 0.008515788242220879, "rewards/rejected": -0.003292663721367717, "step": 1720 }, { "epoch": 1.2463976945244957, "grad_norm": 4.027002334594727, "learning_rate": 3.6320804903743684e-08, "logits/chosen": -1.5162203311920166, "logits/rejected": -1.5112934112548828, "logps/chosen": -45.405494689941406, "logps/rejected": -49.15242385864258, "loss": 0.6888, "rewards/accuracies": 0.65625, "rewards/chosen": 0.0035115727223455906, "rewards/margins": 0.008787490427494049, "rewards/rejected": -0.005275918636471033, "step": 1730 }, { "epoch": 1.2536023054755043, "grad_norm": 2.6576426029205322, "learning_rate": 3.61335243028146e-08, "logits/chosen": -1.495697021484375, "logits/rejected": -1.489630937576294, "logps/chosen": -48.88677978515625, "logps/rejected": -51.62388229370117, "loss": 0.6891, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.004441169556230307, "rewards/margins": 0.008156510069966316, "rewards/rejected": -0.0037153400480747223, "step": 1740 }, { "epoch": 1.260806916426513, "grad_norm": 3.2648770809173584, "learning_rate": 3.5945461061099736e-08, "logits/chosen": -1.4391025304794312, "logits/rejected": -1.4078892469406128, "logps/chosen": -50.779335021972656, "logps/rejected": -49.67884063720703, "loss": 0.6872, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.006459861062467098, "rewards/margins": 0.012035077437758446, "rewards/rejected": -0.005575217306613922, "step": 1750 }, { "epoch": 1.2680115273775217, "grad_norm": 2.9683494567871094, "learning_rate": 3.5756628398668446e-08, "logits/chosen": -1.5588018894195557, "logits/rejected": -1.557953953742981, "logps/chosen": -51.247032165527344, "logps/rejected": -53.68225860595703, "loss": 0.6886, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0037513356655836105, "rewards/margins": 0.009223448112607002, "rewards/rejected": -0.005472113378345966, "step": 1760 }, { "epoch": 1.2752161383285303, "grad_norm": 2.657179832458496, "learning_rate": 3.556703958967716e-08, "logits/chosen": -1.557582974433899, "logits/rejected": -1.5438480377197266, "logps/chosen": -44.341522216796875, "logps/rejected": -47.97272491455078, "loss": 0.6896, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0037257769145071507, "rewards/margins": 0.007122798822820187, "rewards/rejected": -0.003397023305296898, "step": 1770 }, { "epoch": 1.282420749279539, "grad_norm": 4.005496978759766, "learning_rate": 3.5376707961436297e-08, "logits/chosen": -1.5314931869506836, "logits/rejected": -1.5151678323745728, "logps/chosen": -53.3531494140625, "logps/rejected": -53.47139358520508, "loss": 0.6906, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.004263926763087511, "rewards/margins": 0.005168497562408447, "rewards/rejected": -0.000904570915736258, "step": 1780 }, { "epoch": 1.2896253602305476, "grad_norm": 2.4250881671905518, "learning_rate": 3.51856468934734e-08, "logits/chosen": -1.4921042919158936, "logits/rejected": -1.4953409433364868, "logps/chosen": -46.357215881347656, "logps/rejected": -48.65216064453125, "loss": 0.6912, "rewards/accuracies": 0.5625, "rewards/chosen": 0.004532798193395138, "rewards/margins": 0.003982014954090118, "rewards/rejected": 0.0005507826572284102, "step": 1790 }, { "epoch": 1.2968299711815563, "grad_norm": 3.3605597019195557, "learning_rate": 3.499386981659262e-08, "logits/chosen": -1.5788064002990723, "logits/rejected": -1.570356011390686, "logps/chosen": -45.479488372802734, "logps/rejected": -51.7054328918457, "loss": 0.689, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.006253694649785757, "rewards/margins": 0.008554017171263695, "rewards/rejected": -0.0023003218229860067, "step": 1800 }, { "epoch": 1.304034582132565, "grad_norm": 2.548804759979248, "learning_rate": 3.480139021193057e-08, "logits/chosen": -1.462729811668396, "logits/rejected": -1.4625592231750488, "logps/chosen": -46.50673294067383, "logps/rejected": -49.94841384887695, "loss": 0.6896, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.003766898764297366, "rewards/margins": 0.007147823926061392, "rewards/rejected": -0.0033809244632720947, "step": 1810 }, { "epoch": 1.3112391930835736, "grad_norm": 4.094874382019043, "learning_rate": 3.4608221610008666e-08, "logits/chosen": -1.5544965267181396, "logits/rejected": -1.5445128679275513, "logps/chosen": -40.694557189941406, "logps/rejected": -45.3368034362793, "loss": 0.6878, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.00552348280325532, "rewards/margins": 0.010925527662038803, "rewards/rejected": -0.005402045324444771, "step": 1820 }, { "epoch": 1.318443804034582, "grad_norm": 2.3106565475463867, "learning_rate": 3.4414377589782e-08, "logits/chosen": -1.4896498918533325, "logits/rejected": -1.4890059232711792, "logps/chosen": -44.28178787231445, "logps/rejected": -46.68126678466797, "loss": 0.689, "rewards/accuracies": 0.625, "rewards/chosen": 0.003096462693065405, "rewards/margins": 0.00837036594748497, "rewards/rejected": -0.0052739037200808525, "step": 1830 }, { "epoch": 1.3256484149855907, "grad_norm": 2.239630937576294, "learning_rate": 3.4219871777684745e-08, "logits/chosen": -1.5045303106307983, "logits/rejected": -1.48006272315979, "logps/chosen": -48.24463653564453, "logps/rejected": -49.66504669189453, "loss": 0.6888, "rewards/accuracies": 0.5625, "rewards/chosen": 0.004429197870194912, "rewards/margins": 0.0088044423609972, "rewards/rejected": -0.004375244490802288, "step": 1840 }, { "epoch": 1.3328530259365994, "grad_norm": 3.131451368331909, "learning_rate": 3.4024717846672364e-08, "logits/chosen": -1.5544954538345337, "logits/rejected": -1.5410051345825195, "logps/chosen": -43.851314544677734, "logps/rejected": -47.097129821777344, "loss": 0.6885, "rewards/accuracies": 0.625, "rewards/chosen": 0.0032080274540930986, "rewards/margins": 0.009510315954685211, "rewards/rejected": -0.006302288733422756, "step": 1850 }, { "epoch": 1.340057636887608, "grad_norm": 3.247105121612549, "learning_rate": 3.382892951526036e-08, "logits/chosen": -1.5086390972137451, "logits/rejected": -1.4984405040740967, "logps/chosen": -48.57241439819336, "logps/rejected": -53.5483283996582, "loss": 0.688, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.004579062573611736, "rewards/margins": 0.010537253692746162, "rewards/rejected": -0.005958191119134426, "step": 1860 }, { "epoch": 1.3472622478386167, "grad_norm": 3.0748696327209473, "learning_rate": 3.3632520546559974e-08, "logits/chosen": -1.4774866104125977, "logits/rejected": -1.450300693511963, "logps/chosen": -42.174537658691406, "logps/rejected": -46.27776336669922, "loss": 0.6883, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.005281184334307909, "rewards/margins": 0.009902546182274818, "rewards/rejected": -0.004621362779289484, "step": 1870 }, { "epoch": 1.3544668587896254, "grad_norm": 3.4227564334869385, "learning_rate": 3.34355047473107e-08, "logits/chosen": -1.509340763092041, "logits/rejected": -1.4928722381591797, "logps/chosen": -49.13660430908203, "logps/rejected": -50.32002639770508, "loss": 0.6892, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.003565728198736906, "rewards/margins": 0.008065813221037388, "rewards/rejected": -0.004500086419284344, "step": 1880 }, { "epoch": 1.361671469740634, "grad_norm": 3.2549450397491455, "learning_rate": 3.323789596690971e-08, "logits/chosen": -1.4439764022827148, "logits/rejected": -1.440734624862671, "logps/chosen": -46.024192810058594, "logps/rejected": -50.3931999206543, "loss": 0.6884, "rewards/accuracies": 0.625, "rewards/chosen": 0.004367457702755928, "rewards/margins": 0.009686267003417015, "rewards/rejected": -0.005318809300661087, "step": 1890 }, { "epoch": 1.3688760806916427, "grad_norm": 2.0359508991241455, "learning_rate": 3.303970809643828e-08, "logits/chosen": -1.5255951881408691, "logits/rejected": -1.5279831886291504, "logps/chosen": -45.330116271972656, "logps/rejected": -49.04480743408203, "loss": 0.689, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.005456699524074793, "rewards/margins": 0.008413759991526604, "rewards/rejected": -0.002957060467451811, "step": 1900 }, { "epoch": 1.3760806916426513, "grad_norm": 3.008654832839966, "learning_rate": 3.2840955067685356e-08, "logits/chosen": -1.5634180307388306, "logits/rejected": -1.5633050203323364, "logps/chosen": -45.995826721191406, "logps/rejected": -50.449180603027344, "loss": 0.6875, "rewards/accuracies": 0.65625, "rewards/chosen": 0.005350454244762659, "rewards/margins": 0.011478688567876816, "rewards/rejected": -0.006128234788775444, "step": 1910 }, { "epoch": 1.38328530259366, "grad_norm": 2.7495784759521484, "learning_rate": 3.264165085216817e-08, "logits/chosen": -1.5800001621246338, "logits/rejected": -1.572776436805725, "logps/chosen": -38.539920806884766, "logps/rejected": -43.823753356933594, "loss": 0.6888, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.004405132494866848, "rewards/margins": 0.008914101868867874, "rewards/rejected": -0.004508969374001026, "step": 1920 }, { "epoch": 1.3904899135446687, "grad_norm": 4.20028018951416, "learning_rate": 3.244180946015008e-08, "logits/chosen": -1.4439995288848877, "logits/rejected": -1.435498833656311, "logps/chosen": -52.166969299316406, "logps/rejected": -53.882225036621094, "loss": 0.6898, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.004471774213016033, "rewards/margins": 0.0068550496362149715, "rewards/rejected": -0.0023832768201828003, "step": 1930 }, { "epoch": 1.397694524495677, "grad_norm": 2.5121569633483887, "learning_rate": 3.224144493965578e-08, "logits/chosen": -1.5799609422683716, "logits/rejected": -1.5780445337295532, "logps/chosen": -43.653690338134766, "logps/rejected": -45.741546630859375, "loss": 0.6892, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0035719252191483974, "rewards/margins": 0.00797609519213438, "rewards/rejected": -0.0044041709043085575, "step": 1940 }, { "epoch": 1.4048991354466858, "grad_norm": 2.816251516342163, "learning_rate": 3.204057137548371e-08, "logits/chosen": -1.5314784049987793, "logits/rejected": -1.5235313177108765, "logps/chosen": -43.7066650390625, "logps/rejected": -47.314064025878906, "loss": 0.6877, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0040044053457677364, "rewards/margins": 0.011081613600254059, "rewards/rejected": -0.007077208254486322, "step": 1950 }, { "epoch": 1.4121037463976944, "grad_norm": 3.713770866394043, "learning_rate": 3.183920288821597e-08, "logits/chosen": -1.4900578260421753, "logits/rejected": -1.4816348552703857, "logps/chosen": -45.30292510986328, "logps/rejected": -49.97011184692383, "loss": 0.6873, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.004732588771730661, "rewards/margins": 0.011991357430815697, "rewards/rejected": -0.007258768193423748, "step": 1960 }, { "epoch": 1.419308357348703, "grad_norm": 3.808242082595825, "learning_rate": 3.1637353633225735e-08, "logits/chosen": -1.539838433265686, "logits/rejected": -1.5290555953979492, "logps/chosen": -41.22531509399414, "logps/rejected": -45.71930694580078, "loss": 0.6868, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.003598598064854741, "rewards/margins": 0.012951062992215157, "rewards/rejected": -0.00935246329754591, "step": 1970 }, { "epoch": 1.4265129682997117, "grad_norm": 3.283512592315674, "learning_rate": 3.143503779968213e-08, "logits/chosen": -1.5066019296646118, "logits/rejected": -1.5066450834274292, "logps/chosen": -45.44121551513672, "logps/rejected": -49.75178909301758, "loss": 0.6892, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.0011785195674747229, "rewards/margins": 0.008287757635116577, "rewards/rejected": -0.007109238300472498, "step": 1980 }, { "epoch": 1.4337175792507204, "grad_norm": 3.336965322494507, "learning_rate": 3.1232269609552875e-08, "logits/chosen": -1.518235445022583, "logits/rejected": -1.507875680923462, "logps/chosen": -43.67535400390625, "logps/rejected": -46.17144012451172, "loss": 0.6887, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.004166613798588514, "rewards/margins": 0.00908889900892973, "rewards/rejected": -0.0049222856760025024, "step": 1990 }, { "epoch": 1.440922190201729, "grad_norm": 2.1761865615844727, "learning_rate": 3.102906331660444e-08, "logits/chosen": -1.5566504001617432, "logits/rejected": -1.543027639389038, "logps/chosen": -41.95298767089844, "logps/rejected": -48.24913024902344, "loss": 0.6868, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": 0.0051358104683458805, "rewards/margins": 0.013014426454901695, "rewards/rejected": -0.007878614589571953, "step": 2000 }, { "epoch": 1.4481268011527377, "grad_norm": 3.1758456230163574, "learning_rate": 3.082543320540015e-08, "logits/chosen": -1.46958327293396, "logits/rejected": -1.454538106918335, "logps/chosen": -43.856781005859375, "logps/rejected": -47.52972412109375, "loss": 0.6882, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0025951929856091738, "rewards/margins": 0.010161316022276878, "rewards/rejected": -0.007566122803837061, "step": 2010 }, { "epoch": 1.4553314121037464, "grad_norm": 4.123020648956299, "learning_rate": 3.062139359029599e-08, "logits/chosen": -1.5575106143951416, "logits/rejected": -1.5533676147460938, "logps/chosen": -46.472938537597656, "logps/rejected": -48.92716598510742, "loss": 0.6881, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0033696771133691072, "rewards/margins": 0.010232589207589626, "rewards/rejected": -0.006862912327051163, "step": 2020 }, { "epoch": 1.462536023054755, "grad_norm": 3.4289743900299072, "learning_rate": 3.041695881443437e-08, "logits/chosen": -1.5759422779083252, "logits/rejected": -1.5675259828567505, "logps/chosen": -46.361610412597656, "logps/rejected": -50.346927642822266, "loss": 0.69, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0033702836371958256, "rewards/margins": 0.006468233652412891, "rewards/rejected": -0.003097949782386422, "step": 2030 }, { "epoch": 1.4697406340057637, "grad_norm": 4.054693698883057, "learning_rate": 3.0212143248735886e-08, "logits/chosen": -1.5312144756317139, "logits/rejected": -1.5288541316986084, "logps/chosen": -49.77804946899414, "logps/rejected": -54.46125411987305, "loss": 0.6875, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.004172780551016331, "rewards/margins": 0.011559790931642056, "rewards/rejected": -0.007387010846287012, "step": 2040 }, { "epoch": 1.4769452449567724, "grad_norm": 3.1554675102233887, "learning_rate": 3.0006961290889077e-08, "logits/chosen": -1.5213099718093872, "logits/rejected": -1.4963198900222778, "logps/chosen": -50.681434631347656, "logps/rejected": -53.1854362487793, "loss": 0.6876, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.004848620388656855, "rewards/margins": 0.011397367343306541, "rewards/rejected": -0.006548746023327112, "step": 2050 }, { "epoch": 1.484149855907781, "grad_norm": 2.6636829376220703, "learning_rate": 2.980142736433833e-08, "logits/chosen": -1.5465320348739624, "logits/rejected": -1.5224246978759766, "logps/chosen": -44.290504455566406, "logps/rejected": -44.52730941772461, "loss": 0.6885, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0015873590018600225, "rewards/margins": 0.009515630081295967, "rewards/rejected": -0.007928271777927876, "step": 2060 }, { "epoch": 1.4913544668587897, "grad_norm": 4.1119771003723145, "learning_rate": 2.9595555917269997e-08, "logits/chosen": -1.5561182498931885, "logits/rejected": -1.5290348529815674, "logps/chosen": -51.56610107421875, "logps/rejected": -53.313232421875, "loss": 0.6883, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0019942389335483313, "rewards/margins": 0.009912279434502125, "rewards/rejected": -0.00791804026812315, "step": 2070 }, { "epoch": 1.4985590778097984, "grad_norm": 3.134394407272339, "learning_rate": 2.9389361421596725e-08, "logits/chosen": -1.4347946643829346, "logits/rejected": -1.4317066669464111, "logps/chosen": -49.19475555419922, "logps/rejected": -53.55085372924805, "loss": 0.6871, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0042834230698645115, "rewards/margins": 0.012365362606942654, "rewards/rejected": -0.00808194000273943, "step": 2080 }, { "epoch": 1.505763688760807, "grad_norm": 2.575726270675659, "learning_rate": 2.9182858371940126e-08, "logits/chosen": -1.5316616296768188, "logits/rejected": -1.5172832012176514, "logps/chosen": -42.60432052612305, "logps/rejected": -46.17161560058594, "loss": 0.6866, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0029449663124978542, "rewards/margins": 0.013502433896064758, "rewards/rejected": -0.010557468049228191, "step": 2090 }, { "epoch": 1.5129682997118157, "grad_norm": 3.6238691806793213, "learning_rate": 2.8976061284611908e-08, "logits/chosen": -1.4699041843414307, "logits/rejected": -1.4809154272079468, "logps/chosen": -41.60236358642578, "logps/rejected": -45.32775115966797, "loss": 0.6875, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.004988711792975664, "rewards/margins": 0.011551633477210999, "rewards/rejected": -0.006562922149896622, "step": 2100 }, { "epoch": 1.5201729106628243, "grad_norm": 3.270362377166748, "learning_rate": 2.8768984696593384e-08, "logits/chosen": -1.4800060987472534, "logits/rejected": -1.4634923934936523, "logps/chosen": -44.76579666137695, "logps/rejected": -47.92957305908203, "loss": 0.6868, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.005053057335317135, "rewards/margins": 0.013063912279903889, "rewards/rejected": -0.008010854944586754, "step": 2110 }, { "epoch": 1.527377521613833, "grad_norm": 3.2324070930480957, "learning_rate": 2.8561643164513637e-08, "logits/chosen": -1.3341636657714844, "logits/rejected": -1.3184261322021484, "logps/chosen": -51.973052978515625, "logps/rejected": -54.29719924926758, "loss": 0.6888, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0038118623197078705, "rewards/margins": 0.008959764614701271, "rewards/rejected": -0.005147902760654688, "step": 2120 }, { "epoch": 1.5345821325648417, "grad_norm": 3.309232711791992, "learning_rate": 2.8354051263626227e-08, "logits/chosen": -1.4665729999542236, "logits/rejected": -1.4669950008392334, "logps/chosen": -50.305789947509766, "logps/rejected": -52.8392219543457, "loss": 0.6886, "rewards/accuracies": 0.59375, "rewards/chosen": 0.002960771322250366, "rewards/margins": 0.009312191978096962, "rewards/rejected": -0.006351419724524021, "step": 2130 }, { "epoch": 1.54178674351585, "grad_norm": 4.678018569946289, "learning_rate": 2.8146223586784573e-08, "logits/chosen": -1.4573040008544922, "logits/rejected": -1.4442112445831299, "logps/chosen": -52.07715606689453, "logps/rejected": -54.99871063232422, "loss": 0.687, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0037856134586036205, "rewards/margins": 0.012529050931334496, "rewards/rejected": -0.008743437938392162, "step": 2140 }, { "epoch": 1.5489913544668588, "grad_norm": 3.4094982147216797, "learning_rate": 2.7938174743416205e-08, "logits/chosen": -1.3623288869857788, "logits/rejected": -1.3554866313934326, "logps/chosen": -51.5221061706543, "logps/rejected": -55.462249755859375, "loss": 0.6877, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0025853284168988466, "rewards/margins": 0.011182976886630058, "rewards/rejected": -0.00859764777123928, "step": 2150 }, { "epoch": 1.5561959654178674, "grad_norm": 3.0521068572998047, "learning_rate": 2.7729919358495728e-08, "logits/chosen": -1.5039377212524414, "logits/rejected": -1.494292974472046, "logps/chosen": -52.330955505371094, "logps/rejected": -53.54069900512695, "loss": 0.6869, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0036649019457399845, "rewards/margins": 0.012714678421616554, "rewards/rejected": -0.009049774147570133, "step": 2160 }, { "epoch": 1.563400576368876, "grad_norm": 3.8235228061676025, "learning_rate": 2.7521472071516772e-08, "logits/chosen": -1.4729235172271729, "logits/rejected": -1.4665067195892334, "logps/chosen": -43.68346405029297, "logps/rejected": -47.45942306518555, "loss": 0.6883, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.005660043563693762, "rewards/margins": 0.01002978254109621, "rewards/rejected": -0.004369738511741161, "step": 2170 }, { "epoch": 1.5706051873198847, "grad_norm": 3.896113395690918, "learning_rate": 2.731284753546289e-08, "logits/chosen": -1.4809584617614746, "logits/rejected": -1.474462866783142, "logps/chosen": -53.04487228393555, "logps/rejected": -56.80231857299805, "loss": 0.6894, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.00045274553121998906, "rewards/margins": 0.007639092858880758, "rewards/rejected": -0.008091839030385017, "step": 2180 }, { "epoch": 1.5778097982708934, "grad_norm": 4.060333728790283, "learning_rate": 2.710406041577751e-08, "logits/chosen": -1.551286220550537, "logits/rejected": -1.5481075048446655, "logps/chosen": -48.00239181518555, "logps/rejected": -53.792564392089844, "loss": 0.6882, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.004222895950078964, "rewards/margins": 0.010167112573981285, "rewards/rejected": -0.005944215692579746, "step": 2190 }, { "epoch": 1.585014409221902, "grad_norm": 3.3298470973968506, "learning_rate": 2.6895125389333017e-08, "logits/chosen": -1.5374701023101807, "logits/rejected": -1.5225059986114502, "logps/chosen": -48.45729446411133, "logps/rejected": -52.615623474121094, "loss": 0.6852, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.007050990127027035, "rewards/margins": 0.01621903106570244, "rewards/rejected": -0.009168041869997978, "step": 2200 }, { "epoch": 1.5922190201729105, "grad_norm": 3.1585147380828857, "learning_rate": 2.6686057143399028e-08, "logits/chosen": -1.50589919090271, "logits/rejected": -1.4978208541870117, "logps/chosen": -48.517723083496094, "logps/rejected": -50.0194091796875, "loss": 0.6882, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.004859555047005415, "rewards/margins": 0.01032261736690998, "rewards/rejected": -0.005463062319904566, "step": 2210 }, { "epoch": 1.5994236311239192, "grad_norm": 3.698626756668091, "learning_rate": 2.647687037460996e-08, "logits/chosen": -1.4842262268066406, "logits/rejected": -1.4768081903457642, "logps/chosen": -52.859100341796875, "logps/rejected": -58.45001220703125, "loss": 0.6862, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.006431101355701685, "rewards/margins": 0.014263585209846497, "rewards/rejected": -0.007832483388483524, "step": 2220 }, { "epoch": 1.6066282420749278, "grad_norm": 3.2058353424072266, "learning_rate": 2.626757978793187e-08, "logits/chosen": -1.506158471107483, "logits/rejected": -1.4989348649978638, "logps/chosen": -48.894813537597656, "logps/rejected": -52.491554260253906, "loss": 0.6889, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0004457598552107811, "rewards/margins": 0.008815920911729336, "rewards/rejected": -0.009261680766940117, "step": 2230 }, { "epoch": 1.6138328530259365, "grad_norm": 2.961463212966919, "learning_rate": 2.6058200095628797e-08, "logits/chosen": -1.5059995651245117, "logits/rejected": -1.5062806606292725, "logps/chosen": -40.8658447265625, "logps/rejected": -46.769744873046875, "loss": 0.6849, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0045011648908257484, "rewards/margins": 0.01694817841053009, "rewards/rejected": -0.012447012588381767, "step": 2240 }, { "epoch": 1.6210374639769451, "grad_norm": 3.2749526500701904, "learning_rate": 2.584874601622854e-08, "logits/chosen": -1.566329836845398, "logits/rejected": -1.5490646362304688, "logps/chosen": -49.39610290527344, "logps/rejected": -53.27736282348633, "loss": 0.6888, "rewards/accuracies": 0.59375, "rewards/chosen": 0.002292500576004386, "rewards/margins": 0.009129652753472328, "rewards/rejected": -0.006837151013314724, "step": 2250 }, { "epoch": 1.6282420749279538, "grad_norm": 3.0580456256866455, "learning_rate": 2.5639232273487993e-08, "logits/chosen": -1.4602752923965454, "logits/rejected": -1.440234899520874, "logps/chosen": -44.33481216430664, "logps/rejected": -47.66165542602539, "loss": 0.6877, "rewards/accuracies": 0.59375, "rewards/chosen": 0.004179822281002998, "rewards/margins": 0.011104853823781013, "rewards/rejected": -0.00692503247410059, "step": 2260 }, { "epoch": 1.6354466858789625, "grad_norm": 3.618812322616577, "learning_rate": 2.5429673595358142e-08, "logits/chosen": -1.5238935947418213, "logits/rejected": -1.509108304977417, "logps/chosen": -45.8094596862793, "logps/rejected": -48.57301330566406, "loss": 0.6878, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0015709620201960206, "rewards/margins": 0.010965234600007534, "rewards/rejected": -0.009394274093210697, "step": 2270 }, { "epoch": 1.6426512968299711, "grad_norm": 3.3065967559814453, "learning_rate": 2.5220084712948764e-08, "logits/chosen": -1.4578711986541748, "logits/rejected": -1.4458692073822021, "logps/chosen": -52.10942459106445, "logps/rejected": -55.22698211669922, "loss": 0.6898, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.002750293118879199, "rewards/margins": 0.006976880133152008, "rewards/rejected": -0.004226587247103453, "step": 2280 }, { "epoch": 1.6498559077809798, "grad_norm": 3.7602572441101074, "learning_rate": 2.5010480359492838e-08, "logits/chosen": -1.4637352228164673, "logits/rejected": -1.4515202045440674, "logps/chosen": -49.456268310546875, "logps/rejected": -49.46593475341797, "loss": 0.6859, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0037472727708518505, "rewards/margins": 0.014802709221839905, "rewards/rejected": -0.011055436916649342, "step": 2290 }, { "epoch": 1.6570605187319885, "grad_norm": 2.980210542678833, "learning_rate": 2.480087526931091e-08, "logits/chosen": -1.5034412145614624, "logits/rejected": -1.4840527772903442, "logps/chosen": -43.39094161987305, "logps/rejected": -45.42670822143555, "loss": 0.6862, "rewards/accuracies": 0.625, "rewards/chosen": 0.00358552485704422, "rewards/margins": 0.014299413189291954, "rewards/rejected": -0.010713890194892883, "step": 2300 }, { "epoch": 1.6642651296829971, "grad_norm": 3.3928635120391846, "learning_rate": 2.4591284176775326e-08, "logits/chosen": -1.4473450183868408, "logits/rejected": -1.4353264570236206, "logps/chosen": -55.103553771972656, "logps/rejected": -56.5111198425293, "loss": 0.689, "rewards/accuracies": 0.5625, "rewards/chosen": 0.003477326361462474, "rewards/margins": 0.008474309928715229, "rewards/rejected": -0.0049969833344221115, "step": 2310 }, { "epoch": 1.6714697406340058, "grad_norm": 2.8873450756073, "learning_rate": 2.4381721815274443e-08, "logits/chosen": -1.520268201828003, "logits/rejected": -1.5136438608169556, "logps/chosen": -43.23371887207031, "logps/rejected": -46.416725158691406, "loss": 0.687, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0015050893416628242, "rewards/margins": 0.01274427305907011, "rewards/rejected": -0.011239184066653252, "step": 2320 }, { "epoch": 1.6786743515850144, "grad_norm": 3.149796724319458, "learning_rate": 2.4172202916176936e-08, "logits/chosen": -1.5638529062271118, "logits/rejected": -1.555055856704712, "logps/chosen": -43.024620056152344, "logps/rejected": -47.86211013793945, "loss": 0.686, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": 0.0007431974518112838, "rewards/margins": 0.014958178624510765, "rewards/rejected": -0.014214983209967613, "step": 2330 }, { "epoch": 1.685878962536023, "grad_norm": 3.653278112411499, "learning_rate": 2.3962742207796268e-08, "logits/chosen": -1.4480760097503662, "logits/rejected": -1.4380666017532349, "logps/chosen": -41.63762283325195, "logps/rejected": -45.63017654418945, "loss": 0.6849, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.00497156148776412, "rewards/margins": 0.016974590718746185, "rewards/rejected": -0.012003025971353054, "step": 2340 }, { "epoch": 1.6930835734870318, "grad_norm": 3.627781391143799, "learning_rate": 2.3753354414355334e-08, "logits/chosen": -1.4231648445129395, "logits/rejected": -1.4004006385803223, "logps/chosen": -53.55913543701172, "logps/rejected": -55.218833923339844, "loss": 0.6873, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.00025733548682183027, "rewards/margins": 0.012184584513306618, "rewards/rejected": -0.011927250772714615, "step": 2350 }, { "epoch": 1.7002881844380404, "grad_norm": 3.3585216999053955, "learning_rate": 2.3544054254951408e-08, "logits/chosen": -1.464521050453186, "logits/rejected": -1.4445630311965942, "logps/chosen": -42.864479064941406, "logps/rejected": -48.42354202270508, "loss": 0.6842, "rewards/accuracies": 0.668749988079071, "rewards/chosen": 0.004192016087472439, "rewards/margins": 0.018358776345849037, "rewards/rejected": -0.014166759327054024, "step": 2360 }, { "epoch": 1.707492795389049, "grad_norm": 3.362706184387207, "learning_rate": 2.3334856442521435e-08, "logits/chosen": -1.5583127737045288, "logits/rejected": -1.539954662322998, "logps/chosen": -51.2715950012207, "logps/rejected": -51.411582946777344, "loss": 0.688, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0023382273502647877, "rewards/margins": 0.010567070916295052, "rewards/rejected": -0.008228843100368977, "step": 2370 }, { "epoch": 1.7146974063400577, "grad_norm": 3.5896084308624268, "learning_rate": 2.3125775682807826e-08, "logits/chosen": -1.5538270473480225, "logits/rejected": -1.5523487329483032, "logps/chosen": -49.91779708862305, "logps/rejected": -53.7571907043457, "loss": 0.686, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.002975709503516555, "rewards/margins": 0.014789762906730175, "rewards/rejected": -0.011814054101705551, "step": 2380 }, { "epoch": 1.7219020172910664, "grad_norm": 2.732419013977051, "learning_rate": 2.291682667332464e-08, "logits/chosen": -1.6124897003173828, "logits/rejected": -1.5989201068878174, "logps/chosen": -46.4678840637207, "logps/rejected": -49.60542678833008, "loss": 0.6887, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0006780127296224236, "rewards/margins": 0.009133217856287956, "rewards/rejected": -0.008455204777419567, "step": 2390 }, { "epoch": 1.729106628242075, "grad_norm": 2.9192984104156494, "learning_rate": 2.2708024102324454e-08, "logits/chosen": -1.5306718349456787, "logits/rejected": -1.5253630876541138, "logps/chosen": -46.716922760009766, "logps/rejected": -51.75889205932617, "loss": 0.685, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0032539735548198223, "rewards/margins": 0.016655398532748222, "rewards/rejected": -0.013401424512267113, "step": 2400 }, { "epoch": 1.7363112391930837, "grad_norm": 3.7035152912139893, "learning_rate": 2.2499382647765797e-08, "logits/chosen": -1.4964570999145508, "logits/rejected": -1.4977965354919434, "logps/chosen": -48.43117141723633, "logps/rejected": -52.01741409301758, "loss": 0.6875, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.00011159153655171394, "rewards/margins": 0.011605454608798027, "rewards/rejected": -0.011717047542333603, "step": 2410 }, { "epoch": 1.7435158501440924, "grad_norm": 2.8924202919006348, "learning_rate": 2.2290916976281427e-08, "logits/chosen": -1.4779541492462158, "logits/rejected": -1.4635918140411377, "logps/chosen": -43.7186279296875, "logps/rejected": -46.1024055480957, "loss": 0.6862, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.00032794266007840633, "rewards/margins": 0.014410694129765034, "rewards/rejected": -0.014738637022674084, "step": 2420 }, { "epoch": 1.7507204610951008, "grad_norm": 3.557410478591919, "learning_rate": 2.2082641742147238e-08, "logits/chosen": -1.471459150314331, "logits/rejected": -1.46261727809906, "logps/chosen": -45.6954231262207, "logps/rejected": -51.618690490722656, "loss": 0.6865, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0008009333396330476, "rewards/margins": 0.013774615712463856, "rewards/rejected": -0.012973681092262268, "step": 2430 }, { "epoch": 1.7579250720461095, "grad_norm": 3.1066579818725586, "learning_rate": 2.1874571586252177e-08, "logits/chosen": -1.5453145503997803, "logits/rejected": -1.5335826873779297, "logps/chosen": -45.59928512573242, "logps/rejected": -48.342613220214844, "loss": 0.6878, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0005938579561188817, "rewards/margins": 0.010966666042804718, "rewards/rejected": -0.010372808203101158, "step": 2440 }, { "epoch": 1.7651296829971181, "grad_norm": 2.4217562675476074, "learning_rate": 2.1666721135069037e-08, "logits/chosen": -1.5157561302185059, "logits/rejected": -1.5022157430648804, "logps/chosen": -49.85866165161133, "logps/rejected": -51.332054138183594, "loss": 0.6874, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0015822596615180373, "rewards/margins": 0.011859697289764881, "rewards/rejected": -0.010277437046170235, "step": 2450 }, { "epoch": 1.7723342939481268, "grad_norm": 2.6538779735565186, "learning_rate": 2.145910499962628e-08, "logits/chosen": -1.57613205909729, "logits/rejected": -1.5550073385238647, "logps/chosen": -44.032875061035156, "logps/rejected": -46.20510482788086, "loss": 0.6846, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.003437323495745659, "rewards/margins": 0.017632577568292618, "rewards/rejected": -0.014195254072546959, "step": 2460 }, { "epoch": 1.7795389048991355, "grad_norm": 3.9013473987579346, "learning_rate": 2.1251737774480915e-08, "logits/chosen": -1.548778772354126, "logits/rejected": -1.5395104885101318, "logps/chosen": -53.24138259887695, "logps/rejected": -55.38903045654297, "loss": 0.6871, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0015886023174971342, "rewards/margins": 0.01247491780668497, "rewards/rejected": -0.010886315256357193, "step": 2470 }, { "epoch": 1.7867435158501441, "grad_norm": 2.5475399494171143, "learning_rate": 2.104463403669264e-08, "logits/chosen": -1.4768997430801392, "logits/rejected": -1.456456184387207, "logps/chosen": -49.00716018676758, "logps/rejected": -51.28715133666992, "loss": 0.6861, "rewards/accuracies": 0.59375, "rewards/chosen": 0.0008974798256531358, "rewards/margins": 0.014535580761730671, "rewards/rejected": -0.013638099655508995, "step": 2480 }, { "epoch": 1.7939481268011528, "grad_norm": 2.6631736755371094, "learning_rate": 2.0837808344799028e-08, "logits/chosen": -1.4518641233444214, "logits/rejected": -1.4358189105987549, "logps/chosen": -43.846717834472656, "logps/rejected": -47.554298400878906, "loss": 0.6837, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.006967812776565552, "rewards/margins": 0.019296620041131973, "rewards/rejected": -0.012328808195888996, "step": 2490 }, { "epoch": 1.8011527377521612, "grad_norm": 3.2146389484405518, "learning_rate": 2.063127523779219e-08, "logits/chosen": -1.4298336505889893, "logits/rejected": -1.4289504289627075, "logps/chosen": -44.85996627807617, "logps/rejected": -51.302490234375, "loss": 0.6836, "rewards/accuracies": 0.706250011920929, "rewards/chosen": 0.0029459139332175255, "rewards/margins": 0.019514020532369614, "rewards/rejected": -0.016568105667829514, "step": 2500 }, { "epoch": 1.8083573487031699, "grad_norm": 3.835034132003784, "learning_rate": 2.0425049234096737e-08, "logits/chosen": -1.4856529235839844, "logits/rejected": -1.4708282947540283, "logps/chosen": -49.11738967895508, "logps/rejected": -51.82581329345703, "loss": 0.686, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.00014816033944953233, "rewards/margins": 0.014810358174145222, "rewards/rejected": -0.014662196859717369, "step": 2510 }, { "epoch": 1.8155619596541785, "grad_norm": 2.643429756164551, "learning_rate": 2.0219144830549163e-08, "logits/chosen": -1.4603726863861084, "logits/rejected": -1.4508287906646729, "logps/chosen": -49.00832748413086, "logps/rejected": -52.694000244140625, "loss": 0.6847, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0007474374724552035, "rewards/margins": 0.017317909747362137, "rewards/rejected": -0.016570471227169037, "step": 2520 }, { "epoch": 1.8227665706051872, "grad_norm": 2.901050329208374, "learning_rate": 2.0013576501378823e-08, "logits/chosen": -1.4358056783676147, "logits/rejected": -1.4261062145233154, "logps/chosen": -44.63908767700195, "logps/rejected": -48.853946685791016, "loss": 0.6816, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.007906489074230194, "rewards/margins": 0.0237591415643692, "rewards/rejected": -0.015852652490139008, "step": 2530 }, { "epoch": 1.8299711815561959, "grad_norm": 3.5685510635375977, "learning_rate": 1.9808358697190426e-08, "logits/chosen": -1.4616773128509521, "logits/rejected": -1.4614180326461792, "logps/chosen": -40.03239059448242, "logps/rejected": -45.37065124511719, "loss": 0.6845, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.000598257698584348, "rewards/margins": 0.017768951132893562, "rewards/rejected": -0.018367204815149307, "step": 2540 }, { "epoch": 1.8371757925072045, "grad_norm": 3.0533125400543213, "learning_rate": 1.9603505843948214e-08, "logits/chosen": -1.4894813299179077, "logits/rejected": -1.4689750671386719, "logps/chosen": -41.06487274169922, "logps/rejected": -46.349754333496094, "loss": 0.6864, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0008318226900883019, "rewards/margins": 0.013914955779910088, "rewards/rejected": -0.013083134777843952, "step": 2550 }, { "epoch": 1.8443804034582132, "grad_norm": 3.040861129760742, "learning_rate": 1.9399032341961886e-08, "logits/chosen": -1.4607442617416382, "logits/rejected": -1.440852165222168, "logps/chosen": -44.09001922607422, "logps/rejected": -45.968475341796875, "loss": 0.6868, "rewards/accuracies": 0.59375, "rewards/chosen": 0.003010817337781191, "rewards/margins": 0.013135477900505066, "rewards/rejected": -0.010124661028385162, "step": 2560 }, { "epoch": 1.8515850144092219, "grad_norm": 3.694607734680176, "learning_rate": 1.9194952564874323e-08, "logits/chosen": -1.4902753829956055, "logits/rejected": -1.4777010679244995, "logps/chosen": -49.39779281616211, "logps/rejected": -52.769874572753906, "loss": 0.6858, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0012734916526824236, "rewards/margins": 0.015179460868239403, "rewards/rejected": -0.013905969448387623, "step": 2570 }, { "epoch": 1.8587896253602305, "grad_norm": 3.0213842391967773, "learning_rate": 1.8991280858651157e-08, "logits/chosen": -1.4653866291046143, "logits/rejected": -1.444471836090088, "logps/chosen": -48.05756378173828, "logps/rejected": -49.7245979309082, "loss": 0.6862, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 0.0008609207579866052, "rewards/margins": 0.014405569061636925, "rewards/rejected": -0.013544648885726929, "step": 2580 }, { "epoch": 1.8659942363112392, "grad_norm": 3.783416986465454, "learning_rate": 1.8788031540572327e-08, "logits/chosen": -1.432162880897522, "logits/rejected": -1.4183642864227295, "logps/chosen": -43.341087341308594, "logps/rejected": -47.24171447753906, "loss": 0.6846, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0013560210354626179, "rewards/margins": 0.01773322932422161, "rewards/rejected": -0.01637720875442028, "step": 2590 }, { "epoch": 1.8731988472622478, "grad_norm": 3.501404285430908, "learning_rate": 1.858521889822565e-08, "logits/chosen": -1.4806432723999023, "logits/rejected": -1.47100031375885, "logps/chosen": -44.776634216308594, "logps/rejected": -47.38679504394531, "loss": 0.6872, "rewards/accuracies": 0.59375, "rewards/chosen": 0.00211677816696465, "rewards/margins": 0.012272249907255173, "rewards/rejected": -0.010155471973121166, "step": 2600 }, { "epoch": 1.8804034582132565, "grad_norm": 3.0854015350341797, "learning_rate": 1.8382857188502422e-08, "logits/chosen": -1.4793776273727417, "logits/rejected": -1.4643954038619995, "logps/chosen": -43.39844512939453, "logps/rejected": -46.25333023071289, "loss": 0.6856, "rewards/accuracies": 0.543749988079071, "rewards/chosen": 0.0010049303527921438, "rewards/margins": 0.015818919986486435, "rewards/rejected": -0.014813992194831371, "step": 2610 }, { "epoch": 1.8876080691642652, "grad_norm": 3.0658137798309326, "learning_rate": 1.8180960636595234e-08, "logits/chosen": -1.4343183040618896, "logits/rejected": -1.4235293865203857, "logps/chosen": -45.46432876586914, "logps/rejected": -48.85834503173828, "loss": 0.6841, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0007860889891162515, "rewards/margins": 0.01868055760860443, "rewards/rejected": -0.01789446920156479, "step": 2620 }, { "epoch": 1.8948126801152738, "grad_norm": 2.6446595191955566, "learning_rate": 1.7979543434998015e-08, "logits/chosen": -1.51717209815979, "logits/rejected": -1.5126534700393677, "logps/chosen": -54.11791229248047, "logps/rejected": -55.764686584472656, "loss": 0.6887, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.002511689905077219, "rewards/margins": 0.009304236620664597, "rewards/rejected": -0.011815925128757954, "step": 2630 }, { "epoch": 1.9020172910662825, "grad_norm": 3.2104849815368652, "learning_rate": 1.7778619742508345e-08, "logits/chosen": -1.4984880685806274, "logits/rejected": -1.4783477783203125, "logps/chosen": -48.82425308227539, "logps/rejected": -50.621795654296875, "loss": 0.6861, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0017830973956733942, "rewards/margins": 0.014747503213584423, "rewards/rejected": -0.0165305994451046, "step": 2640 }, { "epoch": 1.9092219020172911, "grad_norm": 5.295479774475098, "learning_rate": 1.757820368323213e-08, "logits/chosen": -1.4480249881744385, "logits/rejected": -1.4318348169326782, "logps/chosen": -55.6407585144043, "logps/rejected": -60.88798141479492, "loss": 0.6852, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0003665873664431274, "rewards/margins": 0.01638123393058777, "rewards/rejected": -0.016747821122407913, "step": 2650 }, { "epoch": 1.9164265129682998, "grad_norm": 2.7129533290863037, "learning_rate": 1.7378309345590803e-08, "logits/chosen": -1.5183932781219482, "logits/rejected": -1.5215718746185303, "logps/chosen": -48.15589141845703, "logps/rejected": -51.79344940185547, "loss": 0.6858, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.00031967152608558536, "rewards/margins": 0.015088710002601147, "rewards/rejected": -0.014769040048122406, "step": 2660 }, { "epoch": 1.9236311239193085, "grad_norm": 3.052567481994629, "learning_rate": 1.717895078133088e-08, "logits/chosen": -1.5369809865951538, "logits/rejected": -1.5272055864334106, "logps/chosen": -45.72820281982422, "logps/rejected": -50.92518615722656, "loss": 0.6844, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.001268348889425397, "rewards/margins": 0.018100250512361526, "rewards/rejected": -0.01683189906179905, "step": 2670 }, { "epoch": 1.9308357348703171, "grad_norm": 2.9650213718414307, "learning_rate": 1.698014200453624e-08, "logits/chosen": -1.512068510055542, "logits/rejected": -1.5153911113739014, "logps/chosen": -48.55554962158203, "logps/rejected": -53.2547492980957, "loss": 0.6892, "rewards/accuracies": 0.581250011920929, "rewards/chosen": 4.29423516834504e-06, "rewards/margins": 0.008227944374084473, "rewards/rejected": -0.00822365004569292, "step": 2680 }, { "epoch": 1.9380403458213258, "grad_norm": 3.127119541168213, "learning_rate": 1.6781896990642964e-08, "logits/chosen": -1.416456937789917, "logits/rejected": -1.406944751739502, "logps/chosen": -53.6823844909668, "logps/rejected": -55.647216796875, "loss": 0.6873, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": 0.0006507608923129737, "rewards/margins": 0.01228669099509716, "rewards/rejected": -0.011635931208729744, "step": 2690 }, { "epoch": 1.9452449567723344, "grad_norm": 3.6985666751861572, "learning_rate": 1.658422967545693e-08, "logits/chosen": -1.544832706451416, "logits/rejected": -1.523085117340088, "logps/chosen": -46.62820816040039, "logps/rejected": -48.89812469482422, "loss": 0.6864, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.002641532802954316, "rewards/margins": 0.01398603618144989, "rewards/rejected": -0.016627568751573563, "step": 2700 }, { "epoch": 1.952449567723343, "grad_norm": 3.3566489219665527, "learning_rate": 1.638715395417418e-08, "logits/chosen": -1.5159873962402344, "logits/rejected": -1.4998763799667358, "logps/chosen": -47.75004959106445, "logps/rejected": -50.23518753051758, "loss": 0.6872, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0016118374187499285, "rewards/margins": 0.01232110895216465, "rewards/rejected": -0.013932946138083935, "step": 2710 }, { "epoch": 1.9596541786743515, "grad_norm": 3.4124369621276855, "learning_rate": 1.619068368040416e-08, "logits/chosen": -1.5037791728973389, "logits/rejected": -1.4937578439712524, "logps/chosen": -42.351898193359375, "logps/rejected": -47.97382354736328, "loss": 0.6849, "rewards/accuracies": 0.625, "rewards/chosen": 0.0007811610703356564, "rewards/margins": 0.017164845019578934, "rewards/rejected": -0.016383685171604156, "step": 2720 }, { "epoch": 1.9668587896253602, "grad_norm": 3.3016412258148193, "learning_rate": 1.5994832665195853e-08, "logits/chosen": -1.4340431690216064, "logits/rejected": -1.4277336597442627, "logps/chosen": -46.43476486206055, "logps/rejected": -48.81419372558594, "loss": 0.6874, "rewards/accuracies": 0.59375, "rewards/chosen": 0.00010869167454075068, "rewards/margins": 0.011941631324589252, "rewards/rejected": -0.011832939460873604, "step": 2730 }, { "epoch": 1.9740634005763689, "grad_norm": 3.3040690422058105, "learning_rate": 1.5799614676066906e-08, "logits/chosen": -1.561361312866211, "logits/rejected": -1.5565513372421265, "logps/chosen": -42.6262092590332, "logps/rejected": -47.142024993896484, "loss": 0.6851, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0014737072633579373, "rewards/margins": 0.01672922447323799, "rewards/rejected": -0.018202928826212883, "step": 2740 }, { "epoch": 1.9812680115273775, "grad_norm": 2.7992374897003174, "learning_rate": 1.560504343603587e-08, "logits/chosen": -1.4569079875946045, "logits/rejected": -1.4597570896148682, "logps/chosen": -47.62577438354492, "logps/rejected": -53.16133499145508, "loss": 0.6863, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0012593867722898722, "rewards/margins": 0.014034323394298553, "rewards/rejected": -0.012774934992194176, "step": 2750 }, { "epoch": 1.9884726224783862, "grad_norm": 2.700779438018799, "learning_rate": 1.541113262265748e-08, "logits/chosen": -1.558211088180542, "logits/rejected": -1.553525686264038, "logps/chosen": -47.86023712158203, "logps/rejected": -52.087554931640625, "loss": 0.6854, "rewards/accuracies": 0.65625, "rewards/chosen": 0.00030795374186709523, "rewards/margins": 0.01597772166132927, "rewards/rejected": -0.015669768676161766, "step": 2760 }, { "epoch": 1.9956772334293948, "grad_norm": 2.8260514736175537, "learning_rate": 1.5217895867061227e-08, "logits/chosen": -1.4797998666763306, "logits/rejected": -1.4682440757751465, "logps/chosen": -49.11394119262695, "logps/rejected": -51.766929626464844, "loss": 0.6856, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0009157865424640477, "rewards/margins": 0.015710871666669846, "rewards/rejected": -0.016626659780740738, "step": 2770 }, { "epoch": 2.0028818443804033, "grad_norm": 3.2303998470306396, "learning_rate": 1.5025346752993098e-08, "logits/chosen": -1.4731628894805908, "logits/rejected": -1.4776121377944946, "logps/chosen": -47.23828887939453, "logps/rejected": -51.46478271484375, "loss": 0.6885, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.003107684198766947, "rewards/margins": 0.009776955470442772, "rewards/rejected": -0.01288464106619358, "step": 2780 }, { "epoch": 2.010086455331412, "grad_norm": 3.200279951095581, "learning_rate": 1.4833498815860756e-08, "logits/chosen": -1.603215217590332, "logits/rejected": -1.5945051908493042, "logps/chosen": -44.726016998291016, "logps/rejected": -49.45168685913086, "loss": 0.6833, "rewards/accuracies": 0.625, "rewards/chosen": 0.0007763226167298853, "rewards/margins": 0.020336374640464783, "rewards/rejected": -0.01956005021929741, "step": 2790 }, { "epoch": 2.0172910662824206, "grad_norm": 3.470811128616333, "learning_rate": 1.4642365541781993e-08, "logits/chosen": -1.4187241792678833, "logits/rejected": -1.402005910873413, "logps/chosen": -46.40681838989258, "logps/rejected": -51.32469940185547, "loss": 0.6849, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0019146741833537817, "rewards/margins": 0.017073018476366997, "rewards/rejected": -0.018987691029906273, "step": 2800 }, { "epoch": 2.0244956772334293, "grad_norm": 3.550839424133301, "learning_rate": 1.4451960366636745e-08, "logits/chosen": -1.5050057172775269, "logits/rejected": -1.5091335773468018, "logps/chosen": -50.25291061401367, "logps/rejected": -54.865325927734375, "loss": 0.6861, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.4477363038167823e-05, "rewards/margins": 0.014642780646681786, "rewards/rejected": -0.014657258987426758, "step": 2810 }, { "epoch": 2.031700288184438, "grad_norm": 2.99293851852417, "learning_rate": 1.4262296675122592e-08, "logits/chosen": -1.505652666091919, "logits/rejected": -1.490431547164917, "logps/chosen": -43.927162170410156, "logps/rejected": -48.62024688720703, "loss": 0.6849, "rewards/accuracies": 0.625, "rewards/chosen": -0.0005578006966970861, "rewards/margins": 0.016974329948425293, "rewards/rejected": -0.01753213070333004, "step": 2820 }, { "epoch": 2.0389048991354466, "grad_norm": 3.470144748687744, "learning_rate": 1.407338779981389e-08, "logits/chosen": -1.473926305770874, "logits/rejected": -1.4623931646347046, "logps/chosen": -41.433082580566406, "logps/rejected": -46.476688385009766, "loss": 0.6829, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0010272187646478415, "rewards/margins": 0.021077865734696388, "rewards/rejected": -0.022105086594820023, "step": 2830 }, { "epoch": 2.0461095100864553, "grad_norm": 3.15632963180542, "learning_rate": 1.3885247020224534e-08, "logits/chosen": -1.4739320278167725, "logits/rejected": -1.4632718563079834, "logps/chosen": -40.990055084228516, "logps/rejected": -44.34033203125, "loss": 0.6834, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": 0.000578380364459008, "rewards/margins": 0.020333198830485344, "rewards/rejected": -0.019754819571971893, "step": 2840 }, { "epoch": 2.053314121037464, "grad_norm": 2.7791974544525146, "learning_rate": 1.369788756187445e-08, "logits/chosen": -1.5228140354156494, "logits/rejected": -1.511235237121582, "logps/chosen": -46.87430953979492, "logps/rejected": -48.13962936401367, "loss": 0.6878, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0030249585397541523, "rewards/margins": 0.011208303272724152, "rewards/rejected": -0.014233263209462166, "step": 2850 }, { "epoch": 2.0605187319884726, "grad_norm": 3.1895058155059814, "learning_rate": 1.3511322595359925e-08, "logits/chosen": -1.5314232110977173, "logits/rejected": -1.5200746059417725, "logps/chosen": -43.31315994262695, "logps/rejected": -49.05580139160156, "loss": 0.6838, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.0011509342584758997, "rewards/margins": 0.019369563087821007, "rewards/rejected": -0.020520497113466263, "step": 2860 }, { "epoch": 2.0677233429394812, "grad_norm": 3.250976324081421, "learning_rate": 1.3325565235427716e-08, "logits/chosen": -1.5527724027633667, "logits/rejected": -1.544170618057251, "logps/chosen": -45.31568145751953, "logps/rejected": -49.30888366699219, "loss": 0.6846, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.001304257777519524, "rewards/margins": 0.017727408558130264, "rewards/rejected": -0.019031664356589317, "step": 2870 }, { "epoch": 2.07492795389049, "grad_norm": 3.654803991317749, "learning_rate": 1.3140628540053218e-08, "logits/chosen": -1.4575417041778564, "logits/rejected": -1.4557491540908813, "logps/chosen": -45.90990447998047, "logps/rejected": -49.446495056152344, "loss": 0.6854, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.002215947024524212, "rewards/margins": 0.015816405415534973, "rewards/rejected": -0.013600456528365612, "step": 2880 }, { "epoch": 2.0821325648414986, "grad_norm": 3.999239921569824, "learning_rate": 1.2956525509522451e-08, "logits/chosen": -1.4346693754196167, "logits/rejected": -1.4396179914474487, "logps/chosen": -47.850128173828125, "logps/rejected": -51.41837692260742, "loss": 0.6873, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0015433436492457986, "rewards/margins": 0.012201479636132717, "rewards/rejected": -0.010658138431608677, "step": 2890 }, { "epoch": 2.089337175792507, "grad_norm": 3.8457813262939453, "learning_rate": 1.2773269085518267e-08, "logits/chosen": -1.5163942575454712, "logits/rejected": -1.5108063220977783, "logps/chosen": -52.5305061340332, "logps/rejected": -56.18115234375, "loss": 0.6864, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.00015631233691237867, "rewards/margins": 0.01401402335613966, "rewards/rejected": -0.013857712037861347, "step": 2900 }, { "epoch": 2.096541786743516, "grad_norm": 2.6759727001190186, "learning_rate": 1.2590872150210574e-08, "logits/chosen": -1.5920841693878174, "logits/rejected": -1.5757710933685303, "logps/chosen": -45.695072174072266, "logps/rejected": -47.82882308959961, "loss": 0.6838, "rewards/accuracies": 0.625, "rewards/chosen": -0.004866642877459526, "rewards/margins": 0.019523626193404198, "rewards/rejected": -0.024390270933508873, "step": 2910 }, { "epoch": 2.1037463976945245, "grad_norm": 2.84696888923645, "learning_rate": 1.2409347525350775e-08, "logits/chosen": -1.4993377923965454, "logits/rejected": -1.480965256690979, "logps/chosen": -47.44709396362305, "logps/rejected": -51.3841667175293, "loss": 0.6831, "rewards/accuracies": 0.643750011920929, "rewards/chosen": 0.0008200405864045024, "rewards/margins": 0.02079201303422451, "rewards/rejected": -0.019971970468759537, "step": 2920 }, { "epoch": 2.110951008645533, "grad_norm": 3.4382824897766113, "learning_rate": 1.2228707971370421e-08, "logits/chosen": -1.4966394901275635, "logits/rejected": -1.4779356718063354, "logps/chosen": -42.06577682495117, "logps/rejected": -44.59346008300781, "loss": 0.6838, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0021643335931003094, "rewards/margins": 0.01934720203280449, "rewards/rejected": -0.017182866111397743, "step": 2930 }, { "epoch": 2.118155619596542, "grad_norm": 4.404411792755127, "learning_rate": 1.2048966186484282e-08, "logits/chosen": -1.5253870487213135, "logits/rejected": -1.495600700378418, "logps/chosen": -52.13329315185547, "logps/rejected": -55.06341552734375, "loss": 0.686, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.001562885008752346, "rewards/margins": 0.014812910929322243, "rewards/rejected": -0.016375798732042313, "step": 2940 }, { "epoch": 2.1253602305475505, "grad_norm": 3.313408374786377, "learning_rate": 1.187013480579762e-08, "logits/chosen": -1.4894028902053833, "logits/rejected": -1.4830772876739502, "logps/chosen": -45.41261291503906, "logps/rejected": -49.40631103515625, "loss": 0.684, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.004708284046500921, "rewards/margins": 0.019039448350667953, "rewards/rejected": -0.023747732862830162, "step": 2950 }, { "epoch": 2.132564841498559, "grad_norm": 4.3451385498046875, "learning_rate": 1.1692226400418073e-08, "logits/chosen": -1.4124035835266113, "logits/rejected": -1.4043338298797607, "logps/chosen": -49.05751419067383, "logps/rejected": -52.05934524536133, "loss": 0.6855, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.004499994218349457, "rewards/margins": 0.01585240475833416, "rewards/rejected": -0.020352398976683617, "step": 2960 }, { "epoch": 2.139769452449568, "grad_norm": 2.498106002807617, "learning_rate": 1.1515253476571923e-08, "logits/chosen": -1.4480842351913452, "logits/rejected": -1.4422247409820557, "logps/chosen": -44.44374084472656, "logps/rejected": -51.034915924072266, "loss": 0.6841, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.003585060592740774, "rewards/margins": 0.018647151067852974, "rewards/rejected": -0.022232210263609886, "step": 2970 }, { "epoch": 2.1469740634005765, "grad_norm": 3.3477675914764404, "learning_rate": 1.133922847472496e-08, "logits/chosen": -1.4905388355255127, "logits/rejected": -1.4867520332336426, "logps/chosen": -52.541542053222656, "logps/rejected": -55.05914306640625, "loss": 0.685, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0006752757471986115, "rewards/margins": 0.017081119120121002, "rewards/rejected": -0.017756396904587746, "step": 2980 }, { "epoch": 2.154178674351585, "grad_norm": 3.3309946060180664, "learning_rate": 1.1164163768707952e-08, "logits/chosen": -1.4653676748275757, "logits/rejected": -1.454390525817871, "logps/chosen": -47.43062973022461, "logps/rejected": -51.644432067871094, "loss": 0.6824, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0007173820049501956, "rewards/margins": 0.0224156454205513, "rewards/rejected": -0.02313302643597126, "step": 2990 }, { "epoch": 2.161383285302594, "grad_norm": 3.354658365249634, "learning_rate": 1.0990071664846861e-08, "logits/chosen": -1.4403693675994873, "logits/rejected": -1.4301955699920654, "logps/chosen": -48.77064895629883, "logps/rejected": -53.99092483520508, "loss": 0.6818, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 9.120155300479382e-05, "rewards/margins": 0.023369425907731056, "rewards/rejected": -0.02327822335064411, "step": 3000 }, { "epoch": 2.1685878962536025, "grad_norm": 3.0528011322021484, "learning_rate": 1.0816964401097739e-08, "logits/chosen": -1.482627272605896, "logits/rejected": -1.4725825786590576, "logps/chosen": -43.032142639160156, "logps/rejected": -45.872344970703125, "loss": 0.6854, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0006441779551096261, "rewards/margins": 0.016231542453169823, "rewards/rejected": -0.016875719651579857, "step": 3010 }, { "epoch": 2.175792507204611, "grad_norm": 3.9394450187683105, "learning_rate": 1.0644854146186406e-08, "logits/chosen": -1.5147243738174438, "logits/rejected": -1.4970117807388306, "logps/chosen": -48.232513427734375, "logps/rejected": -52.97943878173828, "loss": 0.6827, "rewards/accuracies": 0.625, "rewards/chosen": -0.0024471329525113106, "rewards/margins": 0.021746691316366196, "rewards/rejected": -0.02419382520020008, "step": 3020 }, { "epoch": 2.18299711815562, "grad_norm": 3.223052978515625, "learning_rate": 1.0473752998753114e-08, "logits/chosen": -1.4945493936538696, "logits/rejected": -1.473004698753357, "logps/chosen": -48.55394744873047, "logps/rejected": -51.71314239501953, "loss": 0.6823, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0011378397466614842, "rewards/margins": 0.022417975589632988, "rewards/rejected": -0.021280135959386826, "step": 3030 }, { "epoch": 2.1902017291066285, "grad_norm": 2.9647247791290283, "learning_rate": 1.030367298650201e-08, "logits/chosen": -1.4931919574737549, "logits/rejected": -1.492661476135254, "logps/chosen": -48.77735900878906, "logps/rejected": -53.65592575073242, "loss": 0.6872, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.004096081480383873, "rewards/margins": 0.012563072144985199, "rewards/rejected": -0.016659153625369072, "step": 3040 }, { "epoch": 2.1974063400576367, "grad_norm": 3.8749027252197266, "learning_rate": 1.0134626065355675e-08, "logits/chosen": -1.5941665172576904, "logits/rejected": -1.5830225944519043, "logps/chosen": -49.36051559448242, "logps/rejected": -52.95866012573242, "loss": 0.6825, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0015227645635604858, "rewards/margins": 0.022038374096155167, "rewards/rejected": -0.02051560953259468, "step": 3050 }, { "epoch": 2.2046109510086453, "grad_norm": 3.4919955730438232, "learning_rate": 9.966624118614611e-09, "logits/chosen": -1.4911123514175415, "logits/rejected": -1.4718658924102783, "logps/chosen": -52.330223083496094, "logps/rejected": -55.529273986816406, "loss": 0.6837, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.0019868058152496815, "rewards/margins": 0.01953335851430893, "rewards/rejected": -0.017546551302075386, "step": 3060 }, { "epoch": 2.211815561959654, "grad_norm": 2.4199657440185547, "learning_rate": 9.799678956121976e-09, "logits/chosen": -1.4366905689239502, "logits/rejected": -1.420212984085083, "logps/chosen": -45.93531036376953, "logps/rejected": -48.54907989501953, "loss": 0.6871, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0024884543381631374, "rewards/margins": 0.012519553303718567, "rewards/rejected": -0.015008007176220417, "step": 3070 }, { "epoch": 2.2190201729106627, "grad_norm": 3.557464361190796, "learning_rate": 9.633802313433314e-09, "logits/chosen": -1.4143495559692383, "logits/rejected": -1.410310983657837, "logps/chosen": -48.37750244140625, "logps/rejected": -50.99727249145508, "loss": 0.6849, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0013545064721256495, "rewards/margins": 0.017067620530724525, "rewards/rejected": -0.01842212677001953, "step": 3080 }, { "epoch": 2.2262247838616713, "grad_norm": 2.7765674591064453, "learning_rate": 9.469005850991705e-09, "logits/chosen": -1.4845517873764038, "logits/rejected": -1.4717377424240112, "logps/chosen": -47.20240020751953, "logps/rejected": -48.706504821777344, "loss": 0.6842, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0018641784554347396, "rewards/margins": 0.018557867035269737, "rewards/rejected": -0.02042204514145851, "step": 3090 }, { "epoch": 2.23342939481268, "grad_norm": 3.18376088142395, "learning_rate": 9.305301153307949e-09, "logits/chosen": -1.496174693107605, "logits/rejected": -1.5003782510757446, "logps/chosen": -40.0098762512207, "logps/rejected": -44.06249237060547, "loss": 0.6833, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.005063945893198252, "rewards/margins": 0.020303303375840187, "rewards/rejected": -0.02536724880337715, "step": 3100 }, { "epoch": 2.2406340057636887, "grad_norm": 2.7682111263275146, "learning_rate": 9.142699728146336e-09, "logits/chosen": -1.43401300907135, "logits/rejected": -1.4253044128417969, "logps/chosen": -46.127769470214844, "logps/rejected": -51.10237503051758, "loss": 0.6844, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.002828083001077175, "rewards/margins": 0.018272753804922104, "rewards/rejected": -0.021100837737321854, "step": 3110 }, { "epoch": 2.2478386167146973, "grad_norm": 2.9937551021575928, "learning_rate": 8.981213005715627e-09, "logits/chosen": -1.501936435699463, "logits/rejected": -1.501483678817749, "logps/chosen": -44.192527770996094, "logps/rejected": -49.055450439453125, "loss": 0.6841, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0020972955971956253, "rewards/margins": 0.018690943717956543, "rewards/rejected": -0.020788241177797318, "step": 3120 }, { "epoch": 2.255043227665706, "grad_norm": 3.6883223056793213, "learning_rate": 8.820852337865611e-09, "logits/chosen": -1.5518832206726074, "logits/rejected": -1.5363496541976929, "logps/chosen": -45.0767822265625, "logps/rejected": -48.65938186645508, "loss": 0.6844, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0013773315586149693, "rewards/margins": 0.018128078430891037, "rewards/rejected": -0.01950540952384472, "step": 3130 }, { "epoch": 2.2622478386167146, "grad_norm": 2.8200368881225586, "learning_rate": 8.661628997289044e-09, "logits/chosen": -1.4339885711669922, "logits/rejected": -1.4207003116607666, "logps/chosen": -45.365943908691406, "logps/rejected": -49.89191818237305, "loss": 0.6838, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.000960633740760386, "rewards/margins": 0.019434962421655655, "rewards/rejected": -0.020395595580339432, "step": 3140 }, { "epoch": 2.2694524495677233, "grad_norm": 2.8255815505981445, "learning_rate": 8.503554176729341e-09, "logits/chosen": -1.4203985929489136, "logits/rejected": -1.4152114391326904, "logps/chosen": -45.45329666137695, "logps/rejected": -49.060142517089844, "loss": 0.6833, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.000864202156662941, "rewards/margins": 0.020447982475161552, "rewards/rejected": -0.01958378031849861, "step": 3150 }, { "epoch": 2.276657060518732, "grad_norm": 3.818735122680664, "learning_rate": 8.346638988193636e-09, "logits/chosen": -1.4726885557174683, "logits/rejected": -1.4705435037612915, "logps/chosen": -40.58414840698242, "logps/rejected": -46.35978317260742, "loss": 0.6836, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0010551244486123323, "rewards/margins": 0.019719591364264488, "rewards/rejected": -0.020774714648723602, "step": 3160 }, { "epoch": 2.2838616714697406, "grad_norm": 4.216102123260498, "learning_rate": 8.19089446217176e-09, "logits/chosen": -1.4285030364990234, "logits/rejected": -1.4068001508712769, "logps/chosen": -45.76138687133789, "logps/rejected": -51.217926025390625, "loss": 0.6797, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0022968396078795195, "rewards/margins": 0.027888495475053787, "rewards/rejected": -0.025591660290956497, "step": 3170 }, { "epoch": 2.2910662824207493, "grad_norm": 3.064260959625244, "learning_rate": 8.036331546860777e-09, "logits/chosen": -1.4557617902755737, "logits/rejected": -1.4532365798950195, "logps/chosen": -45.43151092529297, "logps/rejected": -48.28192901611328, "loss": 0.6875, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0039183879271149635, "rewards/margins": 0.011963925324380398, "rewards/rejected": -0.01588231325149536, "step": 3180 }, { "epoch": 2.298270893371758, "grad_norm": 3.668010950088501, "learning_rate": 7.882961107395416e-09, "logits/chosen": -1.4972290992736816, "logits/rejected": -1.4874447584152222, "logps/chosen": -52.34519577026367, "logps/rejected": -52.7003173828125, "loss": 0.6871, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.00619167648255825, "rewards/margins": 0.01287173479795456, "rewards/rejected": -0.01906341128051281, "step": 3190 }, { "epoch": 2.3054755043227666, "grad_norm": 4.626786231994629, "learning_rate": 7.73079392508428e-09, "logits/chosen": -1.4211690425872803, "logits/rejected": -1.4251266717910767, "logps/chosen": -49.76266098022461, "logps/rejected": -56.70771026611328, "loss": 0.6824, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00390449957922101, "rewards/margins": 0.022429395467042923, "rewards/rejected": -0.026333892717957497, "step": 3200 }, { "epoch": 2.3126801152737753, "grad_norm": 3.64296555519104, "learning_rate": 7.579840696651938e-09, "logits/chosen": -1.5132083892822266, "logits/rejected": -1.5068855285644531, "logps/chosen": -42.3305778503418, "logps/rejected": -45.745948791503906, "loss": 0.6841, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.004392626229673624, "rewards/margins": 0.01889212615787983, "rewards/rejected": -0.023284751921892166, "step": 3210 }, { "epoch": 2.319884726224784, "grad_norm": 4.373074054718018, "learning_rate": 7.43011203348704e-09, "logits/chosen": -1.3568141460418701, "logits/rejected": -1.351928472518921, "logps/chosen": -53.11281204223633, "logps/rejected": -53.88922882080078, "loss": 0.6857, "rewards/accuracies": 0.59375, "rewards/chosen": -0.005558110773563385, "rewards/margins": 0.015617373399436474, "rewards/rejected": -0.021175485104322433, "step": 3220 }, { "epoch": 2.3270893371757926, "grad_norm": 3.2564187049865723, "learning_rate": 7.281618460896344e-09, "logits/chosen": -1.4833844900131226, "logits/rejected": -1.4731206893920898, "logps/chosen": -46.27775192260742, "logps/rejected": -50.7646598815918, "loss": 0.6844, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0017924957210198045, "rewards/margins": 0.018207941204309464, "rewards/rejected": -0.020000439137220383, "step": 3230 }, { "epoch": 2.3342939481268012, "grad_norm": 2.9897639751434326, "learning_rate": 7.134370417364849e-09, "logits/chosen": -1.4312418699264526, "logits/rejected": -1.4230421781539917, "logps/chosen": -45.265655517578125, "logps/rejected": -48.087547302246094, "loss": 0.6868, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.007034836802631617, "rewards/margins": 0.01339829433709383, "rewards/rejected": -0.020433131605386734, "step": 3240 }, { "epoch": 2.34149855907781, "grad_norm": 4.031427383422852, "learning_rate": 6.988378253821981e-09, "logits/chosen": -1.4584436416625977, "logits/rejected": -1.4505494832992554, "logps/chosen": -51.43975067138672, "logps/rejected": -54.94978713989258, "loss": 0.6869, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.00025085260858759284, "rewards/margins": 0.013079972937703133, "rewards/rejected": -0.013330824673175812, "step": 3250 }, { "epoch": 2.3487031700288186, "grad_norm": 3.0655064582824707, "learning_rate": 6.8436522329140186e-09, "logits/chosen": -1.4397895336151123, "logits/rejected": -1.4459335803985596, "logps/chosen": -46.97881317138672, "logps/rejected": -50.81465530395508, "loss": 0.6854, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0028320541605353355, "rewards/margins": 0.01652853563427925, "rewards/rejected": -0.01936059445142746, "step": 3260 }, { "epoch": 2.3559077809798272, "grad_norm": 3.536172389984131, "learning_rate": 6.700202528282603e-09, "logits/chosen": -1.41671621799469, "logits/rejected": -1.3971173763275146, "logps/chosen": -48.60463333129883, "logps/rejected": -51.607643127441406, "loss": 0.6834, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.006176457740366459, "rewards/margins": 0.020537305623292923, "rewards/rejected": -0.026713764294981956, "step": 3270 }, { "epoch": 2.363112391930836, "grad_norm": 3.716606378555298, "learning_rate": 6.558039223849668e-09, "logits/chosen": -1.5101362466812134, "logits/rejected": -1.4909931421279907, "logps/chosen": -46.293373107910156, "logps/rejected": -52.6843147277832, "loss": 0.6816, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0020285609643906355, "rewards/margins": 0.02413899265229702, "rewards/rejected": -0.026167552918195724, "step": 3280 }, { "epoch": 2.3703170028818445, "grad_norm": 2.897578001022339, "learning_rate": 6.417172313108471e-09, "logits/chosen": -1.4238040447235107, "logits/rejected": -1.4123533964157104, "logps/chosen": -44.11698913574219, "logps/rejected": -47.348690032958984, "loss": 0.6852, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.007857661694288254, "rewards/margins": 0.01665417291224003, "rewards/rejected": -0.024511834606528282, "step": 3290 }, { "epoch": 2.377521613832853, "grad_norm": 2.985238552093506, "learning_rate": 6.277611698421179e-09, "logits/chosen": -1.5537796020507812, "logits/rejected": -1.5336295366287231, "logps/chosen": -39.11783218383789, "logps/rejected": -45.03361892700195, "loss": 0.6814, "rewards/accuracies": 0.5625, "rewards/chosen": -0.004091170616447926, "rewards/margins": 0.02452995628118515, "rewards/rejected": -0.0286211259663105, "step": 3300 }, { "epoch": 2.3847262247838614, "grad_norm": 4.756281852722168, "learning_rate": 6.139367190322714e-09, "logits/chosen": -1.4921059608459473, "logits/rejected": -1.4921514987945557, "logps/chosen": -52.750709533691406, "logps/rejected": -58.163116455078125, "loss": 0.6855, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0038170956540852785, "rewards/margins": 0.01597406342625618, "rewards/rejected": -0.01979115977883339, "step": 3310 }, { "epoch": 2.39193083573487, "grad_norm": 2.5016090869903564, "learning_rate": 6.002448506831171e-09, "logits/chosen": -1.4790103435516357, "logits/rejected": -1.4744632244110107, "logps/chosen": -44.081382751464844, "logps/rejected": -49.28645706176758, "loss": 0.6842, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.003526459215208888, "rewards/margins": 0.01861676014959812, "rewards/rejected": -0.02214321866631508, "step": 3320 }, { "epoch": 2.3991354466858787, "grad_norm": 3.0572214126586914, "learning_rate": 5.866865272764607e-09, "logits/chosen": -1.4946694374084473, "logits/rejected": -1.4880764484405518, "logps/chosen": -46.4217643737793, "logps/rejected": -50.51008605957031, "loss": 0.6854, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0063838111236691475, "rewards/margins": 0.016102833673357964, "rewards/rejected": -0.022486645728349686, "step": 3330 }, { "epoch": 2.4063400576368874, "grad_norm": 4.673269271850586, "learning_rate": 5.7326270190645595e-09, "logits/chosen": -1.3282158374786377, "logits/rejected": -1.3230302333831787, "logps/chosen": -49.934425354003906, "logps/rejected": -52.02433395385742, "loss": 0.6845, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.005276781972497702, "rewards/margins": 0.018048197031021118, "rewards/rejected": -0.023324977606534958, "step": 3340 }, { "epoch": 2.413544668587896, "grad_norm": 3.670146942138672, "learning_rate": 5.599743182125938e-09, "logits/chosen": -1.5373293161392212, "logits/rejected": -1.5367281436920166, "logps/chosen": -48.803321838378906, "logps/rejected": -54.12480545043945, "loss": 0.685, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0014929801691323519, "rewards/margins": 0.01699111983180046, "rewards/rejected": -0.018484100699424744, "step": 3350 }, { "epoch": 2.4207492795389047, "grad_norm": 3.4955081939697266, "learning_rate": 5.46822310313379e-09, "logits/chosen": -1.560293436050415, "logits/rejected": -1.5650306940078735, "logps/chosen": -49.443687438964844, "logps/rejected": -52.810874938964844, "loss": 0.6873, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0048328666016459465, "rewards/margins": 0.012173362076282501, "rewards/rejected": -0.017006227746605873, "step": 3360 }, { "epoch": 2.4279538904899134, "grad_norm": 3.653897523880005, "learning_rate": 5.33807602740658e-09, "logits/chosen": -1.556183934211731, "logits/rejected": -1.5406051874160767, "logps/chosen": -41.885528564453125, "logps/rejected": -47.48136901855469, "loss": 0.6799, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.0013223844580352306, "rewards/margins": 0.027406567707657814, "rewards/rejected": -0.028728952631354332, "step": 3370 }, { "epoch": 2.435158501440922, "grad_norm": 3.8069379329681396, "learning_rate": 5.209311103746334e-09, "logits/chosen": -1.4751628637313843, "logits/rejected": -1.4709327220916748, "logps/chosen": -47.141212463378906, "logps/rejected": -52.47745895385742, "loss": 0.6836, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.003443703055381775, "rewards/margins": 0.019782431423664093, "rewards/rejected": -0.02322613261640072, "step": 3380 }, { "epoch": 2.4423631123919307, "grad_norm": 4.198550701141357, "learning_rate": 5.081937383795484e-09, "logits/chosen": -1.4638655185699463, "logits/rejected": -1.4535114765167236, "logps/chosen": -44.21641540527344, "logps/rejected": -48.895484924316406, "loss": 0.6822, "rewards/accuracies": 0.65625, "rewards/chosen": -0.001047824858687818, "rewards/margins": 0.022810544818639755, "rewards/rejected": -0.02385837212204933, "step": 3390 }, { "epoch": 2.4495677233429394, "grad_norm": 3.6953177452087402, "learning_rate": 4.955963821400599e-09, "logits/chosen": -1.5245097875595093, "logits/rejected": -1.5063731670379639, "logps/chosen": -46.904541015625, "logps/rejected": -49.754425048828125, "loss": 0.6827, "rewards/accuracies": 0.6875, "rewards/chosen": -0.00250697392039001, "rewards/margins": 0.021855643019080162, "rewards/rejected": -0.02436261810362339, "step": 3400 }, { "epoch": 2.456772334293948, "grad_norm": 2.7883760929107666, "learning_rate": 4.831399271982928e-09, "logits/chosen": -1.3962290287017822, "logits/rejected": -1.379988431930542, "logps/chosen": -49.720054626464844, "logps/rejected": -52.83693313598633, "loss": 0.683, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.0017061985563486814, "rewards/margins": 0.021188754588365555, "rewards/rejected": -0.022894952446222305, "step": 3410 }, { "epoch": 2.4639769452449567, "grad_norm": 3.981572389602661, "learning_rate": 4.708252491915951e-09, "logits/chosen": -1.4993345737457275, "logits/rejected": -1.4892202615737915, "logps/chosen": -47.0760383605957, "logps/rejected": -51.671592712402344, "loss": 0.6829, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.003932795021682978, "rewards/margins": 0.021549254655838013, "rewards/rejected": -0.02548205293715, "step": 3420 }, { "epoch": 2.4711815561959654, "grad_norm": 2.9065399169921875, "learning_rate": 4.58653213790981e-09, "logits/chosen": -1.4973710775375366, "logits/rejected": -1.4790763854980469, "logps/chosen": -47.4251823425293, "logps/rejected": -52.07381057739258, "loss": 0.6836, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0022236169315874577, "rewards/margins": 0.019855182617902756, "rewards/rejected": -0.022078800946474075, "step": 3430 }, { "epoch": 2.478386167146974, "grad_norm": 3.3968310356140137, "learning_rate": 4.466246766402773e-09, "logits/chosen": -1.4705581665039062, "logits/rejected": -1.4513893127441406, "logps/chosen": -48.730186462402344, "logps/rejected": -52.47394943237305, "loss": 0.6818, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0018464624881744385, "rewards/margins": 0.023552386090159416, "rewards/rejected": -0.025398846715688705, "step": 3440 }, { "epoch": 2.4855907780979827, "grad_norm": 3.7176759243011475, "learning_rate": 4.347404832959775e-09, "logits/chosen": -1.5254770517349243, "logits/rejected": -1.5139881372451782, "logps/chosen": -44.64695739746094, "logps/rejected": -48.886756896972656, "loss": 0.6838, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.004621665924787521, "rewards/margins": 0.019287504255771637, "rewards/rejected": -0.02390917018055916, "step": 3450 }, { "epoch": 2.4927953890489913, "grad_norm": 3.4857494831085205, "learning_rate": 4.230014691678016e-09, "logits/chosen": -1.477423071861267, "logits/rejected": -1.478615403175354, "logps/chosen": -49.44475173950195, "logps/rejected": -51.15663528442383, "loss": 0.6865, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.006155318580567837, "rewards/margins": 0.013897466473281384, "rewards/rejected": -0.02005278691649437, "step": 3460 }, { "epoch": 2.5, "grad_norm": 3.1407816410064697, "learning_rate": 4.114084594599707e-09, "logits/chosen": -1.4632585048675537, "logits/rejected": -1.4401248693466187, "logps/chosen": -45.59687042236328, "logps/rejected": -51.5286979675293, "loss": 0.6822, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -0.00289691099897027, "rewards/margins": 0.022817853838205338, "rewards/rejected": -0.02571476623415947, "step": 3470 }, { "epoch": 2.5072046109510087, "grad_norm": 3.1534249782562256, "learning_rate": 3.9996226911319546e-09, "logits/chosen": -1.4795855283737183, "logits/rejected": -1.4571055173873901, "logps/chosen": -45.63375473022461, "logps/rejected": -48.75299072265625, "loss": 0.6838, "rewards/accuracies": 0.59375, "rewards/chosen": -0.0032276164274662733, "rewards/margins": 0.019503096118569374, "rewards/rejected": -0.022730711847543716, "step": 3480 }, { "epoch": 2.5144092219020173, "grad_norm": 3.362689971923828, "learning_rate": 3.886637027473949e-09, "logits/chosen": -1.5117019414901733, "logits/rejected": -1.507697343826294, "logps/chosen": -47.55299758911133, "logps/rejected": -51.62897491455078, "loss": 0.6837, "rewards/accuracies": 0.65625, "rewards/chosen": -0.005091325379908085, "rewards/margins": 0.01942974142730236, "rewards/rejected": -0.02452106960117817, "step": 3490 }, { "epoch": 2.521613832853026, "grad_norm": 3.200064182281494, "learning_rate": 3.775135546051295e-09, "logits/chosen": -1.4044318199157715, "logits/rejected": -1.4048256874084473, "logps/chosen": -46.03704833984375, "logps/rejected": -50.541046142578125, "loss": 0.6818, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.004429998807609081, "rewards/margins": 0.02344963699579239, "rewards/rejected": -0.027879636734724045, "step": 3500 }, { "epoch": 2.5288184438040346, "grad_norm": 3.3993964195251465, "learning_rate": 3.665126084957723e-09, "logits/chosen": -1.4691989421844482, "logits/rejected": -1.4607309103012085, "logps/chosen": -50.858978271484375, "logps/rejected": -51.20890426635742, "loss": 0.6844, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.005501296371221542, "rewards/margins": 0.018329834565520287, "rewards/rejected": -0.023831134662032127, "step": 3510 }, { "epoch": 2.5360230547550433, "grad_norm": 3.145418167114258, "learning_rate": 3.556616377404101e-09, "logits/chosen": -1.5022590160369873, "logits/rejected": -1.4903608560562134, "logps/chosen": -51.8497428894043, "logps/rejected": -55.86328887939453, "loss": 0.6817, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.006599706597626209, "rewards/margins": 0.023647639900445938, "rewards/rejected": -0.030247345566749573, "step": 3520 }, { "epoch": 2.543227665706052, "grad_norm": 3.399369478225708, "learning_rate": 3.4496140511748125e-09, "logits/chosen": -1.4858022928237915, "logits/rejected": -1.4672108888626099, "logps/chosen": -48.12791061401367, "logps/rejected": -51.02872085571289, "loss": 0.6833, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.00712383771315217, "rewards/margins": 0.02042998932301998, "rewards/rejected": -0.027553830295801163, "step": 3530 }, { "epoch": 2.5504322766570606, "grad_norm": 3.965590000152588, "learning_rate": 3.3441266280915427e-09, "logits/chosen": -1.4491469860076904, "logits/rejected": -1.4466187953948975, "logps/chosen": -53.73418045043945, "logps/rejected": -57.25054931640625, "loss": 0.6853, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.002359113423153758, "rewards/margins": 0.01653447560966015, "rewards/rejected": -0.018893588334321976, "step": 3540 }, { "epoch": 2.5576368876080693, "grad_norm": 3.5014843940734863, "learning_rate": 3.2401615234845693e-09, "logits/chosen": -1.492725133895874, "logits/rejected": -1.4749128818511963, "logps/chosen": -54.025245666503906, "logps/rejected": -57.38044357299805, "loss": 0.6818, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.007148324511945248, "rewards/margins": 0.02353733219206333, "rewards/rejected": -0.030685653910040855, "step": 3550 }, { "epoch": 2.564841498559078, "grad_norm": 3.1182708740234375, "learning_rate": 3.1377260456714375e-09, "logits/chosen": -1.3231797218322754, "logits/rejected": -1.3109872341156006, "logps/chosen": -49.063629150390625, "logps/rejected": -54.12776565551758, "loss": 0.6836, "rewards/accuracies": 0.625, "rewards/chosen": -0.007035274989902973, "rewards/margins": 0.020289259031414986, "rewards/rejected": -0.027324533089995384, "step": 3560 }, { "epoch": 2.5720461095100866, "grad_norm": 3.6655466556549072, "learning_rate": 3.0368273954432698e-09, "logits/chosen": -1.5293312072753906, "logits/rejected": -1.5017873048782349, "logps/chosen": -51.08592987060547, "logps/rejected": -53.295440673828125, "loss": 0.6846, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.006225470919162035, "rewards/margins": 0.017943011596798897, "rewards/rejected": -0.024168482050299644, "step": 3570 }, { "epoch": 2.5792507204610953, "grad_norm": 3.021207094192505, "learning_rate": 2.937472665558541e-09, "logits/chosen": -1.5536901950836182, "logits/rejected": -1.5462580919265747, "logps/chosen": -45.45623016357422, "logps/rejected": -47.74097442626953, "loss": 0.6818, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.008379434235394001, "rewards/margins": 0.0236833319067955, "rewards/rejected": -0.03206276521086693, "step": 3580 }, { "epoch": 2.586455331412104, "grad_norm": 4.031035900115967, "learning_rate": 2.8396688402445053e-09, "logits/chosen": -1.574249505996704, "logits/rejected": -1.5572589635849, "logps/chosen": -45.45792007446289, "logps/rejected": -51.68840789794922, "loss": 0.6817, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.01051649171859026, "rewards/margins": 0.024006729945540428, "rewards/rejected": -0.03452322259545326, "step": 3590 }, { "epoch": 2.5936599423631126, "grad_norm": 4.006160259246826, "learning_rate": 2.7434227947062324e-09, "logits/chosen": -1.5262694358825684, "logits/rejected": -1.5151522159576416, "logps/chosen": -53.84954071044922, "logps/rejected": -57.39601516723633, "loss": 0.6856, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.005320216063410044, "rewards/margins": 0.015715904533863068, "rewards/rejected": -0.021036118268966675, "step": 3600 }, { "epoch": 2.6008645533141213, "grad_norm": 3.0315651893615723, "learning_rate": 2.6487412946432976e-09, "logits/chosen": -1.4456422328948975, "logits/rejected": -1.4327547550201416, "logps/chosen": -49.539451599121094, "logps/rejected": -52.36029052734375, "loss": 0.6821, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.012892505154013634, "rewards/margins": 0.023083876818418503, "rewards/rejected": -0.03597638010978699, "step": 3610 }, { "epoch": 2.60806916426513, "grad_norm": 3.428950786590576, "learning_rate": 2.5556309957742024e-09, "logits/chosen": -1.4444409608840942, "logits/rejected": -1.4349945783615112, "logps/chosen": -44.949119567871094, "logps/rejected": -52.14201736450195, "loss": 0.6804, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0015101671451702714, "rewards/margins": 0.026495974510908127, "rewards/rejected": -0.02498580887913704, "step": 3620 }, { "epoch": 2.6152737752161386, "grad_norm": 3.3707656860351562, "learning_rate": 2.4640984433684758e-09, "logits/chosen": -1.5578646659851074, "logits/rejected": -1.5437214374542236, "logps/chosen": -50.96113204956055, "logps/rejected": -53.15272903442383, "loss": 0.6837, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.004595599137246609, "rewards/margins": 0.019676122814416885, "rewards/rejected": -0.02427172288298607, "step": 3630 }, { "epoch": 2.6224783861671472, "grad_norm": 3.6540372371673584, "learning_rate": 2.3741500717865987e-09, "logits/chosen": -1.4447523355484009, "logits/rejected": -1.4563525915145874, "logps/chosen": -47.37974166870117, "logps/rejected": -52.231971740722656, "loss": 0.6841, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -0.0018523692851886153, "rewards/margins": 0.019028810784220695, "rewards/rejected": -0.020881177857518196, "step": 3640 }, { "epoch": 2.629682997118156, "grad_norm": 3.127500295639038, "learning_rate": 2.285792204027678e-09, "logits/chosen": -1.4207046031951904, "logits/rejected": -1.4100974798202515, "logps/chosen": -47.52744674682617, "logps/rejected": -54.74420166015625, "loss": 0.6823, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.005003909580409527, "rewards/margins": 0.022536050528287888, "rewards/rejected": -0.027539962902665138, "step": 3650 }, { "epoch": 2.636887608069164, "grad_norm": 3.715735912322998, "learning_rate": 2.199031051284972e-09, "logits/chosen": -1.4993317127227783, "logits/rejected": -1.4986127614974976, "logps/chosen": -48.38993453979492, "logps/rejected": -52.40184783935547, "loss": 0.6839, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.005102119408547878, "rewards/margins": 0.019588427618145943, "rewards/rejected": -0.024690547958016396, "step": 3660 }, { "epoch": 2.6440922190201728, "grad_norm": 3.8177621364593506, "learning_rate": 2.113872712509254e-09, "logits/chosen": -1.4067778587341309, "logits/rejected": -1.397430658340454, "logps/chosen": -56.226844787597656, "logps/rejected": -59.394134521484375, "loss": 0.6831, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.009724282659590244, "rewards/margins": 0.02074650302529335, "rewards/rejected": -0.03047078475356102, "step": 3670 }, { "epoch": 2.6512968299711814, "grad_norm": 3.4949936866760254, "learning_rate": 2.0303231739801143e-09, "logits/chosen": -1.4101097583770752, "logits/rejected": -1.3969051837921143, "logps/chosen": -50.74885177612305, "logps/rejected": -55.03565216064453, "loss": 0.6843, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.009084323421120644, "rewards/margins": 0.018547767773270607, "rewards/rejected": -0.027632087469100952, "step": 3680 }, { "epoch": 2.65850144092219, "grad_norm": 3.9415016174316406, "learning_rate": 1.948388308885102e-09, "logits/chosen": -1.5741355419158936, "logits/rejected": -1.5591968297958374, "logps/chosen": -50.14144515991211, "logps/rejected": -53.07600784301758, "loss": 0.685, "rewards/accuracies": 0.59375, "rewards/chosen": -0.003897708607837558, "rewards/margins": 0.017061758786439896, "rewards/rejected": -0.02095946855843067, "step": 3690 }, { "epoch": 2.6657060518731988, "grad_norm": 3.1455366611480713, "learning_rate": 1.86807387690692e-09, "logits/chosen": -1.5526137351989746, "logits/rejected": -1.5452834367752075, "logps/chosen": -50.222904205322266, "logps/rejected": -57.684120178222656, "loss": 0.6788, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0016280229901894927, "rewards/margins": 0.029992084950208664, "rewards/rejected": -0.03162010759115219, "step": 3700 }, { "epoch": 2.6729106628242074, "grad_norm": 3.540015697479248, "learning_rate": 1.789385523818493e-09, "logits/chosen": -1.4762442111968994, "logits/rejected": -1.4785929918289185, "logps/chosen": -45.24761199951172, "logps/rejected": -51.211875915527344, "loss": 0.6822, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.0035579826217144728, "rewards/margins": 0.02260783314704895, "rewards/rejected": -0.02616581693291664, "step": 3710 }, { "epoch": 2.680115273775216, "grad_norm": 3.6030099391937256, "learning_rate": 1.712328781086131e-09, "logits/chosen": -1.5480079650878906, "logits/rejected": -1.5321252346038818, "logps/chosen": -51.021087646484375, "logps/rejected": -53.23911666870117, "loss": 0.6864, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.007240858860313892, "rewards/margins": 0.014243543148040771, "rewards/rejected": -0.02148440107703209, "step": 3720 }, { "epoch": 2.6873198847262247, "grad_norm": 3.420142650604248, "learning_rate": 1.6369090654806543e-09, "logits/chosen": -1.5726039409637451, "logits/rejected": -1.5602816343307495, "logps/chosen": -46.87481689453125, "logps/rejected": -51.70646286010742, "loss": 0.6845, "rewards/accuracies": 0.625, "rewards/chosen": -0.008368275128304958, "rewards/margins": 0.017895232886075974, "rewards/rejected": -0.026263505220413208, "step": 3730 }, { "epoch": 2.6945244956772334, "grad_norm": 3.2335476875305176, "learning_rate": 1.5631316786966498e-09, "logits/chosen": -1.4826228618621826, "logits/rejected": -1.4666416645050049, "logps/chosen": -45.12044143676758, "logps/rejected": -48.53943634033203, "loss": 0.6849, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.0067514353431761265, "rewards/margins": 0.017413515597581863, "rewards/rejected": -0.024164952337741852, "step": 3740 }, { "epoch": 2.701729106628242, "grad_norm": 4.138692378997803, "learning_rate": 1.491001806979772e-09, "logits/chosen": -1.512900710105896, "logits/rejected": -1.498417615890503, "logps/chosen": -50.178470611572266, "logps/rejected": -54.36761474609375, "loss": 0.6838, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.0021749637089669704, "rewards/margins": 0.01952570676803589, "rewards/rejected": -0.021700672805309296, "step": 3750 }, { "epoch": 2.7089337175792507, "grad_norm": 3.769171953201294, "learning_rate": 1.4205245207621508e-09, "logits/chosen": -1.4354712963104248, "logits/rejected": -1.419633388519287, "logps/chosen": -52.882362365722656, "logps/rejected": -55.71977996826172, "loss": 0.682, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0019592377357184887, "rewards/margins": 0.023377398028969765, "rewards/rejected": -0.02533663436770439, "step": 3760 }, { "epoch": 2.7161383285302594, "grad_norm": 3.8841655254364014, "learning_rate": 1.3517047743059978e-09, "logits/chosen": -1.5189292430877686, "logits/rejected": -1.5208401679992676, "logps/chosen": -49.525108337402344, "logps/rejected": -55.4781494140625, "loss": 0.6837, "rewards/accuracies": 0.6875, "rewards/chosen": -0.0053468383848667145, "rewards/margins": 0.019498584792017937, "rewards/rejected": -0.0248454250395298, "step": 3770 }, { "epoch": 2.723342939481268, "grad_norm": 3.297200918197632, "learning_rate": 1.2845474053553156e-09, "logits/chosen": -1.517322063446045, "logits/rejected": -1.5089843273162842, "logps/chosen": -43.52055740356445, "logps/rejected": -47.09668731689453, "loss": 0.6851, "rewards/accuracies": 0.625, "rewards/chosen": -0.007344129495322704, "rewards/margins": 0.016880614683032036, "rewards/rejected": -0.024224746972322464, "step": 3780 }, { "epoch": 2.7305475504322767, "grad_norm": 2.828813314437866, "learning_rate": 1.2190571347958422e-09, "logits/chosen": -1.5427597761154175, "logits/rejected": -1.5456900596618652, "logps/chosen": -43.29767990112305, "logps/rejected": -50.125144958496094, "loss": 0.6844, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.0003991241683252156, "rewards/margins": 0.018153730779886246, "rewards/rejected": -0.018552854657173157, "step": 3790 }, { "epoch": 2.7377521613832854, "grad_norm": 2.9458444118499756, "learning_rate": 1.1552385663231634e-09, "logits/chosen": -1.479465365409851, "logits/rejected": -1.4579660892486572, "logps/chosen": -48.168670654296875, "logps/rejected": -50.11082077026367, "loss": 0.6851, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0051146079786121845, "rewards/margins": 0.016856908798217773, "rewards/rejected": -0.02197151444852352, "step": 3800 }, { "epoch": 2.744956772334294, "grad_norm": 3.110060930252075, "learning_rate": 1.0930961861191302e-09, "logits/chosen": -1.4410741329193115, "logits/rejected": -1.4403388500213623, "logps/chosen": -46.421630859375, "logps/rejected": -49.944129943847656, "loss": 0.6864, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.007156413048505783, "rewards/margins": 0.014478680677711964, "rewards/rejected": -0.021635092794895172, "step": 3810 }, { "epoch": 2.7521613832853027, "grad_norm": 3.0305254459381104, "learning_rate": 1.0326343625364608e-09, "logits/chosen": -1.435240387916565, "logits/rejected": -1.419524908065796, "logps/chosen": -47.08954620361328, "logps/rejected": -52.5599250793457, "loss": 0.6807, "rewards/accuracies": 0.65625, "rewards/chosen": -0.005212406627833843, "rewards/margins": 0.026095682755112648, "rewards/rejected": -0.03130808845162392, "step": 3820 }, { "epoch": 2.7593659942363113, "grad_norm": 2.6526858806610107, "learning_rate": 9.738573457917066e-10, "logits/chosen": -1.5486071109771729, "logits/rejected": -1.542614459991455, "logps/chosen": -41.173927307128906, "logps/rejected": -47.29227828979492, "loss": 0.6824, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.005259085912257433, "rewards/margins": 0.022309530526399612, "rewards/rejected": -0.027568617835640907, "step": 3830 }, { "epoch": 2.76657060518732, "grad_norm": 3.002786874771118, "learning_rate": 9.16769267666434e-10, "logits/chosen": -1.4672725200653076, "logits/rejected": -1.4611655473709106, "logps/chosen": -46.251319885253906, "logps/rejected": -48.279781341552734, "loss": 0.6887, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.00623342115432024, "rewards/margins": 0.009535645134747028, "rewards/rejected": -0.01576906442642212, "step": 3840 }, { "epoch": 2.7737752161383287, "grad_norm": 3.2858715057373047, "learning_rate": 8.613741412168113e-10, "logits/chosen": -1.4862616062164307, "logits/rejected": -1.4810454845428467, "logps/chosen": -54.2935676574707, "logps/rejected": -58.491119384765625, "loss": 0.6831, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.003660841379314661, "rewards/margins": 0.020793884992599487, "rewards/rejected": -0.024454724043607712, "step": 3850 }, { "epoch": 2.7809798270893373, "grad_norm": 3.317065954208374, "learning_rate": 8.076758604914802e-10, "logits/chosen": -1.4456998109817505, "logits/rejected": -1.4331220388412476, "logps/chosen": -43.14230728149414, "logps/rejected": -46.72562026977539, "loss": 0.6842, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0021024621091783047, "rewards/margins": 0.01865329220890999, "rewards/rejected": -0.02075575292110443, "step": 3860 }, { "epoch": 2.7881844380403455, "grad_norm": 4.6626362800598145, "learning_rate": 7.55678200257856e-10, "logits/chosen": -1.442833662033081, "logits/rejected": -1.4301643371582031, "logps/chosen": -50.06626510620117, "logps/rejected": -55.51959991455078, "loss": 0.6827, "rewards/accuracies": 0.625, "rewards/chosen": -0.006806174758821726, "rewards/margins": 0.021879781037569046, "rewards/rejected": -0.028685953468084335, "step": 3870 }, { "epoch": 2.795389048991354, "grad_norm": 3.242816925048828, "learning_rate": 7.053848157367315e-10, "logits/chosen": -1.4660804271697998, "logits/rejected": -1.4522478580474854, "logps/chosen": -48.17863082885742, "logps/rejected": -53.19854736328125, "loss": 0.6828, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.0016250055050477386, "rewards/margins": 0.021487154066562653, "rewards/rejected": -0.023112159222364426, "step": 3880 }, { "epoch": 2.802593659942363, "grad_norm": 2.588425397872925, "learning_rate": 6.567992423453794e-10, "logits/chosen": -1.4936778545379639, "logits/rejected": -1.4873104095458984, "logps/chosen": -43.38020706176758, "logps/rejected": -46.673580169677734, "loss": 0.6838, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.0043821679428219795, "rewards/margins": 0.019423723220825195, "rewards/rejected": -0.0238058902323246, "step": 3890 }, { "epoch": 2.8097982708933715, "grad_norm": 3.189708948135376, "learning_rate": 6.099248954489794e-10, "logits/chosen": -1.4088729619979858, "logits/rejected": -1.4072296619415283, "logps/chosen": -47.93668746948242, "logps/rejected": -53.1524543762207, "loss": 0.6834, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.0071435668505728245, "rewards/margins": 0.020221812650561333, "rewards/rejected": -0.02736538089811802, "step": 3900 }, { "epoch": 2.81700288184438, "grad_norm": 3.6743202209472656, "learning_rate": 5.647650701205653e-10, "logits/chosen": -1.5016138553619385, "logits/rejected": -1.481757402420044, "logps/chosen": -54.419166564941406, "logps/rejected": -58.34161376953125, "loss": 0.6805, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.0001567740982864052, "rewards/margins": 0.02638132870197296, "rewards/rejected": -0.026538103818893433, "step": 3910 }, { "epoch": 2.824207492795389, "grad_norm": 3.1479997634887695, "learning_rate": 5.213229409093856e-10, "logits/chosen": -1.5346415042877197, "logits/rejected": -1.5240830183029175, "logps/chosen": -52.7498664855957, "logps/rejected": -57.8280143737793, "loss": 0.6816, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.004968610592186451, "rewards/margins": 0.024416498839855194, "rewards/rejected": -0.02938510850071907, "step": 3920 }, { "epoch": 2.8314121037463975, "grad_norm": 4.354657173156738, "learning_rate": 4.796015616177401e-10, "logits/chosen": -1.4575870037078857, "logits/rejected": -1.4455732107162476, "logps/chosen": -51.82859420776367, "logps/rejected": -55.66108322143555, "loss": 0.6854, "rewards/accuracies": 0.53125, "rewards/chosen": -0.007370557636022568, "rewards/margins": 0.016358794644474983, "rewards/rejected": -0.0237293504178524, "step": 3930 }, { "epoch": 2.838616714697406, "grad_norm": 3.3422064781188965, "learning_rate": 4.3960386508631595e-10, "logits/chosen": -1.3845546245574951, "logits/rejected": -1.38401198387146, "logps/chosen": -42.65739059448242, "logps/rejected": -46.67231369018555, "loss": 0.6851, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.006754173897206783, "rewards/margins": 0.017140675336122513, "rewards/rejected": -0.02389485016465187, "step": 3940 }, { "epoch": 2.845821325648415, "grad_norm": 4.824497699737549, "learning_rate": 4.013326629880243e-10, "logits/chosen": -1.42984139919281, "logits/rejected": -1.4125710725784302, "logps/chosen": -50.18545150756836, "logps/rejected": -54.050750732421875, "loss": 0.6825, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.007838496938347816, "rewards/margins": 0.022195886820554733, "rewards/rejected": -0.0300343818962574, "step": 3950 }, { "epoch": 2.8530259365994235, "grad_norm": 3.4627509117126465, "learning_rate": 3.64790645630339e-10, "logits/chosen": -1.3913991451263428, "logits/rejected": -1.3863855600357056, "logps/chosen": -53.33915328979492, "logps/rejected": -55.66510009765625, "loss": 0.6873, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.0018841477576643229, "rewards/margins": 0.012181239202618599, "rewards/rejected": -0.014065387658774853, "step": 3960 }, { "epoch": 2.860230547550432, "grad_norm": 4.963607311248779, "learning_rate": 3.2998038176619e-10, "logits/chosen": -1.4526419639587402, "logits/rejected": -1.4365007877349854, "logps/chosen": -51.497032165527344, "logps/rejected": -54.97554397583008, "loss": 0.685, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.0071846963837742805, "rewards/margins": 0.01706068590283394, "rewards/rejected": -0.024245383217930794, "step": 3970 }, { "epoch": 2.867435158501441, "grad_norm": 3.4952621459960938, "learning_rate": 2.969043184133907e-10, "logits/chosen": -1.5576436519622803, "logits/rejected": -1.5564569234848022, "logps/chosen": -45.00513458251953, "logps/rejected": -53.378684997558594, "loss": 0.6817, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.00028269606991671026, "rewards/margins": 0.023711269721388817, "rewards/rejected": -0.02399396523833275, "step": 3980 }, { "epoch": 2.8746397694524495, "grad_norm": 3.8880701065063477, "learning_rate": 2.6556478068261447e-10, "logits/chosen": -1.4495378732681274, "logits/rejected": -1.4356410503387451, "logps/chosen": -44.4900016784668, "logps/rejected": -48.011497497558594, "loss": 0.6796, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0006750145694240928, "rewards/margins": 0.028233755379915237, "rewards/rejected": -0.027558740228414536, "step": 3990 }, { "epoch": 2.881844380403458, "grad_norm": 3.490116596221924, "learning_rate": 2.3596397161395607e-10, "logits/chosen": -1.558830976486206, "logits/rejected": -1.5369431972503662, "logps/chosen": -49.584800720214844, "logps/rejected": -54.7055549621582, "loss": 0.6809, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": 0.00045850887545384467, "rewards/margins": 0.025326168164610863, "rewards/rejected": -0.02486766129732132, "step": 4000 }, { "epoch": 2.889048991354467, "grad_norm": 4.86458683013916, "learning_rate": 2.0810397202206399e-10, "logits/chosen": -1.4145355224609375, "logits/rejected": -1.4098033905029297, "logps/chosen": -49.917091369628906, "logps/rejected": -53.33013916015625, "loss": 0.6847, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": 0.0004277043044567108, "rewards/margins": 0.017619196325540543, "rewards/rejected": -0.017191490158438683, "step": 4010 }, { "epoch": 2.8962536023054755, "grad_norm": 3.27022647857666, "learning_rate": 1.819867403498737e-10, "logits/chosen": -1.5645722150802612, "logits/rejected": -1.5553191900253296, "logps/chosen": -47.87715530395508, "logps/rejected": -51.797218322753906, "loss": 0.6836, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0083371726796031, "rewards/margins": 0.020166922360658646, "rewards/rejected": -0.02850409410893917, "step": 4020 }, { "epoch": 2.903458213256484, "grad_norm": 3.4694247245788574, "learning_rate": 1.5761411253092382e-10, "logits/chosen": -1.4308217763900757, "logits/rejected": -1.4093315601348877, "logps/chosen": -46.04566192626953, "logps/rejected": -48.20244598388672, "loss": 0.6843, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0066087073646485806, "rewards/margins": 0.018396150320768356, "rewards/rejected": -0.025004858151078224, "step": 4030 }, { "epoch": 2.910662824207493, "grad_norm": 3.5978357791900635, "learning_rate": 1.3498780186031455e-10, "logits/chosen": -1.4942580461502075, "logits/rejected": -1.4852937459945679, "logps/chosen": -53.630462646484375, "logps/rejected": -57.28614044189453, "loss": 0.6847, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.006508751306682825, "rewards/margins": 0.017568571493029594, "rewards/rejected": -0.024077320471405983, "step": 4040 }, { "epoch": 2.9178674351585014, "grad_norm": 3.3043346405029297, "learning_rate": 1.1410939887425141e-10, "logits/chosen": -1.498331904411316, "logits/rejected": -1.4901821613311768, "logps/chosen": -47.133060455322266, "logps/rejected": -49.65581130981445, "loss": 0.6861, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.010484300553798676, "rewards/margins": 0.014650911092758179, "rewards/rejected": -0.025135213509202003, "step": 4050 }, { "epoch": 2.92507204610951, "grad_norm": 2.9464681148529053, "learning_rate": 9.498037123825686e-11, "logits/chosen": -1.5104553699493408, "logits/rejected": -1.499537706375122, "logps/chosen": -45.17995834350586, "logps/rejected": -49.46979522705078, "loss": 0.6832, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.004455415066331625, "rewards/margins": 0.020533457398414612, "rewards/rejected": -0.024988874793052673, "step": 4060 }, { "epoch": 2.9322766570605188, "grad_norm": 3.2877230644226074, "learning_rate": 7.760206364398614e-11, "logits/chosen": -1.5863580703735352, "logits/rejected": -1.565198540687561, "logps/chosen": -49.96894073486328, "logps/rejected": -53.23193359375, "loss": 0.6837, "rewards/accuracies": 0.625, "rewards/chosen": -0.009586494415998459, "rewards/margins": 0.020023521035909653, "rewards/rejected": -0.029610013589262962, "step": 4070 }, { "epoch": 2.9394812680115274, "grad_norm": 3.83817720413208, "learning_rate": 6.19756977147029e-11, "logits/chosen": -1.4424731731414795, "logits/rejected": -1.4336028099060059, "logps/chosen": -47.239967346191406, "logps/rejected": -54.48072052001953, "loss": 0.6829, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.009559125639498234, "rewards/margins": 0.021477770060300827, "rewards/rejected": -0.031036894768476486, "step": 4080 }, { "epoch": 2.946685878962536, "grad_norm": 2.8576111793518066, "learning_rate": 4.810237191940625e-11, "logits/chosen": -1.4422481060028076, "logits/rejected": -1.4329513311386108, "logps/chosen": -46.956687927246094, "logps/rejected": -49.96436309814453, "loss": 0.6855, "rewards/accuracies": 0.5625, "rewards/chosen": -0.008776359260082245, "rewards/margins": 0.016020456328988075, "rewards/rejected": -0.02479681745171547, "step": 4090 }, { "epoch": 2.9538904899135447, "grad_norm": 3.3893821239471436, "learning_rate": 3.5983061495617476e-11, "logits/chosen": -1.5260568857192993, "logits/rejected": -1.5251171588897705, "logps/chosen": -51.82386016845703, "logps/rejected": -57.438377380371094, "loss": 0.6843, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.005095969419926405, "rewards/margins": 0.018611816689372063, "rewards/rejected": -0.023707788437604904, "step": 4100 }, { "epoch": 2.9610951008645534, "grad_norm": 3.1232125759124756, "learning_rate": 2.5618618380812694e-11, "logits/chosen": -1.5207054615020752, "logits/rejected": -1.5060088634490967, "logps/chosen": -42.08940124511719, "logps/rejected": -47.49885559082031, "loss": 0.6811, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00411585858091712, "rewards/margins": 0.024918580427765846, "rewards/rejected": -0.029034441336989403, "step": 4110 }, { "epoch": 2.968299711815562, "grad_norm": 3.4026784896850586, "learning_rate": 1.700977115254576e-11, "logits/chosen": -1.463889479637146, "logits/rejected": -1.454345464706421, "logps/chosen": -46.250335693359375, "logps/rejected": -51.5218505859375, "loss": 0.6831, "rewards/accuracies": 0.625, "rewards/chosen": -0.00747289415448904, "rewards/margins": 0.02096741273999214, "rewards/rejected": -0.028440307825803757, "step": 4120 }, { "epoch": 2.9755043227665707, "grad_norm": 2.975994110107422, "learning_rate": 1.0157124977230868e-11, "logits/chosen": -1.4347963333129883, "logits/rejected": -1.4254258871078491, "logps/chosen": -43.63376998901367, "logps/rejected": -47.81342315673828, "loss": 0.6845, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -0.0023829059209674597, "rewards/margins": 0.018163811415433884, "rewards/rejected": -0.020546717569231987, "step": 4130 }, { "epoch": 2.9827089337175794, "grad_norm": 3.555697441101074, "learning_rate": 5.061161567596061e-12, "logits/chosen": -1.4684605598449707, "logits/rejected": -1.4558757543563843, "logps/chosen": -47.77406692504883, "logps/rejected": -50.39754867553711, "loss": 0.6844, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.001015470246784389, "rewards/margins": 0.018158772960305214, "rewards/rejected": -0.0191742442548275, "step": 4140 }, { "epoch": 2.989913544668588, "grad_norm": 3.382728099822998, "learning_rate": 1.7222391488297406e-12, "logits/chosen": -1.51640784740448, "logits/rejected": -1.5047814846038818, "logps/chosen": -53.642372131347656, "logps/rejected": -58.678138732910156, "loss": 0.6794, "rewards/accuracies": 0.65625, "rewards/chosen": -0.0038291483651846647, "rewards/margins": 0.02880244515836239, "rewards/rejected": -0.03263159841299057, "step": 4150 }, { "epoch": 2.9971181556195967, "grad_norm": 3.958996534347534, "learning_rate": 1.4059243338693238e-13, "logits/chosen": -1.4424545764923096, "logits/rejected": -1.431705355644226, "logps/chosen": -48.65971374511719, "logps/rejected": -53.465545654296875, "loss": 0.6825, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.0016985107213258743, "rewards/margins": 0.02206951007246971, "rewards/rejected": -0.023768020793795586, "step": 4160 }, { "epoch": 3.0, "step": 4164, "total_flos": 0.0, "train_loss": 0.6880548467553658, "train_runtime": 7425.9787, "train_samples_per_second": 8.969, "train_steps_per_second": 0.561 } ], "logging_steps": 10, "max_steps": 4164, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }