{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.6615384615384614, "eval_steps": 20, "global_step": 360, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009230769230769232, "grad_norm": 52.40730345789634, "learning_rate": 2.2727272727272725e-08, "logits/chosen": -1.2901445627212524, "logits/rejected": -1.2963205575942993, "logps/chosen": -16.113027572631836, "logps/rejected": -27.10122299194336, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.018461538461538463, "grad_norm": 64.88802449206628, "learning_rate": 4.545454545454545e-08, "logits/chosen": -1.3016295433044434, "logits/rejected": -1.3255655765533447, "logps/chosen": -20.355079650878906, "logps/rejected": -39.93232727050781, "loss": 0.6895, "rewards/accuracies": 0.5, "rewards/chosen": 0.008816350251436234, "rewards/margins": 0.0047285472974181175, "rewards/rejected": 0.004087802488356829, "step": 4 }, { "epoch": 0.027692307692307693, "grad_norm": 59.6800701771534, "learning_rate": 6.818181818181817e-08, "logits/chosen": -1.31508207321167, "logits/rejected": -1.3189733028411865, "logps/chosen": -23.069622039794922, "logps/rejected": -26.97477149963379, "loss": 0.695, "rewards/accuracies": 0.4444444477558136, "rewards/chosen": -0.007372706197202206, "rewards/margins": -0.013017671182751656, "rewards/rejected": 0.0056449659168720245, "step": 6 }, { "epoch": 0.036923076923076927, "grad_norm": 52.983511533208585, "learning_rate": 9.09090909090909e-08, "logits/chosen": -1.277503252029419, "logits/rejected": -1.3002785444259644, "logps/chosen": -20.34660530090332, "logps/rejected": -31.0557861328125, "loss": 0.6908, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 0.020842621102929115, "rewards/margins": 0.020597590133547783, "rewards/rejected": 0.00024503222084604204, "step": 8 }, { "epoch": 0.046153846153846156, "grad_norm": 66.2747581823961, "learning_rate": 1.1363636363636363e-07, "logits/chosen": -1.3306350708007812, "logits/rejected": -1.3309379816055298, "logps/chosen": -26.48358917236328, "logps/rejected": -30.445173263549805, "loss": 0.7046, "rewards/accuracies": 0.3888888955116272, "rewards/chosen": -0.01246996782720089, "rewards/margins": -0.032543592154979706, "rewards/rejected": 0.020073626190423965, "step": 10 }, { "epoch": 0.055384615384615386, "grad_norm": 57.271529486531605, "learning_rate": 1.3636363636363635e-07, "logits/chosen": -1.280084252357483, "logits/rejected": -1.295721411705017, "logps/chosen": -25.79343032836914, "logps/rejected": -36.58183288574219, "loss": 0.6956, "rewards/accuracies": 0.5555555820465088, "rewards/chosen": 0.023966560140252113, "rewards/margins": 0.030559096485376358, "rewards/rejected": -0.006592527963221073, "step": 12 }, { "epoch": 0.06461538461538462, "grad_norm": 67.94854888195144, "learning_rate": 1.5909090909090907e-07, "logits/chosen": -1.2790985107421875, "logits/rejected": -1.296931266784668, "logps/chosen": -24.833446502685547, "logps/rejected": -31.11182403564453, "loss": 0.7006, "rewards/accuracies": 0.4305555522441864, "rewards/chosen": 0.013436201959848404, "rewards/margins": 0.002752000233158469, "rewards/rejected": 0.010684202425181866, "step": 14 }, { "epoch": 0.07384615384615385, "grad_norm": 49.36191286721225, "learning_rate": 1.818181818181818e-07, "logits/chosen": -1.301368236541748, "logits/rejected": -1.3136367797851562, "logps/chosen": -26.273963928222656, "logps/rejected": -35.63306427001953, "loss": 0.6949, "rewards/accuracies": 0.5972222089767456, "rewards/chosen": 0.015296169556677341, "rewards/margins": 0.013788570649921894, "rewards/rejected": 0.0015075993724167347, "step": 16 }, { "epoch": 0.08307692307692308, "grad_norm": 56.43976674406361, "learning_rate": 2.0454545454545456e-07, "logits/chosen": -1.3201720714569092, "logits/rejected": -1.3183202743530273, "logps/chosen": -25.70770263671875, "logps/rejected": -26.178009033203125, "loss": 0.7006, "rewards/accuracies": 0.5555555820465088, "rewards/chosen": 0.0011544560547918081, "rewards/margins": 0.01863468438386917, "rewards/rejected": -0.01748022995889187, "step": 18 }, { "epoch": 0.09230769230769231, "grad_norm": 56.010590202518365, "learning_rate": 2.2727272727272726e-07, "logits/chosen": -1.2482044696807861, "logits/rejected": -1.262031078338623, "logps/chosen": -28.337791442871094, "logps/rejected": -29.38203239440918, "loss": 0.6883, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 0.00024333276087418199, "rewards/margins": -0.0005785864195786417, "rewards/rejected": 0.000821918249130249, "step": 20 }, { "epoch": 0.09230769230769231, "eval_logits/chosen": -1.3220677375793457, "eval_logits/rejected": -1.33245849609375, "eval_logps/chosen": -23.036666870117188, "eval_logps/rejected": -26.372356414794922, "eval_loss": 0.6916412115097046, "eval_rewards/accuracies": 0.4965437650680542, "eval_rewards/chosen": 0.00501647312194109, "eval_rewards/margins": 0.010797887109220028, "eval_rewards/rejected": -0.0057814153842628, "eval_runtime": 216.2201, "eval_samples_per_second": 8.02, "eval_steps_per_second": 2.007, "step": 20 }, { "epoch": 0.10153846153846154, "grad_norm": 67.30805212172523, "learning_rate": 2.5e-07, "logits/chosen": -1.2273086309432983, "logits/rejected": -1.2565299272537231, "logps/chosen": -21.540626525878906, "logps/rejected": -47.4769172668457, "loss": 0.6893, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": 0.007773838937282562, "rewards/margins": 0.026619136333465576, "rewards/rejected": -0.018845297396183014, "step": 22 }, { "epoch": 0.11076923076923077, "grad_norm": 51.29780655120263, "learning_rate": 2.727272727272727e-07, "logits/chosen": -1.219795823097229, "logits/rejected": -1.235877513885498, "logps/chosen": -30.82242774963379, "logps/rejected": -37.68511962890625, "loss": 0.6758, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.03086034394800663, "rewards/margins": 0.055920813232660294, "rewards/rejected": -0.025060458108782768, "step": 24 }, { "epoch": 0.12, "grad_norm": 55.0939959360046, "learning_rate": 2.9545454545454545e-07, "logits/chosen": -1.258486270904541, "logits/rejected": -1.2752680778503418, "logps/chosen": -25.136966705322266, "logps/rejected": -43.23137664794922, "loss": 0.6774, "rewards/accuracies": 0.5972222089767456, "rewards/chosen": 0.0216163769364357, "rewards/margins": 0.08480846881866455, "rewards/rejected": -0.06319208443164825, "step": 26 }, { "epoch": 0.12923076923076923, "grad_norm": 48.332663649143974, "learning_rate": 3.1818181818181815e-07, "logits/chosen": -1.320160150527954, "logits/rejected": -1.330212950706482, "logps/chosen": -19.24217414855957, "logps/rejected": -27.22931671142578, "loss": 0.6874, "rewards/accuracies": 0.4027777910232544, "rewards/chosen": 0.01321298535913229, "rewards/margins": 0.009595979005098343, "rewards/rejected": 0.0036170051898807287, "step": 28 }, { "epoch": 0.13846153846153847, "grad_norm": 49.59877928678631, "learning_rate": 3.4090909090909085e-07, "logits/chosen": -1.2795339822769165, "logits/rejected": -1.2929219007492065, "logps/chosen": -21.841049194335938, "logps/rejected": -28.89714813232422, "loss": 0.6813, "rewards/accuracies": 0.5555555820465088, "rewards/chosen": 0.013576723635196686, "rewards/margins": 0.05021868646144867, "rewards/rejected": -0.036641962826251984, "step": 30 }, { "epoch": 0.1476923076923077, "grad_norm": 51.02397460357053, "learning_rate": 3.636363636363636e-07, "logits/chosen": -1.2797447443008423, "logits/rejected": -1.3022751808166504, "logps/chosen": -24.65501594543457, "logps/rejected": -36.741573333740234, "loss": 0.6732, "rewards/accuracies": 0.6527777910232544, "rewards/chosen": 0.04290567338466644, "rewards/margins": 0.09170582890510559, "rewards/rejected": -0.04880015552043915, "step": 32 }, { "epoch": 0.15692307692307692, "grad_norm": 45.39524675384609, "learning_rate": 3.8636363636363636e-07, "logits/chosen": -1.2498574256896973, "logits/rejected": -1.2657580375671387, "logps/chosen": -21.32640838623047, "logps/rejected": -39.71310806274414, "loss": 0.6627, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.007356289308518171, "rewards/margins": 0.06605351716279984, "rewards/rejected": -0.058697231113910675, "step": 34 }, { "epoch": 0.16615384615384615, "grad_norm": 52.58099443727954, "learning_rate": 4.090909090909091e-07, "logits/chosen": -1.2139866352081299, "logits/rejected": -1.2340948581695557, "logps/chosen": -18.409015655517578, "logps/rejected": -35.20015335083008, "loss": 0.6644, "rewards/accuracies": 0.5, "rewards/chosen": 0.022290384396910667, "rewards/margins": 0.06140115484595299, "rewards/rejected": -0.03911077231168747, "step": 36 }, { "epoch": 0.1753846153846154, "grad_norm": 53.938952453151614, "learning_rate": 4.318181818181818e-07, "logits/chosen": -1.2461514472961426, "logits/rejected": -1.2598522901535034, "logps/chosen": -27.248275756835938, "logps/rejected": -32.50380325317383, "loss": 0.6545, "rewards/accuracies": 0.625, "rewards/chosen": 0.04994047060608864, "rewards/margins": 0.1001262366771698, "rewards/rejected": -0.05018576979637146, "step": 38 }, { "epoch": 0.18461538461538463, "grad_norm": 46.949545804629956, "learning_rate": 4.545454545454545e-07, "logits/chosen": -1.2425076961517334, "logits/rejected": -1.2611976861953735, "logps/chosen": -14.459053993225098, "logps/rejected": -22.981327056884766, "loss": 0.6562, "rewards/accuracies": 0.5833333134651184, "rewards/chosen": 0.06477613002061844, "rewards/margins": 0.08449113368988037, "rewards/rejected": -0.019715001806616783, "step": 40 }, { "epoch": 0.18461538461538463, "eval_logits/chosen": -1.3191018104553223, "eval_logits/rejected": -1.3294612169265747, "eval_logps/chosen": -22.93289566040039, "eval_logps/rejected": -26.52239418029785, "eval_loss": 0.6399217247962952, "eval_rewards/accuracies": 0.671658992767334, "eval_rewards/chosen": 0.05690104886889458, "eval_rewards/margins": 0.13770265877246857, "eval_rewards/rejected": -0.08080162853002548, "eval_runtime": 216.334, "eval_samples_per_second": 8.015, "eval_steps_per_second": 2.006, "step": 40 }, { "epoch": 0.19384615384615383, "grad_norm": 41.53559188167412, "learning_rate": 4.772727272727273e-07, "logits/chosen": -1.2119545936584473, "logits/rejected": -1.2175490856170654, "logps/chosen": -23.42240333557129, "logps/rejected": -29.862327575683594, "loss": 0.624, "rewards/accuracies": 0.5694444179534912, "rewards/chosen": 0.059619419276714325, "rewards/margins": 0.15751110017299652, "rewards/rejected": -0.09789170324802399, "step": 42 }, { "epoch": 0.20307692307692307, "grad_norm": 49.942474151893265, "learning_rate": 5e-07, "logits/chosen": -1.3206286430358887, "logits/rejected": -1.3300279378890991, "logps/chosen": -22.983713150024414, "logps/rejected": -23.000356674194336, "loss": 0.6224, "rewards/accuracies": 0.7083333134651184, "rewards/chosen": 0.05540511757135391, "rewards/margins": 0.1078185960650444, "rewards/rejected": -0.05241347849369049, "step": 44 }, { "epoch": 0.2123076923076923, "grad_norm": 40.96104792630147, "learning_rate": 4.99967220916408e-07, "logits/chosen": -1.2594552040100098, "logits/rejected": -1.270306944847107, "logps/chosen": -19.131641387939453, "logps/rejected": -29.00514793395996, "loss": 0.617, "rewards/accuracies": 0.7083333134651184, "rewards/chosen": 0.09214716404676437, "rewards/margins": 0.24131464958190918, "rewards/rejected": -0.14916746318340302, "step": 46 }, { "epoch": 0.22153846153846155, "grad_norm": 44.60792696333844, "learning_rate": 4.998688922613787e-07, "logits/chosen": -1.3020961284637451, "logits/rejected": -1.3101927042007446, "logps/chosen": -31.274911880493164, "logps/rejected": -32.11240005493164, "loss": 0.6075, "rewards/accuracies": 0.7083333134651184, "rewards/chosen": 0.10760927200317383, "rewards/margins": 0.294413298368454, "rewards/rejected": -0.18680399656295776, "step": 48 }, { "epoch": 0.23076923076923078, "grad_norm": 43.17860095734465, "learning_rate": 4.997050398198976e-07, "logits/chosen": -1.291076421737671, "logits/rejected": -1.2982360124588013, "logps/chosen": -22.59940528869629, "logps/rejected": -22.504961013793945, "loss": 0.5855, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 0.14835722744464874, "rewards/margins": 0.3006143271923065, "rewards/rejected": -0.15225709974765778, "step": 50 }, { "epoch": 0.24, "grad_norm": 40.923959372883246, "learning_rate": 4.994757065594279e-07, "logits/chosen": -1.2361193895339966, "logits/rejected": -1.2530244588851929, "logps/chosen": -19.440345764160156, "logps/rejected": -29.653764724731445, "loss": 0.58, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.12961499392986298, "rewards/margins": 0.2747644782066345, "rewards/rejected": -0.14514949917793274, "step": 52 }, { "epoch": 0.24923076923076923, "grad_norm": 40.97149688332116, "learning_rate": 4.991809526186423e-07, "logits/chosen": -1.2297606468200684, "logits/rejected": -1.25152587890625, "logps/chosen": -21.388309478759766, "logps/rejected": -44.34809112548828, "loss": 0.5456, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.151195228099823, "rewards/margins": 0.48822492361068726, "rewards/rejected": -0.33702969551086426, "step": 54 }, { "epoch": 0.25846153846153846, "grad_norm": 41.37645783028047, "learning_rate": 4.988208552916535e-07, "logits/chosen": -1.2540967464447021, "logits/rejected": -1.2566981315612793, "logps/chosen": -22.95637321472168, "logps/rejected": -23.91745376586914, "loss": 0.5722, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.20567570626735687, "rewards/margins": 0.3446711003780365, "rewards/rejected": -0.13899540901184082, "step": 56 }, { "epoch": 0.2676923076923077, "grad_norm": 37.07709893155658, "learning_rate": 4.983955090077444e-07, "logits/chosen": -1.2924391031265259, "logits/rejected": -1.2913458347320557, "logps/chosen": -18.923715591430664, "logps/rejected": -22.57257843017578, "loss": 0.5773, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.16216707229614258, "rewards/margins": 0.27626025676727295, "rewards/rejected": -0.11409316956996918, "step": 58 }, { "epoch": 0.27692307692307694, "grad_norm": 33.00415567764037, "learning_rate": 4.979050253066063e-07, "logits/chosen": -1.2263813018798828, "logits/rejected": -1.2465788125991821, "logps/chosen": -20.503381729125977, "logps/rejected": -37.98419189453125, "loss": 0.5379, "rewards/accuracies": 0.7083333134651184, "rewards/chosen": 0.17731823027133942, "rewards/margins": 0.593184769153595, "rewards/rejected": -0.41586652398109436, "step": 60 }, { "epoch": 0.27692307692307694, "eval_logits/chosen": -1.303908109664917, "eval_logits/rejected": -1.3140496015548706, "eval_logps/chosen": -22.596784591674805, "eval_logps/rejected": -26.880229949951172, "eval_loss": 0.5301286578178406, "eval_rewards/accuracies": 0.7718893885612488, "eval_rewards/chosen": 0.22495588660240173, "eval_rewards/margins": 0.484672486782074, "eval_rewards/rejected": -0.259716659784317, "eval_runtime": 215.7229, "eval_samples_per_second": 8.038, "eval_steps_per_second": 2.012, "step": 60 }, { "epoch": 0.28615384615384615, "grad_norm": 32.870504270075905, "learning_rate": 4.973495328090889e-07, "logits/chosen": -1.2028117179870605, "logits/rejected": -1.2163152694702148, "logps/chosen": -25.100025177001953, "logps/rejected": -35.97075653076172, "loss": 0.5245, "rewards/accuracies": 0.7083333134651184, "rewards/chosen": 0.20213226974010468, "rewards/margins": 0.5411441326141357, "rewards/rejected": -0.33901187777519226, "step": 62 }, { "epoch": 0.2953846153846154, "grad_norm": 38.13033333375434, "learning_rate": 4.967291771834726e-07, "logits/chosen": -1.2682946920394897, "logits/rejected": -1.2830837965011597, "logps/chosen": -22.399858474731445, "logps/rejected": -35.47315979003906, "loss": 0.4854, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.24411238729953766, "rewards/margins": 0.7097706198692322, "rewards/rejected": -0.46565818786621094, "step": 64 }, { "epoch": 0.3046153846153846, "grad_norm": 34.6917991893696, "learning_rate": 4.960441211072685e-07, "logits/chosen": -1.240267038345337, "logits/rejected": -1.2494441270828247, "logps/chosen": -16.752328872680664, "logps/rejected": -21.625200271606445, "loss": 0.52, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.2749379575252533, "rewards/margins": 0.5106962323188782, "rewards/rejected": -0.23575833439826965, "step": 66 }, { "epoch": 0.31384615384615383, "grad_norm": 32.938257449212315, "learning_rate": 4.952945442245597e-07, "logits/chosen": -1.282260775566101, "logits/rejected": -1.2961454391479492, "logps/chosen": -16.818540573120117, "logps/rejected": -31.804317474365234, "loss": 0.4986, "rewards/accuracies": 0.75, "rewards/chosen": 0.20085500180721283, "rewards/margins": 0.6287386417388916, "rewards/rejected": -0.42788365483283997, "step": 68 }, { "epoch": 0.3230769230769231, "grad_norm": 36.12880857430109, "learning_rate": 4.944806430988927e-07, "logits/chosen": -1.2567392587661743, "logits/rejected": -1.263179063796997, "logps/chosen": -23.333267211914062, "logps/rejected": -24.862985610961914, "loss": 0.5059, "rewards/accuracies": 0.7361111044883728, "rewards/chosen": 0.22914116084575653, "rewards/margins": 0.6000176668167114, "rewards/rejected": -0.3708764612674713, "step": 70 }, { "epoch": 0.3323076923076923, "grad_norm": 31.746333807337315, "learning_rate": 4.936026311617316e-07, "logits/chosen": -1.2413491010665894, "logits/rejected": -1.2490180730819702, "logps/chosen": -27.870990753173828, "logps/rejected": -28.86038589477539, "loss": 0.4797, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.35419517755508423, "rewards/margins": 0.7417442202568054, "rewards/rejected": -0.3875490427017212, "step": 72 }, { "epoch": 0.3415384615384615, "grad_norm": 31.965936446320438, "learning_rate": 4.926607386564898e-07, "logits/chosen": -1.3071357011795044, "logits/rejected": -1.3031624555587769, "logps/chosen": -24.66501808166504, "logps/rejected": -19.646629333496094, "loss": 0.4724, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.3141394257545471, "rewards/margins": 0.6052231788635254, "rewards/rejected": -0.29108375310897827, "step": 74 }, { "epoch": 0.3507692307692308, "grad_norm": 30.420218056003396, "learning_rate": 4.916552125781528e-07, "logits/chosen": -1.2826448678970337, "logits/rejected": -1.2921828031539917, "logps/chosen": -21.71385955810547, "logps/rejected": -26.265592575073242, "loss": 0.443, "rewards/accuracies": 0.75, "rewards/chosen": 0.3491870164871216, "rewards/margins": 0.7558759450912476, "rewards/rejected": -0.4066888988018036, "step": 76 }, { "epoch": 0.36, "grad_norm": 35.262762131347294, "learning_rate": 4.905863166085075e-07, "logits/chosen": -1.2882230281829834, "logits/rejected": -1.3004416227340698, "logps/chosen": -25.61620330810547, "logps/rejected": -26.73788833618164, "loss": 0.4682, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 0.29705706238746643, "rewards/margins": 0.6734262108802795, "rewards/rejected": -0.3763691484928131, "step": 78 }, { "epoch": 0.36923076923076925, "grad_norm": 32.454214562336674, "learning_rate": 4.894543310469967e-07, "logits/chosen": -1.292490839958191, "logits/rejected": -1.3075741529464722, "logps/chosen": -24.23374366760254, "logps/rejected": -27.662269592285156, "loss": 0.4233, "rewards/accuracies": 0.7222222089767456, "rewards/chosen": 0.3347330093383789, "rewards/margins": 0.7462683320045471, "rewards/rejected": -0.4115353524684906, "step": 80 }, { "epoch": 0.36923076923076925, "eval_logits/chosen": -1.2837809324264526, "eval_logits/rejected": -1.293448567390442, "eval_logps/chosen": -22.318069458007812, "eval_logps/rejected": -27.420156478881836, "eval_loss": 0.4364205598831177, "eval_rewards/accuracies": 0.7937787771224976, "eval_rewards/chosen": 0.3643138110637665, "eval_rewards/margins": 0.893993616104126, "eval_rewards/rejected": -0.5296797752380371, "eval_runtime": 215.7088, "eval_samples_per_second": 8.039, "eval_steps_per_second": 2.012, "step": 80 }, { "epoch": 0.37846153846153846, "grad_norm": 30.100728508551764, "learning_rate": 4.882595527372152e-07, "logits/chosen": -1.219198226928711, "logits/rejected": -1.2316464185714722, "logps/chosen": -21.758522033691406, "logps/rejected": -32.21995544433594, "loss": 0.4544, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 0.33725497126579285, "rewards/margins": 0.9134353995323181, "rewards/rejected": -0.5761803984642029, "step": 82 }, { "epoch": 0.38769230769230767, "grad_norm": 27.99260854977849, "learning_rate": 4.870022949890676e-07, "logits/chosen": -1.25475013256073, "logits/rejected": -1.258756160736084, "logps/chosen": -29.569332122802734, "logps/rejected": -32.13206481933594, "loss": 0.4048, "rewards/accuracies": 0.75, "rewards/chosen": 0.3496508300304413, "rewards/margins": 1.0080742835998535, "rewards/rejected": -0.6584234237670898, "step": 84 }, { "epoch": 0.39692307692307693, "grad_norm": 28.434505768144174, "learning_rate": 4.856828874966086e-07, "logits/chosen": -1.2163680791854858, "logits/rejected": -1.2340407371520996, "logps/chosen": -18.534114837646484, "logps/rejected": -36.619850158691406, "loss": 0.422, "rewards/accuracies": 0.6666666865348816, "rewards/chosen": 0.2995716333389282, "rewards/margins": 1.0983738899230957, "rewards/rejected": -0.7988021969795227, "step": 86 }, { "epoch": 0.40615384615384614, "grad_norm": 28.794469436567187, "learning_rate": 4.843016762515859e-07, "logits/chosen": -1.2752939462661743, "logits/rejected": -1.285552978515625, "logps/chosen": -21.55384635925293, "logps/rejected": -30.397226333618164, "loss": 0.3905, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.37557560205459595, "rewards/margins": 1.0376694202423096, "rewards/rejected": -0.6620937585830688, "step": 88 }, { "epoch": 0.4153846153846154, "grad_norm": 24.699190483704957, "learning_rate": 4.828590234527106e-07, "logits/chosen": -1.2076385021209717, "logits/rejected": -1.2378058433532715, "logps/chosen": -20.13502311706543, "logps/rejected": -49.50822067260742, "loss": 0.3616, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.29748064279556274, "rewards/margins": 1.576164722442627, "rewards/rejected": -1.2786839008331299, "step": 90 }, { "epoch": 0.4246153846153846, "grad_norm": 24.998257178693006, "learning_rate": 4.81355307410676e-07, "logits/chosen": -1.268651008605957, "logits/rejected": -1.2737505435943604, "logps/chosen": -21.684688568115234, "logps/rejected": -20.43457794189453, "loss": 0.3963, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.701554536819458, "rewards/margins": 1.2370011806488037, "rewards/rejected": -0.5354464650154114, "step": 92 }, { "epoch": 0.4338461538461538, "grad_norm": 30.39233888946852, "learning_rate": 4.79790922448953e-07, "logits/chosen": -1.2319780588150024, "logits/rejected": -1.234665870666504, "logps/chosen": -22.746065139770508, "logps/rejected": -37.10270309448242, "loss": 0.4055, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.33227479457855225, "rewards/margins": 1.4662950038909912, "rewards/rejected": -1.1340200901031494, "step": 94 }, { "epoch": 0.4430769230769231, "grad_norm": 28.351607065877335, "learning_rate": 4.78166278800385e-07, "logits/chosen": -1.2103080749511719, "logits/rejected": -1.2216867208480835, "logps/chosen": -22.36292839050293, "logps/rejected": -36.19468307495117, "loss": 0.3633, "rewards/accuracies": 0.875, "rewards/chosen": 0.46569257974624634, "rewards/margins": 1.3663029670715332, "rewards/rejected": -0.9006102681159973, "step": 96 }, { "epoch": 0.4523076923076923, "grad_norm": 27.63597035013981, "learning_rate": 4.7648180249961165e-07, "logits/chosen": -1.2609645128250122, "logits/rejected": -1.2675108909606934, "logps/chosen": -19.6772403717041, "logps/rejected": -22.703941345214844, "loss": 0.3425, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.42960312962532043, "rewards/margins": 1.3149679899215698, "rewards/rejected": -0.8853649497032166, "step": 98 }, { "epoch": 0.46153846153846156, "grad_norm": 27.095171417356656, "learning_rate": 4.747379352713488e-07, "logits/chosen": -1.2016191482543945, "logits/rejected": -1.212724208831787, "logps/chosen": -26.863676071166992, "logps/rejected": -35.31084442138672, "loss": 0.3626, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 0.43012529611587524, "rewards/margins": 1.344970703125, "rewards/rejected": -0.9148455858230591, "step": 100 }, { "epoch": 0.46153846153846156, "eval_logits/chosen": -1.2631281614303589, "eval_logits/rejected": -1.2726249694824219, "eval_logps/chosen": -22.157392501831055, "eval_logps/rejected": -28.169017791748047, "eval_loss": 0.3646220564842224, "eval_rewards/accuracies": 0.7972350120544434, "eval_rewards/chosen": 0.4446515440940857, "eval_rewards/margins": 1.348763346672058, "eval_rewards/rejected": -0.904111921787262, "eval_runtime": 215.7885, "eval_samples_per_second": 8.036, "eval_steps_per_second": 2.011, "step": 100 }, { "epoch": 0.4707692307692308, "grad_norm": 32.35798457566701, "learning_rate": 4.7293513441455357e-07, "logits/chosen": -1.2197188138961792, "logits/rejected": -1.2320291996002197, "logps/chosen": -19.279041290283203, "logps/rejected": -35.00586700439453, "loss": 0.3714, "rewards/accuracies": 0.75, "rewards/chosen": 0.3205001652240753, "rewards/margins": 1.3763878345489502, "rewards/rejected": -1.0558876991271973, "step": 102 }, { "epoch": 0.48, "grad_norm": 21.70119714606352, "learning_rate": 4.7107387268250586e-07, "logits/chosen": -1.1967614889144897, "logits/rejected": -1.220970630645752, "logps/chosen": -10.033695220947266, "logps/rejected": -38.51593017578125, "loss": 0.3835, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.4108971059322357, "rewards/margins": 1.6398005485534668, "rewards/rejected": -1.2289036512374878, "step": 104 }, { "epoch": 0.48923076923076925, "grad_norm": 22.839162689384967, "learning_rate": 4.691546381588369e-07, "logits/chosen": -1.2221455574035645, "logits/rejected": -1.2347490787506104, "logps/chosen": -20.123445510864258, "logps/rejected": -34.73093032836914, "loss": 0.3528, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.3299613296985626, "rewards/margins": 1.6646933555603027, "rewards/rejected": -1.3347320556640625, "step": 106 }, { "epoch": 0.49846153846153846, "grad_norm": 30.91989303041632, "learning_rate": 4.6717793412953776e-07, "logits/chosen": -1.2001112699508667, "logits/rejected": -1.2213759422302246, "logps/chosen": -18.639766693115234, "logps/rejected": -38.698211669921875, "loss": 0.3751, "rewards/accuracies": 0.7361111044883728, "rewards/chosen": 0.3170078694820404, "rewards/margins": 1.7733925580978394, "rewards/rejected": -1.456384539604187, "step": 108 }, { "epoch": 0.5076923076923077, "grad_norm": 22.21075058785491, "learning_rate": 4.651442789509813e-07, "logits/chosen": -1.172301173210144, "logits/rejected": -1.1873422861099243, "logps/chosen": -19.037778854370117, "logps/rejected": -35.6918830871582, "loss": 0.3632, "rewards/accuracies": 0.75, "rewards/chosen": 0.44801807403564453, "rewards/margins": 1.6537230014801025, "rewards/rejected": -1.2057050466537476, "step": 110 }, { "epoch": 0.5169230769230769, "grad_norm": 22.23191382020911, "learning_rate": 4.630542059139923e-07, "logits/chosen": -1.1621766090393066, "logits/rejected": -1.1781913042068481, "logps/chosen": -26.200401306152344, "logps/rejected": -28.19536590576172, "loss": 0.3117, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.4852801561355591, "rewards/margins": 1.5631003379821777, "rewards/rejected": -1.0778203010559082, "step": 112 }, { "epoch": 0.5261538461538462, "grad_norm": 26.06519967082825, "learning_rate": 4.609082631040011e-07, "logits/chosen": -1.1710741519927979, "logits/rejected": -1.1770610809326172, "logps/chosen": -26.139328002929688, "logps/rejected": -38.44914627075195, "loss": 0.3191, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.42665359377861023, "rewards/margins": 1.9680951833724976, "rewards/rejected": -1.5414414405822754, "step": 114 }, { "epoch": 0.5353846153846153, "grad_norm": 23.76055177774163, "learning_rate": 4.5870701325731773e-07, "logits/chosen": -1.1841078996658325, "logits/rejected": -1.2016386985778809, "logps/chosen": -18.3129940032959, "logps/rejected": -38.7909049987793, "loss": 0.3422, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 0.31725624203681946, "rewards/margins": 1.8888146877288818, "rewards/rejected": -1.5715583562850952, "step": 116 }, { "epoch": 0.5446153846153846, "grad_norm": 22.451458526325442, "learning_rate": 4.5645103361356407e-07, "logits/chosen": -1.203595519065857, "logits/rejected": -1.1993364095687866, "logps/chosen": -29.456233978271484, "logps/rejected": -24.436891555786133, "loss": 0.3111, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.4006561040878296, "rewards/margins": 1.460686206817627, "rewards/rejected": -1.0600301027297974, "step": 118 }, { "epoch": 0.5538461538461539, "grad_norm": 20.899441336146108, "learning_rate": 4.541409157643027e-07, "logits/chosen": -1.113027811050415, "logits/rejected": -1.1339952945709229, "logps/chosen": -22.780738830566406, "logps/rejected": -37.4469108581543, "loss": 0.263, "rewards/accuracies": 0.875, "rewards/chosen": 0.5090766549110413, "rewards/margins": 2.038201332092285, "rewards/rejected": -1.5291246175765991, "step": 120 }, { "epoch": 0.5538461538461539, "eval_logits/chosen": -1.2401551008224487, "eval_logits/rejected": -1.249323844909668, "eval_logps/chosen": -22.120243072509766, "eval_logps/rejected": -28.963603973388672, "eval_loss": 0.32304224371910095, "eval_rewards/accuracies": 0.8122119903564453, "eval_rewards/chosen": 0.46322670578956604, "eval_rewards/margins": 1.764631986618042, "eval_rewards/rejected": -1.3014051914215088, "eval_runtime": 215.8398, "eval_samples_per_second": 8.034, "eval_steps_per_second": 2.011, "step": 120 }, { "epoch": 0.563076923076923, "grad_norm": 25.722122527925197, "learning_rate": 4.517772654979023e-07, "logits/chosen": -1.1628613471984863, "logits/rejected": -1.1666890382766724, "logps/chosen": -28.28006935119629, "logps/rejected": -32.06778335571289, "loss": 0.2967, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.42497023940086365, "rewards/margins": 1.8420732021331787, "rewards/rejected": -1.4171031713485718, "step": 122 }, { "epoch": 0.5723076923076923, "grad_norm": 18.859437245079093, "learning_rate": 4.4936070264068016e-07, "logits/chosen": -1.097366452217102, "logits/rejected": -1.1257672309875488, "logps/chosen": -19.26881217956543, "logps/rejected": -50.698387145996094, "loss": 0.3122, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.3975294530391693, "rewards/margins": 2.376965045928955, "rewards/rejected": -1.9794355630874634, "step": 124 }, { "epoch": 0.5815384615384616, "grad_norm": 24.12611784808478, "learning_rate": 4.468918608943636e-07, "logits/chosen": -1.188425064086914, "logits/rejected": -1.2095468044281006, "logps/chosen": -22.594573974609375, "logps/rejected": -33.808677673339844, "loss": 0.2989, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.4648338854312897, "rewards/margins": 2.128401756286621, "rewards/rejected": -1.6635680198669434, "step": 126 }, { "epoch": 0.5907692307692308, "grad_norm": 21.121113872126465, "learning_rate": 4.443713876699123e-07, "logits/chosen": -1.176856279373169, "logits/rejected": -1.175789713859558, "logps/chosen": -31.682504653930664, "logps/rejected": -26.862850189208984, "loss": 0.2881, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.47753646969795227, "rewards/margins": 1.661524772644043, "rewards/rejected": -1.183988332748413, "step": 128 }, { "epoch": 0.6, "grad_norm": 24.221092280098347, "learning_rate": 4.417999439177465e-07, "logits/chosen": -1.1786390542984009, "logits/rejected": -1.1881896257400513, "logps/chosen": -18.69803237915039, "logps/rejected": -28.687692642211914, "loss": 0.2737, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.5532296895980835, "rewards/margins": 2.0457603931427, "rewards/rejected": -1.4925308227539062, "step": 130 }, { "epoch": 0.6092307692307692, "grad_norm": 19.171893778962126, "learning_rate": 4.391782039544238e-07, "logits/chosen": -1.2097636461257935, "logits/rejected": -1.2146636247634888, "logps/chosen": -19.53115463256836, "logps/rejected": -19.350337982177734, "loss": 0.3284, "rewards/accuracies": 0.7083333134651184, "rewards/chosen": 0.28336918354034424, "rewards/margins": 1.5194146633148193, "rewards/rejected": -1.236045479774475, "step": 132 }, { "epoch": 0.6184615384615385, "grad_norm": 22.368959777821875, "learning_rate": 4.365068552858115e-07, "logits/chosen": -1.2042018175125122, "logits/rejected": -1.2163949012756348, "logps/chosen": -24.11139488220215, "logps/rejected": -33.35640335083008, "loss": 0.3137, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 0.1719236522912979, "rewards/margins": 1.7209672927856445, "rewards/rejected": -1.5490436553955078, "step": 134 }, { "epoch": 0.6276923076923077, "grad_norm": 17.354174303387865, "learning_rate": 4.337865984268001e-07, "logits/chosen": -1.1561534404754639, "logits/rejected": -1.1622954607009888, "logps/chosen": -15.14254093170166, "logps/rejected": -27.18238067626953, "loss": 0.2954, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.34695935249328613, "rewards/margins": 1.897645115852356, "rewards/rejected": -1.5506855249404907, "step": 136 }, { "epoch": 0.6369230769230769, "grad_norm": 14.475969356318869, "learning_rate": 4.310181467176054e-07, "logits/chosen": -1.1768825054168701, "logits/rejected": -1.1757102012634277, "logps/chosen": -25.93258285522461, "logps/rejected": -32.286590576171875, "loss": 0.2914, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.42600950598716736, "rewards/margins": 2.0175862312316895, "rewards/rejected": -1.5915768146514893, "step": 138 }, { "epoch": 0.6461538461538462, "grad_norm": 18.34569474287581, "learning_rate": 4.282022261367073e-07, "logits/chosen": -1.2166173458099365, "logits/rejected": -1.2223114967346191, "logps/chosen": -20.700721740722656, "logps/rejected": -25.006229400634766, "loss": 0.2717, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.5470355749130249, "rewards/margins": 1.990134358406067, "rewards/rejected": -1.4430986642837524, "step": 140 }, { "epoch": 0.6461538461538462, "eval_logits/chosen": -1.221505880355835, "eval_logits/rejected": -1.2305463552474976, "eval_logps/chosen": -22.114253997802734, "eval_logps/rejected": -29.54737663269043, "eval_loss": 0.29700523614883423, "eval_rewards/accuracies": 0.8179723620414734, "eval_rewards/chosen": 0.46622127294540405, "eval_rewards/margins": 2.0595133304595947, "eval_rewards/rejected": -1.5932921171188354, "eval_runtime": 215.9245, "eval_samples_per_second": 8.031, "eval_steps_per_second": 2.01, "step": 140 }, { "epoch": 0.6553846153846153, "grad_norm": 24.003361700026115, "learning_rate": 4.253395751104748e-07, "logits/chosen": -1.2128342390060425, "logits/rejected": -1.2202144861221313, "logps/chosen": -20.926525115966797, "logps/rejected": -33.759159088134766, "loss": 0.2796, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.4563888907432556, "rewards/margins": 2.332362413406372, "rewards/rejected": -1.8759733438491821, "step": 142 }, { "epoch": 0.6646153846153846, "grad_norm": 22.96956018291041, "learning_rate": 4.2243094431952607e-07, "logits/chosen": -1.1733120679855347, "logits/rejected": -1.1876205205917358, "logps/chosen": -20.787324905395508, "logps/rejected": -44.41487503051758, "loss": 0.2904, "rewards/accuracies": 0.7638888955116272, "rewards/chosen": 0.4227790832519531, "rewards/margins": 2.513406753540039, "rewards/rejected": -2.090627431869507, "step": 144 }, { "epoch": 0.6738461538461539, "grad_norm": 20.337910027315395, "learning_rate": 4.194770965018758e-07, "logits/chosen": -1.1829084157943726, "logits/rejected": -1.1901525259017944, "logps/chosen": -22.88217544555664, "logps/rejected": -40.51693344116211, "loss": 0.2982, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.32644984126091003, "rewards/margins": 2.2273294925689697, "rewards/rejected": -1.9008797407150269, "step": 146 }, { "epoch": 0.683076923076923, "grad_norm": 16.955507402789948, "learning_rate": 4.1647880625292027e-07, "logits/chosen": -1.1585676670074463, "logits/rejected": -1.1673483848571777, "logps/chosen": -17.565954208374023, "logps/rejected": -30.01752471923828, "loss": 0.2381, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.6770419478416443, "rewards/margins": 2.5649421215057373, "rewards/rejected": -1.8879002332687378, "step": 148 }, { "epoch": 0.6923076923076923, "grad_norm": 16.268353553690783, "learning_rate": 4.1343685982231315e-07, "logits/chosen": -1.2300368547439575, "logits/rejected": -1.2412070035934448, "logps/chosen": -19.158246994018555, "logps/rejected": -30.00787353515625, "loss": 0.2576, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.28651073575019836, "rewards/margins": 2.1342878341674805, "rewards/rejected": -1.8477774858474731, "step": 150 }, { "epoch": 0.7015384615384616, "grad_norm": 22.707867679754226, "learning_rate": 4.1035205490778496e-07, "logits/chosen": -1.1675605773925781, "logits/rejected": -1.1745511293411255, "logps/chosen": -24.983802795410156, "logps/rejected": -32.00082015991211, "loss": 0.3007, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.4517359137535095, "rewards/margins": 2.2256662845611572, "rewards/rejected": -1.7739304304122925, "step": 152 }, { "epoch": 0.7107692307692308, "grad_norm": 17.503865371681442, "learning_rate": 4.072252004459611e-07, "logits/chosen": -1.1371846199035645, "logits/rejected": -1.1358321905136108, "logps/chosen": -26.079011917114258, "logps/rejected": -27.951416015625, "loss": 0.2471, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.44966569542884827, "rewards/margins": 2.104396104812622, "rewards/rejected": -1.6547303199768066, "step": 154 }, { "epoch": 0.72, "grad_norm": 15.32657259953523, "learning_rate": 4.040571164002318e-07, "logits/chosen": -1.189456820487976, "logits/rejected": -1.1948577165603638, "logps/chosen": -20.083751678466797, "logps/rejected": -30.10634994506836, "loss": 0.2351, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.42890670895576477, "rewards/margins": 2.341860771179199, "rewards/rejected": -1.9129540920257568, "step": 156 }, { "epoch": 0.7292307692307692, "grad_norm": 17.946669808646828, "learning_rate": 4.0084863354573116e-07, "logits/chosen": -1.1215004920959473, "logits/rejected": -1.1300181150436401, "logps/chosen": -23.436655044555664, "logps/rejected": -34.97710418701172, "loss": 0.2706, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": 0.22723568975925446, "rewards/margins": 2.1446826457977295, "rewards/rejected": -1.9174467325210571, "step": 158 }, { "epoch": 0.7384615384615385, "grad_norm": 16.72039592892195, "learning_rate": 3.9760059325148063e-07, "logits/chosen": -1.2237818241119385, "logits/rejected": -1.2211045026779175, "logps/chosen": -24.31806755065918, "logps/rejected": -25.250701904296875, "loss": 0.2351, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.4868224859237671, "rewards/margins": 2.124577522277832, "rewards/rejected": -1.637755274772644, "step": 160 }, { "epoch": 0.7384615384615385, "eval_logits/chosen": -1.2072025537490845, "eval_logits/rejected": -1.216115951538086, "eval_logps/chosen": -22.174776077270508, "eval_logps/rejected": -30.134973526000977, "eval_loss": 0.27949145436286926, "eval_rewards/accuracies": 0.8248847723007202, "eval_rewards/chosen": 0.4359608590602875, "eval_rewards/margins": 2.3230507373809814, "eval_rewards/rejected": -1.8870899677276611, "eval_runtime": 216.1181, "eval_samples_per_second": 8.023, "eval_steps_per_second": 2.008, "step": 160 }, { "epoch": 0.7476923076923077, "grad_norm": 16.877732796497064, "learning_rate": 3.9431384725975485e-07, "logits/chosen": -1.1728930473327637, "logits/rejected": -1.1828408241271973, "logps/chosen": -20.051979064941406, "logps/rejected": -30.078739166259766, "loss": 0.2806, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.4627165198326111, "rewards/margins": 2.1041107177734375, "rewards/rejected": -1.641394019126892, "step": 162 }, { "epoch": 0.7569230769230769, "grad_norm": 17.236677422360824, "learning_rate": 3.909892574627266e-07, "logits/chosen": -1.1840589046478271, "logits/rejected": -1.205323338508606, "logps/chosen": -20.25952911376953, "logps/rejected": -43.16006851196289, "loss": 0.267, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.34341666102409363, "rewards/margins": 2.8926875591278076, "rewards/rejected": -2.5492708683013916, "step": 164 }, { "epoch": 0.7661538461538462, "grad_norm": 15.084626056041332, "learning_rate": 3.876276956764509e-07, "logits/chosen": -1.172157883644104, "logits/rejected": -1.1869869232177734, "logps/chosen": -20.39401626586914, "logps/rejected": -35.54499816894531, "loss": 0.2191, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.533491313457489, "rewards/margins": 3.2933194637298584, "rewards/rejected": -2.7598280906677246, "step": 166 }, { "epoch": 0.7753846153846153, "grad_norm": 16.522846792297653, "learning_rate": 3.8423004341224595e-07, "logits/chosen": -1.1675995588302612, "logits/rejected": -1.1726378202438354, "logps/chosen": -22.266756057739258, "logps/rejected": -27.90992546081543, "loss": 0.2137, "rewards/accuracies": 0.875, "rewards/chosen": 0.3478531837463379, "rewards/margins": 2.3764336109161377, "rewards/rejected": -2.028580665588379, "step": 168 }, { "epoch": 0.7846153846153846, "grad_norm": 18.709310219062342, "learning_rate": 3.807971916455325e-07, "logits/chosen": -1.1257578134536743, "logits/rejected": -1.1353437900543213, "logps/chosen": -25.48769187927246, "logps/rejected": -37.34423065185547, "loss": 0.2439, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.30796098709106445, "rewards/margins": 2.5804708003997803, "rewards/rejected": -2.2725095748901367, "step": 170 }, { "epoch": 0.7938461538461539, "grad_norm": 18.811516964897933, "learning_rate": 3.773300405821908e-07, "logits/chosen": -1.2032923698425293, "logits/rejected": -1.1944453716278076, "logps/chosen": -22.42747688293457, "logps/rejected": -24.809179306030273, "loss": 0.2706, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.4598681628704071, "rewards/margins": 2.331010103225708, "rewards/rejected": -1.871142029762268, "step": 172 }, { "epoch": 0.803076923076923, "grad_norm": 27.213611533570646, "learning_rate": 3.738294994224969e-07, "logits/chosen": -1.1406216621398926, "logits/rejected": -1.1456246376037598, "logps/chosen": -22.41916847229004, "logps/rejected": -25.79179573059082, "loss": 0.2525, "rewards/accuracies": 0.875, "rewards/chosen": 0.5410938858985901, "rewards/margins": 2.5380003452301025, "rewards/rejected": -1.9969062805175781, "step": 174 }, { "epoch": 0.8123076923076923, "grad_norm": 22.120419375719585, "learning_rate": 3.7029648612270123e-07, "logits/chosen": -1.1604636907577515, "logits/rejected": -1.166500210762024, "logps/chosen": -23.140409469604492, "logps/rejected": -32.539859771728516, "loss": 0.2445, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.5552553534507751, "rewards/margins": 2.451958656311035, "rewards/rejected": -1.8967031240463257, "step": 176 }, { "epoch": 0.8215384615384616, "grad_norm": 23.529456123726142, "learning_rate": 3.6673192715431014e-07, "logits/chosen": -1.172749638557434, "logits/rejected": -1.1873490810394287, "logps/chosen": -19.344928741455078, "logps/rejected": -46.30924987792969, "loss": 0.2576, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.3556906580924988, "rewards/margins": 3.198575973510742, "rewards/rejected": -2.8428850173950195, "step": 178 }, { "epoch": 0.8307692307692308, "grad_norm": 16.07954647927614, "learning_rate": 3.6313675726113475e-07, "logits/chosen": -1.1696263551712036, "logits/rejected": -1.1719523668289185, "logps/chosen": -24.40313148498535, "logps/rejected": -30.179893493652344, "loss": 0.2373, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.5325056314468384, "rewards/margins": 2.6024298667907715, "rewards/rejected": -2.0699243545532227, "step": 180 }, { "epoch": 0.8307692307692308, "eval_logits/chosen": -1.1957546472549438, "eval_logits/rejected": -1.2044621706008911, "eval_logps/chosen": -22.226091384887695, "eval_logps/rejected": -30.679323196411133, "eval_loss": 0.2662460505962372, "eval_rewards/accuracies": 0.8271889686584473, "eval_rewards/chosen": 0.4103015661239624, "eval_rewards/margins": 2.569566011428833, "eval_rewards/rejected": -2.15926456451416, "eval_runtime": 216.1605, "eval_samples_per_second": 8.022, "eval_steps_per_second": 2.008, "step": 180 }, { "epoch": 0.84, "grad_norm": 12.027824441881227, "learning_rate": 3.595119192141706e-07, "logits/chosen": -1.1798688173294067, "logits/rejected": -1.190478801727295, "logps/chosen": -23.84467315673828, "logps/rejected": -27.77214241027832, "loss": 0.1945, "rewards/accuracies": 0.875, "rewards/chosen": 0.5185620784759521, "rewards/margins": 2.7370386123657227, "rewards/rejected": -2.2184765338897705, "step": 182 }, { "epoch": 0.8492307692307692, "grad_norm": 21.657852790803656, "learning_rate": 3.558583635643726e-07, "logits/chosen": -1.1619257926940918, "logits/rejected": -1.1783702373504639, "logps/chosen": -20.357545852661133, "logps/rejected": -36.6799430847168, "loss": 0.2859, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.39101898670196533, "rewards/margins": 2.5226354598999023, "rewards/rejected": -2.1316165924072266, "step": 184 }, { "epoch": 0.8584615384615385, "grad_norm": 15.850729398525738, "learning_rate": 3.5217704839338905e-07, "logits/chosen": -1.2039780616760254, "logits/rejected": -1.2015321254730225, "logps/chosen": -25.71788787841797, "logps/rejected": -29.20301628112793, "loss": 0.2245, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.43592390418052673, "rewards/margins": 2.691300392150879, "rewards/rejected": -2.2553763389587402, "step": 186 }, { "epoch": 0.8676923076923077, "grad_norm": 20.33987602806827, "learning_rate": 3.484689390623218e-07, "logits/chosen": -1.173121452331543, "logits/rejected": -1.1853346824645996, "logps/chosen": -21.594472885131836, "logps/rejected": -36.92512130737305, "loss": 0.2243, "rewards/accuracies": 0.875, "rewards/chosen": 0.22467082738876343, "rewards/margins": 2.8943564891815186, "rewards/rejected": -2.6696856021881104, "step": 188 }, { "epoch": 0.8769230769230769, "grad_norm": 15.456781978721555, "learning_rate": 3.447350079585767e-07, "logits/chosen": -1.20560884475708, "logits/rejected": -1.2095773220062256, "logps/chosen": -18.067840576171875, "logps/rejected": -24.3345890045166, "loss": 0.2124, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.2674013674259186, "rewards/margins": 2.3308472633361816, "rewards/rejected": -2.063445568084717, "step": 190 }, { "epoch": 0.8861538461538462, "grad_norm": 24.575966523755373, "learning_rate": 3.409762342408719e-07, "logits/chosen": -1.1767027378082275, "logits/rejected": -1.1829452514648438, "logps/chosen": -23.147159576416016, "logps/rejected": -38.63761901855469, "loss": 0.3063, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.2949807345867157, "rewards/margins": 2.8994204998016357, "rewards/rejected": -2.6044397354125977, "step": 192 }, { "epoch": 0.8953846153846153, "grad_norm": 13.903082439233941, "learning_rate": 3.3719360358247053e-07, "logits/chosen": -1.1678471565246582, "logits/rejected": -1.1855759620666504, "logps/chosen": -19.064098358154297, "logps/rejected": -36.09113693237305, "loss": 0.288, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.4278064966201782, "rewards/margins": 2.7983500957489014, "rewards/rejected": -2.3705434799194336, "step": 194 }, { "epoch": 0.9046153846153846, "grad_norm": 16.97717210575951, "learning_rate": 3.3338810791270517e-07, "logits/chosen": -1.1488627195358276, "logits/rejected": -1.161072015762329, "logps/chosen": -16.16121482849121, "logps/rejected": -35.24711608886719, "loss": 0.2587, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.2668210566043854, "rewards/margins": 2.758829116821289, "rewards/rejected": -2.4920082092285156, "step": 196 }, { "epoch": 0.9138461538461539, "grad_norm": 21.684346277519417, "learning_rate": 3.29560745156861e-07, "logits/chosen": -1.1681840419769287, "logits/rejected": -1.1707243919372559, "logps/chosen": -27.238510131835938, "logps/rejected": -29.843427658081055, "loss": 0.2945, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.487039715051651, "rewards/margins": 2.7937545776367188, "rewards/rejected": -2.3067147731781006, "step": 198 }, { "epoch": 0.9230769230769231, "grad_norm": 15.010044100424757, "learning_rate": 3.2571251897448763e-07, "logits/chosen": -1.1483420133590698, "logits/rejected": -1.172219157218933, "logps/chosen": -20.701204299926758, "logps/rejected": -47.092777252197266, "loss": 0.2393, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.5242102742195129, "rewards/margins": 3.446150302886963, "rewards/rejected": -2.9219398498535156, "step": 200 }, { "epoch": 0.9230769230769231, "eval_logits/chosen": -1.188868761062622, "eval_logits/rejected": -1.1974678039550781, "eval_logps/chosen": -22.205198287963867, "eval_logps/rejected": -30.90268325805664, "eval_loss": 0.25766730308532715, "eval_rewards/accuracies": 0.8306451439857483, "eval_rewards/chosen": 0.42075031995773315, "eval_rewards/margins": 2.6916959285736084, "eval_rewards/rejected": -2.2709455490112305, "eval_runtime": 216.204, "eval_samples_per_second": 8.02, "eval_steps_per_second": 2.007, "step": 200 }, { "epoch": 0.9323076923076923, "grad_norm": 24.918463307740545, "learning_rate": 3.218444384962071e-07, "logits/chosen": -1.1572585105895996, "logits/rejected": -1.1649041175842285, "logps/chosen": -20.337928771972656, "logps/rejected": -25.251022338867188, "loss": 0.2872, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.051526255905628204, "rewards/margins": 2.169602155685425, "rewards/rejected": -2.1180758476257324, "step": 202 }, { "epoch": 0.9415384615384615, "grad_norm": 17.132653548760572, "learning_rate": 3.179575180590857e-07, "logits/chosen": -1.1708558797836304, "logits/rejected": -1.1774191856384277, "logps/chosen": -16.72760772705078, "logps/rejected": -29.532522201538086, "loss": 0.2703, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.3555985391139984, "rewards/margins": 2.5367255210876465, "rewards/rejected": -2.1811270713806152, "step": 204 }, { "epoch": 0.9507692307692308, "grad_norm": 18.808695685272248, "learning_rate": 3.1405277694064305e-07, "logits/chosen": -1.13996422290802, "logits/rejected": -1.1603398323059082, "logps/chosen": -20.1070613861084, "logps/rejected": -43.8044319152832, "loss": 0.2133, "rewards/accuracies": 0.875, "rewards/chosen": 0.3765062689781189, "rewards/margins": 3.3217618465423584, "rewards/rejected": -2.9452552795410156, "step": 206 }, { "epoch": 0.96, "grad_norm": 29.593271367025817, "learning_rate": 3.101312390915634e-07, "logits/chosen": -1.1117515563964844, "logits/rejected": -1.1254826784133911, "logps/chosen": -18.95772933959961, "logps/rejected": -38.70570373535156, "loss": 0.2626, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.19062408804893494, "rewards/margins": 2.819202423095703, "rewards/rejected": -2.6285784244537354, "step": 208 }, { "epoch": 0.9692307692307692, "grad_norm": 19.2158248846026, "learning_rate": 3.0619393286718237e-07, "logits/chosen": -1.1758193969726562, "logits/rejected": -1.18528413772583, "logps/chosen": -25.30388069152832, "logps/rejected": -24.64061737060547, "loss": 0.2715, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.22175876796245575, "rewards/margins": 2.10679292678833, "rewards/rejected": -1.8850340843200684, "step": 210 }, { "epoch": 0.9784615384615385, "grad_norm": 23.720067200725047, "learning_rate": 3.022418907578188e-07, "logits/chosen": -1.1191242933273315, "logits/rejected": -1.1329889297485352, "logps/chosen": -25.677099227905273, "logps/rejected": -39.06088638305664, "loss": 0.2898, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.20263215899467468, "rewards/margins": 3.0255513191223145, "rewards/rejected": -2.8229193687438965, "step": 212 }, { "epoch": 0.9876923076923076, "grad_norm": 15.354779350521344, "learning_rate": 2.98276149118022e-07, "logits/chosen": -1.1088786125183105, "logits/rejected": -1.1292033195495605, "logps/chosen": -24.54433250427246, "logps/rejected": -38.054649353027344, "loss": 0.2164, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.5917240381240845, "rewards/margins": 3.370425224304199, "rewards/rejected": -2.7787015438079834, "step": 214 }, { "epoch": 0.9969230769230769, "grad_norm": 15.922459499539187, "learning_rate": 2.942977478948057e-07, "logits/chosen": -1.134361743927002, "logits/rejected": -1.1381641626358032, "logps/chosen": -29.736419677734375, "logps/rejected": -34.28538513183594, "loss": 0.209, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.47491705417633057, "rewards/margins": 3.0054473876953125, "rewards/rejected": -2.5305304527282715, "step": 216 }, { "epoch": 1.0061538461538462, "grad_norm": 14.602088714669993, "learning_rate": 2.903077303549399e-07, "logits/chosen": -1.1926045417785645, "logits/rejected": -1.2005811929702759, "logps/chosen": -21.338937759399414, "logps/rejected": -31.98470115661621, "loss": 0.2114, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.49925586581230164, "rewards/margins": 3.034120559692383, "rewards/rejected": -2.534864664077759, "step": 218 }, { "epoch": 1.0153846153846153, "grad_norm": 12.776565445469831, "learning_rate": 2.863071428113726e-07, "logits/chosen": -1.180498719215393, "logits/rejected": -1.1876842975616455, "logps/chosen": -21.977970123291016, "logps/rejected": -26.06908416748047, "loss": 0.2223, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.23817205429077148, "rewards/margins": 2.4826109409332275, "rewards/rejected": -2.244438409805298, "step": 220 }, { "epoch": 1.0153846153846153, "eval_logits/chosen": -1.1809991598129272, "eval_logits/rejected": -1.189637303352356, "eval_logps/chosen": -22.231857299804688, "eval_logps/rejected": -31.20700454711914, "eval_loss": 0.25129908323287964, "eval_rewards/accuracies": 0.8329492807388306, "eval_rewards/chosen": 0.4074196219444275, "eval_rewards/margins": 2.8305253982543945, "eval_rewards/rejected": -2.4231057167053223, "eval_runtime": 216.0555, "eval_samples_per_second": 8.026, "eval_steps_per_second": 2.009, "step": 220 }, { "epoch": 1.0246153846153847, "grad_norm": 14.54877776678067, "learning_rate": 2.822970343488516e-07, "logits/chosen": -1.1495935916900635, "logits/rejected": -1.1574082374572754, "logps/chosen": -25.172189712524414, "logps/rejected": -33.7739372253418, "loss": 0.224, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.4095478355884552, "rewards/margins": 2.9969334602355957, "rewards/rejected": -2.587385416030884, "step": 222 }, { "epoch": 1.0338461538461539, "grad_norm": 12.987637533805088, "learning_rate": 2.782784565488211e-07, "logits/chosen": -1.09419846534729, "logits/rejected": -1.1150177717208862, "logps/chosen": -21.80037498474121, "logps/rejected": -47.742916107177734, "loss": 0.2056, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": 0.49535179138183594, "rewards/margins": 4.081587314605713, "rewards/rejected": -3.5862362384796143, "step": 224 }, { "epoch": 1.043076923076923, "grad_norm": 12.537917774467841, "learning_rate": 2.7425246321366205e-07, "logits/chosen": -1.1532597541809082, "logits/rejected": -1.1558729410171509, "logps/chosen": -23.903770446777344, "logps/rejected": -22.89252471923828, "loss": 0.2188, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.5978649258613586, "rewards/margins": 2.4770026206970215, "rewards/rejected": -1.8791378736495972, "step": 226 }, { "epoch": 1.0523076923076924, "grad_norm": 11.390266637295149, "learning_rate": 2.7022011009035107e-07, "logits/chosen": -1.1780048608779907, "logits/rejected": -1.1780657768249512, "logps/chosen": -20.99365997314453, "logps/rejected": -35.256507873535156, "loss": 0.1785, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.25620290637016296, "rewards/margins": 3.1927871704101562, "rewards/rejected": -2.936584234237671, "step": 228 }, { "epoch": 1.0615384615384615, "grad_norm": 13.274197122497501, "learning_rate": 2.661824545936089e-07, "logits/chosen": -1.1301528215408325, "logits/rejected": -1.141854166984558, "logps/chosen": -22.90785789489746, "logps/rejected": -39.776309967041016, "loss": 0.1848, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": 0.23726129531860352, "rewards/margins": 3.6220147609710693, "rewards/rejected": -3.3847532272338867, "step": 230 }, { "epoch": 1.0707692307692307, "grad_norm": 11.899842789993972, "learning_rate": 2.621405555286121e-07, "logits/chosen": -1.1494054794311523, "logits/rejected": -1.158327579498291, "logps/chosen": -27.49151611328125, "logps/rejected": -33.164703369140625, "loss": 0.1801, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.473955363035202, "rewards/margins": 3.1881282329559326, "rewards/rejected": -2.7141730785369873, "step": 232 }, { "epoch": 1.08, "grad_norm": 12.024964222481547, "learning_rate": 2.58095472813339e-07, "logits/chosen": -1.1302716732025146, "logits/rejected": -1.1499823331832886, "logps/chosen": -25.619178771972656, "logps/rejected": -35.781768798828125, "loss": 0.1808, "rewards/accuracies": 0.875, "rewards/chosen": 0.593082070350647, "rewards/margins": 3.549994468688965, "rewards/rejected": -2.9569127559661865, "step": 234 }, { "epoch": 1.0892307692307692, "grad_norm": 16.982420323384893, "learning_rate": 2.540482672006254e-07, "logits/chosen": -1.1983014345169067, "logits/rejected": -1.2088627815246582, "logps/chosen": -20.2447566986084, "logps/rejected": -33.8237419128418, "loss": 0.2502, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.2588607966899872, "rewards/margins": 2.6979219913482666, "rewards/rejected": -2.439061164855957, "step": 236 }, { "epoch": 1.0984615384615384, "grad_norm": 14.78335151339772, "learning_rate": 2.5e-07, "logits/chosen": -1.1217488050460815, "logits/rejected": -1.126597285270691, "logps/chosen": -24.313417434692383, "logps/rejected": -32.5634880065918, "loss": 0.1857, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.4129423499107361, "rewards/margins": 3.0672991275787354, "rewards/rejected": -2.6543567180633545, "step": 238 }, { "epoch": 1.1076923076923078, "grad_norm": 9.560418611995035, "learning_rate": 2.459517327993746e-07, "logits/chosen": -1.1439785957336426, "logits/rejected": -1.1501950025558472, "logps/chosen": -21.520601272583008, "logps/rejected": -36.128475189208984, "loss": 0.1631, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.16983138024806976, "rewards/margins": 3.335303544998169, "rewards/rejected": -3.1654722690582275, "step": 240 }, { "epoch": 1.1076923076923078, "eval_logits/chosen": -1.1771941184997559, "eval_logits/rejected": -1.1856648921966553, "eval_logps/chosen": -22.31366539001465, "eval_logps/rejected": -31.599573135375977, "eval_loss": 0.24783480167388916, "eval_rewards/accuracies": 0.8317972421646118, "eval_rewards/chosen": 0.3665139377117157, "eval_rewards/margins": 2.98590350151062, "eval_rewards/rejected": -2.619389295578003, "eval_runtime": 216.1562, "eval_samples_per_second": 8.022, "eval_steps_per_second": 2.008, "step": 240 }, { "epoch": 1.116923076923077, "grad_norm": 13.013402968505392, "learning_rate": 2.4190452718666105e-07, "logits/chosen": -1.0899126529693604, "logits/rejected": -1.1027652025222778, "logps/chosen": -15.734682083129883, "logps/rejected": -27.53190803527832, "loss": 0.2287, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.5433827638626099, "rewards/margins": 3.0215795040130615, "rewards/rejected": -2.478196859359741, "step": 242 }, { "epoch": 1.126153846153846, "grad_norm": 12.301318346382136, "learning_rate": 2.37859444471388e-07, "logits/chosen": -1.1361184120178223, "logits/rejected": -1.151028037071228, "logps/chosen": -24.852954864501953, "logps/rejected": -40.693912506103516, "loss": 0.1914, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.489397794008255, "rewards/margins": 3.448162794113159, "rewards/rejected": -2.9587647914886475, "step": 244 }, { "epoch": 1.1353846153846154, "grad_norm": 13.708460236846275, "learning_rate": 2.3381754540639106e-07, "logits/chosen": -1.1237130165100098, "logits/rejected": -1.1399991512298584, "logps/chosen": -21.652952194213867, "logps/rejected": -30.665048599243164, "loss": 0.2272, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.5611749291419983, "rewards/margins": 3.155482292175293, "rewards/rejected": -2.5943074226379395, "step": 246 }, { "epoch": 1.1446153846153846, "grad_norm": 11.563478452101487, "learning_rate": 2.2977988990964896e-07, "logits/chosen": -1.0979208946228027, "logits/rejected": -1.111803650856018, "logps/chosen": -21.861614227294922, "logps/rejected": -38.676361083984375, "loss": 0.2243, "rewards/accuracies": 0.875, "rewards/chosen": 0.13799840211868286, "rewards/margins": 3.1060800552368164, "rewards/rejected": -2.968081474304199, "step": 248 }, { "epoch": 1.1538461538461537, "grad_norm": 12.63303273344697, "learning_rate": 2.2574753678633798e-07, "logits/chosen": -1.2150633335113525, "logits/rejected": -1.2195019721984863, "logps/chosen": -19.639219284057617, "logps/rejected": -22.85377311706543, "loss": 0.2111, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.478664755821228, "rewards/margins": 2.8225910663604736, "rewards/rejected": -2.343926429748535, "step": 250 }, { "epoch": 1.1630769230769231, "grad_norm": 15.55104305702512, "learning_rate": 2.2172154345117894e-07, "logits/chosen": -1.1489689350128174, "logits/rejected": -1.1607710123062134, "logps/chosen": -22.335952758789062, "logps/rejected": -43.476783752441406, "loss": 0.1866, "rewards/accuracies": 0.875, "rewards/chosen": 0.6636537909507751, "rewards/margins": 4.2835187911987305, "rewards/rejected": -3.6198649406433105, "step": 252 }, { "epoch": 1.1723076923076923, "grad_norm": 19.58611284576425, "learning_rate": 2.1770296565114846e-07, "logits/chosen": -1.174638271331787, "logits/rejected": -1.1910815238952637, "logps/chosen": -19.441059112548828, "logps/rejected": -23.29158592224121, "loss": 0.2382, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.15968316793441772, "rewards/margins": 2.543644428253174, "rewards/rejected": -2.3839612007141113, "step": 254 }, { "epoch": 1.1815384615384614, "grad_norm": 14.83480005382789, "learning_rate": 2.1369285718862748e-07, "logits/chosen": -1.0653572082519531, "logits/rejected": -1.0726639032363892, "logps/chosen": -24.378429412841797, "logps/rejected": -48.50611877441406, "loss": 0.1932, "rewards/accuracies": 0.875, "rewards/chosen": 0.2468690127134323, "rewards/margins": 4.218470096588135, "rewards/rejected": -3.9716007709503174, "step": 256 }, { "epoch": 1.1907692307692308, "grad_norm": 14.627626741140055, "learning_rate": 2.0969226964506005e-07, "logits/chosen": -1.1564842462539673, "logits/rejected": -1.1586439609527588, "logps/chosen": -25.08201789855957, "logps/rejected": -26.51468849182129, "loss": 0.2157, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.42589980363845825, "rewards/margins": 3.205916166305542, "rewards/rejected": -2.7800166606903076, "step": 258 }, { "epoch": 1.2, "grad_norm": 13.600232617567109, "learning_rate": 2.0570225210519433e-07, "logits/chosen": -1.1147321462631226, "logits/rejected": -1.1307651996612549, "logps/chosen": -22.724639892578125, "logps/rejected": -38.13914489746094, "loss": 0.1956, "rewards/accuracies": 0.875, "rewards/chosen": 0.5592103004455566, "rewards/margins": 3.5806994438171387, "rewards/rejected": -3.021489143371582, "step": 260 }, { "epoch": 1.2, "eval_logits/chosen": -1.1710869073867798, "eval_logits/rejected": -1.179579496383667, "eval_logps/chosen": -22.368024826049805, "eval_logps/rejected": -31.889461517333984, "eval_loss": 0.24438533186912537, "eval_rewards/accuracies": 0.8317972421646118, "eval_rewards/chosen": 0.33933624625205994, "eval_rewards/margins": 3.1036696434020996, "eval_rewards/rejected": -2.764333963394165, "eval_runtime": 216.3298, "eval_samples_per_second": 8.016, "eval_steps_per_second": 2.006, "step": 260 }, { "epoch": 1.209230769230769, "grad_norm": 16.513762580218792, "learning_rate": 2.0172385088197803e-07, "logits/chosen": -1.14779531955719, "logits/rejected": -1.1652312278747559, "logps/chosen": -26.26132583618164, "logps/rejected": -40.5022087097168, "loss": 0.2143, "rewards/accuracies": 0.7777777910232544, "rewards/chosen": 0.41341039538383484, "rewards/margins": 3.354189157485962, "rewards/rejected": -2.940778970718384, "step": 262 }, { "epoch": 1.2184615384615385, "grad_norm": 11.212524578416895, "learning_rate": 1.977581092421812e-07, "logits/chosen": -1.1520088911056519, "logits/rejected": -1.1642160415649414, "logps/chosen": -20.592201232910156, "logps/rejected": -30.868377685546875, "loss": 0.1657, "rewards/accuracies": 0.875, "rewards/chosen": 0.40944963693618774, "rewards/margins": 3.2444136142730713, "rewards/rejected": -2.8349640369415283, "step": 264 }, { "epoch": 1.2276923076923076, "grad_norm": 11.01146404378747, "learning_rate": 1.9380606713281772e-07, "logits/chosen": -1.1583861112594604, "logits/rejected": -1.1652624607086182, "logps/chosen": -18.12959098815918, "logps/rejected": -34.5963134765625, "loss": 0.2062, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.3663688898086548, "rewards/margins": 3.504619836807251, "rewards/rejected": -3.1382510662078857, "step": 266 }, { "epoch": 1.236923076923077, "grad_norm": 12.264405123220332, "learning_rate": 1.8986876090843664e-07, "logits/chosen": -1.13167142868042, "logits/rejected": -1.14499831199646, "logps/chosen": -20.43359375, "logps/rejected": -37.75240707397461, "loss": 0.1807, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.36272215843200684, "rewards/margins": 3.8877878189086914, "rewards/rejected": -3.5250654220581055, "step": 268 }, { "epoch": 1.2461538461538462, "grad_norm": 11.919291580876626, "learning_rate": 1.859472230593569e-07, "logits/chosen": -1.1225872039794922, "logits/rejected": -1.1367418766021729, "logps/chosen": -26.361604690551758, "logps/rejected": -43.534812927246094, "loss": 0.2145, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.43198204040527344, "rewards/margins": 3.9310781955718994, "rewards/rejected": -3.499096155166626, "step": 270 }, { "epoch": 1.2553846153846153, "grad_norm": 12.440022575260326, "learning_rate": 1.8204248194091425e-07, "logits/chosen": -1.1526453495025635, "logits/rejected": -1.1696141958236694, "logps/chosen": -23.60825538635254, "logps/rejected": -57.63713836669922, "loss": 0.1955, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": 0.27740761637687683, "rewards/margins": 4.90004301071167, "rewards/rejected": -4.622635841369629, "step": 272 }, { "epoch": 1.2646153846153847, "grad_norm": 8.286919730890018, "learning_rate": 1.7815556150379296e-07, "logits/chosen": -1.1683982610702515, "logits/rejected": -1.169435977935791, "logps/chosen": -22.41632652282715, "logps/rejected": -32.76851272583008, "loss": 0.1885, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.5325616002082825, "rewards/margins": 3.4823427200317383, "rewards/rejected": -2.9497809410095215, "step": 274 }, { "epoch": 1.2738461538461539, "grad_norm": 11.685150583165354, "learning_rate": 1.7428748102551234e-07, "logits/chosen": -1.106712818145752, "logits/rejected": -1.1161227226257324, "logps/chosen": -20.291996002197266, "logps/rejected": -28.43364715576172, "loss": 0.1994, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.5047957897186279, "rewards/margins": 3.1466941833496094, "rewards/rejected": -2.6418981552124023, "step": 276 }, { "epoch": 1.283076923076923, "grad_norm": 13.842054601252082, "learning_rate": 1.704392548431391e-07, "logits/chosen": -1.1573395729064941, "logits/rejected": -1.1763123273849487, "logps/chosen": -13.727288246154785, "logps/rejected": -40.552120208740234, "loss": 0.1992, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.0833960473537445, "rewards/margins": 3.4928784370422363, "rewards/rejected": -3.40948224067688, "step": 278 }, { "epoch": 1.2923076923076924, "grad_norm": 19.81840697060037, "learning_rate": 1.6661189208729489e-07, "logits/chosen": -1.1369847059249878, "logits/rejected": -1.1503101587295532, "logps/chosen": -29.371524810791016, "logps/rejected": -31.74928092956543, "loss": 0.174, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.46892601251602173, "rewards/margins": 3.2968459129333496, "rewards/rejected": -2.8279199600219727, "step": 280 }, { "epoch": 1.2923076923076924, "eval_logits/chosen": -1.165863275527954, "eval_logits/rejected": -1.1743441820144653, "eval_logps/chosen": -22.31157875061035, "eval_logps/rejected": -31.91876792907715, "eval_loss": 0.23967565596103668, "eval_rewards/accuracies": 0.8341013789176941, "eval_rewards/chosen": 0.3675578236579895, "eval_rewards/margins": 3.146545171737671, "eval_rewards/rejected": -2.778987407684326, "eval_runtime": 216.3352, "eval_samples_per_second": 8.015, "eval_steps_per_second": 2.006, "step": 280 }, { "epoch": 1.3015384615384615, "grad_norm": 8.930251698810418, "learning_rate": 1.6280639641752942e-07, "logits/chosen": -1.1316086053848267, "logits/rejected": -1.1440240144729614, "logps/chosen": -20.34646987915039, "logps/rejected": -49.82673645019531, "loss": 0.1765, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.23807168006896973, "rewards/margins": 4.113887310028076, "rewards/rejected": -3.8758151531219482, "step": 282 }, { "epoch": 1.3107692307692307, "grad_norm": 12.563220339411409, "learning_rate": 1.5902376575912814e-07, "logits/chosen": -1.11788809299469, "logits/rejected": -1.1216245889663696, "logps/chosen": -26.72078514099121, "logps/rejected": -35.561317443847656, "loss": 0.1887, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.3794720470905304, "rewards/margins": 3.400892734527588, "rewards/rejected": -3.021420478820801, "step": 284 }, { "epoch": 1.32, "grad_norm": 12.663334489473607, "learning_rate": 1.552649920414233e-07, "logits/chosen": -1.1346993446350098, "logits/rejected": -1.135698676109314, "logps/chosen": -30.942975997924805, "logps/rejected": -28.223663330078125, "loss": 0.209, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.1739700883626938, "rewards/margins": 2.763653039932251, "rewards/rejected": -2.5896828174591064, "step": 286 }, { "epoch": 1.3292307692307692, "grad_norm": 14.8989835155845, "learning_rate": 1.5153106093767825e-07, "logits/chosen": -1.0928491353988647, "logits/rejected": -1.115010142326355, "logps/chosen": -18.197795867919922, "logps/rejected": -37.05016326904297, "loss": 0.2571, "rewards/accuracies": 0.75, "rewards/chosen": 0.4650332033634186, "rewards/margins": 2.95278000831604, "rewards/rejected": -2.4877467155456543, "step": 288 }, { "epoch": 1.3384615384615386, "grad_norm": 7.959815386261902, "learning_rate": 1.47822951606611e-07, "logits/chosen": -1.1016626358032227, "logits/rejected": -1.1072629690170288, "logps/chosen": -27.025487899780273, "logps/rejected": -32.04999923706055, "loss": 0.1876, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.37108778953552246, "rewards/margins": 3.5628809928894043, "rewards/rejected": -3.191793441772461, "step": 290 }, { "epoch": 1.3476923076923077, "grad_norm": 9.883542506968235, "learning_rate": 1.4414163643562753e-07, "logits/chosen": -1.1510549783706665, "logits/rejected": -1.161637783050537, "logps/chosen": -26.81183433532715, "logps/rejected": -45.584022521972656, "loss": 0.1694, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.7165854573249817, "rewards/margins": 4.145462989807129, "rewards/rejected": -3.428877353668213, "step": 292 }, { "epoch": 1.356923076923077, "grad_norm": 16.819884237605038, "learning_rate": 1.4048808078582942e-07, "logits/chosen": -1.156364917755127, "logits/rejected": -1.158648133277893, "logps/chosen": -25.07522964477539, "logps/rejected": -37.01847839355469, "loss": 0.1916, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": -0.062492769211530685, "rewards/margins": 3.447725534439087, "rewards/rejected": -3.5102179050445557, "step": 294 }, { "epoch": 1.3661538461538463, "grad_norm": 9.730872259730013, "learning_rate": 1.3686324273886528e-07, "logits/chosen": -1.0902260541915894, "logits/rejected": -1.1149543523788452, "logps/chosen": -21.78764533996582, "logps/rejected": -47.82768249511719, "loss": 0.1618, "rewards/accuracies": 0.9444444179534912, "rewards/chosen": 0.330030232667923, "rewards/margins": 4.0784478187561035, "rewards/rejected": -3.748418092727661, "step": 296 }, { "epoch": 1.3753846153846154, "grad_norm": 11.017633003526004, "learning_rate": 1.3326807284568984e-07, "logits/chosen": -1.1744215488433838, "logits/rejected": -1.1781681776046753, "logps/chosen": -20.410446166992188, "logps/rejected": -33.22405242919922, "loss": 0.2013, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.266373872756958, "rewards/margins": 3.171236515045166, "rewards/rejected": -2.904862642288208, "step": 298 }, { "epoch": 1.3846153846153846, "grad_norm": 12.616723945362331, "learning_rate": 1.2970351387729872e-07, "logits/chosen": -1.1809624433517456, "logits/rejected": -1.1951857805252075, "logps/chosen": -18.240955352783203, "logps/rejected": -40.42936706542969, "loss": 0.2077, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.6317293643951416, "rewards/margins": 3.926286458969116, "rewards/rejected": -3.2945568561553955, "step": 300 }, { "epoch": 1.3846153846153846, "eval_logits/chosen": -1.1625326871871948, "eval_logits/rejected": -1.1709260940551758, "eval_logps/chosen": -22.30373764038086, "eval_logps/rejected": -32.03895568847656, "eval_loss": 0.23691046237945557, "eval_rewards/accuracies": 0.8387096524238586, "eval_rewards/chosen": 0.3714797794818878, "eval_rewards/margins": 3.2105631828308105, "eval_rewards/rejected": -2.839083194732666, "eval_runtime": 216.5842, "eval_samples_per_second": 8.006, "eval_steps_per_second": 2.004, "step": 300 }, { "epoch": 1.393846153846154, "grad_norm": 11.126146094324666, "learning_rate": 1.261705005775032e-07, "logits/chosen": -1.1696714162826538, "logits/rejected": -1.1861652135849, "logps/chosen": -22.42890167236328, "logps/rejected": -34.44594192504883, "loss": 0.1635, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": 0.2834773361682892, "rewards/margins": 3.5443296432495117, "rewards/rejected": -3.260852813720703, "step": 302 }, { "epoch": 1.403076923076923, "grad_norm": 10.479052450533084, "learning_rate": 1.2266995941780933e-07, "logits/chosen": -1.130216121673584, "logits/rejected": -1.1414945125579834, "logps/chosen": -25.476299285888672, "logps/rejected": -40.09599304199219, "loss": 0.1598, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.3959537744522095, "rewards/margins": 3.8914499282836914, "rewards/rejected": -3.4954960346221924, "step": 304 }, { "epoch": 1.4123076923076923, "grad_norm": 15.900407241334104, "learning_rate": 1.1920280835446748e-07, "logits/chosen": -1.1561819314956665, "logits/rejected": -1.160946011543274, "logps/chosen": -26.870162963867188, "logps/rejected": -45.102787017822266, "loss": 0.1771, "rewards/accuracies": 0.875, "rewards/chosen": 0.5023772120475769, "rewards/margins": 4.30380392074585, "rewards/rejected": -3.801426887512207, "step": 306 }, { "epoch": 1.4215384615384616, "grad_norm": 10.845292151115956, "learning_rate": 1.1576995658775404e-07, "logits/chosen": -1.1523799896240234, "logits/rejected": -1.1634249687194824, "logps/chosen": -20.11031723022461, "logps/rejected": -28.449501037597656, "loss": 0.155, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.4131190776824951, "rewards/margins": 3.466240882873535, "rewards/rejected": -3.053121328353882, "step": 308 }, { "epoch": 1.4307692307692308, "grad_norm": 13.811097447536184, "learning_rate": 1.123723043235491e-07, "logits/chosen": -1.1037707328796387, "logits/rejected": -1.1196866035461426, "logps/chosen": -22.25092315673828, "logps/rejected": -41.13553237915039, "loss": 0.2394, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.44290411472320557, "rewards/margins": 3.9364805221557617, "rewards/rejected": -3.4935765266418457, "step": 310 }, { "epoch": 1.44, "grad_norm": 7.336736527232887, "learning_rate": 1.0901074253727336e-07, "logits/chosen": -1.132401943206787, "logits/rejected": -1.1375315189361572, "logps/chosen": -21.84718132019043, "logps/rejected": -32.056617736816406, "loss": 0.1639, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.543586015701294, "rewards/margins": 3.536188840866089, "rewards/rejected": -2.9926023483276367, "step": 312 }, { "epoch": 1.4492307692307693, "grad_norm": 9.238298739154985, "learning_rate": 1.056861527402452e-07, "logits/chosen": -1.1301486492156982, "logits/rejected": -1.130847454071045, "logps/chosen": -30.35249137878418, "logps/rejected": -39.42829513549805, "loss": 0.1854, "rewards/accuracies": 0.8055555820465088, "rewards/chosen": 0.6695830821990967, "rewards/margins": 3.61427903175354, "rewards/rejected": -2.9446957111358643, "step": 314 }, { "epoch": 1.4584615384615385, "grad_norm": 13.901867549459764, "learning_rate": 1.0239940674851941e-07, "logits/chosen": -1.1156858205795288, "logits/rejected": -1.114392638206482, "logps/chosen": -24.01244354248047, "logps/rejected": -34.20494842529297, "loss": 0.1866, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.37583643198013306, "rewards/margins": 3.5291662216186523, "rewards/rejected": -3.153329610824585, "step": 316 }, { "epoch": 1.4676923076923076, "grad_norm": 11.080424296345777, "learning_rate": 9.915136645426883e-08, "logits/chosen": -1.1818937063217163, "logits/rejected": -1.1808428764343262, "logps/chosen": -24.881999969482422, "logps/rejected": -28.97332763671875, "loss": 0.173, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": 0.36953669786453247, "rewards/margins": 3.261909246444702, "rewards/rejected": -2.8923726081848145, "step": 318 }, { "epoch": 1.476923076923077, "grad_norm": 15.189646270302608, "learning_rate": 9.594288359976815e-08, "logits/chosen": -1.1282167434692383, "logits/rejected": -1.1426851749420166, "logps/chosen": -17.99266815185547, "logps/rejected": -47.12626266479492, "loss": 0.2092, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.30799973011016846, "rewards/margins": 4.037694454193115, "rewards/rejected": -3.729694366455078, "step": 320 }, { "epoch": 1.476923076923077, "eval_logits/chosen": -1.1610218286514282, "eval_logits/rejected": -1.1692686080932617, "eval_logps/chosen": -22.297130584716797, "eval_logps/rejected": -32.10142135620117, "eval_loss": 0.23491987586021423, "eval_rewards/accuracies": 0.8329492807388306, "eval_rewards/chosen": 0.3747842013835907, "eval_rewards/margins": 3.245098829269409, "eval_rewards/rejected": -2.870314836502075, "eval_runtime": 216.0919, "eval_samples_per_second": 8.024, "eval_steps_per_second": 2.008, "step": 320 }, { "epoch": 1.4861538461538462, "grad_norm": 11.193355120949441, "learning_rate": 9.277479955403886e-08, "logits/chosen": -1.147449016571045, "logits/rejected": -1.1808828115463257, "logps/chosen": -19.78190040588379, "logps/rejected": -68.74774932861328, "loss": 0.1519, "rewards/accuracies": 0.875, "rewards/chosen": 0.25842922925949097, "rewards/margins": 5.480890274047852, "rewards/rejected": -5.222461223602295, "step": 322 }, { "epoch": 1.4953846153846153, "grad_norm": 11.257040825977688, "learning_rate": 8.964794509221507e-08, "logits/chosen": -1.1383910179138184, "logits/rejected": -1.148794412612915, "logps/chosen": -25.653322219848633, "logps/rejected": -34.04636001586914, "loss": 0.1653, "rewards/accuracies": 0.9305555820465088, "rewards/chosen": 0.33952367305755615, "rewards/margins": 3.5638911724090576, "rewards/rejected": -3.224367380142212, "step": 324 }, { "epoch": 1.5046153846153847, "grad_norm": 14.248331413419937, "learning_rate": 8.656314017768693e-08, "logits/chosen": -1.1353636980056763, "logits/rejected": -1.1488914489746094, "logps/chosen": -23.45088768005371, "logps/rejected": -36.34320831298828, "loss": 0.19, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.5625240802764893, "rewards/margins": 3.636873483657837, "rewards/rejected": -3.0743494033813477, "step": 326 }, { "epoch": 1.5138461538461538, "grad_norm": 11.13430757826836, "learning_rate": 8.352119374707977e-08, "logits/chosen": -1.1736154556274414, "logits/rejected": -1.1819250583648682, "logps/chosen": -21.08655548095703, "logps/rejected": -31.81151580810547, "loss": 0.1618, "rewards/accuracies": 0.875, "rewards/chosen": 0.3814205825328827, "rewards/margins": 3.455685615539551, "rewards/rejected": -3.0742650032043457, "step": 328 }, { "epoch": 1.523076923076923, "grad_norm": 9.775792350949882, "learning_rate": 8.052290349812419e-08, "logits/chosen": -1.1424063444137573, "logits/rejected": -1.1474817991256714, "logps/chosen": -21.133007049560547, "logps/rejected": -25.102752685546875, "loss": 0.2071, "rewards/accuracies": 0.875, "rewards/chosen": 0.4940270781517029, "rewards/margins": 2.9714784622192383, "rewards/rejected": -2.4774513244628906, "step": 330 }, { "epoch": 1.5323076923076924, "grad_norm": 6.768309866947245, "learning_rate": 7.756905568047392e-08, "logits/chosen": -1.1152650117874146, "logits/rejected": -1.12236750125885, "logps/chosen": -17.50248146057129, "logps/rejected": -29.518686294555664, "loss": 0.159, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.6183215379714966, "rewards/margins": 3.7438418865203857, "rewards/rejected": -3.1255204677581787, "step": 332 }, { "epoch": 1.5415384615384615, "grad_norm": 12.853827774295516, "learning_rate": 7.46604248895252e-08, "logits/chosen": -1.1082737445831299, "logits/rejected": -1.1175150871276855, "logps/chosen": -20.219505310058594, "logps/rejected": -28.43560218811035, "loss": 0.1827, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.34581294655799866, "rewards/margins": 3.1769955158233643, "rewards/rejected": -2.8311829566955566, "step": 334 }, { "epoch": 1.5507692307692307, "grad_norm": 7.493668682648857, "learning_rate": 7.179777386329275e-08, "logits/chosen": -1.1045269966125488, "logits/rejected": -1.1183186769485474, "logps/chosen": -21.421226501464844, "logps/rejected": -39.41886901855469, "loss": 0.1748, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.5396389365196228, "rewards/margins": 3.9202401638031006, "rewards/rejected": -3.380601167678833, "step": 336 }, { "epoch": 1.56, "grad_norm": 12.452229910069226, "learning_rate": 6.898185328239467e-08, "logits/chosen": -1.145583987236023, "logits/rejected": -1.1488795280456543, "logps/chosen": -22.65854263305664, "logps/rejected": -31.751142501831055, "loss": 0.1845, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.2917179465293884, "rewards/margins": 3.111690044403076, "rewards/rejected": -2.819972038269043, "step": 338 }, { "epoch": 1.5692307692307692, "grad_norm": 10.84177308211244, "learning_rate": 6.621340157319996e-08, "logits/chosen": -1.1560921669006348, "logits/rejected": -1.1605477333068848, "logps/chosen": -16.325712203979492, "logps/rejected": -24.499792098999023, "loss": 0.2045, "rewards/accuracies": 0.9027777910232544, "rewards/chosen": 0.44531428813934326, "rewards/margins": 3.1462950706481934, "rewards/rejected": -2.7009804248809814, "step": 340 }, { "epoch": 1.5692307692307692, "eval_logits/chosen": -1.1584707498550415, "eval_logits/rejected": -1.1668710708618164, "eval_logps/chosen": -22.341110229492188, "eval_logps/rejected": -32.223533630371094, "eval_loss": 0.23495733737945557, "eval_rewards/accuracies": 0.8341013789176941, "eval_rewards/chosen": 0.35279345512390137, "eval_rewards/margins": 3.2841641902923584, "eval_rewards/rejected": -2.931370496749878, "eval_runtime": 216.2511, "eval_samples_per_second": 8.018, "eval_steps_per_second": 2.007, "step": 340 }, { "epoch": 1.5784615384615384, "grad_norm": 8.225696594197464, "learning_rate": 6.349314471418849e-08, "logits/chosen": -1.0857443809509277, "logits/rejected": -1.0922576189041138, "logps/chosen": -16.084243774414062, "logps/rejected": -30.81378173828125, "loss": 0.1803, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.5106647610664368, "rewards/margins": 3.7973814010620117, "rewards/rejected": -3.2867166996002197, "step": 342 }, { "epoch": 1.5876923076923077, "grad_norm": 15.760247716168218, "learning_rate": 6.082179604557616e-08, "logits/chosen": -1.1193811893463135, "logits/rejected": -1.121721863746643, "logps/chosen": -22.19783592224121, "logps/rejected": -28.761178970336914, "loss": 0.197, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.40734562277793884, "rewards/margins": 3.452158212661743, "rewards/rejected": -3.0448129177093506, "step": 344 }, { "epoch": 1.596923076923077, "grad_norm": 10.909974494088763, "learning_rate": 5.8200056082253453e-08, "logits/chosen": -1.125333547592163, "logits/rejected": -1.142914056777954, "logps/chosen": -19.27569007873535, "logps/rejected": -45.170040130615234, "loss": 0.1653, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.4003957509994507, "rewards/margins": 4.2396368980407715, "rewards/rejected": -3.839240550994873, "step": 346 }, { "epoch": 1.606153846153846, "grad_norm": 10.855639719670084, "learning_rate": 5.5628612330087724e-08, "logits/chosen": -1.131655216217041, "logits/rejected": -1.1401116847991943, "logps/chosen": -17.995466232299805, "logps/rejected": -32.176475524902344, "loss": 0.1826, "rewards/accuracies": 0.8472222089767456, "rewards/chosen": 0.4925755262374878, "rewards/margins": 3.6894967555999756, "rewards/rejected": -3.196920871734619, "step": 348 }, { "epoch": 1.6153846153846154, "grad_norm": 16.085282454030374, "learning_rate": 5.310813910563644e-08, "logits/chosen": -1.0810273885726929, "logits/rejected": -1.0798935890197754, "logps/chosen": -22.392784118652344, "logps/rejected": -28.961748123168945, "loss": 0.2082, "rewards/accuracies": 0.7916666865348816, "rewards/chosen": 0.39071983098983765, "rewards/margins": 2.970240592956543, "rewards/rejected": -2.5795204639434814, "step": 350 }, { "epoch": 1.6246153846153846, "grad_norm": 16.9671493136513, "learning_rate": 5.0639297359319846e-08, "logits/chosen": -1.1683417558670044, "logits/rejected": -1.1672459840774536, "logps/chosen": -24.353551864624023, "logps/rejected": -27.454164505004883, "loss": 0.2106, "rewards/accuracies": 0.8333333134651184, "rewards/chosen": 0.26455923914909363, "rewards/margins": 2.982168674468994, "rewards/rejected": -2.717609167098999, "step": 352 }, { "epoch": 1.6338461538461537, "grad_norm": 10.455898381248911, "learning_rate": 4.8222734502097655e-08, "logits/chosen": -1.1433789730072021, "logits/rejected": -1.153548240661621, "logps/chosen": -24.5914363861084, "logps/rejected": -42.36714172363281, "loss": 0.1885, "rewards/accuracies": 0.8194444179534912, "rewards/chosen": 0.36157724261283875, "rewards/margins": 3.6608800888061523, "rewards/rejected": -3.2993030548095703, "step": 354 }, { "epoch": 1.643076923076923, "grad_norm": 19.280259828969186, "learning_rate": 4.5859084235697235e-08, "logits/chosen": -1.164656639099121, "logits/rejected": -1.1599383354187012, "logps/chosen": -19.223194122314453, "logps/rejected": -24.446197509765625, "loss": 0.2371, "rewards/accuracies": 0.8611111044883728, "rewards/chosen": 0.3862743377685547, "rewards/margins": 2.9600579738616943, "rewards/rejected": -2.5737838745117188, "step": 356 }, { "epoch": 1.6523076923076923, "grad_norm": 8.14493222848995, "learning_rate": 4.35489663864359e-08, "logits/chosen": -1.0972024202346802, "logits/rejected": -1.1305886507034302, "logps/chosen": -17.79538345336914, "logps/rejected": -59.57120895385742, "loss": 0.2046, "rewards/accuracies": 0.8888888955116272, "rewards/chosen": 0.5223473310470581, "rewards/margins": 4.91096830368042, "rewards/rejected": -4.388620853424072, "step": 358 }, { "epoch": 1.6615384615384614, "grad_norm": 11.376614389062514, "learning_rate": 4.1292986742682254e-08, "logits/chosen": -1.140592098236084, "logits/rejected": -1.1457772254943848, "logps/chosen": -19.596229553222656, "logps/rejected": -32.57119369506836, "loss": 0.1368, "rewards/accuracies": 0.9166666865348816, "rewards/chosen": 0.34850603342056274, "rewards/margins": 3.6875181198120117, "rewards/rejected": -3.339012622833252, "step": 360 }, { "epoch": 1.6615384615384614, "eval_logits/chosen": -1.1585197448730469, "eval_logits/rejected": -1.1669610738754272, "eval_logps/chosen": -22.363513946533203, "eval_logps/rejected": -32.30293273925781, "eval_loss": 0.23404575884342194, "eval_rewards/accuracies": 0.8352534770965576, "eval_rewards/chosen": 0.3415912091732025, "eval_rewards/margins": 3.3126602172851562, "eval_rewards/rejected": -2.9710693359375, "eval_runtime": 216.0202, "eval_samples_per_second": 8.027, "eval_steps_per_second": 2.009, "step": 360 } ], "logging_steps": 2, "max_steps": 432, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }