zephyr-7b-MI05-SELM / trainer_state.json
Teng Xiao
TX
9c18184
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.998691442030882,
"eval_steps": 500,
"global_step": 477,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010468463752944255,
"grad_norm": 7.631302590698798,
"learning_rate": 5.208333333333333e-08,
"logits/chosen": -2.7707834243774414,
"logits/rejected": -2.7284507751464844,
"logps/chosen": -1.027999758720398,
"logps/rejected": -1.1731758117675781,
"loss": 1.5142,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.027999758720398,
"rewards/margins": 0.145175963640213,
"rewards/rejected": -1.1731758117675781,
"step": 5
},
{
"epoch": 0.02093692750588851,
"grad_norm": 6.949481214555281,
"learning_rate": 1.0416666666666667e-07,
"logits/chosen": -2.751293659210205,
"logits/rejected": -2.7253918647766113,
"logps/chosen": -0.9958034753799438,
"logps/rejected": -1.0718666315078735,
"loss": 1.5011,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.9958034753799438,
"rewards/margins": 0.07606318593025208,
"rewards/rejected": -1.0718666315078735,
"step": 10
},
{
"epoch": 0.031405391258832765,
"grad_norm": 6.93246149046264,
"learning_rate": 1.5624999999999999e-07,
"logits/chosen": -2.7307419776916504,
"logits/rejected": -2.6973299980163574,
"logps/chosen": -0.9896200299263,
"logps/rejected": -1.1479737758636475,
"loss": 1.4936,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -0.9896200299263,
"rewards/margins": 0.15835365653038025,
"rewards/rejected": -1.1479737758636475,
"step": 15
},
{
"epoch": 0.04187385501177702,
"grad_norm": 6.847416841047133,
"learning_rate": 2.0833333333333333e-07,
"logits/chosen": -2.8032145500183105,
"logits/rejected": -2.709728717803955,
"logps/chosen": -1.023134469985962,
"logps/rejected": -1.1238592863082886,
"loss": 1.5132,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -1.023134469985962,
"rewards/margins": 0.10072481632232666,
"rewards/rejected": -1.1238592863082886,
"step": 20
},
{
"epoch": 0.05234231876472128,
"grad_norm": 7.005806187796597,
"learning_rate": 2.604166666666667e-07,
"logits/chosen": -2.799395799636841,
"logits/rejected": -2.775847911834717,
"logps/chosen": -1.0309282541275024,
"logps/rejected": -1.0878545045852661,
"loss": 1.5044,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -1.0309282541275024,
"rewards/margins": 0.056926220655441284,
"rewards/rejected": -1.0878545045852661,
"step": 25
},
{
"epoch": 0.06281078251766553,
"grad_norm": 7.350121902518634,
"learning_rate": 3.1249999999999997e-07,
"logits/chosen": -2.7731544971466064,
"logits/rejected": -2.7317252159118652,
"logps/chosen": -0.9808300733566284,
"logps/rejected": -1.0866905450820923,
"loss": 1.5044,
"rewards/accuracies": 0.5375000238418579,
"rewards/chosen": -0.9808300733566284,
"rewards/margins": 0.10586042702198029,
"rewards/rejected": -1.0866905450820923,
"step": 30
},
{
"epoch": 0.07327924627060979,
"grad_norm": 7.700359687001099,
"learning_rate": 3.645833333333333e-07,
"logits/chosen": -2.7761082649230957,
"logits/rejected": -2.689614772796631,
"logps/chosen": -1.0087323188781738,
"logps/rejected": -1.151500940322876,
"loss": 1.4925,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.0087323188781738,
"rewards/margins": 0.1427687108516693,
"rewards/rejected": -1.151500940322876,
"step": 35
},
{
"epoch": 0.08374771002355404,
"grad_norm": 13.186452732989158,
"learning_rate": 4.1666666666666667e-07,
"logits/chosen": -2.791355848312378,
"logits/rejected": -2.694056749343872,
"logps/chosen": -0.9970605969429016,
"logps/rejected": -1.2966585159301758,
"loss": 1.4668,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -0.9970605969429016,
"rewards/margins": 0.29959791898727417,
"rewards/rejected": -1.2966585159301758,
"step": 40
},
{
"epoch": 0.0942161737764983,
"grad_norm": 7.907271081451181,
"learning_rate": 4.6874999999999996e-07,
"logits/chosen": -2.7956533432006836,
"logits/rejected": -2.7485368251800537,
"logps/chosen": -0.9793591499328613,
"logps/rejected": -1.1734071969985962,
"loss": 1.4659,
"rewards/accuracies": 0.543749988079071,
"rewards/chosen": -0.9793591499328613,
"rewards/margins": 0.1940479576587677,
"rewards/rejected": -1.1734071969985962,
"step": 45
},
{
"epoch": 0.10468463752944256,
"grad_norm": 10.523506634944091,
"learning_rate": 4.999731868769026e-07,
"logits/chosen": -2.740318775177002,
"logits/rejected": -2.669680118560791,
"logps/chosen": -1.023005723953247,
"logps/rejected": -1.3500120639801025,
"loss": 1.4663,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.023005723953247,
"rewards/margins": 0.3270064890384674,
"rewards/rejected": -1.3500120639801025,
"step": 50
},
{
"epoch": 0.11515310128238682,
"grad_norm": 9.075126792521907,
"learning_rate": 4.996716052911017e-07,
"logits/chosen": -2.7170307636260986,
"logits/rejected": -2.678213596343994,
"logps/chosen": -1.0640136003494263,
"logps/rejected": -1.3454773426055908,
"loss": 1.4617,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -1.0640136003494263,
"rewards/margins": 0.28146374225616455,
"rewards/rejected": -1.3454773426055908,
"step": 55
},
{
"epoch": 0.12562156503533106,
"grad_norm": 9.439762477880253,
"learning_rate": 4.990353313429303e-07,
"logits/chosen": -2.754854917526245,
"logits/rejected": -2.7064061164855957,
"logps/chosen": -1.103760838508606,
"logps/rejected": -1.3274078369140625,
"loss": 1.4332,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.103760838508606,
"rewards/margins": 0.22364696860313416,
"rewards/rejected": -1.3274078369140625,
"step": 60
},
{
"epoch": 0.1360900287882753,
"grad_norm": 9.434075067475233,
"learning_rate": 4.980652179769217e-07,
"logits/chosen": -2.7506463527679443,
"logits/rejected": -2.6285061836242676,
"logps/chosen": -1.0410432815551758,
"logps/rejected": -1.671651840209961,
"loss": 1.4215,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.0410432815551758,
"rewards/margins": 0.6306084394454956,
"rewards/rejected": -1.671651840209961,
"step": 65
},
{
"epoch": 0.14655849254121958,
"grad_norm": 13.786310447291475,
"learning_rate": 4.967625656594781e-07,
"logits/chosen": -2.656280994415283,
"logits/rejected": -2.609356641769409,
"logps/chosen": -1.0239083766937256,
"logps/rejected": -1.3759796619415283,
"loss": 1.4285,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.0239083766937256,
"rewards/margins": 0.35207125544548035,
"rewards/rejected": -1.3759796619415283,
"step": 70
},
{
"epoch": 0.15702695629416383,
"grad_norm": 19.70767709035438,
"learning_rate": 4.951291206355559e-07,
"logits/chosen": -2.755265712738037,
"logits/rejected": -2.683993101119995,
"logps/chosen": -1.0507580041885376,
"logps/rejected": -1.4722309112548828,
"loss": 1.4151,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.0507580041885376,
"rewards/margins": 0.42147302627563477,
"rewards/rejected": -1.4722309112548828,
"step": 75
},
{
"epoch": 0.16749542004710807,
"grad_norm": 13.640696435085882,
"learning_rate": 4.93167072587771e-07,
"logits/chosen": -2.719999313354492,
"logits/rejected": -2.659780979156494,
"logps/chosen": -1.1385526657104492,
"logps/rejected": -1.6847137212753296,
"loss": 1.4244,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1385526657104492,
"rewards/margins": 0.5461611747741699,
"rewards/rejected": -1.6847137212753296,
"step": 80
},
{
"epoch": 0.17796388380005235,
"grad_norm": 15.610448795236834,
"learning_rate": 4.908790517010636e-07,
"logits/chosen": -2.7185451984405518,
"logits/rejected": -2.6570563316345215,
"logps/chosen": -1.0404447317123413,
"logps/rejected": -1.5335460901260376,
"loss": 1.4359,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0404447317123413,
"rewards/margins": 0.4931013584136963,
"rewards/rejected": -1.5335460901260376,
"step": 85
},
{
"epoch": 0.1884323475529966,
"grad_norm": 15.498065363599855,
"learning_rate": 4.882681251368548e-07,
"logits/chosen": -2.705440044403076,
"logits/rejected": -2.6757359504699707,
"logps/chosen": -1.0957661867141724,
"logps/rejected": -1.541998267173767,
"loss": 1.4046,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.0957661867141724,
"rewards/margins": 0.4462320804595947,
"rewards/rejected": -1.541998267173767,
"step": 90
},
{
"epoch": 0.19890081130594087,
"grad_norm": 18.034112089469748,
"learning_rate": 4.853377929214243e-07,
"logits/chosen": -2.779350996017456,
"logits/rejected": -2.6815435886383057,
"logps/chosen": -1.1388349533081055,
"logps/rejected": -1.6295349597930908,
"loss": 1.4191,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.1388349533081055,
"rewards/margins": 0.490699827671051,
"rewards/rejected": -1.6295349597930908,
"step": 95
},
{
"epoch": 0.2093692750588851,
"grad_norm": 16.923919026891507,
"learning_rate": 4.820919832540181e-07,
"logits/chosen": -2.7381911277770996,
"logits/rejected": -2.637225389480591,
"logps/chosen": -1.1354267597198486,
"logps/rejected": -1.7485253810882568,
"loss": 1.4134,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.1354267597198486,
"rewards/margins": 0.6130987405776978,
"rewards/rejected": -1.7485253810882568,
"step": 100
},
{
"epoch": 0.21983773881182936,
"grad_norm": 18.172912956493622,
"learning_rate": 4.785350472409791e-07,
"logits/chosen": -2.6725711822509766,
"logits/rejected": -2.6469922065734863,
"logps/chosen": -1.0028823614120483,
"logps/rejected": -1.6847069263458252,
"loss": 1.3943,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.0028823614120483,
"rewards/margins": 0.6818245649337769,
"rewards/rejected": -1.6847069263458252,
"step": 105
},
{
"epoch": 0.23030620256477363,
"grad_norm": 47.211058537190446,
"learning_rate": 4.7467175306295647e-07,
"logits/chosen": -2.722909688949585,
"logits/rejected": -2.664407730102539,
"logps/chosen": -1.1426756381988525,
"logps/rejected": -1.676659345626831,
"loss": 1.3917,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.1426756381988525,
"rewards/margins": 0.5339838266372681,
"rewards/rejected": -1.676659345626831,
"step": 110
},
{
"epoch": 0.24077466631771788,
"grad_norm": 24.055123672850197,
"learning_rate": 4.70507279583015e-07,
"logits/chosen": -2.680541515350342,
"logits/rejected": -2.6233205795288086,
"logps/chosen": -1.0654175281524658,
"logps/rejected": -1.791318655014038,
"loss": 1.3876,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.0654175281524658,
"rewards/margins": 0.7259014248847961,
"rewards/rejected": -1.791318655014038,
"step": 115
},
{
"epoch": 0.2512431300706621,
"grad_norm": 26.57015894685055,
"learning_rate": 4.6604720940421207e-07,
"logits/chosen": -2.6764163970947266,
"logits/rejected": -2.651352643966675,
"logps/chosen": -1.1118848323822021,
"logps/rejected": -1.6098695993423462,
"loss": 1.4094,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.1118848323822021,
"rewards/margins": 0.49798470735549927,
"rewards/rejected": -1.6098695993423462,
"step": 120
},
{
"epoch": 0.26171159382360637,
"grad_norm": 18.624223201288878,
"learning_rate": 4.612975213859487e-07,
"logits/chosen": -2.74927020072937,
"logits/rejected": -2.7295844554901123,
"logps/chosen": -1.156049132347107,
"logps/rejected": -1.56387460231781,
"loss": 1.4148,
"rewards/accuracies": 0.574999988079071,
"rewards/chosen": -1.156049132347107,
"rewards/margins": 0.4078255295753479,
"rewards/rejected": -1.56387460231781,
"step": 125
},
{
"epoch": 0.2721800575765506,
"grad_norm": 27.721075966231663,
"learning_rate": 4.5626458262912735e-07,
"logits/chosen": -2.710625171661377,
"logits/rejected": -2.6833271980285645,
"logps/chosen": -1.1802499294281006,
"logps/rejected": -1.4933916330337524,
"loss": 1.4308,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.1802499294281006,
"rewards/margins": 0.31314152479171753,
"rewards/rejected": -1.4933916330337524,
"step": 130
},
{
"epoch": 0.2826485213294949,
"grad_norm": 25.646654099025422,
"learning_rate": 4.5095513994085974e-07,
"logits/chosen": -2.690138578414917,
"logits/rejected": -2.59498929977417,
"logps/chosen": -1.2173268795013428,
"logps/rejected": -2.0789878368377686,
"loss": 1.4068,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.2173268795013428,
"rewards/margins": 0.8616611361503601,
"rewards/rejected": -2.0789878368377686,
"step": 135
},
{
"epoch": 0.29311698508243916,
"grad_norm": 15.591514935900287,
"learning_rate": 4.453763107901675e-07,
"logits/chosen": -2.715076208114624,
"logits/rejected": -2.666903018951416,
"logps/chosen": -1.191408634185791,
"logps/rejected": -1.7779796123504639,
"loss": 1.3784,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.191408634185791,
"rewards/margins": 0.5865710377693176,
"rewards/rejected": -1.7779796123504639,
"step": 140
},
{
"epoch": 0.3035854488353834,
"grad_norm": 12.601893786620954,
"learning_rate": 4.395355737667985e-07,
"logits/chosen": -2.7022361755371094,
"logits/rejected": -2.650947093963623,
"logps/chosen": -1.1326510906219482,
"logps/rejected": -1.6125431060791016,
"loss": 1.3973,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.1326510906219482,
"rewards/margins": 0.4798920154571533,
"rewards/rejected": -1.6125431060791016,
"step": 145
},
{
"epoch": 0.31405391258832765,
"grad_norm": 15.497858847663315,
"learning_rate": 4.3344075855595097e-07,
"logits/chosen": -2.723766326904297,
"logits/rejected": -2.704010486602783,
"logps/chosen": -1.0249402523040771,
"logps/rejected": -1.5360591411590576,
"loss": 1.3712,
"rewards/accuracies": 0.581250011920929,
"rewards/chosen": -1.0249402523040771,
"rewards/margins": 0.51111900806427,
"rewards/rejected": -1.5360591411590576,
"step": 150
},
{
"epoch": 0.3245223763412719,
"grad_norm": 17.077156505391155,
"learning_rate": 4.271000354423425e-07,
"logits/chosen": -2.6389071941375732,
"logits/rejected": -2.5978379249572754,
"logps/chosen": -1.1586675643920898,
"logps/rejected": -1.6560027599334717,
"loss": 1.3901,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.1586675643920898,
"rewards/margins": 0.497335284948349,
"rewards/rejected": -1.6560027599334717,
"step": 155
},
{
"epoch": 0.33499084009421615,
"grad_norm": 22.945534571845954,
"learning_rate": 4.2052190435769554e-07,
"logits/chosen": -2.5543599128723145,
"logits/rejected": -2.5258305072784424,
"logps/chosen": -1.284404993057251,
"logps/rejected": -1.9095481634140015,
"loss": 1.3994,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.284404993057251,
"rewards/margins": 0.62514328956604,
"rewards/rejected": -1.9095481634140015,
"step": 160
},
{
"epoch": 0.34545930384716045,
"grad_norm": 22.9080407885994,
"learning_rate": 4.137151834863213e-07,
"logits/chosen": -2.6129097938537598,
"logits/rejected": -2.5123066902160645,
"logps/chosen": -1.2131001949310303,
"logps/rejected": -1.8760910034179688,
"loss": 1.3639,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.2131001949310303,
"rewards/margins": 0.6629905104637146,
"rewards/rejected": -1.8760910034179688,
"step": 165
},
{
"epoch": 0.3559277676001047,
"grad_norm": 23.012826789095357,
"learning_rate": 4.0668899744407567e-07,
"logits/chosen": -2.6876988410949707,
"logits/rejected": -2.6080994606018066,
"logps/chosen": -1.1881000995635986,
"logps/rejected": -1.9103021621704102,
"loss": 1.3735,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.1881000995635986,
"rewards/margins": 0.7222020626068115,
"rewards/rejected": -1.9103021621704102,
"step": 170
},
{
"epoch": 0.36639623135304894,
"grad_norm": 22.848365401534277,
"learning_rate": 3.994527650465352e-07,
"logits/chosen": -2.6223888397216797,
"logits/rejected": -2.510727643966675,
"logps/chosen": -1.2682397365570068,
"logps/rejected": -1.8920142650604248,
"loss": 1.4095,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.2682397365570068,
"rewards/margins": 0.623774528503418,
"rewards/rejected": -1.8920142650604248,
"step": 175
},
{
"epoch": 0.3768646951059932,
"grad_norm": 16.908175946250164,
"learning_rate": 3.920161866827889e-07,
"logits/chosen": -2.576324462890625,
"logits/rejected": -2.541231632232666,
"logps/chosen": -1.1460561752319336,
"logps/rejected": -1.8999898433685303,
"loss": 1.3719,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.1460561752319336,
"rewards/margins": 0.7539336085319519,
"rewards/rejected": -1.8999898433685303,
"step": 180
},
{
"epoch": 0.38733315885893743,
"grad_norm": 21.471414104048915,
"learning_rate": 3.8438923131177237e-07,
"logits/chosen": -2.453334331512451,
"logits/rejected": -2.260716438293457,
"logps/chosen": -1.1970899105072021,
"logps/rejected": -2.2250547409057617,
"loss": 1.3519,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.1970899105072021,
"rewards/margins": 1.0279648303985596,
"rewards/rejected": -2.2250547409057617,
"step": 185
},
{
"epoch": 0.39780162261188173,
"grad_norm": 25.009337039540963,
"learning_rate": 3.765821230985757e-07,
"logits/chosen": -2.283473253250122,
"logits/rejected": -2.178825855255127,
"logps/chosen": -1.2332468032836914,
"logps/rejected": -1.9305975437164307,
"loss": 1.4198,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.2332468032836914,
"rewards/margins": 0.6973507404327393,
"rewards/rejected": -1.9305975437164307,
"step": 190
},
{
"epoch": 0.408270086364826,
"grad_norm": 17.444378318038115,
"learning_rate": 3.6860532770864005e-07,
"logits/chosen": -2.3795723915100098,
"logits/rejected": -2.2288522720336914,
"logps/chosen": -1.1638405323028564,
"logps/rejected": -1.7543007135391235,
"loss": 1.3983,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.1638405323028564,
"rewards/margins": 0.5904603004455566,
"rewards/rejected": -1.7543007135391235,
"step": 195
},
{
"epoch": 0.4187385501177702,
"grad_norm": 15.428251714459405,
"learning_rate": 3.604695382782159e-07,
"logits/chosen": -2.31533145904541,
"logits/rejected": -2.2262110710144043,
"logps/chosen": -1.1396164894104004,
"logps/rejected": -1.730507493019104,
"loss": 1.3902,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.1396164894104004,
"rewards/margins": 0.5908910036087036,
"rewards/rejected": -1.730507493019104,
"step": 200
},
{
"epoch": 0.42920701387071447,
"grad_norm": 19.194522439269107,
"learning_rate": 3.5218566107988867e-07,
"logits/chosen": -1.926476240158081,
"logits/rejected": -1.6759207248687744,
"logps/chosen": -1.0454155206680298,
"logps/rejected": -1.7578277587890625,
"loss": 1.3552,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.0454155206680298,
"rewards/margins": 0.7124123573303223,
"rewards/rejected": -1.7578277587890625,
"step": 205
},
{
"epoch": 0.4396754776236587,
"grad_norm": 25.69506263243762,
"learning_rate": 3.4376480090239047e-07,
"logits/chosen": -1.7014926671981812,
"logits/rejected": -1.372465968132019,
"logps/chosen": -1.1878430843353271,
"logps/rejected": -1.839996576309204,
"loss": 1.3821,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.1878430843353271,
"rewards/margins": 0.6521533131599426,
"rewards/rejected": -1.839996576309204,
"step": 210
},
{
"epoch": 0.45014394137660296,
"grad_norm": 19.66701196453165,
"learning_rate": 3.3521824616429284e-07,
"logits/chosen": -1.6308574676513672,
"logits/rejected": -1.2923319339752197,
"logps/chosen": -1.1453461647033691,
"logps/rejected": -1.9198291301727295,
"loss": 1.3574,
"rewards/accuracies": 0.793749988079071,
"rewards/chosen": -1.1453461647033691,
"rewards/margins": 0.774483323097229,
"rewards/rejected": -1.9198291301727295,
"step": 215
},
{
"epoch": 0.46061240512954726,
"grad_norm": 36.150180674302824,
"learning_rate": 3.265574537815398e-07,
"logits/chosen": -1.6699841022491455,
"logits/rejected": -1.1387865543365479,
"logps/chosen": -1.1862000226974487,
"logps/rejected": -2.100661516189575,
"loss": 1.3618,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.1862000226974487,
"rewards/margins": 0.9144614934921265,
"rewards/rejected": -2.100661516189575,
"step": 220
},
{
"epoch": 0.4710808688824915,
"grad_norm": 21.20657529926906,
"learning_rate": 3.1779403380910425e-07,
"logits/chosen": -1.573144793510437,
"logits/rejected": -1.014370322227478,
"logps/chosen": -1.1252361536026,
"logps/rejected": -2.136768102645874,
"loss": 1.3361,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.1252361536026,
"rewards/margins": 1.0115318298339844,
"rewards/rejected": -2.136768102645874,
"step": 225
},
{
"epoch": 0.48154933263543576,
"grad_norm": 20.81339747835316,
"learning_rate": 3.0893973387735683e-07,
"logits/chosen": -1.475577473640442,
"logits/rejected": -0.8431582450866699,
"logps/chosen": -1.1428128480911255,
"logps/rejected": -2.3163764476776123,
"loss": 1.345,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.1428128480911255,
"rewards/margins": 1.1735635995864868,
"rewards/rejected": -2.3163764476776123,
"step": 230
},
{
"epoch": 0.49201779638838,
"grad_norm": 25.711748027315434,
"learning_rate": 3.000064234440111e-07,
"logits/chosen": -1.3059219121932983,
"logits/rejected": -0.7800277471542358,
"logps/chosen": -1.089871883392334,
"logps/rejected": -1.943511962890625,
"loss": 1.3701,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.089871883392334,
"rewards/margins": 0.8536401987075806,
"rewards/rejected": -1.943511962890625,
"step": 235
},
{
"epoch": 0.5024862601413242,
"grad_norm": 15.99544959403641,
"learning_rate": 2.910060778827554e-07,
"logits/chosen": -1.499245285987854,
"logits/rejected": -1.0100138187408447,
"logps/chosen": -1.0744446516036987,
"logps/rejected": -1.84768807888031,
"loss": 1.3668,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.0744446516036987,
"rewards/margins": 0.7732433080673218,
"rewards/rejected": -1.84768807888031,
"step": 240
},
{
"epoch": 0.5129547238942685,
"grad_norm": 16.81971929638616,
"learning_rate": 2.8195076242990116e-07,
"logits/chosen": -1.5549015998840332,
"logits/rejected": -1.0168583393096924,
"logps/chosen": -1.208172082901001,
"logps/rejected": -1.8721519708633423,
"loss": 1.3684,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.208172082901001,
"rewards/margins": 0.6639797687530518,
"rewards/rejected": -1.8721519708633423,
"step": 245
},
{
"epoch": 0.5234231876472127,
"grad_norm": 25.668874451608698,
"learning_rate": 2.7285261601056697e-07,
"logits/chosen": -1.4319603443145752,
"logits/rejected": -0.8526409268379211,
"logps/chosen": -1.1322458982467651,
"logps/rejected": -1.981615662574768,
"loss": 1.3785,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.1322458982467651,
"rewards/margins": 0.8493697047233582,
"rewards/rejected": -1.981615662574768,
"step": 250
},
{
"epoch": 0.533891651400157,
"grad_norm": 24.7591899309445,
"learning_rate": 2.6372383496608186e-07,
"logits/chosen": -0.9910232424736023,
"logits/rejected": -0.1822575181722641,
"logps/chosen": -1.297499418258667,
"logps/rejected": -2.4289941787719727,
"loss": 1.3136,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.297499418258667,
"rewards/margins": 1.1314946413040161,
"rewards/rejected": -2.4289941787719727,
"step": 255
},
{
"epoch": 0.5443601151531012,
"grad_norm": 20.005217624077922,
"learning_rate": 2.5457665670441937e-07,
"logits/chosen": -1.2755597829818726,
"logits/rejected": -0.5895196199417114,
"logps/chosen": -1.0270662307739258,
"logps/rejected": -2.185426712036133,
"loss": 1.348,
"rewards/accuracies": 0.737500011920929,
"rewards/chosen": -1.0270662307739258,
"rewards/margins": 1.158360481262207,
"rewards/rejected": -2.185426712036133,
"step": 260
},
{
"epoch": 0.5548285789060455,
"grad_norm": 23.193814761017656,
"learning_rate": 2.454233432955807e-07,
"logits/chosen": -1.4398943185806274,
"logits/rejected": -0.8867106437683105,
"logps/chosen": -1.1513334512710571,
"logps/rejected": -1.719251036643982,
"loss": 1.3446,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.1513334512710571,
"rewards/margins": 0.5679178237915039,
"rewards/rejected": -1.719251036643982,
"step": 265
},
{
"epoch": 0.5652970426589898,
"grad_norm": 34.7459644126849,
"learning_rate": 2.3627616503391812e-07,
"logits/chosen": -1.335096001625061,
"logits/rejected": -0.7483940124511719,
"logps/chosen": -1.2410709857940674,
"logps/rejected": -1.920100212097168,
"loss": 1.3635,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.2410709857940674,
"rewards/margins": 0.6790293455123901,
"rewards/rejected": -1.920100212097168,
"step": 270
},
{
"epoch": 0.575765506411934,
"grad_norm": 24.36604189795506,
"learning_rate": 2.2714738398943308e-07,
"logits/chosen": -1.1961032152175903,
"logits/rejected": -0.2931798994541168,
"logps/chosen": -1.1527847051620483,
"logps/rejected": -2.2658238410949707,
"loss": 1.3351,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.1527847051620483,
"rewards/margins": 1.1130387783050537,
"rewards/rejected": -2.2658238410949707,
"step": 275
},
{
"epoch": 0.5862339701648783,
"grad_norm": 21.064933326366333,
"learning_rate": 2.1804923757009882e-07,
"logits/chosen": -1.2238754034042358,
"logits/rejected": -0.43058863282203674,
"logps/chosen": -1.0913419723510742,
"logps/rejected": -1.885957956314087,
"loss": 1.3521,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0913419723510742,
"rewards/margins": 0.7946157455444336,
"rewards/rejected": -1.885957956314087,
"step": 280
},
{
"epoch": 0.5967024339178225,
"grad_norm": 22.882367631876512,
"learning_rate": 2.089939221172446e-07,
"logits/chosen": -1.3524372577667236,
"logits/rejected": -0.9207743406295776,
"logps/chosen": -1.1918188333511353,
"logps/rejected": -2.069408893585205,
"loss": 1.3471,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.1918188333511353,
"rewards/margins": 0.8775898814201355,
"rewards/rejected": -2.069408893585205,
"step": 285
},
{
"epoch": 0.6071708976707668,
"grad_norm": 25.205431418510916,
"learning_rate": 1.9999357655598891e-07,
"logits/chosen": -1.240950345993042,
"logits/rejected": -0.5687157511711121,
"logps/chosen": -1.2093303203582764,
"logps/rejected": -2.087651252746582,
"loss": 1.3354,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.2093303203582764,
"rewards/margins": 0.8783208131790161,
"rewards/rejected": -2.087651252746582,
"step": 290
},
{
"epoch": 0.6176393614237111,
"grad_norm": 28.00791593703933,
"learning_rate": 1.9106026612264315e-07,
"logits/chosen": -1.1928002834320068,
"logits/rejected": -0.4924652576446533,
"logps/chosen": -1.2354944944381714,
"logps/rejected": -2.1065988540649414,
"loss": 1.3707,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.2354944944381714,
"rewards/margins": 0.8711041212081909,
"rewards/rejected": -2.1065988540649414,
"step": 295
},
{
"epoch": 0.6281078251766553,
"grad_norm": 24.457847868426004,
"learning_rate": 1.8220596619089573e-07,
"logits/chosen": -1.2340965270996094,
"logits/rejected": -0.6534504294395447,
"logps/chosen": -1.186089038848877,
"logps/rejected": -2.029968738555908,
"loss": 1.3421,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.186089038848877,
"rewards/margins": 0.8438796997070312,
"rewards/rejected": -2.029968738555908,
"step": 300
},
{
"epoch": 0.6385762889295996,
"grad_norm": 27.972140846529875,
"learning_rate": 1.7344254621846017e-07,
"logits/chosen": -1.0477724075317383,
"logits/rejected": -0.3579182028770447,
"logps/chosen": -1.2306467294692993,
"logps/rejected": -2.213597059249878,
"loss": 1.3325,
"rewards/accuracies": 0.7437499761581421,
"rewards/chosen": -1.2306467294692993,
"rewards/margins": 0.9829503893852234,
"rewards/rejected": -2.213597059249878,
"step": 305
},
{
"epoch": 0.6490447526825438,
"grad_norm": 21.126429643053488,
"learning_rate": 1.647817538357072e-07,
"logits/chosen": -0.967767596244812,
"logits/rejected": -0.5430553555488586,
"logps/chosen": -1.1055750846862793,
"logps/rejected": -1.997175931930542,
"loss": 1.3201,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.1055750846862793,
"rewards/margins": 0.8916007280349731,
"rewards/rejected": -1.997175931930542,
"step": 310
},
{
"epoch": 0.6595132164354881,
"grad_norm": 37.81135873666922,
"learning_rate": 1.562351990976095e-07,
"logits/chosen": -0.9042571783065796,
"logits/rejected": -0.11697681248188019,
"logps/chosen": -1.1413376331329346,
"logps/rejected": -2.2313427925109863,
"loss": 1.335,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1413376331329346,
"rewards/margins": 1.0900049209594727,
"rewards/rejected": -2.2313427925109863,
"step": 315
},
{
"epoch": 0.6699816801884323,
"grad_norm": 22.549506139806287,
"learning_rate": 1.478143389201113e-07,
"logits/chosen": -0.7293549180030823,
"logits/rejected": -0.27204781770706177,
"logps/chosen": -1.185328722000122,
"logps/rejected": -2.089491605758667,
"loss": 1.308,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.185328722000122,
"rewards/margins": 0.9041631817817688,
"rewards/rejected": -2.089491605758667,
"step": 320
},
{
"epoch": 0.6804501439413766,
"grad_norm": 22.04232630753836,
"learning_rate": 1.3953046172178413e-07,
"logits/chosen": -0.9256671071052551,
"logits/rejected": -0.02759646438062191,
"logps/chosen": -1.1488409042358398,
"logps/rejected": -2.169890880584717,
"loss": 1.3024,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1488409042358398,
"rewards/margins": 1.021050214767456,
"rewards/rejected": -2.169890880584717,
"step": 325
},
{
"epoch": 0.6909186076943209,
"grad_norm": 23.07193462400401,
"learning_rate": 1.3139467229135998e-07,
"logits/chosen": -0.845870852470398,
"logits/rejected": -0.20406107604503632,
"logps/chosen": -1.1633460521697998,
"logps/rejected": -1.813515067100525,
"loss": 1.3519,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1633460521697998,
"rewards/margins": 0.6501691341400146,
"rewards/rejected": -1.813515067100525,
"step": 330
},
{
"epoch": 0.7013870714472651,
"grad_norm": 23.166253111385615,
"learning_rate": 1.2341787690142435e-07,
"logits/chosen": -1.0633478164672852,
"logits/rejected": -0.322580486536026,
"logps/chosen": -1.0509016513824463,
"logps/rejected": -1.967553734779358,
"loss": 1.3448,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0509016513824463,
"rewards/margins": 0.9166520833969116,
"rewards/rejected": -1.967553734779358,
"step": 335
},
{
"epoch": 0.7118555352002094,
"grad_norm": 36.55063112089001,
"learning_rate": 1.1561076868822755e-07,
"logits/chosen": -0.9787310361862183,
"logits/rejected": -0.23488755524158478,
"logps/chosen": -1.3403594493865967,
"logps/rejected": -2.003267288208008,
"loss": 1.3737,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3403594493865967,
"rewards/margins": 0.6629079580307007,
"rewards/rejected": -2.003267288208008,
"step": 340
},
{
"epoch": 0.7223239989531536,
"grad_norm": 20.352017367464722,
"learning_rate": 1.0798381331721107e-07,
"logits/chosen": -0.9965354800224304,
"logits/rejected": -0.1640423834323883,
"logps/chosen": -1.143103003501892,
"logps/rejected": -2.10261869430542,
"loss": 1.3502,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.143103003501892,
"rewards/margins": 0.9595154523849487,
"rewards/rejected": -2.10261869430542,
"step": 345
},
{
"epoch": 0.7327924627060979,
"grad_norm": 25.25408282143297,
"learning_rate": 1.0054723495346482e-07,
"logits/chosen": -0.7439842224121094,
"logits/rejected": -0.326732337474823,
"logps/chosen": -1.1874338388442993,
"logps/rejected": -1.9638007879257202,
"loss": 1.3647,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.1874338388442993,
"rewards/margins": 0.7763670086860657,
"rewards/rejected": -1.9638007879257202,
"step": 350
},
{
"epoch": 0.7432609264590422,
"grad_norm": 18.046932480362155,
"learning_rate": 9.331100255592436e-08,
"logits/chosen": -0.8028404116630554,
"logits/rejected": 0.0012960076564922929,
"logps/chosen": -1.001501441001892,
"logps/rejected": -2.1421008110046387,
"loss": 1.3092,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.001501441001892,
"rewards/margins": 1.1405994892120361,
"rewards/rejected": -2.1421008110046387,
"step": 355
},
{
"epoch": 0.7537293902119864,
"grad_norm": 24.86376502778227,
"learning_rate": 8.628481651367875e-08,
"logits/chosen": -1.0760581493377686,
"logits/rejected": -0.36155739426612854,
"logps/chosen": -1.0425236225128174,
"logps/rejected": -1.7828603982925415,
"loss": 1.3051,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.0425236225128174,
"rewards/margins": 0.7403370141983032,
"rewards/rejected": -1.7828603982925415,
"step": 360
},
{
"epoch": 0.7641978539649307,
"grad_norm": 21.60526931897869,
"learning_rate": 7.947809564230445e-08,
"logits/chosen": -0.7209302186965942,
"logits/rejected": 0.01974741742014885,
"logps/chosen": -1.1903411149978638,
"logps/rejected": -2.1893298625946045,
"loss": 1.3752,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.1903411149978638,
"rewards/margins": 0.9989888072013855,
"rewards/rejected": -2.1893298625946045,
"step": 365
},
{
"epoch": 0.7746663177178749,
"grad_norm": 22.57059768777603,
"learning_rate": 7.289996455765748e-08,
"logits/chosen": -0.7033840417861938,
"logits/rejected": 0.10633859783411026,
"logps/chosen": -1.241659164428711,
"logps/rejected": -2.4531188011169434,
"loss": 1.3325,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.241659164428711,
"rewards/margins": 1.2114596366882324,
"rewards/rejected": -2.4531188011169434,
"step": 370
},
{
"epoch": 0.7851347814708192,
"grad_norm": 23.77470848150414,
"learning_rate": 6.655924144404906e-08,
"logits/chosen": -0.5643360614776611,
"logits/rejected": -0.4250950217247009,
"logps/chosen": -1.1594164371490479,
"logps/rejected": -2.0758109092712402,
"loss": 1.319,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.1594164371490479,
"rewards/margins": 0.9163944125175476,
"rewards/rejected": -2.0758109092712402,
"step": 375
},
{
"epoch": 0.7956032452237635,
"grad_norm": 31.139993849555335,
"learning_rate": 6.046442623320145e-08,
"logits/chosen": -0.4887496531009674,
"logits/rejected": -0.15603157877922058,
"logps/chosen": -1.2021204233169556,
"logps/rejected": -2.0649125576019287,
"loss": 1.3035,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.2021204233169556,
"rewards/margins": 0.8627923130989075,
"rewards/rejected": -2.0649125576019287,
"step": 380
},
{
"epoch": 0.8060717089767077,
"grad_norm": 21.401361972850502,
"learning_rate": 5.4623689209832484e-08,
"logits/chosen": -0.7876440286636353,
"logits/rejected": -0.1371804028749466,
"logps/chosen": -1.1165800094604492,
"logps/rejected": -1.9990746974945068,
"loss": 1.3148,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.1165800094604492,
"rewards/margins": 0.8824948072433472,
"rewards/rejected": -1.9990746974945068,
"step": 385
},
{
"epoch": 0.816540172729652,
"grad_norm": 27.233602498877953,
"learning_rate": 4.904486005914027e-08,
"logits/chosen": -0.9208453297615051,
"logits/rejected": 0.03846040368080139,
"logps/chosen": -1.142458200454712,
"logps/rejected": -2.2543492317199707,
"loss": 1.2986,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.142458200454712,
"rewards/margins": 1.1118909120559692,
"rewards/rejected": -2.2543492317199707,
"step": 390
},
{
"epoch": 0.8270086364825961,
"grad_norm": 27.619241835201624,
"learning_rate": 4.373541737087263e-08,
"logits/chosen": -0.8224166631698608,
"logits/rejected": 0.22443437576293945,
"logps/chosen": -1.1223267316818237,
"logps/rejected": -2.0982825756073,
"loss": 1.3196,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1223267316818237,
"rewards/margins": 0.9759558439254761,
"rewards/rejected": -2.0982825756073,
"step": 395
},
{
"epoch": 0.8374771002355405,
"grad_norm": 23.574179623539557,
"learning_rate": 3.8702478614051345e-08,
"logits/chosen": -0.5700989961624146,
"logits/rejected": 0.09446726739406586,
"logps/chosen": -1.0945535898208618,
"logps/rejected": -2.0572116374969482,
"loss": 1.3444,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.0945535898208618,
"rewards/margins": 0.9626580476760864,
"rewards/rejected": -2.0572116374969482,
"step": 400
},
{
"epoch": 0.8479455639884846,
"grad_norm": 27.16821359859113,
"learning_rate": 3.3952790595787986e-08,
"logits/chosen": -0.5356675982475281,
"logits/rejected": -0.06621427834033966,
"logps/chosen": -1.2879823446273804,
"logps/rejected": -2.23671555519104,
"loss": 1.3459,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.2879823446273804,
"rewards/margins": 0.9487331509590149,
"rewards/rejected": -2.23671555519104,
"step": 405
},
{
"epoch": 0.8584140277414289,
"grad_norm": 24.735499686166555,
"learning_rate": 2.9492720416985e-08,
"logits/chosen": -0.6337080597877502,
"logits/rejected": -0.15622717142105103,
"logps/chosen": -1.1116082668304443,
"logps/rejected": -1.9464092254638672,
"loss": 1.3254,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.1116082668304443,
"rewards/margins": 0.8348008990287781,
"rewards/rejected": -1.9464092254638672,
"step": 410
},
{
"epoch": 0.8688824914943732,
"grad_norm": 27.139165593959646,
"learning_rate": 2.5328246937043525e-08,
"logits/chosen": -0.5524105429649353,
"logits/rejected": 0.30828729271888733,
"logps/chosen": -1.1661826372146606,
"logps/rejected": -2.2065200805664062,
"loss": 1.2887,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1661826372146606,
"rewards/margins": 1.0403375625610352,
"rewards/rejected": -2.2065200805664062,
"step": 415
},
{
"epoch": 0.8793509552473174,
"grad_norm": 21.202013882918227,
"learning_rate": 2.1464952759020856e-08,
"logits/chosen": -0.5271106958389282,
"logits/rejected": 0.04782446473836899,
"logps/chosen": -1.1561858654022217,
"logps/rejected": -2.1839046478271484,
"loss": 1.3408,
"rewards/accuracies": 0.6312500238418579,
"rewards/chosen": -1.1561858654022217,
"rewards/margins": 1.0277187824249268,
"rewards/rejected": -2.1839046478271484,
"step": 420
},
{
"epoch": 0.8898194190002617,
"grad_norm": 21.7364989280612,
"learning_rate": 1.7908016745981856e-08,
"logits/chosen": -0.5031063556671143,
"logits/rejected": 0.18595072627067566,
"logps/chosen": -1.3663603067398071,
"logps/rejected": -2.1731326580047607,
"loss": 1.3373,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.3663603067398071,
"rewards/margins": 0.8067724108695984,
"rewards/rejected": -2.1731326580047607,
"step": 425
},
{
"epoch": 0.9002878827532059,
"grad_norm": 28.976870006935112,
"learning_rate": 1.4662207078575684e-08,
"logits/chosen": -0.6391962170600891,
"logits/rejected": 0.05445007234811783,
"logps/chosen": -1.2463514804840088,
"logps/rejected": -2.253527879714966,
"loss": 1.3176,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2463514804840088,
"rewards/margins": 1.007176399230957,
"rewards/rejected": -2.253527879714966,
"step": 430
},
{
"epoch": 0.9107563465061502,
"grad_norm": 29.575680170367956,
"learning_rate": 1.1731874863145142e-08,
"logits/chosen": -0.534844696521759,
"logits/rejected": -0.06693878024816513,
"logps/chosen": -1.227757215499878,
"logps/rejected": -2.329684019088745,
"loss": 1.3387,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.227757215499878,
"rewards/margins": 1.1019268035888672,
"rewards/rejected": -2.329684019088745,
"step": 435
},
{
"epoch": 0.9212248102590945,
"grad_norm": 26.81740086509168,
"learning_rate": 9.12094829893642e-09,
"logits/chosen": -0.4514737129211426,
"logits/rejected": 0.175465926527977,
"logps/chosen": -1.1389667987823486,
"logps/rejected": -2.100003719329834,
"loss": 1.333,
"rewards/accuracies": 0.7124999761581421,
"rewards/chosen": -1.1389667987823486,
"rewards/margins": 0.9610370397567749,
"rewards/rejected": -2.100003719329834,
"step": 440
},
{
"epoch": 0.9316932740120387,
"grad_norm": 23.37770169459381,
"learning_rate": 6.832927412229017e-09,
"logits/chosen": -0.6318598985671997,
"logits/rejected": 0.05266062542796135,
"logps/chosen": -1.1773350238800049,
"logps/rejected": -2.1383471488952637,
"loss": 1.3499,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.1773350238800049,
"rewards/margins": 0.9610121846199036,
"rewards/rejected": -2.1383471488952637,
"step": 445
},
{
"epoch": 0.942161737764983,
"grad_norm": 28.112053810154443,
"learning_rate": 4.8708793644441086e-09,
"logits/chosen": -0.49654707312583923,
"logits/rejected": -0.05771613121032715,
"logps/chosen": -1.1556320190429688,
"logps/rejected": -2.0353646278381348,
"loss": 1.3102,
"rewards/accuracies": 0.71875,
"rewards/chosen": -1.1556320190429688,
"rewards/margins": 0.879732608795166,
"rewards/rejected": -2.0353646278381348,
"step": 450
},
{
"epoch": 0.9526302015179272,
"grad_norm": 27.512930833592254,
"learning_rate": 3.2374343405217884e-09,
"logits/chosen": -0.47800731658935547,
"logits/rejected": 0.14576876163482666,
"logps/chosen": -1.3641871213912964,
"logps/rejected": -1.9644696712493896,
"loss": 1.3398,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.3641871213912964,
"rewards/margins": 0.6002823710441589,
"rewards/rejected": -1.9644696712493896,
"step": 455
},
{
"epoch": 0.9630986652708715,
"grad_norm": 21.866991790036977,
"learning_rate": 1.9347820230782295e-09,
"logits/chosen": -0.4405391216278076,
"logits/rejected": 0.19559261202812195,
"logps/chosen": -1.115911602973938,
"logps/rejected": -2.104705333709717,
"loss": 1.3035,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.115911602973938,
"rewards/margins": 0.9887935519218445,
"rewards/rejected": -2.104705333709717,
"step": 460
},
{
"epoch": 0.9735671290238157,
"grad_norm": 25.927634444403026,
"learning_rate": 9.64668657069706e-10,
"logits/chosen": -0.6560274362564087,
"logits/rejected": 0.26568812131881714,
"logps/chosen": -1.2110966444015503,
"logps/rejected": -2.1687331199645996,
"loss": 1.3275,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.2110966444015503,
"rewards/margins": 0.9576365351676941,
"rewards/rejected": -2.1687331199645996,
"step": 465
},
{
"epoch": 0.98403559277676,
"grad_norm": 23.70864962079994,
"learning_rate": 3.2839470889836627e-10,
"logits/chosen": -0.6618033647537231,
"logits/rejected": -0.13337016105651855,
"logps/chosen": -1.159348726272583,
"logps/rejected": -1.9146608114242554,
"loss": 1.3688,
"rewards/accuracies": 0.706250011920929,
"rewards/chosen": -1.159348726272583,
"rewards/margins": 0.7553119659423828,
"rewards/rejected": -1.9146608114242554,
"step": 470
},
{
"epoch": 0.9945040565297043,
"grad_norm": 21.250598362207704,
"learning_rate": 2.6813123097352287e-11,
"logits/chosen": -0.7468453645706177,
"logits/rejected": 0.1613294780254364,
"logps/chosen": -1.1621391773223877,
"logps/rejected": -2.1066856384277344,
"loss": 1.3287,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.1621391773223877,
"rewards/margins": 0.9445463418960571,
"rewards/rejected": -2.1066856384277344,
"step": 475
},
{
"epoch": 0.998691442030882,
"step": 477,
"total_flos": 0.0,
"train_loss": 1.3747819329707627,
"train_runtime": 7204.7044,
"train_samples_per_second": 8.485,
"train_steps_per_second": 0.066
}
],
"logging_steps": 5,
"max_steps": 477,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}