Safetensors
qwen2
trustalign_qwen2.5_3b / trainer_state.json
shanghong's picture
Upload folder using huggingface_hub
71d6c46 verified
raw
history blame
107 kB
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.6615384615384614,
"eval_steps": 20,
"global_step": 360,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.009230769230769232,
"grad_norm": 52.40730345789634,
"learning_rate": 2.2727272727272725e-08,
"logits/chosen": -1.2901445627212524,
"logits/rejected": -1.2963205575942993,
"logps/chosen": -16.113027572631836,
"logps/rejected": -27.10122299194336,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 2
},
{
"epoch": 0.018461538461538463,
"grad_norm": 64.88802449206628,
"learning_rate": 4.545454545454545e-08,
"logits/chosen": -1.3016295433044434,
"logits/rejected": -1.3255655765533447,
"logps/chosen": -20.355079650878906,
"logps/rejected": -39.93232727050781,
"loss": 0.6895,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.008816350251436234,
"rewards/margins": 0.0047285472974181175,
"rewards/rejected": 0.004087802488356829,
"step": 4
},
{
"epoch": 0.027692307692307693,
"grad_norm": 59.6800701771534,
"learning_rate": 6.818181818181817e-08,
"logits/chosen": -1.31508207321167,
"logits/rejected": -1.3189733028411865,
"logps/chosen": -23.069622039794922,
"logps/rejected": -26.97477149963379,
"loss": 0.695,
"rewards/accuracies": 0.4444444477558136,
"rewards/chosen": -0.007372706197202206,
"rewards/margins": -0.013017671182751656,
"rewards/rejected": 0.0056449659168720245,
"step": 6
},
{
"epoch": 0.036923076923076927,
"grad_norm": 52.983511533208585,
"learning_rate": 9.09090909090909e-08,
"logits/chosen": -1.277503252029419,
"logits/rejected": -1.3002785444259644,
"logps/chosen": -20.34660530090332,
"logps/rejected": -31.0557861328125,
"loss": 0.6908,
"rewards/accuracies": 0.4861111044883728,
"rewards/chosen": 0.020842621102929115,
"rewards/margins": 0.020597590133547783,
"rewards/rejected": 0.00024503222084604204,
"step": 8
},
{
"epoch": 0.046153846153846156,
"grad_norm": 66.2747581823961,
"learning_rate": 1.1363636363636363e-07,
"logits/chosen": -1.3306350708007812,
"logits/rejected": -1.3309379816055298,
"logps/chosen": -26.48358917236328,
"logps/rejected": -30.445173263549805,
"loss": 0.7046,
"rewards/accuracies": 0.3888888955116272,
"rewards/chosen": -0.01246996782720089,
"rewards/margins": -0.032543592154979706,
"rewards/rejected": 0.020073626190423965,
"step": 10
},
{
"epoch": 0.055384615384615386,
"grad_norm": 57.271529486531605,
"learning_rate": 1.3636363636363635e-07,
"logits/chosen": -1.280084252357483,
"logits/rejected": -1.295721411705017,
"logps/chosen": -25.79343032836914,
"logps/rejected": -36.58183288574219,
"loss": 0.6956,
"rewards/accuracies": 0.5555555820465088,
"rewards/chosen": 0.023966560140252113,
"rewards/margins": 0.030559096485376358,
"rewards/rejected": -0.006592527963221073,
"step": 12
},
{
"epoch": 0.06461538461538462,
"grad_norm": 67.94854888195144,
"learning_rate": 1.5909090909090907e-07,
"logits/chosen": -1.2790985107421875,
"logits/rejected": -1.296931266784668,
"logps/chosen": -24.833446502685547,
"logps/rejected": -31.11182403564453,
"loss": 0.7006,
"rewards/accuracies": 0.4305555522441864,
"rewards/chosen": 0.013436201959848404,
"rewards/margins": 0.002752000233158469,
"rewards/rejected": 0.010684202425181866,
"step": 14
},
{
"epoch": 0.07384615384615385,
"grad_norm": 49.36191286721225,
"learning_rate": 1.818181818181818e-07,
"logits/chosen": -1.301368236541748,
"logits/rejected": -1.3136367797851562,
"logps/chosen": -26.273963928222656,
"logps/rejected": -35.63306427001953,
"loss": 0.6949,
"rewards/accuracies": 0.5972222089767456,
"rewards/chosen": 0.015296169556677341,
"rewards/margins": 0.013788570649921894,
"rewards/rejected": 0.0015075993724167347,
"step": 16
},
{
"epoch": 0.08307692307692308,
"grad_norm": 56.43976674406361,
"learning_rate": 2.0454545454545456e-07,
"logits/chosen": -1.3201720714569092,
"logits/rejected": -1.3183202743530273,
"logps/chosen": -25.70770263671875,
"logps/rejected": -26.178009033203125,
"loss": 0.7006,
"rewards/accuracies": 0.5555555820465088,
"rewards/chosen": 0.0011544560547918081,
"rewards/margins": 0.01863468438386917,
"rewards/rejected": -0.01748022995889187,
"step": 18
},
{
"epoch": 0.09230769230769231,
"grad_norm": 56.010590202518365,
"learning_rate": 2.2727272727272726e-07,
"logits/chosen": -1.2482044696807861,
"logits/rejected": -1.262031078338623,
"logps/chosen": -28.337791442871094,
"logps/rejected": -29.38203239440918,
"loss": 0.6883,
"rewards/accuracies": 0.4861111044883728,
"rewards/chosen": 0.00024333276087418199,
"rewards/margins": -0.0005785864195786417,
"rewards/rejected": 0.000821918249130249,
"step": 20
},
{
"epoch": 0.09230769230769231,
"eval_logits/chosen": -1.3220677375793457,
"eval_logits/rejected": -1.33245849609375,
"eval_logps/chosen": -23.036666870117188,
"eval_logps/rejected": -26.372356414794922,
"eval_loss": 0.6916412115097046,
"eval_rewards/accuracies": 0.4965437650680542,
"eval_rewards/chosen": 0.00501647312194109,
"eval_rewards/margins": 0.010797887109220028,
"eval_rewards/rejected": -0.0057814153842628,
"eval_runtime": 216.2201,
"eval_samples_per_second": 8.02,
"eval_steps_per_second": 2.007,
"step": 20
},
{
"epoch": 0.10153846153846154,
"grad_norm": 67.30805212172523,
"learning_rate": 2.5e-07,
"logits/chosen": -1.2273086309432983,
"logits/rejected": -1.2565299272537231,
"logps/chosen": -21.540626525878906,
"logps/rejected": -47.4769172668457,
"loss": 0.6893,
"rewards/accuracies": 0.4861111044883728,
"rewards/chosen": 0.007773838937282562,
"rewards/margins": 0.026619136333465576,
"rewards/rejected": -0.018845297396183014,
"step": 22
},
{
"epoch": 0.11076923076923077,
"grad_norm": 51.29780655120263,
"learning_rate": 2.727272727272727e-07,
"logits/chosen": -1.219795823097229,
"logits/rejected": -1.235877513885498,
"logps/chosen": -30.82242774963379,
"logps/rejected": -37.68511962890625,
"loss": 0.6758,
"rewards/accuracies": 0.5277777910232544,
"rewards/chosen": 0.03086034394800663,
"rewards/margins": 0.055920813232660294,
"rewards/rejected": -0.025060458108782768,
"step": 24
},
{
"epoch": 0.12,
"grad_norm": 55.0939959360046,
"learning_rate": 2.9545454545454545e-07,
"logits/chosen": -1.258486270904541,
"logits/rejected": -1.2752680778503418,
"logps/chosen": -25.136966705322266,
"logps/rejected": -43.23137664794922,
"loss": 0.6774,
"rewards/accuracies": 0.5972222089767456,
"rewards/chosen": 0.0216163769364357,
"rewards/margins": 0.08480846881866455,
"rewards/rejected": -0.06319208443164825,
"step": 26
},
{
"epoch": 0.12923076923076923,
"grad_norm": 48.332663649143974,
"learning_rate": 3.1818181818181815e-07,
"logits/chosen": -1.320160150527954,
"logits/rejected": -1.330212950706482,
"logps/chosen": -19.24217414855957,
"logps/rejected": -27.22931671142578,
"loss": 0.6874,
"rewards/accuracies": 0.4027777910232544,
"rewards/chosen": 0.01321298535913229,
"rewards/margins": 0.009595979005098343,
"rewards/rejected": 0.0036170051898807287,
"step": 28
},
{
"epoch": 0.13846153846153847,
"grad_norm": 49.59877928678631,
"learning_rate": 3.4090909090909085e-07,
"logits/chosen": -1.2795339822769165,
"logits/rejected": -1.2929219007492065,
"logps/chosen": -21.841049194335938,
"logps/rejected": -28.89714813232422,
"loss": 0.6813,
"rewards/accuracies": 0.5555555820465088,
"rewards/chosen": 0.013576723635196686,
"rewards/margins": 0.05021868646144867,
"rewards/rejected": -0.036641962826251984,
"step": 30
},
{
"epoch": 0.1476923076923077,
"grad_norm": 51.02397460357053,
"learning_rate": 3.636363636363636e-07,
"logits/chosen": -1.2797447443008423,
"logits/rejected": -1.3022751808166504,
"logps/chosen": -24.65501594543457,
"logps/rejected": -36.741573333740234,
"loss": 0.6732,
"rewards/accuracies": 0.6527777910232544,
"rewards/chosen": 0.04290567338466644,
"rewards/margins": 0.09170582890510559,
"rewards/rejected": -0.04880015552043915,
"step": 32
},
{
"epoch": 0.15692307692307692,
"grad_norm": 45.39524675384609,
"learning_rate": 3.8636363636363636e-07,
"logits/chosen": -1.2498574256896973,
"logits/rejected": -1.2657580375671387,
"logps/chosen": -21.32640838623047,
"logps/rejected": -39.71310806274414,
"loss": 0.6627,
"rewards/accuracies": 0.5277777910232544,
"rewards/chosen": 0.007356289308518171,
"rewards/margins": 0.06605351716279984,
"rewards/rejected": -0.058697231113910675,
"step": 34
},
{
"epoch": 0.16615384615384615,
"grad_norm": 52.58099443727954,
"learning_rate": 4.090909090909091e-07,
"logits/chosen": -1.2139866352081299,
"logits/rejected": -1.2340948581695557,
"logps/chosen": -18.409015655517578,
"logps/rejected": -35.20015335083008,
"loss": 0.6644,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.022290384396910667,
"rewards/margins": 0.06140115484595299,
"rewards/rejected": -0.03911077231168747,
"step": 36
},
{
"epoch": 0.1753846153846154,
"grad_norm": 53.938952453151614,
"learning_rate": 4.318181818181818e-07,
"logits/chosen": -1.2461514472961426,
"logits/rejected": -1.2598522901535034,
"logps/chosen": -27.248275756835938,
"logps/rejected": -32.50380325317383,
"loss": 0.6545,
"rewards/accuracies": 0.625,
"rewards/chosen": 0.04994047060608864,
"rewards/margins": 0.1001262366771698,
"rewards/rejected": -0.05018576979637146,
"step": 38
},
{
"epoch": 0.18461538461538463,
"grad_norm": 46.949545804629956,
"learning_rate": 4.545454545454545e-07,
"logits/chosen": -1.2425076961517334,
"logits/rejected": -1.2611976861953735,
"logps/chosen": -14.459053993225098,
"logps/rejected": -22.981327056884766,
"loss": 0.6562,
"rewards/accuracies": 0.5833333134651184,
"rewards/chosen": 0.06477613002061844,
"rewards/margins": 0.08449113368988037,
"rewards/rejected": -0.019715001806616783,
"step": 40
},
{
"epoch": 0.18461538461538463,
"eval_logits/chosen": -1.3191018104553223,
"eval_logits/rejected": -1.3294612169265747,
"eval_logps/chosen": -22.93289566040039,
"eval_logps/rejected": -26.52239418029785,
"eval_loss": 0.6399217247962952,
"eval_rewards/accuracies": 0.671658992767334,
"eval_rewards/chosen": 0.05690104886889458,
"eval_rewards/margins": 0.13770265877246857,
"eval_rewards/rejected": -0.08080162853002548,
"eval_runtime": 216.334,
"eval_samples_per_second": 8.015,
"eval_steps_per_second": 2.006,
"step": 40
},
{
"epoch": 0.19384615384615383,
"grad_norm": 41.53559188167412,
"learning_rate": 4.772727272727273e-07,
"logits/chosen": -1.2119545936584473,
"logits/rejected": -1.2175490856170654,
"logps/chosen": -23.42240333557129,
"logps/rejected": -29.862327575683594,
"loss": 0.624,
"rewards/accuracies": 0.5694444179534912,
"rewards/chosen": 0.059619419276714325,
"rewards/margins": 0.15751110017299652,
"rewards/rejected": -0.09789170324802399,
"step": 42
},
{
"epoch": 0.20307692307692307,
"grad_norm": 49.942474151893265,
"learning_rate": 5e-07,
"logits/chosen": -1.3206286430358887,
"logits/rejected": -1.3300279378890991,
"logps/chosen": -22.983713150024414,
"logps/rejected": -23.000356674194336,
"loss": 0.6224,
"rewards/accuracies": 0.7083333134651184,
"rewards/chosen": 0.05540511757135391,
"rewards/margins": 0.1078185960650444,
"rewards/rejected": -0.05241347849369049,
"step": 44
},
{
"epoch": 0.2123076923076923,
"grad_norm": 40.96104792630147,
"learning_rate": 4.99967220916408e-07,
"logits/chosen": -1.2594552040100098,
"logits/rejected": -1.270306944847107,
"logps/chosen": -19.131641387939453,
"logps/rejected": -29.00514793395996,
"loss": 0.617,
"rewards/accuracies": 0.7083333134651184,
"rewards/chosen": 0.09214716404676437,
"rewards/margins": 0.24131464958190918,
"rewards/rejected": -0.14916746318340302,
"step": 46
},
{
"epoch": 0.22153846153846155,
"grad_norm": 44.60792696333844,
"learning_rate": 4.998688922613787e-07,
"logits/chosen": -1.3020961284637451,
"logits/rejected": -1.3101927042007446,
"logps/chosen": -31.274911880493164,
"logps/rejected": -32.11240005493164,
"loss": 0.6075,
"rewards/accuracies": 0.7083333134651184,
"rewards/chosen": 0.10760927200317383,
"rewards/margins": 0.294413298368454,
"rewards/rejected": -0.18680399656295776,
"step": 48
},
{
"epoch": 0.23076923076923078,
"grad_norm": 43.17860095734465,
"learning_rate": 4.997050398198976e-07,
"logits/chosen": -1.291076421737671,
"logits/rejected": -1.2982360124588013,
"logps/chosen": -22.59940528869629,
"logps/rejected": -22.504961013793945,
"loss": 0.5855,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 0.14835722744464874,
"rewards/margins": 0.3006143271923065,
"rewards/rejected": -0.15225709974765778,
"step": 50
},
{
"epoch": 0.24,
"grad_norm": 40.923959372883246,
"learning_rate": 4.994757065594279e-07,
"logits/chosen": -1.2361193895339966,
"logits/rejected": -1.2530244588851929,
"logps/chosen": -19.440345764160156,
"logps/rejected": -29.653764724731445,
"loss": 0.58,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.12961499392986298,
"rewards/margins": 0.2747644782066345,
"rewards/rejected": -0.14514949917793274,
"step": 52
},
{
"epoch": 0.24923076923076923,
"grad_norm": 40.97149688332116,
"learning_rate": 4.991809526186423e-07,
"logits/chosen": -1.2297606468200684,
"logits/rejected": -1.25152587890625,
"logps/chosen": -21.388309478759766,
"logps/rejected": -44.34809112548828,
"loss": 0.5456,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.151195228099823,
"rewards/margins": 0.48822492361068726,
"rewards/rejected": -0.33702969551086426,
"step": 54
},
{
"epoch": 0.25846153846153846,
"grad_norm": 41.37645783028047,
"learning_rate": 4.988208552916535e-07,
"logits/chosen": -1.2540967464447021,
"logits/rejected": -1.2566981315612793,
"logps/chosen": -22.95637321472168,
"logps/rejected": -23.91745376586914,
"loss": 0.5722,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.20567570626735687,
"rewards/margins": 0.3446711003780365,
"rewards/rejected": -0.13899540901184082,
"step": 56
},
{
"epoch": 0.2676923076923077,
"grad_norm": 37.07709893155658,
"learning_rate": 4.983955090077444e-07,
"logits/chosen": -1.2924391031265259,
"logits/rejected": -1.2913458347320557,
"logps/chosen": -18.923715591430664,
"logps/rejected": -22.57257843017578,
"loss": 0.5773,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.16216707229614258,
"rewards/margins": 0.27626025676727295,
"rewards/rejected": -0.11409316956996918,
"step": 58
},
{
"epoch": 0.27692307692307694,
"grad_norm": 33.00415567764037,
"learning_rate": 4.979050253066063e-07,
"logits/chosen": -1.2263813018798828,
"logits/rejected": -1.2465788125991821,
"logps/chosen": -20.503381729125977,
"logps/rejected": -37.98419189453125,
"loss": 0.5379,
"rewards/accuracies": 0.7083333134651184,
"rewards/chosen": 0.17731823027133942,
"rewards/margins": 0.593184769153595,
"rewards/rejected": -0.41586652398109436,
"step": 60
},
{
"epoch": 0.27692307692307694,
"eval_logits/chosen": -1.303908109664917,
"eval_logits/rejected": -1.3140496015548706,
"eval_logps/chosen": -22.596784591674805,
"eval_logps/rejected": -26.880229949951172,
"eval_loss": 0.5301286578178406,
"eval_rewards/accuracies": 0.7718893885612488,
"eval_rewards/chosen": 0.22495588660240173,
"eval_rewards/margins": 0.484672486782074,
"eval_rewards/rejected": -0.259716659784317,
"eval_runtime": 215.7229,
"eval_samples_per_second": 8.038,
"eval_steps_per_second": 2.012,
"step": 60
},
{
"epoch": 0.28615384615384615,
"grad_norm": 32.870504270075905,
"learning_rate": 4.973495328090889e-07,
"logits/chosen": -1.2028117179870605,
"logits/rejected": -1.2163152694702148,
"logps/chosen": -25.100025177001953,
"logps/rejected": -35.97075653076172,
"loss": 0.5245,
"rewards/accuracies": 0.7083333134651184,
"rewards/chosen": 0.20213226974010468,
"rewards/margins": 0.5411441326141357,
"rewards/rejected": -0.33901187777519226,
"step": 62
},
{
"epoch": 0.2953846153846154,
"grad_norm": 38.13033333375434,
"learning_rate": 4.967291771834726e-07,
"logits/chosen": -1.2682946920394897,
"logits/rejected": -1.2830837965011597,
"logps/chosen": -22.399858474731445,
"logps/rejected": -35.47315979003906,
"loss": 0.4854,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.24411238729953766,
"rewards/margins": 0.7097706198692322,
"rewards/rejected": -0.46565818786621094,
"step": 64
},
{
"epoch": 0.3046153846153846,
"grad_norm": 34.6917991893696,
"learning_rate": 4.960441211072685e-07,
"logits/chosen": -1.240267038345337,
"logits/rejected": -1.2494441270828247,
"logps/chosen": -16.752328872680664,
"logps/rejected": -21.625200271606445,
"loss": 0.52,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.2749379575252533,
"rewards/margins": 0.5106962323188782,
"rewards/rejected": -0.23575833439826965,
"step": 66
},
{
"epoch": 0.31384615384615383,
"grad_norm": 32.938257449212315,
"learning_rate": 4.952945442245597e-07,
"logits/chosen": -1.282260775566101,
"logits/rejected": -1.2961454391479492,
"logps/chosen": -16.818540573120117,
"logps/rejected": -31.804317474365234,
"loss": 0.4986,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.20085500180721283,
"rewards/margins": 0.6287386417388916,
"rewards/rejected": -0.42788365483283997,
"step": 68
},
{
"epoch": 0.3230769230769231,
"grad_norm": 36.12880857430109,
"learning_rate": 4.944806430988927e-07,
"logits/chosen": -1.2567392587661743,
"logits/rejected": -1.263179063796997,
"logps/chosen": -23.333267211914062,
"logps/rejected": -24.862985610961914,
"loss": 0.5059,
"rewards/accuracies": 0.7361111044883728,
"rewards/chosen": 0.22914116084575653,
"rewards/margins": 0.6000176668167114,
"rewards/rejected": -0.3708764612674713,
"step": 70
},
{
"epoch": 0.3323076923076923,
"grad_norm": 31.746333807337315,
"learning_rate": 4.936026311617316e-07,
"logits/chosen": -1.2413491010665894,
"logits/rejected": -1.2490180730819702,
"logps/chosen": -27.870990753173828,
"logps/rejected": -28.86038589477539,
"loss": 0.4797,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.35419517755508423,
"rewards/margins": 0.7417442202568054,
"rewards/rejected": -0.3875490427017212,
"step": 72
},
{
"epoch": 0.3415384615384615,
"grad_norm": 31.965936446320438,
"learning_rate": 4.926607386564898e-07,
"logits/chosen": -1.3071357011795044,
"logits/rejected": -1.3031624555587769,
"logps/chosen": -24.66501808166504,
"logps/rejected": -19.646629333496094,
"loss": 0.4724,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.3141394257545471,
"rewards/margins": 0.6052231788635254,
"rewards/rejected": -0.29108375310897827,
"step": 74
},
{
"epoch": 0.3507692307692308,
"grad_norm": 30.420218056003396,
"learning_rate": 4.916552125781528e-07,
"logits/chosen": -1.2826448678970337,
"logits/rejected": -1.2921828031539917,
"logps/chosen": -21.71385955810547,
"logps/rejected": -26.265592575073242,
"loss": 0.443,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.3491870164871216,
"rewards/margins": 0.7558759450912476,
"rewards/rejected": -0.4066888988018036,
"step": 76
},
{
"epoch": 0.36,
"grad_norm": 35.262762131347294,
"learning_rate": 4.905863166085075e-07,
"logits/chosen": -1.2882230281829834,
"logits/rejected": -1.3004416227340698,
"logps/chosen": -25.61620330810547,
"logps/rejected": -26.73788833618164,
"loss": 0.4682,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 0.29705706238746643,
"rewards/margins": 0.6734262108802795,
"rewards/rejected": -0.3763691484928131,
"step": 78
},
{
"epoch": 0.36923076923076925,
"grad_norm": 32.454214562336674,
"learning_rate": 4.894543310469967e-07,
"logits/chosen": -1.292490839958191,
"logits/rejected": -1.3075741529464722,
"logps/chosen": -24.23374366760254,
"logps/rejected": -27.662269592285156,
"loss": 0.4233,
"rewards/accuracies": 0.7222222089767456,
"rewards/chosen": 0.3347330093383789,
"rewards/margins": 0.7462683320045471,
"rewards/rejected": -0.4115353524684906,
"step": 80
},
{
"epoch": 0.36923076923076925,
"eval_logits/chosen": -1.2837809324264526,
"eval_logits/rejected": -1.293448567390442,
"eval_logps/chosen": -22.318069458007812,
"eval_logps/rejected": -27.420156478881836,
"eval_loss": 0.4364205598831177,
"eval_rewards/accuracies": 0.7937787771224976,
"eval_rewards/chosen": 0.3643138110637665,
"eval_rewards/margins": 0.893993616104126,
"eval_rewards/rejected": -0.5296797752380371,
"eval_runtime": 215.7088,
"eval_samples_per_second": 8.039,
"eval_steps_per_second": 2.012,
"step": 80
},
{
"epoch": 0.37846153846153846,
"grad_norm": 30.100728508551764,
"learning_rate": 4.882595527372152e-07,
"logits/chosen": -1.219198226928711,
"logits/rejected": -1.2316464185714722,
"logps/chosen": -21.758522033691406,
"logps/rejected": -32.21995544433594,
"loss": 0.4544,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 0.33725497126579285,
"rewards/margins": 0.9134353995323181,
"rewards/rejected": -0.5761803984642029,
"step": 82
},
{
"epoch": 0.38769230769230767,
"grad_norm": 27.99260854977849,
"learning_rate": 4.870022949890676e-07,
"logits/chosen": -1.25475013256073,
"logits/rejected": -1.258756160736084,
"logps/chosen": -29.569332122802734,
"logps/rejected": -32.13206481933594,
"loss": 0.4048,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.3496508300304413,
"rewards/margins": 1.0080742835998535,
"rewards/rejected": -0.6584234237670898,
"step": 84
},
{
"epoch": 0.39692307692307693,
"grad_norm": 28.434505768144174,
"learning_rate": 4.856828874966086e-07,
"logits/chosen": -1.2163680791854858,
"logits/rejected": -1.2340407371520996,
"logps/chosen": -18.534114837646484,
"logps/rejected": -36.619850158691406,
"loss": 0.422,
"rewards/accuracies": 0.6666666865348816,
"rewards/chosen": 0.2995716333389282,
"rewards/margins": 1.0983738899230957,
"rewards/rejected": -0.7988021969795227,
"step": 86
},
{
"epoch": 0.40615384615384614,
"grad_norm": 28.794469436567187,
"learning_rate": 4.843016762515859e-07,
"logits/chosen": -1.2752939462661743,
"logits/rejected": -1.285552978515625,
"logps/chosen": -21.55384635925293,
"logps/rejected": -30.397226333618164,
"loss": 0.3905,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.37557560205459595,
"rewards/margins": 1.0376694202423096,
"rewards/rejected": -0.6620937585830688,
"step": 88
},
{
"epoch": 0.4153846153846154,
"grad_norm": 24.699190483704957,
"learning_rate": 4.828590234527106e-07,
"logits/chosen": -1.2076385021209717,
"logits/rejected": -1.2378058433532715,
"logps/chosen": -20.13502311706543,
"logps/rejected": -49.50822067260742,
"loss": 0.3616,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.29748064279556274,
"rewards/margins": 1.576164722442627,
"rewards/rejected": -1.2786839008331299,
"step": 90
},
{
"epoch": 0.4246153846153846,
"grad_norm": 24.998257178693006,
"learning_rate": 4.81355307410676e-07,
"logits/chosen": -1.268651008605957,
"logits/rejected": -1.2737505435943604,
"logps/chosen": -21.684688568115234,
"logps/rejected": -20.43457794189453,
"loss": 0.3963,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.701554536819458,
"rewards/margins": 1.2370011806488037,
"rewards/rejected": -0.5354464650154114,
"step": 92
},
{
"epoch": 0.4338461538461538,
"grad_norm": 30.39233888946852,
"learning_rate": 4.79790922448953e-07,
"logits/chosen": -1.2319780588150024,
"logits/rejected": -1.234665870666504,
"logps/chosen": -22.746065139770508,
"logps/rejected": -37.10270309448242,
"loss": 0.4055,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.33227479457855225,
"rewards/margins": 1.4662950038909912,
"rewards/rejected": -1.1340200901031494,
"step": 94
},
{
"epoch": 0.4430769230769231,
"grad_norm": 28.351607065877335,
"learning_rate": 4.78166278800385e-07,
"logits/chosen": -1.2103080749511719,
"logits/rejected": -1.2216867208480835,
"logps/chosen": -22.36292839050293,
"logps/rejected": -36.19468307495117,
"loss": 0.3633,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.46569257974624634,
"rewards/margins": 1.3663029670715332,
"rewards/rejected": -0.9006102681159973,
"step": 96
},
{
"epoch": 0.4523076923076923,
"grad_norm": 27.63597035013981,
"learning_rate": 4.7648180249961165e-07,
"logits/chosen": -1.2609645128250122,
"logits/rejected": -1.2675108909606934,
"logps/chosen": -19.6772403717041,
"logps/rejected": -22.703941345214844,
"loss": 0.3425,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.42960312962532043,
"rewards/margins": 1.3149679899215698,
"rewards/rejected": -0.8853649497032166,
"step": 98
},
{
"epoch": 0.46153846153846156,
"grad_norm": 27.095171417356656,
"learning_rate": 4.747379352713488e-07,
"logits/chosen": -1.2016191482543945,
"logits/rejected": -1.212724208831787,
"logps/chosen": -26.863676071166992,
"logps/rejected": -35.31084442138672,
"loss": 0.3626,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 0.43012529611587524,
"rewards/margins": 1.344970703125,
"rewards/rejected": -0.9148455858230591,
"step": 100
},
{
"epoch": 0.46153846153846156,
"eval_logits/chosen": -1.2631281614303589,
"eval_logits/rejected": -1.2726249694824219,
"eval_logps/chosen": -22.157392501831055,
"eval_logps/rejected": -28.169017791748047,
"eval_loss": 0.3646220564842224,
"eval_rewards/accuracies": 0.7972350120544434,
"eval_rewards/chosen": 0.4446515440940857,
"eval_rewards/margins": 1.348763346672058,
"eval_rewards/rejected": -0.904111921787262,
"eval_runtime": 215.7885,
"eval_samples_per_second": 8.036,
"eval_steps_per_second": 2.011,
"step": 100
},
{
"epoch": 0.4707692307692308,
"grad_norm": 32.35798457566701,
"learning_rate": 4.7293513441455357e-07,
"logits/chosen": -1.2197188138961792,
"logits/rejected": -1.2320291996002197,
"logps/chosen": -19.279041290283203,
"logps/rejected": -35.00586700439453,
"loss": 0.3714,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.3205001652240753,
"rewards/margins": 1.3763878345489502,
"rewards/rejected": -1.0558876991271973,
"step": 102
},
{
"epoch": 0.48,
"grad_norm": 21.70119714606352,
"learning_rate": 4.7107387268250586e-07,
"logits/chosen": -1.1967614889144897,
"logits/rejected": -1.220970630645752,
"logps/chosen": -10.033695220947266,
"logps/rejected": -38.51593017578125,
"loss": 0.3835,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.4108971059322357,
"rewards/margins": 1.6398005485534668,
"rewards/rejected": -1.2289036512374878,
"step": 104
},
{
"epoch": 0.48923076923076925,
"grad_norm": 22.839162689384967,
"learning_rate": 4.691546381588369e-07,
"logits/chosen": -1.2221455574035645,
"logits/rejected": -1.2347490787506104,
"logps/chosen": -20.123445510864258,
"logps/rejected": -34.73093032836914,
"loss": 0.3528,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.3299613296985626,
"rewards/margins": 1.6646933555603027,
"rewards/rejected": -1.3347320556640625,
"step": 106
},
{
"epoch": 0.49846153846153846,
"grad_norm": 30.91989303041632,
"learning_rate": 4.6717793412953776e-07,
"logits/chosen": -1.2001112699508667,
"logits/rejected": -1.2213759422302246,
"logps/chosen": -18.639766693115234,
"logps/rejected": -38.698211669921875,
"loss": 0.3751,
"rewards/accuracies": 0.7361111044883728,
"rewards/chosen": 0.3170078694820404,
"rewards/margins": 1.7733925580978394,
"rewards/rejected": -1.456384539604187,
"step": 108
},
{
"epoch": 0.5076923076923077,
"grad_norm": 22.21075058785491,
"learning_rate": 4.651442789509813e-07,
"logits/chosen": -1.172301173210144,
"logits/rejected": -1.1873422861099243,
"logps/chosen": -19.037778854370117,
"logps/rejected": -35.6918830871582,
"loss": 0.3632,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.44801807403564453,
"rewards/margins": 1.6537230014801025,
"rewards/rejected": -1.2057050466537476,
"step": 110
},
{
"epoch": 0.5169230769230769,
"grad_norm": 22.23191382020911,
"learning_rate": 4.630542059139923e-07,
"logits/chosen": -1.1621766090393066,
"logits/rejected": -1.1781913042068481,
"logps/chosen": -26.200401306152344,
"logps/rejected": -28.19536590576172,
"loss": 0.3117,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.4852801561355591,
"rewards/margins": 1.5631003379821777,
"rewards/rejected": -1.0778203010559082,
"step": 112
},
{
"epoch": 0.5261538461538462,
"grad_norm": 26.06519967082825,
"learning_rate": 4.609082631040011e-07,
"logits/chosen": -1.1710741519927979,
"logits/rejected": -1.1770610809326172,
"logps/chosen": -26.139328002929688,
"logps/rejected": -38.44914627075195,
"loss": 0.3191,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.42665359377861023,
"rewards/margins": 1.9680951833724976,
"rewards/rejected": -1.5414414405822754,
"step": 114
},
{
"epoch": 0.5353846153846153,
"grad_norm": 23.76055177774163,
"learning_rate": 4.5870701325731773e-07,
"logits/chosen": -1.1841078996658325,
"logits/rejected": -1.2016386985778809,
"logps/chosen": -18.3129940032959,
"logps/rejected": -38.7909049987793,
"loss": 0.3422,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 0.31725624203681946,
"rewards/margins": 1.8888146877288818,
"rewards/rejected": -1.5715583562850952,
"step": 116
},
{
"epoch": 0.5446153846153846,
"grad_norm": 22.451458526325442,
"learning_rate": 4.5645103361356407e-07,
"logits/chosen": -1.203595519065857,
"logits/rejected": -1.1993364095687866,
"logps/chosen": -29.456233978271484,
"logps/rejected": -24.436891555786133,
"loss": 0.3111,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.4006561040878296,
"rewards/margins": 1.460686206817627,
"rewards/rejected": -1.0600301027297974,
"step": 118
},
{
"epoch": 0.5538461538461539,
"grad_norm": 20.899441336146108,
"learning_rate": 4.541409157643027e-07,
"logits/chosen": -1.113027811050415,
"logits/rejected": -1.1339952945709229,
"logps/chosen": -22.780738830566406,
"logps/rejected": -37.4469108581543,
"loss": 0.263,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.5090766549110413,
"rewards/margins": 2.038201332092285,
"rewards/rejected": -1.5291246175765991,
"step": 120
},
{
"epoch": 0.5538461538461539,
"eval_logits/chosen": -1.2401551008224487,
"eval_logits/rejected": -1.249323844909668,
"eval_logps/chosen": -22.120243072509766,
"eval_logps/rejected": -28.963603973388672,
"eval_loss": 0.32304224371910095,
"eval_rewards/accuracies": 0.8122119903564453,
"eval_rewards/chosen": 0.46322670578956604,
"eval_rewards/margins": 1.764631986618042,
"eval_rewards/rejected": -1.3014051914215088,
"eval_runtime": 215.8398,
"eval_samples_per_second": 8.034,
"eval_steps_per_second": 2.011,
"step": 120
},
{
"epoch": 0.563076923076923,
"grad_norm": 25.722122527925197,
"learning_rate": 4.517772654979023e-07,
"logits/chosen": -1.1628613471984863,
"logits/rejected": -1.1666890382766724,
"logps/chosen": -28.28006935119629,
"logps/rejected": -32.06778335571289,
"loss": 0.2967,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.42497023940086365,
"rewards/margins": 1.8420732021331787,
"rewards/rejected": -1.4171031713485718,
"step": 122
},
{
"epoch": 0.5723076923076923,
"grad_norm": 18.859437245079093,
"learning_rate": 4.4936070264068016e-07,
"logits/chosen": -1.097366452217102,
"logits/rejected": -1.1257672309875488,
"logps/chosen": -19.26881217956543,
"logps/rejected": -50.698387145996094,
"loss": 0.3122,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.3975294530391693,
"rewards/margins": 2.376965045928955,
"rewards/rejected": -1.9794355630874634,
"step": 124
},
{
"epoch": 0.5815384615384616,
"grad_norm": 24.12611784808478,
"learning_rate": 4.468918608943636e-07,
"logits/chosen": -1.188425064086914,
"logits/rejected": -1.2095468044281006,
"logps/chosen": -22.594573974609375,
"logps/rejected": -33.808677673339844,
"loss": 0.2989,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.4648338854312897,
"rewards/margins": 2.128401756286621,
"rewards/rejected": -1.6635680198669434,
"step": 126
},
{
"epoch": 0.5907692307692308,
"grad_norm": 21.121113872126465,
"learning_rate": 4.443713876699123e-07,
"logits/chosen": -1.176856279373169,
"logits/rejected": -1.175789713859558,
"logps/chosen": -31.682504653930664,
"logps/rejected": -26.862850189208984,
"loss": 0.2881,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.47753646969795227,
"rewards/margins": 1.661524772644043,
"rewards/rejected": -1.183988332748413,
"step": 128
},
{
"epoch": 0.6,
"grad_norm": 24.221092280098347,
"learning_rate": 4.417999439177465e-07,
"logits/chosen": -1.1786390542984009,
"logits/rejected": -1.1881896257400513,
"logps/chosen": -18.69803237915039,
"logps/rejected": -28.687692642211914,
"loss": 0.2737,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.5532296895980835,
"rewards/margins": 2.0457603931427,
"rewards/rejected": -1.4925308227539062,
"step": 130
},
{
"epoch": 0.6092307692307692,
"grad_norm": 19.171893778962126,
"learning_rate": 4.391782039544238e-07,
"logits/chosen": -1.2097636461257935,
"logits/rejected": -1.2146636247634888,
"logps/chosen": -19.53115463256836,
"logps/rejected": -19.350337982177734,
"loss": 0.3284,
"rewards/accuracies": 0.7083333134651184,
"rewards/chosen": 0.28336918354034424,
"rewards/margins": 1.5194146633148193,
"rewards/rejected": -1.236045479774475,
"step": 132
},
{
"epoch": 0.6184615384615385,
"grad_norm": 22.368959777821875,
"learning_rate": 4.365068552858115e-07,
"logits/chosen": -1.2042018175125122,
"logits/rejected": -1.2163949012756348,
"logps/chosen": -24.11139488220215,
"logps/rejected": -33.35640335083008,
"loss": 0.3137,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 0.1719236522912979,
"rewards/margins": 1.7209672927856445,
"rewards/rejected": -1.5490436553955078,
"step": 134
},
{
"epoch": 0.6276923076923077,
"grad_norm": 17.354174303387865,
"learning_rate": 4.337865984268001e-07,
"logits/chosen": -1.1561534404754639,
"logits/rejected": -1.1622954607009888,
"logps/chosen": -15.14254093170166,
"logps/rejected": -27.18238067626953,
"loss": 0.2954,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.34695935249328613,
"rewards/margins": 1.897645115852356,
"rewards/rejected": -1.5506855249404907,
"step": 136
},
{
"epoch": 0.6369230769230769,
"grad_norm": 14.475969356318869,
"learning_rate": 4.310181467176054e-07,
"logits/chosen": -1.1768825054168701,
"logits/rejected": -1.1757102012634277,
"logps/chosen": -25.93258285522461,
"logps/rejected": -32.286590576171875,
"loss": 0.2914,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.42600950598716736,
"rewards/margins": 2.0175862312316895,
"rewards/rejected": -1.5915768146514893,
"step": 138
},
{
"epoch": 0.6461538461538462,
"grad_norm": 18.34569474287581,
"learning_rate": 4.282022261367073e-07,
"logits/chosen": -1.2166173458099365,
"logits/rejected": -1.2223114967346191,
"logps/chosen": -20.700721740722656,
"logps/rejected": -25.006229400634766,
"loss": 0.2717,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.5470355749130249,
"rewards/margins": 1.990134358406067,
"rewards/rejected": -1.4430986642837524,
"step": 140
},
{
"epoch": 0.6461538461538462,
"eval_logits/chosen": -1.221505880355835,
"eval_logits/rejected": -1.2305463552474976,
"eval_logps/chosen": -22.114253997802734,
"eval_logps/rejected": -29.54737663269043,
"eval_loss": 0.29700523614883423,
"eval_rewards/accuracies": 0.8179723620414734,
"eval_rewards/chosen": 0.46622127294540405,
"eval_rewards/margins": 2.0595133304595947,
"eval_rewards/rejected": -1.5932921171188354,
"eval_runtime": 215.9245,
"eval_samples_per_second": 8.031,
"eval_steps_per_second": 2.01,
"step": 140
},
{
"epoch": 0.6553846153846153,
"grad_norm": 24.003361700026115,
"learning_rate": 4.253395751104748e-07,
"logits/chosen": -1.2128342390060425,
"logits/rejected": -1.2202144861221313,
"logps/chosen": -20.926525115966797,
"logps/rejected": -33.759159088134766,
"loss": 0.2796,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.4563888907432556,
"rewards/margins": 2.332362413406372,
"rewards/rejected": -1.8759733438491821,
"step": 142
},
{
"epoch": 0.6646153846153846,
"grad_norm": 22.96956018291041,
"learning_rate": 4.2243094431952607e-07,
"logits/chosen": -1.1733120679855347,
"logits/rejected": -1.1876205205917358,
"logps/chosen": -20.787324905395508,
"logps/rejected": -44.41487503051758,
"loss": 0.2904,
"rewards/accuracies": 0.7638888955116272,
"rewards/chosen": 0.4227790832519531,
"rewards/margins": 2.513406753540039,
"rewards/rejected": -2.090627431869507,
"step": 144
},
{
"epoch": 0.6738461538461539,
"grad_norm": 20.337910027315395,
"learning_rate": 4.194770965018758e-07,
"logits/chosen": -1.1829084157943726,
"logits/rejected": -1.1901525259017944,
"logps/chosen": -22.88217544555664,
"logps/rejected": -40.51693344116211,
"loss": 0.2982,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.32644984126091003,
"rewards/margins": 2.2273294925689697,
"rewards/rejected": -1.9008797407150269,
"step": 146
},
{
"epoch": 0.683076923076923,
"grad_norm": 16.955507402789948,
"learning_rate": 4.1647880625292027e-07,
"logits/chosen": -1.1585676670074463,
"logits/rejected": -1.1673483848571777,
"logps/chosen": -17.565954208374023,
"logps/rejected": -30.01752471923828,
"loss": 0.2381,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.6770419478416443,
"rewards/margins": 2.5649421215057373,
"rewards/rejected": -1.8879002332687378,
"step": 148
},
{
"epoch": 0.6923076923076923,
"grad_norm": 16.268353553690783,
"learning_rate": 4.1343685982231315e-07,
"logits/chosen": -1.2300368547439575,
"logits/rejected": -1.2412070035934448,
"logps/chosen": -19.158246994018555,
"logps/rejected": -30.00787353515625,
"loss": 0.2576,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.28651073575019836,
"rewards/margins": 2.1342878341674805,
"rewards/rejected": -1.8477774858474731,
"step": 150
},
{
"epoch": 0.7015384615384616,
"grad_norm": 22.707867679754226,
"learning_rate": 4.1035205490778496e-07,
"logits/chosen": -1.1675605773925781,
"logits/rejected": -1.1745511293411255,
"logps/chosen": -24.983802795410156,
"logps/rejected": -32.00082015991211,
"loss": 0.3007,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.4517359137535095,
"rewards/margins": 2.2256662845611572,
"rewards/rejected": -1.7739304304122925,
"step": 152
},
{
"epoch": 0.7107692307692308,
"grad_norm": 17.503865371681442,
"learning_rate": 4.072252004459611e-07,
"logits/chosen": -1.1371846199035645,
"logits/rejected": -1.1358321905136108,
"logps/chosen": -26.079011917114258,
"logps/rejected": -27.951416015625,
"loss": 0.2471,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.44966569542884827,
"rewards/margins": 2.104396104812622,
"rewards/rejected": -1.6547303199768066,
"step": 154
},
{
"epoch": 0.72,
"grad_norm": 15.32657259953523,
"learning_rate": 4.040571164002318e-07,
"logits/chosen": -1.189456820487976,
"logits/rejected": -1.1948577165603638,
"logps/chosen": -20.083751678466797,
"logps/rejected": -30.10634994506836,
"loss": 0.2351,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.42890670895576477,
"rewards/margins": 2.341860771179199,
"rewards/rejected": -1.9129540920257568,
"step": 156
},
{
"epoch": 0.7292307692307692,
"grad_norm": 17.946669808646828,
"learning_rate": 4.0084863354573116e-07,
"logits/chosen": -1.1215004920959473,
"logits/rejected": -1.1300181150436401,
"logps/chosen": -23.436655044555664,
"logps/rejected": -34.97710418701172,
"loss": 0.2706,
"rewards/accuracies": 0.9444444179534912,
"rewards/chosen": 0.22723568975925446,
"rewards/margins": 2.1446826457977295,
"rewards/rejected": -1.9174467325210571,
"step": 158
},
{
"epoch": 0.7384615384615385,
"grad_norm": 16.72039592892195,
"learning_rate": 3.9760059325148063e-07,
"logits/chosen": -1.2237818241119385,
"logits/rejected": -1.2211045026779175,
"logps/chosen": -24.31806755065918,
"logps/rejected": -25.250701904296875,
"loss": 0.2351,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.4868224859237671,
"rewards/margins": 2.124577522277832,
"rewards/rejected": -1.637755274772644,
"step": 160
},
{
"epoch": 0.7384615384615385,
"eval_logits/chosen": -1.2072025537490845,
"eval_logits/rejected": -1.216115951538086,
"eval_logps/chosen": -22.174776077270508,
"eval_logps/rejected": -30.134973526000977,
"eval_loss": 0.27949145436286926,
"eval_rewards/accuracies": 0.8248847723007202,
"eval_rewards/chosen": 0.4359608590602875,
"eval_rewards/margins": 2.3230507373809814,
"eval_rewards/rejected": -1.8870899677276611,
"eval_runtime": 216.1181,
"eval_samples_per_second": 8.023,
"eval_steps_per_second": 2.008,
"step": 160
},
{
"epoch": 0.7476923076923077,
"grad_norm": 16.877732796497064,
"learning_rate": 3.9431384725975485e-07,
"logits/chosen": -1.1728930473327637,
"logits/rejected": -1.1828408241271973,
"logps/chosen": -20.051979064941406,
"logps/rejected": -30.078739166259766,
"loss": 0.2806,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.4627165198326111,
"rewards/margins": 2.1041107177734375,
"rewards/rejected": -1.641394019126892,
"step": 162
},
{
"epoch": 0.7569230769230769,
"grad_norm": 17.236677422360824,
"learning_rate": 3.909892574627266e-07,
"logits/chosen": -1.1840589046478271,
"logits/rejected": -1.205323338508606,
"logps/chosen": -20.25952911376953,
"logps/rejected": -43.16006851196289,
"loss": 0.267,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.34341666102409363,
"rewards/margins": 2.8926875591278076,
"rewards/rejected": -2.5492708683013916,
"step": 164
},
{
"epoch": 0.7661538461538462,
"grad_norm": 15.084626056041332,
"learning_rate": 3.876276956764509e-07,
"logits/chosen": -1.172157883644104,
"logits/rejected": -1.1869869232177734,
"logps/chosen": -20.39401626586914,
"logps/rejected": -35.54499816894531,
"loss": 0.2191,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.533491313457489,
"rewards/margins": 3.2933194637298584,
"rewards/rejected": -2.7598280906677246,
"step": 166
},
{
"epoch": 0.7753846153846153,
"grad_norm": 16.522846792297653,
"learning_rate": 3.8423004341224595e-07,
"logits/chosen": -1.1675995588302612,
"logits/rejected": -1.1726378202438354,
"logps/chosen": -22.266756057739258,
"logps/rejected": -27.90992546081543,
"loss": 0.2137,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.3478531837463379,
"rewards/margins": 2.3764336109161377,
"rewards/rejected": -2.028580665588379,
"step": 168
},
{
"epoch": 0.7846153846153846,
"grad_norm": 18.709310219062342,
"learning_rate": 3.807971916455325e-07,
"logits/chosen": -1.1257578134536743,
"logits/rejected": -1.1353437900543213,
"logps/chosen": -25.48769187927246,
"logps/rejected": -37.34423065185547,
"loss": 0.2439,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.30796098709106445,
"rewards/margins": 2.5804708003997803,
"rewards/rejected": -2.2725095748901367,
"step": 170
},
{
"epoch": 0.7938461538461539,
"grad_norm": 18.811516964897933,
"learning_rate": 3.773300405821908e-07,
"logits/chosen": -1.2032923698425293,
"logits/rejected": -1.1944453716278076,
"logps/chosen": -22.42747688293457,
"logps/rejected": -24.809179306030273,
"loss": 0.2706,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.4598681628704071,
"rewards/margins": 2.331010103225708,
"rewards/rejected": -1.871142029762268,
"step": 172
},
{
"epoch": 0.803076923076923,
"grad_norm": 27.213611533570646,
"learning_rate": 3.738294994224969e-07,
"logits/chosen": -1.1406216621398926,
"logits/rejected": -1.1456246376037598,
"logps/chosen": -22.41916847229004,
"logps/rejected": -25.79179573059082,
"loss": 0.2525,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.5410938858985901,
"rewards/margins": 2.5380003452301025,
"rewards/rejected": -1.9969062805175781,
"step": 174
},
{
"epoch": 0.8123076923076923,
"grad_norm": 22.120419375719585,
"learning_rate": 3.7029648612270123e-07,
"logits/chosen": -1.1604636907577515,
"logits/rejected": -1.166500210762024,
"logps/chosen": -23.140409469604492,
"logps/rejected": -32.539859771728516,
"loss": 0.2445,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.5552553534507751,
"rewards/margins": 2.451958656311035,
"rewards/rejected": -1.8967031240463257,
"step": 176
},
{
"epoch": 0.8215384615384616,
"grad_norm": 23.529456123726142,
"learning_rate": 3.6673192715431014e-07,
"logits/chosen": -1.172749638557434,
"logits/rejected": -1.1873490810394287,
"logps/chosen": -19.344928741455078,
"logps/rejected": -46.30924987792969,
"loss": 0.2576,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.3556906580924988,
"rewards/margins": 3.198575973510742,
"rewards/rejected": -2.8428850173950195,
"step": 178
},
{
"epoch": 0.8307692307692308,
"grad_norm": 16.07954647927614,
"learning_rate": 3.6313675726113475e-07,
"logits/chosen": -1.1696263551712036,
"logits/rejected": -1.1719523668289185,
"logps/chosen": -24.40313148498535,
"logps/rejected": -30.179893493652344,
"loss": 0.2373,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.5325056314468384,
"rewards/margins": 2.6024298667907715,
"rewards/rejected": -2.0699243545532227,
"step": 180
},
{
"epoch": 0.8307692307692308,
"eval_logits/chosen": -1.1957546472549438,
"eval_logits/rejected": -1.2044621706008911,
"eval_logps/chosen": -22.226091384887695,
"eval_logps/rejected": -30.679323196411133,
"eval_loss": 0.2662460505962372,
"eval_rewards/accuracies": 0.8271889686584473,
"eval_rewards/chosen": 0.4103015661239624,
"eval_rewards/margins": 2.569566011428833,
"eval_rewards/rejected": -2.15926456451416,
"eval_runtime": 216.1605,
"eval_samples_per_second": 8.022,
"eval_steps_per_second": 2.008,
"step": 180
},
{
"epoch": 0.84,
"grad_norm": 12.027824441881227,
"learning_rate": 3.595119192141706e-07,
"logits/chosen": -1.1798688173294067,
"logits/rejected": -1.190478801727295,
"logps/chosen": -23.84467315673828,
"logps/rejected": -27.77214241027832,
"loss": 0.1945,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.5185620784759521,
"rewards/margins": 2.7370386123657227,
"rewards/rejected": -2.2184765338897705,
"step": 182
},
{
"epoch": 0.8492307692307692,
"grad_norm": 21.657852790803656,
"learning_rate": 3.558583635643726e-07,
"logits/chosen": -1.1619257926940918,
"logits/rejected": -1.1783702373504639,
"logps/chosen": -20.357545852661133,
"logps/rejected": -36.6799430847168,
"loss": 0.2859,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.39101898670196533,
"rewards/margins": 2.5226354598999023,
"rewards/rejected": -2.1316165924072266,
"step": 184
},
{
"epoch": 0.8584615384615385,
"grad_norm": 15.850729398525738,
"learning_rate": 3.5217704839338905e-07,
"logits/chosen": -1.2039780616760254,
"logits/rejected": -1.2015321254730225,
"logps/chosen": -25.71788787841797,
"logps/rejected": -29.20301628112793,
"loss": 0.2245,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.43592390418052673,
"rewards/margins": 2.691300392150879,
"rewards/rejected": -2.2553763389587402,
"step": 186
},
{
"epoch": 0.8676923076923077,
"grad_norm": 20.33987602806827,
"learning_rate": 3.484689390623218e-07,
"logits/chosen": -1.173121452331543,
"logits/rejected": -1.1853346824645996,
"logps/chosen": -21.594472885131836,
"logps/rejected": -36.92512130737305,
"loss": 0.2243,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.22467082738876343,
"rewards/margins": 2.8943564891815186,
"rewards/rejected": -2.6696856021881104,
"step": 188
},
{
"epoch": 0.8769230769230769,
"grad_norm": 15.456781978721555,
"learning_rate": 3.447350079585767e-07,
"logits/chosen": -1.20560884475708,
"logits/rejected": -1.2095773220062256,
"logps/chosen": -18.067840576171875,
"logps/rejected": -24.3345890045166,
"loss": 0.2124,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.2674013674259186,
"rewards/margins": 2.3308472633361816,
"rewards/rejected": -2.063445568084717,
"step": 190
},
{
"epoch": 0.8861538461538462,
"grad_norm": 24.575966523755373,
"learning_rate": 3.409762342408719e-07,
"logits/chosen": -1.1767027378082275,
"logits/rejected": -1.1829452514648438,
"logps/chosen": -23.147159576416016,
"logps/rejected": -38.63761901855469,
"loss": 0.3063,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.2949807345867157,
"rewards/margins": 2.8994204998016357,
"rewards/rejected": -2.6044397354125977,
"step": 192
},
{
"epoch": 0.8953846153846153,
"grad_norm": 13.903082439233941,
"learning_rate": 3.3719360358247053e-07,
"logits/chosen": -1.1678471565246582,
"logits/rejected": -1.1855759620666504,
"logps/chosen": -19.064098358154297,
"logps/rejected": -36.09113693237305,
"loss": 0.288,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.4278064966201782,
"rewards/margins": 2.7983500957489014,
"rewards/rejected": -2.3705434799194336,
"step": 194
},
{
"epoch": 0.9046153846153846,
"grad_norm": 16.97717210575951,
"learning_rate": 3.3338810791270517e-07,
"logits/chosen": -1.1488627195358276,
"logits/rejected": -1.161072015762329,
"logps/chosen": -16.16121482849121,
"logps/rejected": -35.24711608886719,
"loss": 0.2587,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.2668210566043854,
"rewards/margins": 2.758829116821289,
"rewards/rejected": -2.4920082092285156,
"step": 196
},
{
"epoch": 0.9138461538461539,
"grad_norm": 21.684346277519417,
"learning_rate": 3.29560745156861e-07,
"logits/chosen": -1.1681840419769287,
"logits/rejected": -1.1707243919372559,
"logps/chosen": -27.238510131835938,
"logps/rejected": -29.843427658081055,
"loss": 0.2945,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.487039715051651,
"rewards/margins": 2.7937545776367188,
"rewards/rejected": -2.3067147731781006,
"step": 198
},
{
"epoch": 0.9230769230769231,
"grad_norm": 15.010044100424757,
"learning_rate": 3.2571251897448763e-07,
"logits/chosen": -1.1483420133590698,
"logits/rejected": -1.172219157218933,
"logps/chosen": -20.701204299926758,
"logps/rejected": -47.092777252197266,
"loss": 0.2393,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.5242102742195129,
"rewards/margins": 3.446150302886963,
"rewards/rejected": -2.9219398498535156,
"step": 200
},
{
"epoch": 0.9230769230769231,
"eval_logits/chosen": -1.188868761062622,
"eval_logits/rejected": -1.1974678039550781,
"eval_logps/chosen": -22.205198287963867,
"eval_logps/rejected": -30.90268325805664,
"eval_loss": 0.25766730308532715,
"eval_rewards/accuracies": 0.8306451439857483,
"eval_rewards/chosen": 0.42075031995773315,
"eval_rewards/margins": 2.6916959285736084,
"eval_rewards/rejected": -2.2709455490112305,
"eval_runtime": 216.204,
"eval_samples_per_second": 8.02,
"eval_steps_per_second": 2.007,
"step": 200
},
{
"epoch": 0.9323076923076923,
"grad_norm": 24.918463307740545,
"learning_rate": 3.218444384962071e-07,
"logits/chosen": -1.1572585105895996,
"logits/rejected": -1.1649041175842285,
"logps/chosen": -20.337928771972656,
"logps/rejected": -25.251022338867188,
"loss": 0.2872,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.051526255905628204,
"rewards/margins": 2.169602155685425,
"rewards/rejected": -2.1180758476257324,
"step": 202
},
{
"epoch": 0.9415384615384615,
"grad_norm": 17.132653548760572,
"learning_rate": 3.179575180590857e-07,
"logits/chosen": -1.1708558797836304,
"logits/rejected": -1.1774191856384277,
"logps/chosen": -16.72760772705078,
"logps/rejected": -29.532522201538086,
"loss": 0.2703,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.3555985391139984,
"rewards/margins": 2.5367255210876465,
"rewards/rejected": -2.1811270713806152,
"step": 204
},
{
"epoch": 0.9507692307692308,
"grad_norm": 18.808695685272248,
"learning_rate": 3.1405277694064305e-07,
"logits/chosen": -1.13996422290802,
"logits/rejected": -1.1603398323059082,
"logps/chosen": -20.1070613861084,
"logps/rejected": -43.8044319152832,
"loss": 0.2133,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.3765062689781189,
"rewards/margins": 3.3217618465423584,
"rewards/rejected": -2.9452552795410156,
"step": 206
},
{
"epoch": 0.96,
"grad_norm": 29.593271367025817,
"learning_rate": 3.101312390915634e-07,
"logits/chosen": -1.1117515563964844,
"logits/rejected": -1.1254826784133911,
"logps/chosen": -18.95772933959961,
"logps/rejected": -38.70570373535156,
"loss": 0.2626,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.19062408804893494,
"rewards/margins": 2.819202423095703,
"rewards/rejected": -2.6285784244537354,
"step": 208
},
{
"epoch": 0.9692307692307692,
"grad_norm": 19.2158248846026,
"learning_rate": 3.0619393286718237e-07,
"logits/chosen": -1.1758193969726562,
"logits/rejected": -1.18528413772583,
"logps/chosen": -25.30388069152832,
"logps/rejected": -24.64061737060547,
"loss": 0.2715,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.22175876796245575,
"rewards/margins": 2.10679292678833,
"rewards/rejected": -1.8850340843200684,
"step": 210
},
{
"epoch": 0.9784615384615385,
"grad_norm": 23.720067200725047,
"learning_rate": 3.022418907578188e-07,
"logits/chosen": -1.1191242933273315,
"logits/rejected": -1.1329889297485352,
"logps/chosen": -25.677099227905273,
"logps/rejected": -39.06088638305664,
"loss": 0.2898,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.20263215899467468,
"rewards/margins": 3.0255513191223145,
"rewards/rejected": -2.8229193687438965,
"step": 212
},
{
"epoch": 0.9876923076923076,
"grad_norm": 15.354779350521344,
"learning_rate": 2.98276149118022e-07,
"logits/chosen": -1.1088786125183105,
"logits/rejected": -1.1292033195495605,
"logps/chosen": -24.54433250427246,
"logps/rejected": -38.054649353027344,
"loss": 0.2164,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.5917240381240845,
"rewards/margins": 3.370425224304199,
"rewards/rejected": -2.7787015438079834,
"step": 214
},
{
"epoch": 0.9969230769230769,
"grad_norm": 15.922459499539187,
"learning_rate": 2.942977478948057e-07,
"logits/chosen": -1.134361743927002,
"logits/rejected": -1.1381641626358032,
"logps/chosen": -29.736419677734375,
"logps/rejected": -34.28538513183594,
"loss": 0.209,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.47491705417633057,
"rewards/margins": 3.0054473876953125,
"rewards/rejected": -2.5305304527282715,
"step": 216
},
{
"epoch": 1.0061538461538462,
"grad_norm": 14.602088714669993,
"learning_rate": 2.903077303549399e-07,
"logits/chosen": -1.1926045417785645,
"logits/rejected": -1.2005811929702759,
"logps/chosen": -21.338937759399414,
"logps/rejected": -31.98470115661621,
"loss": 0.2114,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.49925586581230164,
"rewards/margins": 3.034120559692383,
"rewards/rejected": -2.534864664077759,
"step": 218
},
{
"epoch": 1.0153846153846153,
"grad_norm": 12.776565445469831,
"learning_rate": 2.863071428113726e-07,
"logits/chosen": -1.180498719215393,
"logits/rejected": -1.1876842975616455,
"logps/chosen": -21.977970123291016,
"logps/rejected": -26.06908416748047,
"loss": 0.2223,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.23817205429077148,
"rewards/margins": 2.4826109409332275,
"rewards/rejected": -2.244438409805298,
"step": 220
},
{
"epoch": 1.0153846153846153,
"eval_logits/chosen": -1.1809991598129272,
"eval_logits/rejected": -1.189637303352356,
"eval_logps/chosen": -22.231857299804688,
"eval_logps/rejected": -31.20700454711914,
"eval_loss": 0.25129908323287964,
"eval_rewards/accuracies": 0.8329492807388306,
"eval_rewards/chosen": 0.4074196219444275,
"eval_rewards/margins": 2.8305253982543945,
"eval_rewards/rejected": -2.4231057167053223,
"eval_runtime": 216.0555,
"eval_samples_per_second": 8.026,
"eval_steps_per_second": 2.009,
"step": 220
},
{
"epoch": 1.0246153846153847,
"grad_norm": 14.54877776678067,
"learning_rate": 2.822970343488516e-07,
"logits/chosen": -1.1495935916900635,
"logits/rejected": -1.1574082374572754,
"logps/chosen": -25.172189712524414,
"logps/rejected": -33.7739372253418,
"loss": 0.224,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.4095478355884552,
"rewards/margins": 2.9969334602355957,
"rewards/rejected": -2.587385416030884,
"step": 222
},
{
"epoch": 1.0338461538461539,
"grad_norm": 12.987637533805088,
"learning_rate": 2.782784565488211e-07,
"logits/chosen": -1.09419846534729,
"logits/rejected": -1.1150177717208862,
"logps/chosen": -21.80037498474121,
"logps/rejected": -47.742916107177734,
"loss": 0.2056,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": 0.49535179138183594,
"rewards/margins": 4.081587314605713,
"rewards/rejected": -3.5862362384796143,
"step": 224
},
{
"epoch": 1.043076923076923,
"grad_norm": 12.537917774467841,
"learning_rate": 2.7425246321366205e-07,
"logits/chosen": -1.1532597541809082,
"logits/rejected": -1.1558729410171509,
"logps/chosen": -23.903770446777344,
"logps/rejected": -22.89252471923828,
"loss": 0.2188,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.5978649258613586,
"rewards/margins": 2.4770026206970215,
"rewards/rejected": -1.8791378736495972,
"step": 226
},
{
"epoch": 1.0523076923076924,
"grad_norm": 11.390266637295149,
"learning_rate": 2.7022011009035107e-07,
"logits/chosen": -1.1780048608779907,
"logits/rejected": -1.1780657768249512,
"logps/chosen": -20.99365997314453,
"logps/rejected": -35.256507873535156,
"loss": 0.1785,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.25620290637016296,
"rewards/margins": 3.1927871704101562,
"rewards/rejected": -2.936584234237671,
"step": 228
},
{
"epoch": 1.0615384615384615,
"grad_norm": 13.274197122497501,
"learning_rate": 2.661824545936089e-07,
"logits/chosen": -1.1301528215408325,
"logits/rejected": -1.141854166984558,
"logps/chosen": -22.90785789489746,
"logps/rejected": -39.776309967041016,
"loss": 0.1848,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": 0.23726129531860352,
"rewards/margins": 3.6220147609710693,
"rewards/rejected": -3.3847532272338867,
"step": 230
},
{
"epoch": 1.0707692307692307,
"grad_norm": 11.899842789993972,
"learning_rate": 2.621405555286121e-07,
"logits/chosen": -1.1494054794311523,
"logits/rejected": -1.158327579498291,
"logps/chosen": -27.49151611328125,
"logps/rejected": -33.164703369140625,
"loss": 0.1801,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.473955363035202,
"rewards/margins": 3.1881282329559326,
"rewards/rejected": -2.7141730785369873,
"step": 232
},
{
"epoch": 1.08,
"grad_norm": 12.024964222481547,
"learning_rate": 2.58095472813339e-07,
"logits/chosen": -1.1302716732025146,
"logits/rejected": -1.1499823331832886,
"logps/chosen": -25.619178771972656,
"logps/rejected": -35.781768798828125,
"loss": 0.1808,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.593082070350647,
"rewards/margins": 3.549994468688965,
"rewards/rejected": -2.9569127559661865,
"step": 234
},
{
"epoch": 1.0892307692307692,
"grad_norm": 16.982420323384893,
"learning_rate": 2.540482672006254e-07,
"logits/chosen": -1.1983014345169067,
"logits/rejected": -1.2088627815246582,
"logps/chosen": -20.2447566986084,
"logps/rejected": -33.8237419128418,
"loss": 0.2502,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.2588607966899872,
"rewards/margins": 2.6979219913482666,
"rewards/rejected": -2.439061164855957,
"step": 236
},
{
"epoch": 1.0984615384615384,
"grad_norm": 14.78335151339772,
"learning_rate": 2.5e-07,
"logits/chosen": -1.1217488050460815,
"logits/rejected": -1.126597285270691,
"logps/chosen": -24.313417434692383,
"logps/rejected": -32.5634880065918,
"loss": 0.1857,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.4129423499107361,
"rewards/margins": 3.0672991275787354,
"rewards/rejected": -2.6543567180633545,
"step": 238
},
{
"epoch": 1.1076923076923078,
"grad_norm": 9.560418611995035,
"learning_rate": 2.459517327993746e-07,
"logits/chosen": -1.1439785957336426,
"logits/rejected": -1.1501950025558472,
"logps/chosen": -21.520601272583008,
"logps/rejected": -36.128475189208984,
"loss": 0.1631,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.16983138024806976,
"rewards/margins": 3.335303544998169,
"rewards/rejected": -3.1654722690582275,
"step": 240
},
{
"epoch": 1.1076923076923078,
"eval_logits/chosen": -1.1771941184997559,
"eval_logits/rejected": -1.1856648921966553,
"eval_logps/chosen": -22.31366539001465,
"eval_logps/rejected": -31.599573135375977,
"eval_loss": 0.24783480167388916,
"eval_rewards/accuracies": 0.8317972421646118,
"eval_rewards/chosen": 0.3665139377117157,
"eval_rewards/margins": 2.98590350151062,
"eval_rewards/rejected": -2.619389295578003,
"eval_runtime": 216.1562,
"eval_samples_per_second": 8.022,
"eval_steps_per_second": 2.008,
"step": 240
},
{
"epoch": 1.116923076923077,
"grad_norm": 13.013402968505392,
"learning_rate": 2.4190452718666105e-07,
"logits/chosen": -1.0899126529693604,
"logits/rejected": -1.1027652025222778,
"logps/chosen": -15.734682083129883,
"logps/rejected": -27.53190803527832,
"loss": 0.2287,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.5433827638626099,
"rewards/margins": 3.0215795040130615,
"rewards/rejected": -2.478196859359741,
"step": 242
},
{
"epoch": 1.126153846153846,
"grad_norm": 12.301318346382136,
"learning_rate": 2.37859444471388e-07,
"logits/chosen": -1.1361184120178223,
"logits/rejected": -1.151028037071228,
"logps/chosen": -24.852954864501953,
"logps/rejected": -40.693912506103516,
"loss": 0.1914,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.489397794008255,
"rewards/margins": 3.448162794113159,
"rewards/rejected": -2.9587647914886475,
"step": 244
},
{
"epoch": 1.1353846153846154,
"grad_norm": 13.708460236846275,
"learning_rate": 2.3381754540639106e-07,
"logits/chosen": -1.1237130165100098,
"logits/rejected": -1.1399991512298584,
"logps/chosen": -21.652952194213867,
"logps/rejected": -30.665048599243164,
"loss": 0.2272,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.5611749291419983,
"rewards/margins": 3.155482292175293,
"rewards/rejected": -2.5943074226379395,
"step": 246
},
{
"epoch": 1.1446153846153846,
"grad_norm": 11.563478452101487,
"learning_rate": 2.2977988990964896e-07,
"logits/chosen": -1.0979208946228027,
"logits/rejected": -1.111803650856018,
"logps/chosen": -21.861614227294922,
"logps/rejected": -38.676361083984375,
"loss": 0.2243,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.13799840211868286,
"rewards/margins": 3.1060800552368164,
"rewards/rejected": -2.968081474304199,
"step": 248
},
{
"epoch": 1.1538461538461537,
"grad_norm": 12.63303273344697,
"learning_rate": 2.2574753678633798e-07,
"logits/chosen": -1.2150633335113525,
"logits/rejected": -1.2195019721984863,
"logps/chosen": -19.639219284057617,
"logps/rejected": -22.85377311706543,
"loss": 0.2111,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.478664755821228,
"rewards/margins": 2.8225910663604736,
"rewards/rejected": -2.343926429748535,
"step": 250
},
{
"epoch": 1.1630769230769231,
"grad_norm": 15.55104305702512,
"learning_rate": 2.2172154345117894e-07,
"logits/chosen": -1.1489689350128174,
"logits/rejected": -1.1607710123062134,
"logps/chosen": -22.335952758789062,
"logps/rejected": -43.476783752441406,
"loss": 0.1866,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.6636537909507751,
"rewards/margins": 4.2835187911987305,
"rewards/rejected": -3.6198649406433105,
"step": 252
},
{
"epoch": 1.1723076923076923,
"grad_norm": 19.58611284576425,
"learning_rate": 2.1770296565114846e-07,
"logits/chosen": -1.174638271331787,
"logits/rejected": -1.1910815238952637,
"logps/chosen": -19.441059112548828,
"logps/rejected": -23.29158592224121,
"loss": 0.2382,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.15968316793441772,
"rewards/margins": 2.543644428253174,
"rewards/rejected": -2.3839612007141113,
"step": 254
},
{
"epoch": 1.1815384615384614,
"grad_norm": 14.83480005382789,
"learning_rate": 2.1369285718862748e-07,
"logits/chosen": -1.0653572082519531,
"logits/rejected": -1.0726639032363892,
"logps/chosen": -24.378429412841797,
"logps/rejected": -48.50611877441406,
"loss": 0.1932,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.2468690127134323,
"rewards/margins": 4.218470096588135,
"rewards/rejected": -3.9716007709503174,
"step": 256
},
{
"epoch": 1.1907692307692308,
"grad_norm": 14.627626741140055,
"learning_rate": 2.0969226964506005e-07,
"logits/chosen": -1.1564842462539673,
"logits/rejected": -1.1586439609527588,
"logps/chosen": -25.08201789855957,
"logps/rejected": -26.51468849182129,
"loss": 0.2157,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.42589980363845825,
"rewards/margins": 3.205916166305542,
"rewards/rejected": -2.7800166606903076,
"step": 258
},
{
"epoch": 1.2,
"grad_norm": 13.600232617567109,
"learning_rate": 2.0570225210519433e-07,
"logits/chosen": -1.1147321462631226,
"logits/rejected": -1.1307651996612549,
"logps/chosen": -22.724639892578125,
"logps/rejected": -38.13914489746094,
"loss": 0.1956,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.5592103004455566,
"rewards/margins": 3.5806994438171387,
"rewards/rejected": -3.021489143371582,
"step": 260
},
{
"epoch": 1.2,
"eval_logits/chosen": -1.1710869073867798,
"eval_logits/rejected": -1.179579496383667,
"eval_logps/chosen": -22.368024826049805,
"eval_logps/rejected": -31.889461517333984,
"eval_loss": 0.24438533186912537,
"eval_rewards/accuracies": 0.8317972421646118,
"eval_rewards/chosen": 0.33933624625205994,
"eval_rewards/margins": 3.1036696434020996,
"eval_rewards/rejected": -2.764333963394165,
"eval_runtime": 216.3298,
"eval_samples_per_second": 8.016,
"eval_steps_per_second": 2.006,
"step": 260
},
{
"epoch": 1.209230769230769,
"grad_norm": 16.513762580218792,
"learning_rate": 2.0172385088197803e-07,
"logits/chosen": -1.14779531955719,
"logits/rejected": -1.1652312278747559,
"logps/chosen": -26.26132583618164,
"logps/rejected": -40.5022087097168,
"loss": 0.2143,
"rewards/accuracies": 0.7777777910232544,
"rewards/chosen": 0.41341039538383484,
"rewards/margins": 3.354189157485962,
"rewards/rejected": -2.940778970718384,
"step": 262
},
{
"epoch": 1.2184615384615385,
"grad_norm": 11.212524578416895,
"learning_rate": 1.977581092421812e-07,
"logits/chosen": -1.1520088911056519,
"logits/rejected": -1.1642160415649414,
"logps/chosen": -20.592201232910156,
"logps/rejected": -30.868377685546875,
"loss": 0.1657,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.40944963693618774,
"rewards/margins": 3.2444136142730713,
"rewards/rejected": -2.8349640369415283,
"step": 264
},
{
"epoch": 1.2276923076923076,
"grad_norm": 11.01146404378747,
"learning_rate": 1.9380606713281772e-07,
"logits/chosen": -1.1583861112594604,
"logits/rejected": -1.1652624607086182,
"logps/chosen": -18.12959098815918,
"logps/rejected": -34.5963134765625,
"loss": 0.2062,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.3663688898086548,
"rewards/margins": 3.504619836807251,
"rewards/rejected": -3.1382510662078857,
"step": 266
},
{
"epoch": 1.236923076923077,
"grad_norm": 12.264405123220332,
"learning_rate": 1.8986876090843664e-07,
"logits/chosen": -1.13167142868042,
"logits/rejected": -1.14499831199646,
"logps/chosen": -20.43359375,
"logps/rejected": -37.75240707397461,
"loss": 0.1807,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.36272215843200684,
"rewards/margins": 3.8877878189086914,
"rewards/rejected": -3.5250654220581055,
"step": 268
},
{
"epoch": 1.2461538461538462,
"grad_norm": 11.919291580876626,
"learning_rate": 1.859472230593569e-07,
"logits/chosen": -1.1225872039794922,
"logits/rejected": -1.1367418766021729,
"logps/chosen": -26.361604690551758,
"logps/rejected": -43.534812927246094,
"loss": 0.2145,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.43198204040527344,
"rewards/margins": 3.9310781955718994,
"rewards/rejected": -3.499096155166626,
"step": 270
},
{
"epoch": 1.2553846153846153,
"grad_norm": 12.440022575260326,
"learning_rate": 1.8204248194091425e-07,
"logits/chosen": -1.1526453495025635,
"logits/rejected": -1.1696141958236694,
"logps/chosen": -23.60825538635254,
"logps/rejected": -57.63713836669922,
"loss": 0.1955,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": 0.27740761637687683,
"rewards/margins": 4.90004301071167,
"rewards/rejected": -4.622635841369629,
"step": 272
},
{
"epoch": 1.2646153846153847,
"grad_norm": 8.286919730890018,
"learning_rate": 1.7815556150379296e-07,
"logits/chosen": -1.1683982610702515,
"logits/rejected": -1.169435977935791,
"logps/chosen": -22.41632652282715,
"logps/rejected": -32.76851272583008,
"loss": 0.1885,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.5325616002082825,
"rewards/margins": 3.4823427200317383,
"rewards/rejected": -2.9497809410095215,
"step": 274
},
{
"epoch": 1.2738461538461539,
"grad_norm": 11.685150583165354,
"learning_rate": 1.7428748102551234e-07,
"logits/chosen": -1.106712818145752,
"logits/rejected": -1.1161227226257324,
"logps/chosen": -20.291996002197266,
"logps/rejected": -28.43364715576172,
"loss": 0.1994,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.5047957897186279,
"rewards/margins": 3.1466941833496094,
"rewards/rejected": -2.6418981552124023,
"step": 276
},
{
"epoch": 1.283076923076923,
"grad_norm": 13.842054601252082,
"learning_rate": 1.704392548431391e-07,
"logits/chosen": -1.1573395729064941,
"logits/rejected": -1.1763123273849487,
"logps/chosen": -13.727288246154785,
"logps/rejected": -40.552120208740234,
"loss": 0.1992,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.0833960473537445,
"rewards/margins": 3.4928784370422363,
"rewards/rejected": -3.40948224067688,
"step": 278
},
{
"epoch": 1.2923076923076924,
"grad_norm": 19.81840697060037,
"learning_rate": 1.6661189208729489e-07,
"logits/chosen": -1.1369847059249878,
"logits/rejected": -1.1503101587295532,
"logps/chosen": -29.371524810791016,
"logps/rejected": -31.74928092956543,
"loss": 0.174,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.46892601251602173,
"rewards/margins": 3.2968459129333496,
"rewards/rejected": -2.8279199600219727,
"step": 280
},
{
"epoch": 1.2923076923076924,
"eval_logits/chosen": -1.165863275527954,
"eval_logits/rejected": -1.1743441820144653,
"eval_logps/chosen": -22.31157875061035,
"eval_logps/rejected": -31.91876792907715,
"eval_loss": 0.23967565596103668,
"eval_rewards/accuracies": 0.8341013789176941,
"eval_rewards/chosen": 0.3675578236579895,
"eval_rewards/margins": 3.146545171737671,
"eval_rewards/rejected": -2.778987407684326,
"eval_runtime": 216.3352,
"eval_samples_per_second": 8.015,
"eval_steps_per_second": 2.006,
"step": 280
},
{
"epoch": 1.3015384615384615,
"grad_norm": 8.930251698810418,
"learning_rate": 1.6280639641752942e-07,
"logits/chosen": -1.1316086053848267,
"logits/rejected": -1.1440240144729614,
"logps/chosen": -20.34646987915039,
"logps/rejected": -49.82673645019531,
"loss": 0.1765,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.23807168006896973,
"rewards/margins": 4.113887310028076,
"rewards/rejected": -3.8758151531219482,
"step": 282
},
{
"epoch": 1.3107692307692307,
"grad_norm": 12.563220339411409,
"learning_rate": 1.5902376575912814e-07,
"logits/chosen": -1.11788809299469,
"logits/rejected": -1.1216245889663696,
"logps/chosen": -26.72078514099121,
"logps/rejected": -35.561317443847656,
"loss": 0.1887,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.3794720470905304,
"rewards/margins": 3.400892734527588,
"rewards/rejected": -3.021420478820801,
"step": 284
},
{
"epoch": 1.32,
"grad_norm": 12.663334489473607,
"learning_rate": 1.552649920414233e-07,
"logits/chosen": -1.1346993446350098,
"logits/rejected": -1.135698676109314,
"logps/chosen": -30.942975997924805,
"logps/rejected": -28.223663330078125,
"loss": 0.209,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.1739700883626938,
"rewards/margins": 2.763653039932251,
"rewards/rejected": -2.5896828174591064,
"step": 286
},
{
"epoch": 1.3292307692307692,
"grad_norm": 14.8989835155845,
"learning_rate": 1.5153106093767825e-07,
"logits/chosen": -1.0928491353988647,
"logits/rejected": -1.115010142326355,
"logps/chosen": -18.197795867919922,
"logps/rejected": -37.05016326904297,
"loss": 0.2571,
"rewards/accuracies": 0.75,
"rewards/chosen": 0.4650332033634186,
"rewards/margins": 2.95278000831604,
"rewards/rejected": -2.4877467155456543,
"step": 288
},
{
"epoch": 1.3384615384615386,
"grad_norm": 7.959815386261902,
"learning_rate": 1.47822951606611e-07,
"logits/chosen": -1.1016626358032227,
"logits/rejected": -1.1072629690170288,
"logps/chosen": -27.025487899780273,
"logps/rejected": -32.04999923706055,
"loss": 0.1876,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.37108778953552246,
"rewards/margins": 3.5628809928894043,
"rewards/rejected": -3.191793441772461,
"step": 290
},
{
"epoch": 1.3476923076923077,
"grad_norm": 9.883542506968235,
"learning_rate": 1.4414163643562753e-07,
"logits/chosen": -1.1510549783706665,
"logits/rejected": -1.161637783050537,
"logps/chosen": -26.81183433532715,
"logps/rejected": -45.584022521972656,
"loss": 0.1694,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.7165854573249817,
"rewards/margins": 4.145462989807129,
"rewards/rejected": -3.428877353668213,
"step": 292
},
{
"epoch": 1.356923076923077,
"grad_norm": 16.819884237605038,
"learning_rate": 1.4048808078582942e-07,
"logits/chosen": -1.156364917755127,
"logits/rejected": -1.158648133277893,
"logps/chosen": -25.07522964477539,
"logps/rejected": -37.01847839355469,
"loss": 0.1916,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": -0.062492769211530685,
"rewards/margins": 3.447725534439087,
"rewards/rejected": -3.5102179050445557,
"step": 294
},
{
"epoch": 1.3661538461538463,
"grad_norm": 9.730872259730013,
"learning_rate": 1.3686324273886528e-07,
"logits/chosen": -1.0902260541915894,
"logits/rejected": -1.1149543523788452,
"logps/chosen": -21.78764533996582,
"logps/rejected": -47.82768249511719,
"loss": 0.1618,
"rewards/accuracies": 0.9444444179534912,
"rewards/chosen": 0.330030232667923,
"rewards/margins": 4.0784478187561035,
"rewards/rejected": -3.748418092727661,
"step": 296
},
{
"epoch": 1.3753846153846154,
"grad_norm": 11.017633003526004,
"learning_rate": 1.3326807284568984e-07,
"logits/chosen": -1.1744215488433838,
"logits/rejected": -1.1781681776046753,
"logps/chosen": -20.410446166992188,
"logps/rejected": -33.22405242919922,
"loss": 0.2013,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.266373872756958,
"rewards/margins": 3.171236515045166,
"rewards/rejected": -2.904862642288208,
"step": 298
},
{
"epoch": 1.3846153846153846,
"grad_norm": 12.616723945362331,
"learning_rate": 1.2970351387729872e-07,
"logits/chosen": -1.1809624433517456,
"logits/rejected": -1.1951857805252075,
"logps/chosen": -18.240955352783203,
"logps/rejected": -40.42936706542969,
"loss": 0.2077,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.6317293643951416,
"rewards/margins": 3.926286458969116,
"rewards/rejected": -3.2945568561553955,
"step": 300
},
{
"epoch": 1.3846153846153846,
"eval_logits/chosen": -1.1625326871871948,
"eval_logits/rejected": -1.1709260940551758,
"eval_logps/chosen": -22.30373764038086,
"eval_logps/rejected": -32.03895568847656,
"eval_loss": 0.23691046237945557,
"eval_rewards/accuracies": 0.8387096524238586,
"eval_rewards/chosen": 0.3714797794818878,
"eval_rewards/margins": 3.2105631828308105,
"eval_rewards/rejected": -2.839083194732666,
"eval_runtime": 216.5842,
"eval_samples_per_second": 8.006,
"eval_steps_per_second": 2.004,
"step": 300
},
{
"epoch": 1.393846153846154,
"grad_norm": 11.126146094324666,
"learning_rate": 1.261705005775032e-07,
"logits/chosen": -1.1696714162826538,
"logits/rejected": -1.1861652135849,
"logps/chosen": -22.42890167236328,
"logps/rejected": -34.44594192504883,
"loss": 0.1635,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": 0.2834773361682892,
"rewards/margins": 3.5443296432495117,
"rewards/rejected": -3.260852813720703,
"step": 302
},
{
"epoch": 1.403076923076923,
"grad_norm": 10.479052450533084,
"learning_rate": 1.2266995941780933e-07,
"logits/chosen": -1.130216121673584,
"logits/rejected": -1.1414945125579834,
"logps/chosen": -25.476299285888672,
"logps/rejected": -40.09599304199219,
"loss": 0.1598,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.3959537744522095,
"rewards/margins": 3.8914499282836914,
"rewards/rejected": -3.4954960346221924,
"step": 304
},
{
"epoch": 1.4123076923076923,
"grad_norm": 15.900407241334104,
"learning_rate": 1.1920280835446748e-07,
"logits/chosen": -1.1561819314956665,
"logits/rejected": -1.160946011543274,
"logps/chosen": -26.870162963867188,
"logps/rejected": -45.102787017822266,
"loss": 0.1771,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.5023772120475769,
"rewards/margins": 4.30380392074585,
"rewards/rejected": -3.801426887512207,
"step": 306
},
{
"epoch": 1.4215384615384616,
"grad_norm": 10.845292151115956,
"learning_rate": 1.1576995658775404e-07,
"logits/chosen": -1.1523799896240234,
"logits/rejected": -1.1634249687194824,
"logps/chosen": -20.11031723022461,
"logps/rejected": -28.449501037597656,
"loss": 0.155,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.4131190776824951,
"rewards/margins": 3.466240882873535,
"rewards/rejected": -3.053121328353882,
"step": 308
},
{
"epoch": 1.4307692307692308,
"grad_norm": 13.811097447536184,
"learning_rate": 1.123723043235491e-07,
"logits/chosen": -1.1037707328796387,
"logits/rejected": -1.1196866035461426,
"logps/chosen": -22.25092315673828,
"logps/rejected": -41.13553237915039,
"loss": 0.2394,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.44290411472320557,
"rewards/margins": 3.9364805221557617,
"rewards/rejected": -3.4935765266418457,
"step": 310
},
{
"epoch": 1.44,
"grad_norm": 7.336736527232887,
"learning_rate": 1.0901074253727336e-07,
"logits/chosen": -1.132401943206787,
"logits/rejected": -1.1375315189361572,
"logps/chosen": -21.84718132019043,
"logps/rejected": -32.056617736816406,
"loss": 0.1639,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.543586015701294,
"rewards/margins": 3.536188840866089,
"rewards/rejected": -2.9926023483276367,
"step": 312
},
{
"epoch": 1.4492307692307693,
"grad_norm": 9.238298739154985,
"learning_rate": 1.056861527402452e-07,
"logits/chosen": -1.1301486492156982,
"logits/rejected": -1.130847454071045,
"logps/chosen": -30.35249137878418,
"logps/rejected": -39.42829513549805,
"loss": 0.1854,
"rewards/accuracies": 0.8055555820465088,
"rewards/chosen": 0.6695830821990967,
"rewards/margins": 3.61427903175354,
"rewards/rejected": -2.9446957111358643,
"step": 314
},
{
"epoch": 1.4584615384615385,
"grad_norm": 13.901867549459764,
"learning_rate": 1.0239940674851941e-07,
"logits/chosen": -1.1156858205795288,
"logits/rejected": -1.114392638206482,
"logps/chosen": -24.01244354248047,
"logps/rejected": -34.20494842529297,
"loss": 0.1866,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.37583643198013306,
"rewards/margins": 3.5291662216186523,
"rewards/rejected": -3.153329610824585,
"step": 316
},
{
"epoch": 1.4676923076923076,
"grad_norm": 11.080424296345777,
"learning_rate": 9.915136645426883e-08,
"logits/chosen": -1.1818937063217163,
"logits/rejected": -1.1808428764343262,
"logps/chosen": -24.881999969482422,
"logps/rejected": -28.97332763671875,
"loss": 0.173,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": 0.36953669786453247,
"rewards/margins": 3.261909246444702,
"rewards/rejected": -2.8923726081848145,
"step": 318
},
{
"epoch": 1.476923076923077,
"grad_norm": 15.189646270302608,
"learning_rate": 9.594288359976815e-08,
"logits/chosen": -1.1282167434692383,
"logits/rejected": -1.1426851749420166,
"logps/chosen": -17.99266815185547,
"logps/rejected": -47.12626266479492,
"loss": 0.2092,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.30799973011016846,
"rewards/margins": 4.037694454193115,
"rewards/rejected": -3.729694366455078,
"step": 320
},
{
"epoch": 1.476923076923077,
"eval_logits/chosen": -1.1610218286514282,
"eval_logits/rejected": -1.1692686080932617,
"eval_logps/chosen": -22.297130584716797,
"eval_logps/rejected": -32.10142135620117,
"eval_loss": 0.23491987586021423,
"eval_rewards/accuracies": 0.8329492807388306,
"eval_rewards/chosen": 0.3747842013835907,
"eval_rewards/margins": 3.245098829269409,
"eval_rewards/rejected": -2.870314836502075,
"eval_runtime": 216.0919,
"eval_samples_per_second": 8.024,
"eval_steps_per_second": 2.008,
"step": 320
},
{
"epoch": 1.4861538461538462,
"grad_norm": 11.193355120949441,
"learning_rate": 9.277479955403886e-08,
"logits/chosen": -1.147449016571045,
"logits/rejected": -1.1808828115463257,
"logps/chosen": -19.78190040588379,
"logps/rejected": -68.74774932861328,
"loss": 0.1519,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.25842922925949097,
"rewards/margins": 5.480890274047852,
"rewards/rejected": -5.222461223602295,
"step": 322
},
{
"epoch": 1.4953846153846153,
"grad_norm": 11.257040825977688,
"learning_rate": 8.964794509221507e-08,
"logits/chosen": -1.1383910179138184,
"logits/rejected": -1.148794412612915,
"logps/chosen": -25.653322219848633,
"logps/rejected": -34.04636001586914,
"loss": 0.1653,
"rewards/accuracies": 0.9305555820465088,
"rewards/chosen": 0.33952367305755615,
"rewards/margins": 3.5638911724090576,
"rewards/rejected": -3.224367380142212,
"step": 324
},
{
"epoch": 1.5046153846153847,
"grad_norm": 14.248331413419937,
"learning_rate": 8.656314017768693e-08,
"logits/chosen": -1.1353636980056763,
"logits/rejected": -1.1488914489746094,
"logps/chosen": -23.45088768005371,
"logps/rejected": -36.34320831298828,
"loss": 0.19,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.5625240802764893,
"rewards/margins": 3.636873483657837,
"rewards/rejected": -3.0743494033813477,
"step": 326
},
{
"epoch": 1.5138461538461538,
"grad_norm": 11.13430757826836,
"learning_rate": 8.352119374707977e-08,
"logits/chosen": -1.1736154556274414,
"logits/rejected": -1.1819250583648682,
"logps/chosen": -21.08655548095703,
"logps/rejected": -31.81151580810547,
"loss": 0.1618,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.3814205825328827,
"rewards/margins": 3.455685615539551,
"rewards/rejected": -3.0742650032043457,
"step": 328
},
{
"epoch": 1.523076923076923,
"grad_norm": 9.775792350949882,
"learning_rate": 8.052290349812419e-08,
"logits/chosen": -1.1424063444137573,
"logits/rejected": -1.1474817991256714,
"logps/chosen": -21.133007049560547,
"logps/rejected": -25.102752685546875,
"loss": 0.2071,
"rewards/accuracies": 0.875,
"rewards/chosen": 0.4940270781517029,
"rewards/margins": 2.9714784622192383,
"rewards/rejected": -2.4774513244628906,
"step": 330
},
{
"epoch": 1.5323076923076924,
"grad_norm": 6.768309866947245,
"learning_rate": 7.756905568047392e-08,
"logits/chosen": -1.1152650117874146,
"logits/rejected": -1.12236750125885,
"logps/chosen": -17.50248146057129,
"logps/rejected": -29.518686294555664,
"loss": 0.159,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.6183215379714966,
"rewards/margins": 3.7438418865203857,
"rewards/rejected": -3.1255204677581787,
"step": 332
},
{
"epoch": 1.5415384615384615,
"grad_norm": 12.853827774295516,
"learning_rate": 7.46604248895252e-08,
"logits/chosen": -1.1082737445831299,
"logits/rejected": -1.1175150871276855,
"logps/chosen": -20.219505310058594,
"logps/rejected": -28.43560218811035,
"loss": 0.1827,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.34581294655799866,
"rewards/margins": 3.1769955158233643,
"rewards/rejected": -2.8311829566955566,
"step": 334
},
{
"epoch": 1.5507692307692307,
"grad_norm": 7.493668682648857,
"learning_rate": 7.179777386329275e-08,
"logits/chosen": -1.1045269966125488,
"logits/rejected": -1.1183186769485474,
"logps/chosen": -21.421226501464844,
"logps/rejected": -39.41886901855469,
"loss": 0.1748,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.5396389365196228,
"rewards/margins": 3.9202401638031006,
"rewards/rejected": -3.380601167678833,
"step": 336
},
{
"epoch": 1.56,
"grad_norm": 12.452229910069226,
"learning_rate": 6.898185328239467e-08,
"logits/chosen": -1.145583987236023,
"logits/rejected": -1.1488795280456543,
"logps/chosen": -22.65854263305664,
"logps/rejected": -31.751142501831055,
"loss": 0.1845,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.2917179465293884,
"rewards/margins": 3.111690044403076,
"rewards/rejected": -2.819972038269043,
"step": 338
},
{
"epoch": 1.5692307692307692,
"grad_norm": 10.84177308211244,
"learning_rate": 6.621340157319996e-08,
"logits/chosen": -1.1560921669006348,
"logits/rejected": -1.1605477333068848,
"logps/chosen": -16.325712203979492,
"logps/rejected": -24.499792098999023,
"loss": 0.2045,
"rewards/accuracies": 0.9027777910232544,
"rewards/chosen": 0.44531428813934326,
"rewards/margins": 3.1462950706481934,
"rewards/rejected": -2.7009804248809814,
"step": 340
},
{
"epoch": 1.5692307692307692,
"eval_logits/chosen": -1.1584707498550415,
"eval_logits/rejected": -1.1668710708618164,
"eval_logps/chosen": -22.341110229492188,
"eval_logps/rejected": -32.223533630371094,
"eval_loss": 0.23495733737945557,
"eval_rewards/accuracies": 0.8341013789176941,
"eval_rewards/chosen": 0.35279345512390137,
"eval_rewards/margins": 3.2841641902923584,
"eval_rewards/rejected": -2.931370496749878,
"eval_runtime": 216.2511,
"eval_samples_per_second": 8.018,
"eval_steps_per_second": 2.007,
"step": 340
},
{
"epoch": 1.5784615384615384,
"grad_norm": 8.225696594197464,
"learning_rate": 6.349314471418849e-08,
"logits/chosen": -1.0857443809509277,
"logits/rejected": -1.0922576189041138,
"logps/chosen": -16.084243774414062,
"logps/rejected": -30.81378173828125,
"loss": 0.1803,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.5106647610664368,
"rewards/margins": 3.7973814010620117,
"rewards/rejected": -3.2867166996002197,
"step": 342
},
{
"epoch": 1.5876923076923077,
"grad_norm": 15.760247716168218,
"learning_rate": 6.082179604557616e-08,
"logits/chosen": -1.1193811893463135,
"logits/rejected": -1.121721863746643,
"logps/chosen": -22.19783592224121,
"logps/rejected": -28.761178970336914,
"loss": 0.197,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.40734562277793884,
"rewards/margins": 3.452158212661743,
"rewards/rejected": -3.0448129177093506,
"step": 344
},
{
"epoch": 1.596923076923077,
"grad_norm": 10.909974494088763,
"learning_rate": 5.8200056082253453e-08,
"logits/chosen": -1.125333547592163,
"logits/rejected": -1.142914056777954,
"logps/chosen": -19.27569007873535,
"logps/rejected": -45.170040130615234,
"loss": 0.1653,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.4003957509994507,
"rewards/margins": 4.2396368980407715,
"rewards/rejected": -3.839240550994873,
"step": 346
},
{
"epoch": 1.606153846153846,
"grad_norm": 10.855639719670084,
"learning_rate": 5.5628612330087724e-08,
"logits/chosen": -1.131655216217041,
"logits/rejected": -1.1401116847991943,
"logps/chosen": -17.995466232299805,
"logps/rejected": -32.176475524902344,
"loss": 0.1826,
"rewards/accuracies": 0.8472222089767456,
"rewards/chosen": 0.4925755262374878,
"rewards/margins": 3.6894967555999756,
"rewards/rejected": -3.196920871734619,
"step": 348
},
{
"epoch": 1.6153846153846154,
"grad_norm": 16.085282454030374,
"learning_rate": 5.310813910563644e-08,
"logits/chosen": -1.0810273885726929,
"logits/rejected": -1.0798935890197754,
"logps/chosen": -22.392784118652344,
"logps/rejected": -28.961748123168945,
"loss": 0.2082,
"rewards/accuracies": 0.7916666865348816,
"rewards/chosen": 0.39071983098983765,
"rewards/margins": 2.970240592956543,
"rewards/rejected": -2.5795204639434814,
"step": 350
},
{
"epoch": 1.6246153846153846,
"grad_norm": 16.9671493136513,
"learning_rate": 5.0639297359319846e-08,
"logits/chosen": -1.1683417558670044,
"logits/rejected": -1.1672459840774536,
"logps/chosen": -24.353551864624023,
"logps/rejected": -27.454164505004883,
"loss": 0.2106,
"rewards/accuracies": 0.8333333134651184,
"rewards/chosen": 0.26455923914909363,
"rewards/margins": 2.982168674468994,
"rewards/rejected": -2.717609167098999,
"step": 352
},
{
"epoch": 1.6338461538461537,
"grad_norm": 10.455898381248911,
"learning_rate": 4.8222734502097655e-08,
"logits/chosen": -1.1433789730072021,
"logits/rejected": -1.153548240661621,
"logps/chosen": -24.5914363861084,
"logps/rejected": -42.36714172363281,
"loss": 0.1885,
"rewards/accuracies": 0.8194444179534912,
"rewards/chosen": 0.36157724261283875,
"rewards/margins": 3.6608800888061523,
"rewards/rejected": -3.2993030548095703,
"step": 354
},
{
"epoch": 1.643076923076923,
"grad_norm": 19.280259828969186,
"learning_rate": 4.5859084235697235e-08,
"logits/chosen": -1.164656639099121,
"logits/rejected": -1.1599383354187012,
"logps/chosen": -19.223194122314453,
"logps/rejected": -24.446197509765625,
"loss": 0.2371,
"rewards/accuracies": 0.8611111044883728,
"rewards/chosen": 0.3862743377685547,
"rewards/margins": 2.9600579738616943,
"rewards/rejected": -2.5737838745117188,
"step": 356
},
{
"epoch": 1.6523076923076923,
"grad_norm": 8.14493222848995,
"learning_rate": 4.35489663864359e-08,
"logits/chosen": -1.0972024202346802,
"logits/rejected": -1.1305886507034302,
"logps/chosen": -17.79538345336914,
"logps/rejected": -59.57120895385742,
"loss": 0.2046,
"rewards/accuracies": 0.8888888955116272,
"rewards/chosen": 0.5223473310470581,
"rewards/margins": 4.91096830368042,
"rewards/rejected": -4.388620853424072,
"step": 358
},
{
"epoch": 1.6615384615384614,
"grad_norm": 11.376614389062514,
"learning_rate": 4.1292986742682254e-08,
"logits/chosen": -1.140592098236084,
"logits/rejected": -1.1457772254943848,
"logps/chosen": -19.596229553222656,
"logps/rejected": -32.57119369506836,
"loss": 0.1368,
"rewards/accuracies": 0.9166666865348816,
"rewards/chosen": 0.34850603342056274,
"rewards/margins": 3.6875181198120117,
"rewards/rejected": -3.339012622833252,
"step": 360
},
{
"epoch": 1.6615384615384614,
"eval_logits/chosen": -1.1585197448730469,
"eval_logits/rejected": -1.1669610738754272,
"eval_logps/chosen": -22.363513946533203,
"eval_logps/rejected": -32.30293273925781,
"eval_loss": 0.23404575884342194,
"eval_rewards/accuracies": 0.8352534770965576,
"eval_rewards/chosen": 0.3415912091732025,
"eval_rewards/margins": 3.3126602172851562,
"eval_rewards/rejected": -2.9710693359375,
"eval_runtime": 216.0202,
"eval_samples_per_second": 8.027,
"eval_steps_per_second": 2.009,
"step": 360
}
],
"logging_steps": 2,
"max_steps": 432,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 20,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}