zephyr-dpo-qlora-gpt4-5e-6-epoch3 / trainer_state.json
just1nseo's picture
Model save
6f4d452 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 3.0,
"eval_steps": 100,
"global_step": 1065,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0,
"grad_norm": 2.1205010754043525,
"learning_rate": 4.672897196261682e-08,
"logits/chosen": -2.8477635383605957,
"logits/rejected": -2.8469698429107666,
"logps/chosen": -522.6112670898438,
"logps/rejected": -359.48583984375,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/margins_max": 0.0,
"rewards/margins_min": 0.0,
"rewards/margins_std": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.03,
"grad_norm": 10.218544680897951,
"learning_rate": 4.6728971962616824e-07,
"logits/chosen": -2.9212379455566406,
"logits/rejected": -2.7965469360351562,
"logps/chosen": -313.4451904296875,
"logps/rejected": -170.3771209716797,
"loss": 0.6932,
"rewards/accuracies": 0.5,
"rewards/chosen": 0.0002524534647818655,
"rewards/margins": 0.0003799269034061581,
"rewards/margins_max": 0.0016077507752925158,
"rewards/margins_min": -0.0008478969684801996,
"rewards/margins_std": 0.0017364051891490817,
"rewards/rejected": -0.0001274734386242926,
"step": 10
},
{
"epoch": 0.06,
"grad_norm": 2.0408708876984667,
"learning_rate": 9.345794392523365e-07,
"logits/chosen": -2.7633142471313477,
"logits/rejected": -2.7104804515838623,
"logps/chosen": -380.93878173828125,
"logps/rejected": -244.42214965820312,
"loss": 0.6916,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": 0.0017110242042690516,
"rewards/margins": 0.002610816154628992,
"rewards/margins_max": 0.004759171046316624,
"rewards/margins_min": 0.0004624614375643432,
"rewards/margins_std": 0.0030382319819182158,
"rewards/rejected": -0.0008997917175292969,
"step": 20
},
{
"epoch": 0.08,
"grad_norm": 2.293731718484229,
"learning_rate": 1.4018691588785047e-06,
"logits/chosen": -2.8749966621398926,
"logits/rejected": -2.8233141899108887,
"logps/chosen": -375.4239196777344,
"logps/rejected": -252.9129638671875,
"loss": 0.687,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.0067976354621350765,
"rewards/margins": 0.009298587217926979,
"rewards/margins_max": 0.015676181763410568,
"rewards/margins_min": 0.0029209901113063097,
"rewards/margins_std": 0.009019283577799797,
"rewards/rejected": -0.0025009517557919025,
"step": 30
},
{
"epoch": 0.11,
"grad_norm": 1.9265009094442067,
"learning_rate": 1.869158878504673e-06,
"logits/chosen": -2.7316184043884277,
"logits/rejected": -2.7654078006744385,
"logps/chosen": -305.0208740234375,
"logps/rejected": -318.15576171875,
"loss": 0.6783,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.01904786378145218,
"rewards/margins": 0.02529343031346798,
"rewards/margins_max": 0.03756815567612648,
"rewards/margins_min": 0.013018706813454628,
"rewards/margins_std": 0.017359081655740738,
"rewards/rejected": -0.006245566997677088,
"step": 40
},
{
"epoch": 0.14,
"grad_norm": 2.2762718753507225,
"learning_rate": 2.3364485981308413e-06,
"logits/chosen": -2.7840142250061035,
"logits/rejected": -2.695960521697998,
"logps/chosen": -241.2890167236328,
"logps/rejected": -175.4230194091797,
"loss": 0.6612,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.039340294897556305,
"rewards/margins": 0.05124547332525253,
"rewards/margins_max": 0.07519420981407166,
"rewards/margins_min": 0.027296727523207664,
"rewards/margins_std": 0.03386863321065903,
"rewards/rejected": -0.011905180290341377,
"step": 50
},
{
"epoch": 0.17,
"grad_norm": 2.278929693070735,
"learning_rate": 2.8037383177570094e-06,
"logits/chosen": -2.7337279319763184,
"logits/rejected": -2.6699888706207275,
"logps/chosen": -257.01812744140625,
"logps/rejected": -237.2047119140625,
"loss": 0.636,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.10417316108942032,
"rewards/margins": 0.12125153839588165,
"rewards/margins_max": 0.19414573907852173,
"rewards/margins_min": 0.04835732653737068,
"rewards/margins_std": 0.10308797657489777,
"rewards/rejected": -0.01707836613059044,
"step": 60
},
{
"epoch": 0.2,
"grad_norm": 1.9261684067245632,
"learning_rate": 3.2710280373831774e-06,
"logits/chosen": -2.6452136039733887,
"logits/rejected": -2.649742364883423,
"logps/chosen": -320.9119567871094,
"logps/rejected": -220.4650421142578,
"loss": 0.6066,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.1302875578403473,
"rewards/margins": 0.186918243765831,
"rewards/margins_max": 0.2680404782295227,
"rewards/margins_min": 0.10579605400562286,
"rewards/margins_std": 0.11472412198781967,
"rewards/rejected": -0.0566307008266449,
"step": 70
},
{
"epoch": 0.23,
"grad_norm": 1.899604093562728,
"learning_rate": 3.738317757009346e-06,
"logits/chosen": -2.856180191040039,
"logits/rejected": -2.781043291091919,
"logps/chosen": -324.0494079589844,
"logps/rejected": -299.65643310546875,
"loss": 0.5744,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": 0.12999968230724335,
"rewards/margins": 0.25530779361724854,
"rewards/margins_max": 0.37520045042037964,
"rewards/margins_min": 0.13541515171527863,
"rewards/margins_std": 0.16955383121967316,
"rewards/rejected": -0.12530812621116638,
"step": 80
},
{
"epoch": 0.25,
"grad_norm": 2.438635537156189,
"learning_rate": 4.205607476635514e-06,
"logits/chosen": -2.6444644927978516,
"logits/rejected": -2.6486284732818604,
"logps/chosen": -272.92718505859375,
"logps/rejected": -228.8600616455078,
"loss": 0.523,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": 0.04048062115907669,
"rewards/margins": 0.29693564772605896,
"rewards/margins_max": 0.4845455288887024,
"rewards/margins_min": 0.1093258485198021,
"rewards/margins_std": 0.265320360660553,
"rewards/rejected": -0.25645506381988525,
"step": 90
},
{
"epoch": 0.28,
"grad_norm": 2.676590355830037,
"learning_rate": 4.6728971962616825e-06,
"logits/chosen": -2.7964138984680176,
"logits/rejected": -2.735548973083496,
"logps/chosen": -437.5833435058594,
"logps/rejected": -379.58123779296875,
"loss": 0.4777,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": 0.20675165951251984,
"rewards/margins": 0.581081748008728,
"rewards/margins_max": 0.8298590779304504,
"rewards/margins_min": 0.3323042690753937,
"rewards/margins_std": 0.3518243730068207,
"rewards/rejected": -0.3743300139904022,
"step": 100
},
{
"epoch": 0.28,
"eval_logits/chosen": -2.670954704284668,
"eval_logits/rejected": -2.6312379837036133,
"eval_logps/chosen": -321.22222900390625,
"eval_logps/rejected": -301.6253967285156,
"eval_loss": 0.6754581928253174,
"eval_rewards/accuracies": 0.60317462682724,
"eval_rewards/chosen": -0.3600099980831146,
"eval_rewards/margins": 0.06441720575094223,
"eval_rewards/margins_max": 0.35590171813964844,
"eval_rewards/margins_min": -0.22098243236541748,
"eval_rewards/margins_std": 0.25287726521492004,
"eval_rewards/rejected": -0.42442721128463745,
"eval_runtime": 283.3412,
"eval_samples_per_second": 7.059,
"eval_steps_per_second": 0.222,
"step": 100
},
{
"epoch": 0.31,
"grad_norm": 2.5201742608505686,
"learning_rate": 4.999879018839288e-06,
"logits/chosen": -2.637324810028076,
"logits/rejected": -2.529784679412842,
"logps/chosen": -315.1212158203125,
"logps/rejected": -298.06903076171875,
"loss": 0.4234,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.12577927112579346,
"rewards/margins": 0.6422899961471558,
"rewards/margins_max": 0.9393427968025208,
"rewards/margins_min": 0.3452370762825012,
"rewards/margins_std": 0.42009615898132324,
"rewards/rejected": -0.5165106058120728,
"step": 110
},
{
"epoch": 0.34,
"grad_norm": 6.261552433653697,
"learning_rate": 4.99772856836941e-06,
"logits/chosen": -2.7266364097595215,
"logits/rejected": -2.7145590782165527,
"logps/chosen": -347.3783264160156,
"logps/rejected": -389.63299560546875,
"loss": 0.3956,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.24562442302703857,
"rewards/margins": 0.8258479237556458,
"rewards/margins_max": 1.141953468322754,
"rewards/margins_min": 0.5097422003746033,
"rewards/margins_std": 0.44704094529151917,
"rewards/rejected": -0.5802234411239624,
"step": 120
},
{
"epoch": 0.37,
"grad_norm": 2.5117234961196413,
"learning_rate": 4.992892309373227e-06,
"logits/chosen": -2.5119540691375732,
"logits/rejected": -2.4644391536712646,
"logps/chosen": -370.6039733886719,
"logps/rejected": -361.2594909667969,
"loss": 0.3218,
"rewards/accuracies": 1.0,
"rewards/chosen": 0.20368309319019318,
"rewards/margins": 1.2330464124679565,
"rewards/margins_max": 1.4150781631469727,
"rewards/margins_min": 1.0510146617889404,
"rewards/margins_std": 0.25743168592453003,
"rewards/rejected": -1.0293633937835693,
"step": 130
},
{
"epoch": 0.39,
"grad_norm": 5.066809244826759,
"learning_rate": 4.985375442281969e-06,
"logits/chosen": -2.325155019760132,
"logits/rejected": -2.2663826942443848,
"logps/chosen": -366.98211669921875,
"logps/rejected": -403.01495361328125,
"loss": 0.2761,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -0.1577085703611374,
"rewards/margins": 1.5553103685379028,
"rewards/margins_max": 2.037226676940918,
"rewards/margins_min": 1.0733940601348877,
"rewards/margins_std": 0.681532621383667,
"rewards/rejected": -1.7130190134048462,
"step": 140
},
{
"epoch": 0.42,
"grad_norm": 7.190427764349362,
"learning_rate": 4.9751860499858175e-06,
"logits/chosen": -2.1403324604034424,
"logits/rejected": -2.041670560836792,
"logps/chosen": -324.15667724609375,
"logps/rejected": -441.0560607910156,
"loss": 0.2399,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.27334439754486084,
"rewards/margins": 1.659519910812378,
"rewards/margins_max": 2.2249293327331543,
"rewards/margins_min": 1.0941104888916016,
"rewards/margins_std": 0.7996099591255188,
"rewards/rejected": -1.9328645467758179,
"step": 150
},
{
"epoch": 0.45,
"grad_norm": 7.116224539942571,
"learning_rate": 4.962335089142376e-06,
"logits/chosen": -1.9535696506500244,
"logits/rejected": -1.7718425989151,
"logps/chosen": -358.6165466308594,
"logps/rejected": -501.46856689453125,
"loss": 0.1556,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -0.26896899938583374,
"rewards/margins": 2.3143906593322754,
"rewards/margins_max": 2.8530867099761963,
"rewards/margins_min": 1.7756941318511963,
"rewards/margins_std": 0.7618317008018494,
"rewards/rejected": -2.5833592414855957,
"step": 160
},
{
"epoch": 0.48,
"grad_norm": 12.210481387434758,
"learning_rate": 4.946836378394967e-06,
"logits/chosen": -1.838096022605896,
"logits/rejected": -1.5799922943115234,
"logps/chosen": -445.1002502441406,
"logps/rejected": -597.6307373046875,
"loss": 0.1406,
"rewards/accuracies": 1.0,
"rewards/chosen": -0.4461892545223236,
"rewards/margins": 3.19466233253479,
"rewards/margins_max": 4.110939979553223,
"rewards/margins_min": 2.2783844470977783,
"rewards/margins_std": 1.2958126068115234,
"rewards/rejected": -3.6408514976501465,
"step": 170
},
{
"epoch": 0.51,
"grad_norm": 27.562973883397905,
"learning_rate": 4.928706583513441e-06,
"logits/chosen": -1.3463890552520752,
"logits/rejected": -1.2715332508087158,
"logps/chosen": -605.5383911132812,
"logps/rejected": -967.7098388671875,
"loss": 0.1672,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -2.81402325630188,
"rewards/margins": 3.0660033226013184,
"rewards/margins_max": 3.8246688842773438,
"rewards/margins_min": 2.307338237762451,
"rewards/margins_std": 1.072914719581604,
"rewards/rejected": -5.880026817321777,
"step": 180
},
{
"epoch": 0.54,
"grad_norm": 3.9080684244028343,
"learning_rate": 4.907965199473471e-06,
"logits/chosen": -1.3362934589385986,
"logits/rejected": -1.0377042293548584,
"logps/chosen": -732.0992431640625,
"logps/rejected": -907.0653076171875,
"loss": 0.131,
"rewards/accuracies": 1.0,
"rewards/chosen": -2.7598698139190674,
"rewards/margins": 4.08551549911499,
"rewards/margins_max": 4.806515693664551,
"rewards/margins_min": 3.3645145893096924,
"rewards/margins_std": 1.019648551940918,
"rewards/rejected": -6.8453850746154785,
"step": 190
},
{
"epoch": 0.56,
"grad_norm": 42.83035382744783,
"learning_rate": 4.884634529493591e-06,
"logits/chosen": -1.4783378839492798,
"logits/rejected": -1.2933928966522217,
"logps/chosen": -735.5909423828125,
"logps/rejected": -1023.0391845703125,
"loss": 0.1416,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.047953128814697,
"rewards/margins": 4.137004375457764,
"rewards/margins_max": 5.257144927978516,
"rewards/margins_min": 3.0168652534484863,
"rewards/margins_std": 1.5841166973114014,
"rewards/rejected": -8.184958457946777,
"step": 200
},
{
"epoch": 0.56,
"eval_logits/chosen": -1.4607926607131958,
"eval_logits/rejected": -1.4055131673812866,
"eval_logps/chosen": -955.6170043945312,
"eval_logps/rejected": -980.7882080078125,
"eval_loss": 0.9053447246551514,
"eval_rewards/accuracies": 0.6269841194152832,
"eval_rewards/chosen": -6.703957557678223,
"eval_rewards/margins": 0.5120973587036133,
"eval_rewards/margins_max": 2.7698452472686768,
"eval_rewards/margins_min": -1.7983918190002441,
"eval_rewards/margins_std": 2.0239174365997314,
"eval_rewards/rejected": -7.216055393218994,
"eval_runtime": 281.707,
"eval_samples_per_second": 7.1,
"eval_steps_per_second": 0.224,
"step": 200
},
{
"epoch": 0.59,
"grad_norm": 11.323675041923366,
"learning_rate": 4.858739661052539e-06,
"logits/chosen": -1.350990891456604,
"logits/rejected": -1.2011955976486206,
"logps/chosen": -738.5956420898438,
"logps/rejected": -1072.1134033203125,
"loss": 0.1359,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -3.817591905593872,
"rewards/margins": 4.215450286865234,
"rewards/margins_max": 6.099488735198975,
"rewards/margins_min": 2.3314108848571777,
"rewards/margins_std": 2.664433240890503,
"rewards/rejected": -8.033041000366211,
"step": 210
},
{
"epoch": 0.62,
"grad_norm": 2.145861603880887,
"learning_rate": 4.830308438912687e-06,
"logits/chosen": -1.5942816734313965,
"logits/rejected": -1.3603050708770752,
"logps/chosen": -854.7412109375,
"logps/rejected": -1243.659423828125,
"loss": 0.0774,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.454717636108398,
"rewards/margins": 5.1989240646362305,
"rewards/margins_max": 6.37256383895874,
"rewards/margins_min": 4.025284290313721,
"rewards/margins_std": 1.6597778797149658,
"rewards/rejected": -9.653641700744629,
"step": 220
},
{
"epoch": 0.65,
"grad_norm": 4.962012371252307,
"learning_rate": 4.799371435178544e-06,
"logits/chosen": -1.7452170848846436,
"logits/rejected": -1.609167456626892,
"logps/chosen": -769.598876953125,
"logps/rejected": -1189.131103515625,
"loss": 0.104,
"rewards/accuracies": 1.0,
"rewards/chosen": -3.888404130935669,
"rewards/margins": 4.6370439529418945,
"rewards/margins_max": 5.980400085449219,
"rewards/margins_min": 3.293687343597412,
"rewards/margins_std": 1.8997926712036133,
"rewards/rejected": -8.5254487991333,
"step": 230
},
{
"epoch": 0.68,
"grad_norm": 2.001005873458455,
"learning_rate": 4.765961916422575e-06,
"logits/chosen": -1.6597576141357422,
"logits/rejected": -1.444551944732666,
"logps/chosen": -838.1024169921875,
"logps/rejected": -1238.279052734375,
"loss": 0.0955,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -4.969546318054199,
"rewards/margins": 4.475127696990967,
"rewards/margins_max": 5.603785514831543,
"rewards/margins_min": 3.346471071243286,
"rewards/margins_std": 1.59616219997406,
"rewards/rejected": -9.444674491882324,
"step": 240
},
{
"epoch": 0.7,
"grad_norm": 17.06427775193877,
"learning_rate": 4.730115807913627e-06,
"logits/chosen": -1.6722052097320557,
"logits/rejected": -1.393259882926941,
"logps/chosen": -916.7503662109375,
"logps/rejected": -1274.2889404296875,
"loss": 0.0866,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.015233039855957,
"rewards/margins": 5.172359943389893,
"rewards/margins_max": 6.111589431762695,
"rewards/margins_min": 4.233129501342773,
"rewards/margins_std": 1.328271508216858,
"rewards/rejected": -10.187592506408691,
"step": 250
},
{
"epoch": 0.73,
"grad_norm": 1.9182916124757974,
"learning_rate": 4.691871654986485e-06,
"logits/chosen": -1.7107824087142944,
"logits/rejected": -1.6128714084625244,
"logps/chosen": -878.5494384765625,
"logps/rejected": -1255.8555908203125,
"loss": 0.079,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -5.5672712326049805,
"rewards/margins": 4.748871803283691,
"rewards/margins_max": 5.786838531494141,
"rewards/margins_min": 3.7109055519104004,
"rewards/margins_std": 1.4679062366485596,
"rewards/rejected": -10.316143035888672,
"step": 260
},
{
"epoch": 0.76,
"grad_norm": 14.786553042508123,
"learning_rate": 4.651270581594054e-06,
"logits/chosen": -1.8650672435760498,
"logits/rejected": -1.613443374633789,
"logps/chosen": -834.0842895507812,
"logps/rejected": -1138.3665771484375,
"loss": 0.0875,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.050877571105957,
"rewards/margins": 5.007403373718262,
"rewards/margins_max": 5.84472131729126,
"rewards/margins_min": 4.170086860656738,
"rewards/margins_std": 1.184145212173462,
"rewards/rejected": -9.058280944824219,
"step": 270
},
{
"epoch": 0.79,
"grad_norm": 5.30439894597876,
"learning_rate": 4.6083562460867545e-06,
"logits/chosen": -1.6716859340667725,
"logits/rejected": -1.5429413318634033,
"logps/chosen": -701.3162841796875,
"logps/rejected": -1120.8736572265625,
"loss": 0.0896,
"rewards/accuracies": 0.8500000238418579,
"rewards/chosen": -3.7223961353302,
"rewards/margins": 4.8294267654418945,
"rewards/margins_max": 6.9812211990356445,
"rewards/margins_min": 2.6776328086853027,
"rewards/margins_std": 3.0430965423583984,
"rewards/rejected": -8.551824569702148,
"step": 280
},
{
"epoch": 0.82,
"grad_norm": 12.724182318476426,
"learning_rate": 4.563174794266684e-06,
"logits/chosen": -1.8460794687271118,
"logits/rejected": -1.6377445459365845,
"logps/chosen": -858.4215698242188,
"logps/rejected": -1289.198974609375,
"loss": 0.0576,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.965760231018066,
"rewards/margins": 5.2121992111206055,
"rewards/margins_max": 6.927371025085449,
"rewards/margins_min": 3.49702525138855,
"rewards/margins_std": 2.4256205558776855,
"rewards/rejected": -10.177958488464355,
"step": 290
},
{
"epoch": 0.85,
"grad_norm": 5.778488241840074,
"learning_rate": 4.5157748097670125e-06,
"logits/chosen": -1.7077114582061768,
"logits/rejected": -1.5558173656463623,
"logps/chosen": -739.67333984375,
"logps/rejected": -1423.210693359375,
"loss": 0.0426,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.0192999839782715,
"rewards/margins": 7.085653781890869,
"rewards/margins_max": 7.969016075134277,
"rewards/margins_min": 6.202291488647461,
"rewards/margins_std": 1.2492637634277344,
"rewards/rejected": -11.104954719543457,
"step": 300
},
{
"epoch": 0.85,
"eval_logits/chosen": -1.7101370096206665,
"eval_logits/rejected": -1.6507517099380493,
"eval_logps/chosen": -1041.5823974609375,
"eval_logps/rejected": -1121.1776123046875,
"eval_loss": 0.9213338494300842,
"eval_rewards/accuracies": 0.6785714030265808,
"eval_rewards/chosen": -7.563611030578613,
"eval_rewards/margins": 1.0563386678695679,
"eval_rewards/margins_max": 4.265172481536865,
"eval_rewards/margins_min": -2.1614327430725098,
"eval_rewards/margins_std": 2.8564813137054443,
"eval_rewards/rejected": -8.619950294494629,
"eval_runtime": 281.7456,
"eval_samples_per_second": 7.099,
"eval_steps_per_second": 0.224,
"step": 300
},
{
"epoch": 0.87,
"grad_norm": 12.853675144552225,
"learning_rate": 4.466207261809989e-06,
"logits/chosen": -1.9336496591567993,
"logits/rejected": -1.6221659183502197,
"logps/chosen": -901.4439697265625,
"logps/rejected": -1262.938720703125,
"loss": 0.0633,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.771965980529785,
"rewards/margins": 5.247581958770752,
"rewards/margins_max": 6.526535987854004,
"rewards/margins_min": 3.968628406524658,
"rewards/margins_std": 1.8087135553359985,
"rewards/rejected": -10.019546508789062,
"step": 310
},
{
"epoch": 0.9,
"grad_norm": 12.332833632235157,
"learning_rate": 4.414525450399713e-06,
"logits/chosen": -1.6821091175079346,
"logits/rejected": -1.511785626411438,
"logps/chosen": -956.3181762695312,
"logps/rejected": -1481.1754150390625,
"loss": 0.0978,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.100653171539307,
"rewards/margins": 6.3301496505737305,
"rewards/margins_max": 8.061585426330566,
"rewards/margins_min": 4.598714828491211,
"rewards/margins_std": 2.4486188888549805,
"rewards/rejected": -12.430803298950195,
"step": 320
},
{
"epoch": 0.93,
"grad_norm": 3.9044155848949162,
"learning_rate": 4.360784949008615e-06,
"logits/chosen": -1.768561601638794,
"logits/rejected": -1.5437813997268677,
"logps/chosen": -1006.9339599609375,
"logps/rejected": -1522.902587890625,
"loss": 0.1091,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.272473335266113,
"rewards/margins": 6.482227325439453,
"rewards/margins_max": 8.401371002197266,
"rewards/margins_min": 4.563082695007324,
"rewards/margins_std": 2.7140800952911377,
"rewards/rejected": -12.754700660705566,
"step": 330
},
{
"epoch": 0.96,
"grad_norm": 4.01171637277802,
"learning_rate": 4.30504354481929e-06,
"logits/chosen": -1.7665777206420898,
"logits/rejected": -1.5484760999679565,
"logps/chosen": -942.85888671875,
"logps/rejected": -1260.244384765625,
"loss": 0.0741,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.743631839752197,
"rewards/margins": 4.815784931182861,
"rewards/margins_max": 6.530648708343506,
"rewards/margins_min": 3.1009204387664795,
"rewards/margins_std": 2.425184488296509,
"rewards/rejected": -10.559415817260742,
"step": 340
},
{
"epoch": 0.99,
"grad_norm": 12.659683176327913,
"learning_rate": 4.247361176585904e-06,
"logits/chosen": -1.831321120262146,
"logits/rejected": -1.6549314260482788,
"logps/chosen": -909.5006713867188,
"logps/rejected": -1532.635986328125,
"loss": 0.0943,
"rewards/accuracies": 1.0,
"rewards/chosen": -4.810971736907959,
"rewards/margins": 7.531504154205322,
"rewards/margins_max": 8.548044204711914,
"rewards/margins_min": 6.514962673187256,
"rewards/margins_std": 1.4376055002212524,
"rewards/rejected": -12.342476844787598,
"step": 350
},
{
"epoch": 1.01,
"grad_norm": 3.001942641389469,
"learning_rate": 4.187799870182038e-06,
"logits/chosen": -1.7835716009140015,
"logits/rejected": -1.5620241165161133,
"logps/chosen": -896.9002075195312,
"logps/rejected": -1392.6307373046875,
"loss": 0.0555,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.5069427490234375,
"rewards/margins": 6.391612529754639,
"rewards/margins_max": 7.894322872161865,
"rewards/margins_min": 4.888903617858887,
"rewards/margins_std": 2.125152349472046,
"rewards/rejected": -11.898555755615234,
"step": 360
},
{
"epoch": 1.04,
"grad_norm": 34.14422714120664,
"learning_rate": 4.1264236719042365e-06,
"logits/chosen": -1.5919651985168457,
"logits/rejected": -1.5377094745635986,
"logps/chosen": -915.7950439453125,
"logps/rejected": -1490.6865234375,
"loss": 0.0808,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.6413750648498535,
"rewards/margins": 6.627654075622559,
"rewards/margins_max": 8.43530559539795,
"rewards/margins_min": 4.820002555847168,
"rewards/margins_std": 2.5564048290252686,
"rewards/rejected": -12.26902961730957,
"step": 370
},
{
"epoch": 1.07,
"grad_norm": 2.1290534012360847,
"learning_rate": 4.063298579603001e-06,
"logits/chosen": -1.8492443561553955,
"logits/rejected": -1.5422757863998413,
"logps/chosen": -937.0126953125,
"logps/rejected": -1458.616455078125,
"loss": 0.0231,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.460320949554443,
"rewards/margins": 7.281059265136719,
"rewards/margins_max": 8.49816608428955,
"rewards/margins_min": 6.0639543533325195,
"rewards/margins_std": 1.7212467193603516,
"rewards/rejected": -12.74138069152832,
"step": 380
},
{
"epoch": 1.1,
"grad_norm": 5.584775064800199,
"learning_rate": 3.998492471715272e-06,
"logits/chosen": -1.8397998809814453,
"logits/rejected": -1.6857073307037354,
"logps/chosen": -913.9352416992188,
"logps/rejected": -1781.8939208984375,
"loss": 0.0278,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.164222717285156,
"rewards/margins": 9.338297843933105,
"rewards/margins_max": 11.463502883911133,
"rewards/margins_min": 7.2130937576293945,
"rewards/margins_std": 3.005493640899658,
"rewards/rejected": -14.502520561218262,
"step": 390
},
{
"epoch": 1.13,
"grad_norm": 0.9893449328848739,
"learning_rate": 3.932075034274723e-06,
"logits/chosen": -1.5922348499298096,
"logits/rejected": -1.4688727855682373,
"logps/chosen": -871.9650268554688,
"logps/rejected": -1526.658935546875,
"loss": 0.0537,
"rewards/accuracies": 1.0,
"rewards/chosen": -5.7322564125061035,
"rewards/margins": 7.261972904205322,
"rewards/margins_max": 8.895970344543457,
"rewards/margins_min": 5.627974510192871,
"rewards/margins_std": 2.3108224868774414,
"rewards/rejected": -12.994227409362793,
"step": 400
},
{
"epoch": 1.13,
"eval_logits/chosen": -1.6575742959976196,
"eval_logits/rejected": -1.5926053524017334,
"eval_logps/chosen": -1505.182861328125,
"eval_logps/rejected": -1577.3876953125,
"eval_loss": 1.1419050693511963,
"eval_rewards/accuracies": 0.64682537317276,
"eval_rewards/chosen": -12.199617385864258,
"eval_rewards/margins": 0.9824325442314148,
"eval_rewards/margins_max": 5.48787260055542,
"eval_rewards/margins_min": -3.0621237754821777,
"eval_rewards/margins_std": 3.7889323234558105,
"eval_rewards/rejected": -13.182049751281738,
"eval_runtime": 282.4562,
"eval_samples_per_second": 7.081,
"eval_steps_per_second": 0.223,
"step": 400
},
{
"epoch": 1.15,
"grad_norm": 0.9794540017501292,
"learning_rate": 3.864117685978339e-06,
"logits/chosen": -1.6234560012817383,
"logits/rejected": -1.4928052425384521,
"logps/chosen": -1131.8265380859375,
"logps/rejected": -1794.791015625,
"loss": 0.0776,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -8.371360778808594,
"rewards/margins": 7.494576454162598,
"rewards/margins_max": 10.048029899597168,
"rewards/margins_min": 4.941121578216553,
"rewards/margins_std": 3.61112904548645,
"rewards/rejected": -15.865939140319824,
"step": 410
},
{
"epoch": 1.18,
"grad_norm": 5.020955613205059,
"learning_rate": 3.794693501389861e-06,
"logits/chosen": -1.7987747192382812,
"logits/rejected": -1.6164734363555908,
"logps/chosen": -1037.0328369140625,
"logps/rejected": -1667.540283203125,
"loss": 0.054,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.372786045074463,
"rewards/margins": 7.646895408630371,
"rewards/margins_max": 8.891626358032227,
"rewards/margins_min": 6.402162075042725,
"rewards/margins_std": 1.7603172063827515,
"rewards/rejected": -14.019680976867676,
"step": 420
},
{
"epoch": 1.21,
"grad_norm": 15.978168852619268,
"learning_rate": 3.7238771323626822e-06,
"logits/chosen": -1.6425611972808838,
"logits/rejected": -1.4570006132125854,
"logps/chosen": -1138.6572265625,
"logps/rejected": -1780.6002197265625,
"loss": 0.044,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.515681266784668,
"rewards/margins": 7.655673027038574,
"rewards/margins_max": 9.563043594360352,
"rewards/margins_min": 5.748303413391113,
"rewards/margins_std": 2.6974284648895264,
"rewards/rejected": -15.171353340148926,
"step": 430
},
{
"epoch": 1.24,
"grad_norm": 1.4394479904186748,
"learning_rate": 3.651744727766676e-06,
"logits/chosen": -1.565843939781189,
"logits/rejected": -1.3031253814697266,
"logps/chosen": -1135.116943359375,
"logps/rejected": -1897.188232421875,
"loss": 0.0356,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.198633193969727,
"rewards/margins": 8.82483196258545,
"rewards/margins_max": 11.5381441116333,
"rewards/margins_min": 6.1115217208862305,
"rewards/margins_std": 3.8372015953063965,
"rewards/rejected": -17.023466110229492,
"step": 440
},
{
"epoch": 1.27,
"grad_norm": 2.5233082457705853,
"learning_rate": 3.57837385160529e-06,
"logits/chosen": -1.6333341598510742,
"logits/rejected": -1.419213056564331,
"logps/chosen": -991.2794799804688,
"logps/rejected": -1686.808837890625,
"loss": 0.0246,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.5310492515563965,
"rewards/margins": 7.6606926918029785,
"rewards/margins_max": 9.670614242553711,
"rewards/margins_min": 5.650770664215088,
"rewards/margins_std": 2.842459201812744,
"rewards/rejected": -14.191740036010742,
"step": 450
},
{
"epoch": 1.3,
"grad_norm": 1.432241857413985,
"learning_rate": 3.503843399610941e-06,
"logits/chosen": -1.6662094593048096,
"logits/rejected": -1.5159740447998047,
"logps/chosen": -1023.26220703125,
"logps/rejected": -1997.1787109375,
"loss": 0.0208,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.279843330383301,
"rewards/margins": 9.666014671325684,
"rewards/margins_max": 11.908063888549805,
"rewards/margins_min": 7.423966407775879,
"rewards/margins_std": 3.1707358360290527,
"rewards/rejected": -15.945857048034668,
"step": 460
},
{
"epoch": 1.32,
"grad_norm": 1.3845844015706055,
"learning_rate": 3.4282335144083985e-06,
"logits/chosen": -1.5941836833953857,
"logits/rejected": -1.34697425365448,
"logps/chosen": -1180.2171630859375,
"logps/rejected": -1964.836181640625,
"loss": 0.0304,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.229662895202637,
"rewards/margins": 9.211896896362305,
"rewards/margins_max": 11.3733549118042,
"rewards/margins_min": 7.050437927246094,
"rewards/margins_std": 3.0567641258239746,
"rewards/rejected": -17.441558837890625,
"step": 470
},
{
"epoch": 1.35,
"grad_norm": 0.25091350074864577,
"learning_rate": 3.351625499337395e-06,
"logits/chosen": -1.7405236959457397,
"logits/rejected": -1.4616386890411377,
"logps/chosen": -1157.209716796875,
"logps/rejected": -1899.130126953125,
"loss": 0.014,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.413580417633057,
"rewards/margins": 8.81358528137207,
"rewards/margins_max": 10.952999114990234,
"rewards/margins_min": 6.674172401428223,
"rewards/margins_std": 3.0255870819091797,
"rewards/rejected": -16.227169036865234,
"step": 480
},
{
"epoch": 1.38,
"grad_norm": 1.9987349085330508,
"learning_rate": 3.2741017310271056e-06,
"logits/chosen": -1.3325449228286743,
"logits/rejected": -1.044908881187439,
"logps/chosen": -1130.028076171875,
"logps/rejected": -2392.521728515625,
"loss": 0.0448,
"rewards/accuracies": 0.8999999761581421,
"rewards/chosen": -9.127466201782227,
"rewards/margins": 12.631993293762207,
"rewards/margins_max": 19.333314895629883,
"rewards/margins_min": 5.930669784545898,
"rewards/margins_std": 9.47710132598877,
"rewards/rejected": -21.759456634521484,
"step": 490
},
{
"epoch": 1.41,
"grad_norm": 1.7094204242814826,
"learning_rate": 3.195745570816532e-06,
"logits/chosen": -1.3385294675827026,
"logits/rejected": -1.144627571105957,
"logps/chosen": -1425.61474609375,
"logps/rejected": -2558.358642578125,
"loss": 0.0197,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.41409969329834,
"rewards/margins": 12.790387153625488,
"rewards/margins_max": 14.778757095336914,
"rewards/margins_min": 10.802019119262695,
"rewards/margins_std": 2.811978340148926,
"rewards/rejected": -23.204486846923828,
"step": 500
},
{
"epoch": 1.41,
"eval_logits/chosen": -1.5026105642318726,
"eval_logits/rejected": -1.4330366849899292,
"eval_logps/chosen": -2000.166259765625,
"eval_logps/rejected": -2146.479736328125,
"eval_loss": 1.684375524520874,
"eval_rewards/accuracies": 0.6666666865348816,
"eval_rewards/chosen": -17.149450302124023,
"eval_rewards/margins": 1.7235194444656372,
"eval_rewards/margins_max": 9.41946029663086,
"eval_rewards/margins_min": -5.146158218383789,
"eval_rewards/margins_std": 6.577420711517334,
"eval_rewards/rejected": -18.872970581054688,
"eval_runtime": 282.6761,
"eval_samples_per_second": 7.075,
"eval_steps_per_second": 0.223,
"step": 500
},
{
"epoch": 1.44,
"grad_norm": 19.195207569920772,
"learning_rate": 3.116641275116018e-06,
"logits/chosen": -1.2405312061309814,
"logits/rejected": -0.9798258543014526,
"logps/chosen": -1318.967041015625,
"logps/rejected": -3077.10986328125,
"loss": 0.0229,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.26286792755127,
"rewards/margins": 17.355688095092773,
"rewards/margins_max": 25.170244216918945,
"rewards/margins_min": 9.541135787963867,
"rewards/margins_std": 11.051448822021484,
"rewards/rejected": -27.618555068969727,
"step": 510
},
{
"epoch": 1.46,
"grad_norm": 18.23076880980296,
"learning_rate": 3.0368739048062956e-06,
"logits/chosen": -1.6826045513153076,
"logits/rejected": -1.4554195404052734,
"logps/chosen": -1159.925048828125,
"logps/rejected": -2069.19580078125,
"loss": 0.0355,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.183090209960938,
"rewards/margins": 10.176679611206055,
"rewards/margins_max": 13.777229309082031,
"rewards/margins_min": 6.5761308670043945,
"rewards/margins_std": 5.091946125030518,
"rewards/rejected": -18.359769821166992,
"step": 520
},
{
"epoch": 1.49,
"grad_norm": 7.345312333811953,
"learning_rate": 2.956529233772492e-06,
"logits/chosen": -1.6696984767913818,
"logits/rejected": -1.566896915435791,
"logps/chosen": -1206.398681640625,
"logps/rejected": -2070.3857421875,
"loss": 0.0184,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.273930549621582,
"rewards/margins": 9.733041763305664,
"rewards/margins_max": 12.174661636352539,
"rewards/margins_min": 7.291422367095947,
"rewards/margins_std": 3.4529712200164795,
"rewards/rejected": -18.006973266601562,
"step": 530
},
{
"epoch": 1.52,
"grad_norm": 21.78105244485373,
"learning_rate": 2.8756936566714317e-06,
"logits/chosen": -1.8572250604629517,
"logits/rejected": -1.5829768180847168,
"logps/chosen": -1132.333740234375,
"logps/rejected": -1908.844970703125,
"loss": 0.0256,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.327805519104004,
"rewards/margins": 9.385960578918457,
"rewards/margins_max": 10.629077911376953,
"rewards/margins_min": 8.142843246459961,
"rewards/margins_std": 1.7580335140228271,
"rewards/rejected": -16.713764190673828,
"step": 540
},
{
"epoch": 1.55,
"grad_norm": 0.0011589092808777935,
"learning_rate": 2.794454096031429e-06,
"logits/chosen": -1.7256653308868408,
"logits/rejected": -1.5292785167694092,
"logps/chosen": -1160.131591796875,
"logps/rejected": -2000.1337890625,
"loss": 0.0223,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.491829872131348,
"rewards/margins": 8.8389892578125,
"rewards/margins_max": 10.393911361694336,
"rewards/margins_min": 7.284067630767822,
"rewards/margins_std": 2.1989917755126953,
"rewards/rejected": -17.33081817626953,
"step": 550
},
{
"epoch": 1.58,
"grad_norm": 1.1029358007262624,
"learning_rate": 2.71289790878446e-06,
"logits/chosen": -1.5588399171829224,
"logits/rejected": -1.3718044757843018,
"logps/chosen": -1313.054443359375,
"logps/rejected": -2318.33544921875,
"loss": 0.0303,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.963714599609375,
"rewards/margins": 9.831637382507324,
"rewards/margins_max": 12.691813468933105,
"rewards/margins_min": 6.971460819244385,
"rewards/margins_std": 4.044900894165039,
"rewards/rejected": -19.795352935791016,
"step": 560
},
{
"epoch": 1.61,
"grad_norm": 0.032589510422147,
"learning_rate": 2.6311127923312156e-06,
"logits/chosen": -1.7382599115371704,
"logits/rejected": -1.5052683353424072,
"logps/chosen": -1249.270263671875,
"logps/rejected": -2084.659912109375,
"loss": 0.0177,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.170693397521973,
"rewards/margins": 9.51733684539795,
"rewards/margins_max": 11.196283340454102,
"rewards/margins_min": 7.8383917808532715,
"rewards/margins_std": 2.374387741088867,
"rewards/rejected": -17.68802833557129,
"step": 570
},
{
"epoch": 1.63,
"grad_norm": 12.99158263963332,
"learning_rate": 2.549186690240057e-06,
"logits/chosen": -1.610082983970642,
"logits/rejected": -1.3717553615570068,
"logps/chosen": -1186.931884765625,
"logps/rejected": -2215.44970703125,
"loss": 0.0096,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.884663581848145,
"rewards/margins": 11.055347442626953,
"rewards/margins_max": 13.794784545898438,
"rewards/margins_min": 8.315912246704102,
"rewards/margins_std": 3.874147891998291,
"rewards/rejected": -19.94001007080078,
"step": 580
},
{
"epoch": 1.66,
"grad_norm": 0.09893386521593805,
"learning_rate": 2.4672076976812548e-06,
"logits/chosen": -1.504370927810669,
"logits/rejected": -1.24093759059906,
"logps/chosen": -1294.6529541015625,
"logps/rejected": -2374.53271484375,
"loss": 0.0182,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.253921508789062,
"rewards/margins": 11.56922721862793,
"rewards/margins_max": 16.103586196899414,
"rewards/margins_min": 7.034867763519287,
"rewards/margins_std": 6.412552833557129,
"rewards/rejected": -20.823148727416992,
"step": 590
},
{
"epoch": 1.69,
"grad_norm": 1.4677452546622722,
"learning_rate": 2.3852639666982218e-06,
"logits/chosen": -1.5387322902679443,
"logits/rejected": -1.3424365520477295,
"logps/chosen": -1172.688232421875,
"logps/rejected": -2390.56689453125,
"loss": 0.0029,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.940356254577637,
"rewards/margins": 12.360175132751465,
"rewards/margins_max": 14.774116516113281,
"rewards/margins_min": 9.946235656738281,
"rewards/margins_std": 3.4138267040252686,
"rewards/rejected": -21.300533294677734,
"step": 600
},
{
"epoch": 1.69,
"eval_logits/chosen": -1.5330660343170166,
"eval_logits/rejected": -1.4547291994094849,
"eval_logps/chosen": -1739.8331298828125,
"eval_logps/rejected": -2005.7900390625,
"eval_loss": 1.9743393659591675,
"eval_rewards/accuracies": 0.6865079402923584,
"eval_rewards/chosen": -14.546117782592773,
"eval_rewards/margins": 2.9199535846710205,
"eval_rewards/margins_max": 12.400845527648926,
"eval_rewards/margins_min": -5.716708660125732,
"eval_rewards/margins_std": 8.164259910583496,
"eval_rewards/rejected": -17.46607208251953,
"eval_runtime": 281.995,
"eval_samples_per_second": 7.092,
"eval_steps_per_second": 0.223,
"step": 600
},
{
"epoch": 1.72,
"grad_norm": 16.662428863900104,
"learning_rate": 2.303443611417584e-06,
"logits/chosen": -1.2892029285430908,
"logits/rejected": -1.0749212503433228,
"logps/chosen": -1583.099609375,
"logps/rejected": -2742.760498046875,
"loss": 0.3581,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.176101684570312,
"rewards/margins": 12.56828498840332,
"rewards/margins_max": 17.369625091552734,
"rewards/margins_min": 7.766943454742432,
"rewards/margins_std": 6.790121555328369,
"rewards/rejected": -24.744388580322266,
"step": 610
},
{
"epoch": 1.75,
"grad_norm": 0.1502185307527533,
"learning_rate": 2.2218346133000264e-06,
"logits/chosen": -1.1851621866226196,
"logits/rejected": -0.8747516870498657,
"logps/chosen": -1684.5989990234375,
"logps/rejected": -2998.321044921875,
"loss": 0.0851,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -13.733156204223633,
"rewards/margins": 14.27801513671875,
"rewards/margins_max": 20.737751007080078,
"rewards/margins_min": 7.818281650543213,
"rewards/margins_std": 9.135442733764648,
"rewards/rejected": -28.011173248291016,
"step": 620
},
{
"epoch": 1.77,
"grad_norm": 0.608737783564001,
"learning_rate": 2.140524726533792e-06,
"logits/chosen": -1.4635207653045654,
"logits/rejected": -1.206559658050537,
"logps/chosen": -1263.6993408203125,
"logps/rejected": -2158.978759765625,
"loss": 0.0474,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.689355850219727,
"rewards/margins": 10.659037590026855,
"rewards/margins_max": 13.989839553833008,
"rewards/margins_min": 7.3282365798950195,
"rewards/margins_std": 4.710465431213379,
"rewards/rejected": -19.3483943939209,
"step": 630
},
{
"epoch": 1.8,
"grad_norm": 37.51094566818964,
"learning_rate": 2.059601383672566e-06,
"logits/chosen": -1.6980371475219727,
"logits/rejected": -1.5178521871566772,
"logps/chosen": -964.2796630859375,
"logps/rejected": -1743.4036865234375,
"loss": 0.0669,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.6180419921875,
"rewards/margins": 8.817036628723145,
"rewards/margins_max": 10.244000434875488,
"rewards/margins_min": 7.390072822570801,
"rewards/margins_std": 2.018031597137451,
"rewards/rejected": -15.435079574584961,
"step": 640
},
{
"epoch": 1.83,
"grad_norm": 0.824336798291059,
"learning_rate": 1.9791516016192214e-06,
"logits/chosen": -1.8461487293243408,
"logits/rejected": -1.5655087232589722,
"logps/chosen": -941.0548706054688,
"logps/rejected": -1621.322265625,
"loss": 0.0587,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -6.0665507316589355,
"rewards/margins": 7.918545722961426,
"rewards/margins_max": 10.15103530883789,
"rewards/margins_min": 5.6860551834106445,
"rewards/margins_std": 3.157217502593994,
"rewards/rejected": -13.985095024108887,
"step": 650
},
{
"epoch": 1.86,
"grad_norm": 0.2329366656877762,
"learning_rate": 1.8992618880565039e-06,
"logits/chosen": -1.4127376079559326,
"logits/rejected": -1.204310655593872,
"logps/chosen": -974.7972412109375,
"logps/rejected": -1706.96484375,
"loss": 0.0472,
"rewards/accuracies": 1.0,
"rewards/chosen": -6.5077385902404785,
"rewards/margins": 8.661420822143555,
"rewards/margins_max": 11.35025691986084,
"rewards/margins_min": 5.972585678100586,
"rewards/margins_std": 3.8025870323181152,
"rewards/rejected": -15.169160842895508,
"step": 660
},
{
"epoch": 1.89,
"grad_norm": 0.2766932797893532,
"learning_rate": 1.8200181484252888e-06,
"logits/chosen": -1.6775104999542236,
"logits/rejected": -1.5603760480880737,
"logps/chosen": -1146.943603515625,
"logps/rejected": -2180.825927734375,
"loss": 0.0303,
"rewards/accuracies": 1.0,
"rewards/chosen": -7.641868591308594,
"rewards/margins": 10.99293327331543,
"rewards/margins_max": 14.466341018676758,
"rewards/margins_min": 7.519525051116943,
"rewards/margins_std": 4.912140369415283,
"rewards/rejected": -18.634801864624023,
"step": 670
},
{
"epoch": 1.92,
"grad_norm": 1.9894517252535326,
"learning_rate": 1.7415055935504234e-06,
"logits/chosen": -1.6779143810272217,
"logits/rejected": -1.3088996410369873,
"logps/chosen": -1250.79345703125,
"logps/rejected": -2332.5302734375,
"loss": 0.0268,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.574339866638184,
"rewards/margins": 11.780553817749023,
"rewards/margins_max": 17.217056274414062,
"rewards/margins_min": 6.344052314758301,
"rewards/margins_std": 7.688374996185303,
"rewards/rejected": -20.35489273071289,
"step": 680
},
{
"epoch": 1.94,
"grad_norm": 1.2264882447915335,
"learning_rate": 1.6638086480134954e-06,
"logits/chosen": -1.133843183517456,
"logits/rejected": -0.9121431112289429,
"logps/chosen": -1320.951171875,
"logps/rejected": -2429.5537109375,
"loss": 0.014,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -10.83985710144043,
"rewards/margins": 12.160634994506836,
"rewards/margins_max": 17.855926513671875,
"rewards/margins_min": 6.465344429016113,
"rewards/margins_std": 8.054357528686523,
"rewards/rejected": -23.000492095947266,
"step": 690
},
{
"epoch": 1.97,
"grad_norm": 4.223913353219136,
"learning_rate": 1.5870108593710473e-06,
"logits/chosen": -1.4314680099487305,
"logits/rejected": -1.1393955945968628,
"logps/chosen": -1421.0302734375,
"logps/rejected": -2616.06005859375,
"loss": 0.018,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.940652847290039,
"rewards/margins": 14.069793701171875,
"rewards/margins_max": 18.50979995727539,
"rewards/margins_min": 9.62978744506836,
"rewards/margins_std": 6.279117584228516,
"rewards/rejected": -24.010446548461914,
"step": 700
},
{
"epoch": 1.97,
"eval_logits/chosen": -1.4977593421936035,
"eval_logits/rejected": -1.4133175611495972,
"eval_logps/chosen": -1938.2783203125,
"eval_logps/rejected": -2177.001708984375,
"eval_loss": 1.8029882907867432,
"eval_rewards/accuracies": 0.6785714030265808,
"eval_rewards/chosen": -16.53057098388672,
"eval_rewards/margins": 2.6476187705993652,
"eval_rewards/margins_max": 11.230785369873047,
"eval_rewards/margins_min": -5.27154541015625,
"eval_rewards/margins_std": 7.43382453918457,
"eval_rewards/rejected": -19.178190231323242,
"eval_runtime": 282.2867,
"eval_samples_per_second": 7.085,
"eval_steps_per_second": 0.223,
"step": 700
},
{
"epoch": 2.0,
"grad_norm": 0.027200756028801846,
"learning_rate": 1.511194808315853e-06,
"logits/chosen": -1.4225877523422241,
"logits/rejected": -1.1490380764007568,
"logps/chosen": -1361.941162109375,
"logps/rejected": -2227.452880859375,
"loss": 0.0423,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -10.556672096252441,
"rewards/margins": 9.88037109375,
"rewards/margins_max": 13.63640022277832,
"rewards/margins_min": 6.124342441558838,
"rewards/margins_std": 5.311827182769775,
"rewards/rejected": -20.437042236328125,
"step": 710
},
{
"epoch": 2.03,
"grad_norm": 0.318786591879142,
"learning_rate": 1.4364420198778662e-06,
"logits/chosen": -1.5894582271575928,
"logits/rejected": -1.3686472177505493,
"logps/chosen": -1422.156005859375,
"logps/rejected": -2683.84814453125,
"loss": 0.0033,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.745410919189453,
"rewards/margins": 12.789144515991211,
"rewards/margins_max": 16.427227020263672,
"rewards/margins_min": 9.15106201171875,
"rewards/margins_std": 5.14502477645874,
"rewards/rejected": -23.53455352783203,
"step": 720
},
{
"epoch": 2.06,
"grad_norm": 1.5807231251466567,
"learning_rate": 1.3628328757603243e-06,
"logits/chosen": -1.6512333154678345,
"logits/rejected": -1.3885473012924194,
"logps/chosen": -1368.7022705078125,
"logps/rejected": -2550.4912109375,
"loss": 0.0091,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.459519386291504,
"rewards/margins": 13.517751693725586,
"rewards/margins_max": 18.180484771728516,
"rewards/margins_min": 8.855023384094238,
"rewards/margins_std": 6.5940961837768555,
"rewards/rejected": -22.97727394104004,
"step": 730
},
{
"epoch": 2.08,
"grad_norm": 0.1516893711186873,
"learning_rate": 1.2904465279052725e-06,
"logits/chosen": -1.6209065914154053,
"logits/rejected": -1.351872444152832,
"logps/chosen": -1231.8480224609375,
"logps/rejected": -2237.622802734375,
"loss": 0.0085,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.798944473266602,
"rewards/margins": 11.324702262878418,
"rewards/margins_max": 13.88591480255127,
"rewards/margins_min": 8.763489723205566,
"rewards/margins_std": 3.6221022605895996,
"rewards/rejected": -20.123645782470703,
"step": 740
},
{
"epoch": 2.11,
"grad_norm": 0.8035507691467565,
"learning_rate": 1.219360813381446e-06,
"logits/chosen": -1.247396469116211,
"logits/rejected": -1.033151388168335,
"logps/chosen": -1316.85546875,
"logps/rejected": -2502.35400390625,
"loss": 0.0042,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.282798767089844,
"rewards/margins": 12.374329566955566,
"rewards/margins_max": 16.396432876586914,
"rewards/margins_min": 8.352226257324219,
"rewards/margins_std": 5.688112258911133,
"rewards/rejected": -23.657127380371094,
"step": 750
},
{
"epoch": 2.14,
"grad_norm": 0.10201527009610997,
"learning_rate": 1.1496521706860392e-06,
"logits/chosen": -1.5233542919158936,
"logits/rejected": -1.1838680505752563,
"logps/chosen": -1417.0087890625,
"logps/rejected": -2805.773681640625,
"loss": 0.0051,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.839475631713867,
"rewards/margins": 14.590258598327637,
"rewards/margins_max": 17.661457061767578,
"rewards/margins_min": 11.519063949584961,
"rewards/margins_std": 4.343328475952148,
"rewards/rejected": -25.429737091064453,
"step": 760
},
{
"epoch": 2.17,
"grad_norm": 0.0015806759819360625,
"learning_rate": 1.0813955575503588e-06,
"logits/chosen": -1.355691909790039,
"logits/rejected": -1.144424557685852,
"logps/chosen": -1348.842041015625,
"logps/rejected": -2898.0224609375,
"loss": 0.0066,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.152058601379395,
"rewards/margins": 16.251543045043945,
"rewards/margins_max": 22.687950134277344,
"rewards/margins_min": 9.815134048461914,
"rewards/margins_std": 9.102456092834473,
"rewards/rejected": -26.40359878540039,
"step": 770
},
{
"epoch": 2.2,
"grad_norm": 0.408380187113466,
"learning_rate": 1.0146643703377488e-06,
"logits/chosen": -1.6056991815567017,
"logits/rejected": -1.3266913890838623,
"logps/chosen": -1298.9927978515625,
"logps/rejected": -2409.390869140625,
"loss": 0.0075,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.719507217407227,
"rewards/margins": 12.09427261352539,
"rewards/margins_max": 15.695422172546387,
"rewards/margins_min": 8.493124008178711,
"rewards/margins_std": 5.092793941497803,
"rewards/rejected": -21.813779830932617,
"step": 780
},
{
"epoch": 2.23,
"grad_norm": 0.001344347508367163,
"learning_rate": 9.495303651204496e-07,
"logits/chosen": -1.563906192779541,
"logits/rejected": -1.3474560976028442,
"logps/chosen": -1254.9219970703125,
"logps/rejected": -2623.2822265625,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.889430046081543,
"rewards/margins": 14.575796127319336,
"rewards/margins_max": 18.69800567626953,
"rewards/margins_min": 10.453584671020508,
"rewards/margins_std": 5.829684734344482,
"rewards/rejected": -23.465227127075195,
"step": 790
},
{
"epoch": 2.25,
"grad_norm": 1.6920469977748351,
"learning_rate": 8.860635805202616e-07,
"logits/chosen": -1.551922082901001,
"logits/rejected": -1.2580442428588867,
"logps/chosen": -1456.9490966796875,
"logps/rejected": -2604.62744140625,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.935505867004395,
"rewards/margins": 12.657417297363281,
"rewards/margins_max": 15.51282024383545,
"rewards/margins_min": 9.802014350891113,
"rewards/margins_std": 4.038149833679199,
"rewards/rejected": -23.59292221069336,
"step": 800
},
{
"epoch": 2.25,
"eval_logits/chosen": -1.5266377925872803,
"eval_logits/rejected": -1.4433014392852783,
"eval_logps/chosen": -1957.578857421875,
"eval_logps/rejected": -2208.484375,
"eval_loss": 1.8519227504730225,
"eval_rewards/accuracies": 0.6746031641960144,
"eval_rewards/chosen": -16.72357749938965,
"eval_rewards/margins": 2.7694385051727295,
"eval_rewards/margins_max": 11.662981033325195,
"eval_rewards/margins_min": -5.304656982421875,
"eval_rewards/margins_std": 7.62367582321167,
"eval_rewards/rejected": -19.493017196655273,
"eval_runtime": 282.5434,
"eval_samples_per_second": 7.079,
"eval_steps_per_second": 0.223,
"step": 800
},
{
"epoch": 2.28,
"grad_norm": 3.2305387145726234,
"learning_rate": 8.24332262395994e-07,
"logits/chosen": -1.5742024183273315,
"logits/rejected": -1.3343318700790405,
"logps/chosen": -1459.0062255859375,
"logps/rejected": -2835.21044921875,
"loss": 0.0055,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.644388198852539,
"rewards/margins": 14.268835067749023,
"rewards/margins_max": 19.221527099609375,
"rewards/margins_min": 9.316144943237305,
"rewards/margins_std": 7.0041632652282715,
"rewards/rejected": -25.913223266601562,
"step": 810
},
{
"epoch": 2.31,
"grad_norm": 0.26542768442550385,
"learning_rate": 7.644027904586587e-07,
"logits/chosen": -1.50737726688385,
"logits/rejected": -1.2445927858352661,
"logps/chosen": -1452.3663330078125,
"logps/rejected": -2697.02880859375,
"loss": 0.0067,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.468404769897461,
"rewards/margins": 13.425836563110352,
"rewards/margins_max": 16.106616973876953,
"rewards/margins_min": 10.745055198669434,
"rewards/margins_std": 3.791196823120117,
"rewards/rejected": -24.894241333007812,
"step": 820
},
{
"epoch": 2.34,
"grad_norm": 0.8567763833713586,
"learning_rate": 7.06339606893347e-07,
"logits/chosen": -1.6803547143936157,
"logits/rejected": -1.4048993587493896,
"logps/chosen": -1588.3795166015625,
"logps/rejected": -2856.94873046875,
"loss": 0.0218,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.344830513000488,
"rewards/margins": 14.68242073059082,
"rewards/margins_max": 20.33969497680664,
"rewards/margins_min": 9.025145530700684,
"rewards/margins_std": 8.000594139099121,
"rewards/rejected": -26.02724838256836,
"step": 830
},
{
"epoch": 2.37,
"grad_norm": 0.19797390603665133,
"learning_rate": 6.502051470645149e-07,
"logits/chosen": -1.7654281854629517,
"logits/rejected": -1.40230393409729,
"logps/chosen": -1327.5189208984375,
"logps/rejected": -2276.90771484375,
"loss": 0.0218,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.360559463500977,
"rewards/margins": 10.55632495880127,
"rewards/margins_max": 12.99437141418457,
"rewards/margins_min": 8.118279457092285,
"rewards/margins_std": 3.4479167461395264,
"rewards/rejected": -19.916885375976562,
"step": 840
},
{
"epoch": 2.39,
"grad_norm": 0.0023467881665189677,
"learning_rate": 5.960597723792194e-07,
"logits/chosen": -1.5812981128692627,
"logits/rejected": -1.1608024835586548,
"logps/chosen": -1374.124267578125,
"logps/rejected": -2819.462158203125,
"loss": 0.0049,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.890588760375977,
"rewards/margins": 15.723424911499023,
"rewards/margins_max": 21.0240421295166,
"rewards/margins_min": 10.422807693481445,
"rewards/margins_std": 7.4962053298950195,
"rewards/rejected": -25.614009857177734,
"step": 850
},
{
"epoch": 2.42,
"grad_norm": 1.4084849928658003,
"learning_rate": 5.43961705380465e-07,
"logits/chosen": -1.646162986755371,
"logits/rejected": -1.4091808795928955,
"logps/chosen": -1218.2606201171875,
"logps/rejected": -2409.643798828125,
"loss": 0.0078,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -8.628401756286621,
"rewards/margins": 12.78498649597168,
"rewards/margins_max": 17.431535720825195,
"rewards/margins_min": 8.138437271118164,
"rewards/margins_std": 6.5712127685546875,
"rewards/rejected": -21.413387298583984,
"step": 860
},
{
"epoch": 2.45,
"grad_norm": 0.13595105985996128,
"learning_rate": 4.939669671404871e-07,
"logits/chosen": -1.5396533012390137,
"logits/rejected": -1.2183513641357422,
"logps/chosen": -1237.326904296875,
"logps/rejected": -3156.015380859375,
"loss": 0.0039,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.967730522155762,
"rewards/margins": 19.433839797973633,
"rewards/margins_max": 26.383316040039062,
"rewards/margins_min": 12.484365463256836,
"rewards/margins_std": 9.828042984008789,
"rewards/rejected": -28.40157127380371,
"step": 870
},
{
"epoch": 2.48,
"grad_norm": 0.012403182973777866,
"learning_rate": 4.461293170212644e-07,
"logits/chosen": -1.6268768310546875,
"logits/rejected": -1.3297674655914307,
"logps/chosen": -1231.2391357421875,
"logps/rejected": -2482.310546875,
"loss": 0.0125,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.140237808227539,
"rewards/margins": 13.229069709777832,
"rewards/margins_max": 16.058679580688477,
"rewards/margins_min": 10.399457931518555,
"rewards/margins_std": 4.001674175262451,
"rewards/rejected": -22.369308471679688,
"step": 880
},
{
"epoch": 2.51,
"grad_norm": 5.925107209728559,
"learning_rate": 4.005001948670606e-07,
"logits/chosen": -1.7953965663909912,
"logits/rejected": -1.5808696746826172,
"logps/chosen": -1377.26611328125,
"logps/rejected": -2234.20849609375,
"loss": 0.0043,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.003216743469238,
"rewards/margins": 10.078218460083008,
"rewards/margins_max": 11.774847030639648,
"rewards/margins_min": 8.381589889526367,
"rewards/margins_std": 2.39939546585083,
"rewards/rejected": -19.08143424987793,
"step": 890
},
{
"epoch": 2.54,
"grad_norm": 0.0018034560654693567,
"learning_rate": 3.571286656911377e-07,
"logits/chosen": -1.6509956121444702,
"logits/rejected": -1.2617855072021484,
"logps/chosen": -1374.924072265625,
"logps/rejected": -2686.83154296875,
"loss": 0.0034,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.74584674835205,
"rewards/margins": 14.469047546386719,
"rewards/margins_max": 20.866533279418945,
"rewards/margins_min": 8.071561813354492,
"rewards/margins_std": 9.04741096496582,
"rewards/rejected": -24.214895248413086,
"step": 900
},
{
"epoch": 2.54,
"eval_logits/chosen": -1.5324345827102661,
"eval_logits/rejected": -1.4488511085510254,
"eval_logps/chosen": -1899.9781494140625,
"eval_logps/rejected": -2137.156982421875,
"eval_loss": 1.6798701286315918,
"eval_rewards/accuracies": 0.6865079402923584,
"eval_rewards/chosen": -16.14756965637207,
"eval_rewards/margins": 2.632173776626587,
"eval_rewards/margins_max": 10.763092994689941,
"eval_rewards/margins_min": -4.875840663909912,
"eval_rewards/margins_std": 7.033862590789795,
"eval_rewards/rejected": -18.77974510192871,
"eval_runtime": 281.9065,
"eval_samples_per_second": 7.095,
"eval_steps_per_second": 0.223,
"step": 900
},
{
"epoch": 2.56,
"grad_norm": 0.39851941407344293,
"learning_rate": 3.1606136691612555e-07,
"logits/chosen": -1.7041774988174438,
"logits/rejected": -1.4187756776809692,
"logps/chosen": -1301.1878662109375,
"logps/rejected": -2172.826904296875,
"loss": 0.001,
"rewards/accuracies": 1.0,
"rewards/chosen": -8.820059776306152,
"rewards/margins": 10.524114608764648,
"rewards/margins_max": 12.688272476196289,
"rewards/margins_min": 8.359955787658691,
"rewards/margins_std": 3.060582160949707,
"rewards/rejected": -19.344173431396484,
"step": 910
},
{
"epoch": 2.59,
"grad_norm": 0.0005374838985619683,
"learning_rate": 2.773424582247844e-07,
"logits/chosen": -1.5690796375274658,
"logits/rejected": -1.2215526103973389,
"logps/chosen": -1358.075927734375,
"logps/rejected": -2381.899169921875,
"loss": 0.0024,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.972057342529297,
"rewards/margins": 11.921777725219727,
"rewards/margins_max": 14.729642868041992,
"rewards/margins_min": 9.113912582397461,
"rewards/margins_std": 3.970921754837036,
"rewards/rejected": -21.893835067749023,
"step": 920
},
{
"epoch": 2.62,
"grad_norm": 0.8257494267996711,
"learning_rate": 2.410135740750821e-07,
"logits/chosen": -1.5338929891586304,
"logits/rejected": -1.259865164756775,
"logps/chosen": -1410.4990234375,
"logps/rejected": -2998.914794921875,
"loss": 0.0099,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.63892936706543,
"rewards/margins": 16.653705596923828,
"rewards/margins_max": 21.365177154541016,
"rewards/margins_min": 11.942238807678223,
"rewards/margins_std": 6.663023471832275,
"rewards/rejected": -27.29263687133789,
"step": 930
},
{
"epoch": 2.65,
"grad_norm": 0.06916221157748438,
"learning_rate": 2.0711377893064182e-07,
"logits/chosen": -1.5516988039016724,
"logits/rejected": -1.2729582786560059,
"logps/chosen": -1308.211669921875,
"logps/rejected": -2490.35693359375,
"loss": 0.0053,
"rewards/accuracies": 1.0,
"rewards/chosen": -9.68997573852539,
"rewards/margins": 13.111665725708008,
"rewards/margins_max": 18.273632049560547,
"rewards/margins_min": 7.9496965408325195,
"rewards/margins_std": 7.300126075744629,
"rewards/rejected": -22.801639556884766,
"step": 940
},
{
"epoch": 2.68,
"grad_norm": 2.498417925921994,
"learning_rate": 1.756795252547111e-07,
"logits/chosen": -1.4785737991333008,
"logits/rejected": -1.2068592309951782,
"logps/chosen": -1470.0135498046875,
"logps/rejected": -2859.243408203125,
"loss": 0.0078,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.678686141967773,
"rewards/margins": 14.885587692260742,
"rewards/margins_max": 18.92436981201172,
"rewards/margins_min": 10.846805572509766,
"rewards/margins_std": 5.7117018699646,
"rewards/rejected": -26.564273834228516,
"step": 950
},
{
"epoch": 2.7,
"grad_norm": 0.30835027385045066,
"learning_rate": 1.4674461431281013e-07,
"logits/chosen": -1.6750847101211548,
"logits/rejected": -1.3757655620574951,
"logps/chosen": -1276.86669921875,
"logps/rejected": -2703.418701171875,
"loss": 0.0151,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -9.499726295471191,
"rewards/margins": 15.09521198272705,
"rewards/margins_max": 21.079849243164062,
"rewards/margins_min": 9.11056900024414,
"rewards/margins_std": 8.463561058044434,
"rewards/rejected": -24.59493637084961,
"step": 960
},
{
"epoch": 2.73,
"grad_norm": 0.23235990194938522,
"learning_rate": 1.2034015982622243e-07,
"logits/chosen": -1.5666346549987793,
"logits/rejected": -1.2590982913970947,
"logps/chosen": -1482.5379638671875,
"logps/rejected": -2852.9375,
"loss": 0.0003,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.074012756347656,
"rewards/margins": 14.420585632324219,
"rewards/margins_max": 18.83799934387207,
"rewards/margins_min": 10.003174781799316,
"rewards/margins_std": 6.24716329574585,
"rewards/rejected": -25.494598388671875,
"step": 970
},
{
"epoch": 2.76,
"grad_norm": 0.003130078676672441,
"learning_rate": 9.649455451539419e-08,
"logits/chosen": -1.2376658916473389,
"logits/rejected": -0.9727104306221008,
"logps/chosen": -1320.026123046875,
"logps/rejected": -2890.248291015625,
"loss": 0.0043,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.823871612548828,
"rewards/margins": 16.33503532409668,
"rewards/margins_max": 22.118406295776367,
"rewards/margins_min": 10.551666259765625,
"rewards/margins_std": 8.178921699523926,
"rewards/rejected": -27.15890884399414,
"step": 980
},
{
"epoch": 2.79,
"grad_norm": 0.01106748013868886,
"learning_rate": 7.523343956923196e-08,
"logits/chosen": -1.6014173030853271,
"logits/rejected": -1.3725566864013672,
"logps/chosen": -1455.7508544921875,
"logps/rejected": -2784.856201171875,
"loss": 0.0014,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.036726951599121,
"rewards/margins": 13.958398818969727,
"rewards/margins_max": 18.721614837646484,
"rewards/margins_min": 9.19517993927002,
"rewards/margins_std": 6.736205101013184,
"rewards/rejected": -24.995126724243164,
"step": 990
},
{
"epoch": 2.82,
"grad_norm": 0.21777107682252947,
"learning_rate": 5.657967707312195e-08,
"logits/chosen": -1.4147546291351318,
"logits/rejected": -1.2533682584762573,
"logps/chosen": -1340.80859375,
"logps/rejected": -2710.937255859375,
"loss": 0.0118,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.588825225830078,
"rewards/margins": 13.658398628234863,
"rewards/margins_max": 17.033788681030273,
"rewards/margins_min": 10.28300666809082,
"rewards/margins_std": 4.773523807525635,
"rewards/rejected": -24.247220993041992,
"step": 1000
},
{
"epoch": 2.82,
"eval_logits/chosen": -1.51563560962677,
"eval_logits/rejected": -1.4296027421951294,
"eval_logps/chosen": -1952.324462890625,
"eval_logps/rejected": -2219.474609375,
"eval_loss": 1.8351484537124634,
"eval_rewards/accuracies": 0.682539701461792,
"eval_rewards/chosen": -16.671031951904297,
"eval_rewards/margins": 2.931889057159424,
"eval_rewards/margins_max": 11.962862014770508,
"eval_rewards/margins_min": -5.289890766143799,
"eval_rewards/margins_std": 7.766205787658691,
"eval_rewards/rejected": -19.602922439575195,
"eval_runtime": 281.5027,
"eval_samples_per_second": 7.105,
"eval_steps_per_second": 0.224,
"step": 1000
},
{
"epoch": 2.85,
"grad_norm": 0.4419550733032763,
"learning_rate": 4.055332542531959e-08,
"logits/chosen": -1.5433815717697144,
"logits/rejected": -1.295972228050232,
"logps/chosen": -1293.6630859375,
"logps/rejected": -2648.736572265625,
"loss": 0.0096,
"rewards/accuracies": 0.949999988079071,
"rewards/chosen": -9.779963493347168,
"rewards/margins": 14.112527847290039,
"rewards/margins_max": 18.39639663696289,
"rewards/margins_min": 9.828656196594238,
"rewards/margins_std": 6.058306694030762,
"rewards/rejected": -23.89249038696289,
"step": 1010
},
{
"epoch": 2.87,
"grad_norm": 0.14005943320430667,
"learning_rate": 2.7171617768147472e-08,
"logits/chosen": -1.398990273475647,
"logits/rejected": -1.063157320022583,
"logps/chosen": -1454.0186767578125,
"logps/rejected": -2948.3251953125,
"loss": 0.0009,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.425373077392578,
"rewards/margins": 15.727473258972168,
"rewards/margins_max": 20.60434341430664,
"rewards/margins_min": 10.850606918334961,
"rewards/margins_std": 6.896933078765869,
"rewards/rejected": -27.152847290039062,
"step": 1020
},
{
"epoch": 2.9,
"grad_norm": 0.2626213621970617,
"learning_rate": 1.6448943457189616e-08,
"logits/chosen": -1.5582804679870605,
"logits/rejected": -1.3218994140625,
"logps/chosen": -1478.698974609375,
"logps/rejected": -2884.353271484375,
"loss": 0.0057,
"rewards/accuracies": 1.0,
"rewards/chosen": -11.135309219360352,
"rewards/margins": 14.943025588989258,
"rewards/margins_max": 20.703128814697266,
"rewards/margins_min": 9.1829195022583,
"rewards/margins_std": 8.146018981933594,
"rewards/rejected": -26.07833480834961,
"step": 1030
},
{
"epoch": 2.93,
"grad_norm": 2.8326701528782565,
"learning_rate": 8.39683258841123e-09,
"logits/chosen": -1.5044890642166138,
"logits/rejected": -1.2109694480895996,
"logps/chosen": -1402.8773193359375,
"logps/rejected": -2849.219970703125,
"loss": 0.0062,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.566572189331055,
"rewards/margins": 15.638870239257812,
"rewards/margins_max": 20.092174530029297,
"rewards/margins_min": 11.185564041137695,
"rewards/margins_std": 6.297926425933838,
"rewards/rejected": -26.2054443359375,
"step": 1040
},
{
"epoch": 2.96,
"grad_norm": 0.3213477153635432,
"learning_rate": 3.0239435998430376e-09,
"logits/chosen": -1.4634066820144653,
"logits/rejected": -1.1483074426651,
"logps/chosen": -1369.406494140625,
"logps/rejected": -2688.2548828125,
"loss": 0.0034,
"rewards/accuracies": 1.0,
"rewards/chosen": -10.378218650817871,
"rewards/margins": 13.92640495300293,
"rewards/margins_max": 18.696613311767578,
"rewards/margins_min": 9.156195640563965,
"rewards/margins_std": 6.746094703674316,
"rewards/rejected": -24.304622650146484,
"step": 1050
},
{
"epoch": 2.99,
"grad_norm": 0.31694097428400714,
"learning_rate": 3.3605396115826695e-10,
"logits/chosen": -1.4050662517547607,
"logits/rejected": -1.1527583599090576,
"logps/chosen": -1549.754150390625,
"logps/rejected": -2639.6474609375,
"loss": 0.0027,
"rewards/accuracies": 1.0,
"rewards/chosen": -12.238971710205078,
"rewards/margins": 12.063154220581055,
"rewards/margins_max": 15.284955978393555,
"rewards/margins_min": 8.841352462768555,
"rewards/margins_std": 4.5563154220581055,
"rewards/rejected": -24.302127838134766,
"step": 1060
},
{
"epoch": 3.0,
"step": 1065,
"total_flos": 0.0,
"train_loss": 0.1103198329137612,
"train_runtime": 9245.0119,
"train_samples_per_second": 1.843,
"train_steps_per_second": 0.115
}
],
"logging_steps": 10,
"max_steps": 1065,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}