Meta-Llama-3-8B-Base-MI-6e-7 / trainer_state.json
tengxiao1
TX
0d8bf93
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.998691442030882,
"eval_steps": 500,
"global_step": 477,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.010468463752944255,
"grad_norm": 34.35367990587695,
"learning_rate": 6.25e-08,
"logits/chosen": -0.49797338247299194,
"logits/rejected": -0.5135231018066406,
"logps/chosen": -1.1745355129241943,
"logps/rejected": -1.3596293926239014,
"loss": 2.1735,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1745355129241943,
"rewards/margins": 0.18509384989738464,
"rewards/rejected": -1.3596293926239014,
"step": 5
},
{
"epoch": 0.02093692750588851,
"grad_norm": 16.378216146989434,
"learning_rate": 1.25e-07,
"logits/chosen": -0.521752655506134,
"logits/rejected": -0.4988512396812439,
"logps/chosen": -1.1591465473175049,
"logps/rejected": -1.2624419927597046,
"loss": 2.1407,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.1591465473175049,
"rewards/margins": 0.10329560935497284,
"rewards/rejected": -1.2624419927597046,
"step": 10
},
{
"epoch": 0.031405391258832765,
"grad_norm": 22.732280640563598,
"learning_rate": 1.875e-07,
"logits/chosen": -0.46235981583595276,
"logits/rejected": -0.4507545530796051,
"logps/chosen": -1.1068508625030518,
"logps/rejected": -1.361823558807373,
"loss": 2.1077,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.1068508625030518,
"rewards/margins": 0.2549726366996765,
"rewards/rejected": -1.361823558807373,
"step": 15
},
{
"epoch": 0.04187385501177702,
"grad_norm": 22.78289950681047,
"learning_rate": 2.5e-07,
"logits/chosen": -0.4406924247741699,
"logits/rejected": -0.4528113007545471,
"logps/chosen": -1.161055564880371,
"logps/rejected": -1.2642455101013184,
"loss": 2.167,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.161055564880371,
"rewards/margins": 0.10318990051746368,
"rewards/rejected": -1.2642455101013184,
"step": 20
},
{
"epoch": 0.05234231876472128,
"grad_norm": 13.313947854898018,
"learning_rate": 3.125e-07,
"logits/chosen": -0.5021263360977173,
"logits/rejected": -0.47814303636550903,
"logps/chosen": -1.1769291162490845,
"logps/rejected": -1.2403558492660522,
"loss": 2.1418,
"rewards/accuracies": 0.5249999761581421,
"rewards/chosen": -1.1769291162490845,
"rewards/margins": 0.06342674791812897,
"rewards/rejected": -1.2403558492660522,
"step": 25
},
{
"epoch": 0.06281078251766553,
"grad_norm": 21.17299953179056,
"learning_rate": 3.75e-07,
"logits/chosen": -0.4899294972419739,
"logits/rejected": -0.49411076307296753,
"logps/chosen": -1.1576581001281738,
"logps/rejected": -1.280582070350647,
"loss": 2.1692,
"rewards/accuracies": 0.512499988079071,
"rewards/chosen": -1.1576581001281738,
"rewards/margins": 0.12292404472827911,
"rewards/rejected": -1.280582070350647,
"step": 30
},
{
"epoch": 0.07327924627060979,
"grad_norm": 21.15597636264796,
"learning_rate": 4.3749999999999994e-07,
"logits/chosen": -0.4856337904930115,
"logits/rejected": -0.4433709979057312,
"logps/chosen": -1.1427704095840454,
"logps/rejected": -1.2787848711013794,
"loss": 2.134,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1427704095840454,
"rewards/margins": 0.13601449131965637,
"rewards/rejected": -1.2787848711013794,
"step": 35
},
{
"epoch": 0.08374771002355404,
"grad_norm": 29.038714028264685,
"learning_rate": 5e-07,
"logits/chosen": -0.4945921301841736,
"logits/rejected": -0.4987305998802185,
"logps/chosen": -1.0738334655761719,
"logps/rejected": -1.39645516872406,
"loss": 2.0884,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.0738334655761719,
"rewards/margins": 0.3226216435432434,
"rewards/rejected": -1.39645516872406,
"step": 40
},
{
"epoch": 0.0942161737764983,
"grad_norm": 30.530804134708777,
"learning_rate": 5.625e-07,
"logits/chosen": -0.45864447951316833,
"logits/rejected": -0.4690025746822357,
"logps/chosen": -1.090343952178955,
"logps/rejected": -1.317134976387024,
"loss": 2.1074,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.090343952178955,
"rewards/margins": 0.22679109871387482,
"rewards/rejected": -1.317134976387024,
"step": 45
},
{
"epoch": 0.10468463752944256,
"grad_norm": 21.789634951246953,
"learning_rate": 5.999678242522831e-07,
"logits/chosen": -0.4777728021144867,
"logits/rejected": -0.49264296889305115,
"logps/chosen": -1.1642675399780273,
"logps/rejected": -1.4595439434051514,
"loss": 2.1327,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.1642675399780273,
"rewards/margins": 0.2952764332294464,
"rewards/rejected": -1.4595439434051514,
"step": 50
},
{
"epoch": 0.11515310128238682,
"grad_norm": 175.8483724549221,
"learning_rate": 5.996059263493219e-07,
"logits/chosen": -0.4417606294155121,
"logits/rejected": -0.4309404492378235,
"logps/chosen": -1.1178853511810303,
"logps/rejected": -1.343202829360962,
"loss": 2.106,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -1.1178853511810303,
"rewards/margins": 0.22531744837760925,
"rewards/rejected": -1.343202829360962,
"step": 55
},
{
"epoch": 0.12562156503533106,
"grad_norm": 18.6141832401793,
"learning_rate": 5.988423976115163e-07,
"logits/chosen": -0.48267728090286255,
"logits/rejected": -0.47969865798950195,
"logps/chosen": -1.2083253860473633,
"logps/rejected": -1.33084237575531,
"loss": 2.0877,
"rewards/accuracies": 0.5562499761581421,
"rewards/chosen": -1.2083253860473633,
"rewards/margins": 0.12251707166433334,
"rewards/rejected": -1.33084237575531,
"step": 60
},
{
"epoch": 0.1360900287882753,
"grad_norm": 32.36335168763983,
"learning_rate": 5.976782615723061e-07,
"logits/chosen": -0.44251489639282227,
"logits/rejected": -0.40677833557128906,
"logps/chosen": -1.102402925491333,
"logps/rejected": -1.6289558410644531,
"loss": 2.0703,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.102402925491333,
"rewards/margins": 0.5265528559684753,
"rewards/rejected": -1.6289558410644531,
"step": 65
},
{
"epoch": 0.14655849254121958,
"grad_norm": 48.08034129839621,
"learning_rate": 5.961150787913738e-07,
"logits/chosen": -0.3722413182258606,
"logits/rejected": -0.3657040297985077,
"logps/chosen": -1.1620112657546997,
"logps/rejected": -1.426941990852356,
"loss": 2.0511,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.1620112657546997,
"rewards/margins": 0.2649305760860443,
"rewards/rejected": -1.426941990852356,
"step": 70
},
{
"epoch": 0.15702695629416383,
"grad_norm": 59.84416296380324,
"learning_rate": 5.941549447626671e-07,
"logits/chosen": -0.37373992800712585,
"logits/rejected": -0.3564635217189789,
"logps/chosen": -1.1539630889892578,
"logps/rejected": -1.505507230758667,
"loss": 2.0661,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.1539630889892578,
"rewards/margins": 0.3515442907810211,
"rewards/rejected": -1.505507230758667,
"step": 75
},
{
"epoch": 0.16749542004710807,
"grad_norm": 25.67817680738418,
"learning_rate": 5.918004871053251e-07,
"logits/chosen": -0.4051768183708191,
"logits/rejected": -0.38645023107528687,
"logps/chosen": -1.1455223560333252,
"logps/rejected": -1.5474026203155518,
"loss": 2.0988,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.1455223560333252,
"rewards/margins": 0.40188026428222656,
"rewards/rejected": -1.5474026203155518,
"step": 80
},
{
"epoch": 0.17796388380005235,
"grad_norm": 34.17749794765947,
"learning_rate": 5.890548620412763e-07,
"logits/chosen": -0.45642200112342834,
"logits/rejected": -0.42394012212753296,
"logps/chosen": -1.1208285093307495,
"logps/rejected": -1.4425103664398193,
"loss": 2.0996,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.1208285093307495,
"rewards/margins": 0.3216818571090698,
"rewards/rejected": -1.4425103664398193,
"step": 85
},
{
"epoch": 0.1884323475529966,
"grad_norm": 10.063019778918575,
"learning_rate": 5.859217501642258e-07,
"logits/chosen": -0.4428345561027527,
"logits/rejected": -0.4144333004951477,
"logps/chosen": -1.1374626159667969,
"logps/rejected": -1.4302767515182495,
"loss": 2.0461,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.1374626159667969,
"rewards/margins": 0.2928140461444855,
"rewards/rejected": -1.4302767515182495,
"step": 90
},
{
"epoch": 0.19890081130594087,
"grad_norm": 20.74891634135048,
"learning_rate": 5.824053515057091e-07,
"logits/chosen": -0.479747474193573,
"logits/rejected": -0.39375734329223633,
"logps/chosen": -1.159339189529419,
"logps/rejected": -1.4628812074661255,
"loss": 2.0797,
"rewards/accuracies": 0.5687500238418579,
"rewards/chosen": -1.159339189529419,
"rewards/margins": 0.30354195833206177,
"rewards/rejected": -1.4628812074661255,
"step": 95
},
{
"epoch": 0.2093692750588851,
"grad_norm": 33.44699335449945,
"learning_rate": 5.785103799048218e-07,
"logits/chosen": -0.4181644022464752,
"logits/rejected": -0.38857489824295044,
"logps/chosen": -1.1860034465789795,
"logps/rejected": -1.7242858409881592,
"loss": 2.0744,
"rewards/accuracies": 0.53125,
"rewards/chosen": -1.1860034465789795,
"rewards/margins": 0.5382825136184692,
"rewards/rejected": -1.7242858409881592,
"step": 100
},
{
"epoch": 0.21983773881182936,
"grad_norm": 59.5883485681765,
"learning_rate": 5.742420566891749e-07,
"logits/chosen": -0.4103716015815735,
"logits/rejected": -0.3968586027622223,
"logps/chosen": -1.076053261756897,
"logps/rejected": -1.4917399883270264,
"loss": 2.0443,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.076053261756897,
"rewards/margins": 0.4156867563724518,
"rewards/rejected": -1.4917399883270264,
"step": 105
},
{
"epoch": 0.23030620256477363,
"grad_norm": 80.31752384431189,
"learning_rate": 5.696061036755478e-07,
"logits/chosen": -0.4146521985530853,
"logits/rejected": -0.38292473554611206,
"logps/chosen": -1.0837781429290771,
"logps/rejected": -1.5503333806991577,
"loss": 2.0282,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0837781429290771,
"rewards/margins": 0.4665554165840149,
"rewards/rejected": -1.5503333806991577,
"step": 110
},
{
"epoch": 0.24077466631771788,
"grad_norm": 98.56244903302768,
"learning_rate": 5.64608735499618e-07,
"logits/chosen": -0.28679025173187256,
"logits/rejected": -0.2377845048904419,
"logps/chosen": -1.111647367477417,
"logps/rejected": -1.687975287437439,
"loss": 2.0185,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.111647367477417,
"rewards/margins": 0.576327919960022,
"rewards/rejected": -1.687975287437439,
"step": 115
},
{
"epoch": 0.2512431300706621,
"grad_norm": 20.27186258736798,
"learning_rate": 5.592566512850545e-07,
"logits/chosen": -0.27318406105041504,
"logits/rejected": -0.2487379014492035,
"logps/chosen": -1.2702689170837402,
"logps/rejected": -1.556516408920288,
"loss": 2.0975,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.2702689170837402,
"rewards/margins": 0.2862473428249359,
"rewards/rejected": -1.556516408920288,
"step": 120
},
{
"epoch": 0.26171159382360637,
"grad_norm": 30.996972753737214,
"learning_rate": 5.535570256631384e-07,
"logits/chosen": -0.32224705815315247,
"logits/rejected": -0.2949572503566742,
"logps/chosen": -1.0966984033584595,
"logps/rejected": -1.425978660583496,
"loss": 2.0513,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0966984033584595,
"rewards/margins": 0.3292803168296814,
"rewards/rejected": -1.425978660583496,
"step": 125
},
{
"epoch": 0.2721800575765506,
"grad_norm": 106.95100145667757,
"learning_rate": 5.475174991549528e-07,
"logits/chosen": -0.2838582396507263,
"logits/rejected": -0.27573472261428833,
"logps/chosen": -1.065918207168579,
"logps/rejected": -1.2746469974517822,
"loss": 2.0692,
"rewards/accuracies": 0.5874999761581421,
"rewards/chosen": -1.065918207168579,
"rewards/margins": 0.2087288349866867,
"rewards/rejected": -1.2746469974517822,
"step": 130
},
{
"epoch": 0.2826485213294949,
"grad_norm": 18.450255655703543,
"learning_rate": 5.411461679290317e-07,
"logits/chosen": -0.33663275837898254,
"logits/rejected": -0.24262118339538574,
"logps/chosen": -1.1360746622085571,
"logps/rejected": -1.793914556503296,
"loss": 2.0112,
"rewards/accuracies": 0.7562500238418579,
"rewards/chosen": -1.1360746622085571,
"rewards/margins": 0.6578398942947388,
"rewards/rejected": -1.793914556503296,
"step": 135
},
{
"epoch": 0.29311698508243916,
"grad_norm": 20.28586481649801,
"learning_rate": 5.34451572948201e-07,
"logits/chosen": -0.25463438034057617,
"logits/rejected": -0.18874910473823547,
"logps/chosen": -1.2043225765228271,
"logps/rejected": -1.7759593725204468,
"loss": 1.9956,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.2043225765228271,
"rewards/margins": 0.5716367959976196,
"rewards/rejected": -1.7759593725204468,
"step": 140
},
{
"epoch": 0.3035854488353834,
"grad_norm": 16.12998962894494,
"learning_rate": 5.274426885201582e-07,
"logits/chosen": -0.3120715320110321,
"logits/rejected": -0.28251615166664124,
"logps/chosen": -1.1517010927200317,
"logps/rejected": -1.5577958822250366,
"loss": 2.04,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.1517010927200317,
"rewards/margins": 0.4060949683189392,
"rewards/rejected": -1.5577958822250366,
"step": 145
},
{
"epoch": 0.31405391258832765,
"grad_norm": 18.374398699221082,
"learning_rate": 5.201289102671411e-07,
"logits/chosen": -0.29038459062576294,
"logits/rejected": -0.25689178705215454,
"logps/chosen": -1.0535448789596558,
"logps/rejected": -1.4643501043319702,
"loss": 2.0025,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.0535448789596558,
"rewards/margins": 0.41080522537231445,
"rewards/rejected": -1.4643501043319702,
"step": 150
},
{
"epoch": 0.3245223763412719,
"grad_norm": 21.42424041369703,
"learning_rate": 5.12520042530811e-07,
"logits/chosen": -0.33090248703956604,
"logits/rejected": -0.26949256658554077,
"logps/chosen": -1.1385178565979004,
"logps/rejected": -1.5117579698562622,
"loss": 2.0025,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.1385178565979004,
"rewards/margins": 0.3732401728630066,
"rewards/rejected": -1.5117579698562622,
"step": 155
},
{
"epoch": 0.33499084009421615,
"grad_norm": 17.090434114730737,
"learning_rate": 5.046262852292346e-07,
"logits/chosen": -0.2471882402896881,
"logits/rejected": -0.1910603940486908,
"logps/chosen": -1.179958701133728,
"logps/rejected": -1.6289422512054443,
"loss": 2.0344,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.179958701133728,
"rewards/margins": 0.4489835202693939,
"rewards/rejected": -1.6289422512054443,
"step": 160
},
{
"epoch": 0.34545930384716045,
"grad_norm": 13.121179286172973,
"learning_rate": 4.964582201835856e-07,
"logits/chosen": -0.2789207696914673,
"logits/rejected": -0.2165801227092743,
"logps/chosen": -1.1143217086791992,
"logps/rejected": -1.6779086589813232,
"loss": 2.012,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.1143217086791992,
"rewards/margins": 0.5635868906974792,
"rewards/rejected": -1.6779086589813232,
"step": 165
},
{
"epoch": 0.3559277676001047,
"grad_norm": 13.822111204791623,
"learning_rate": 4.880267969328908e-07,
"logits/chosen": -0.26305219531059265,
"logits/rejected": -0.16298075020313263,
"logps/chosen": -1.2098379135131836,
"logps/rejected": -1.6500240564346313,
"loss": 2.0205,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.2098379135131836,
"rewards/margins": 0.44018617272377014,
"rewards/rejected": -1.6500240564346313,
"step": 170
},
{
"epoch": 0.36639623135304894,
"grad_norm": 18.589343338653535,
"learning_rate": 4.793433180558423e-07,
"logits/chosen": -0.26549848914146423,
"logits/rejected": -0.13549579679965973,
"logps/chosen": -1.1815907955169678,
"logps/rejected": -1.6547002792358398,
"loss": 2.0137,
"rewards/accuracies": 0.606249988079071,
"rewards/chosen": -1.1815907955169678,
"rewards/margins": 0.4731093943119049,
"rewards/rejected": -1.6547002792358398,
"step": 175
},
{
"epoch": 0.3768646951059932,
"grad_norm": 24.192793785270847,
"learning_rate": 4.704194240193467e-07,
"logits/chosen": -0.20712879300117493,
"logits/rejected": -0.14872625470161438,
"logps/chosen": -1.2047398090362549,
"logps/rejected": -1.6949182748794556,
"loss": 2.0473,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.2047398090362549,
"rewards/margins": 0.49017828702926636,
"rewards/rejected": -1.6949182748794556,
"step": 180
},
{
"epoch": 0.38733315885893743,
"grad_norm": 21.72801779202531,
"learning_rate": 4.6126707757412686e-07,
"logits/chosen": -0.19427147507667542,
"logits/rejected": -0.08655323088169098,
"logps/chosen": -1.2102793455123901,
"logps/rejected": -1.956017255783081,
"loss": 1.9488,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.2102793455123901,
"rewards/margins": 0.745737612247467,
"rewards/rejected": -1.956017255783081,
"step": 185
},
{
"epoch": 0.39780162261188173,
"grad_norm": 52.99611536180731,
"learning_rate": 4.5189854771829086e-07,
"logits/chosen": -0.27349403500556946,
"logits/rejected": -0.19519653916358948,
"logps/chosen": -1.2331929206848145,
"logps/rejected": -1.693256139755249,
"loss": 2.0896,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.2331929206848145,
"rewards/margins": 0.4600633680820465,
"rewards/rejected": -1.693256139755249,
"step": 190
},
{
"epoch": 0.408270086364826,
"grad_norm": 23.313047834378704,
"learning_rate": 4.4232639325036807e-07,
"logits/chosen": -0.2565682530403137,
"logits/rejected": -0.20012107491493225,
"logps/chosen": -1.2263553142547607,
"logps/rejected": -1.6081535816192627,
"loss": 2.0274,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.2263553142547607,
"rewards/margins": 0.38179832696914673,
"rewards/rejected": -1.6081535816192627,
"step": 195
},
{
"epoch": 0.4187385501177702,
"grad_norm": 28.77315959004056,
"learning_rate": 4.32563445933859e-07,
"logits/chosen": -0.2804745137691498,
"logits/rejected": -0.2464865893125534,
"logps/chosen": -1.2270160913467407,
"logps/rejected": -1.6264985799789429,
"loss": 2.046,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.2270160913467407,
"rewards/margins": 0.3994825482368469,
"rewards/rejected": -1.6264985799789429,
"step": 200
},
{
"epoch": 0.42920701387071447,
"grad_norm": 24.629126590195398,
"learning_rate": 4.226227932958664e-07,
"logits/chosen": -0.21598652005195618,
"logits/rejected": -0.16539430618286133,
"logps/chosen": -1.0299084186553955,
"logps/rejected": -1.6446430683135986,
"loss": 1.9694,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.0299084186553955,
"rewards/margins": 0.6147347688674927,
"rewards/rejected": -1.6446430683135986,
"step": 205
},
{
"epoch": 0.4396754776236587,
"grad_norm": 18.99935065324488,
"learning_rate": 4.1251776108286854e-07,
"logits/chosen": -0.24660630524158478,
"logits/rejected": -0.19936877489089966,
"logps/chosen": -1.2271109819412231,
"logps/rejected": -1.5406509637832642,
"loss": 2.0403,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.2271109819412231,
"rewards/margins": 0.31353995203971863,
"rewards/rejected": -1.5406509637832642,
"step": 210
},
{
"epoch": 0.45014394137660296,
"grad_norm": 28.14089359211311,
"learning_rate": 4.022618953971514e-07,
"logits/chosen": -0.28811579942703247,
"logits/rejected": -0.23570355772972107,
"logps/chosen": -1.1424353122711182,
"logps/rejected": -1.6885287761688232,
"loss": 2.0049,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1424353122711182,
"rewards/margins": 0.5460935831069946,
"rewards/rejected": -1.6885287761688232,
"step": 215
},
{
"epoch": 0.46061240512954726,
"grad_norm": 32.89974992330637,
"learning_rate": 3.918689445378477e-07,
"logits/chosen": -0.30860984325408936,
"logits/rejected": -0.18307650089263916,
"logps/chosen": -1.2112575769424438,
"logps/rejected": -1.7577863931655884,
"loss": 2.0035,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.2112575769424438,
"rewards/margins": 0.5465287566184998,
"rewards/rejected": -1.7577863931655884,
"step": 220
},
{
"epoch": 0.4710808688824915,
"grad_norm": 20.40577743783828,
"learning_rate": 3.813528405709251e-07,
"logits/chosen": -0.29336491227149963,
"logits/rejected": -0.19095419347286224,
"logps/chosen": -1.1172130107879639,
"logps/rejected": -1.7578521966934204,
"loss": 1.949,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.1172130107879639,
"rewards/margins": 0.6406393051147461,
"rewards/rejected": -1.7578521966934204,
"step": 225
},
{
"epoch": 0.48154933263543576,
"grad_norm": 22.351884239960583,
"learning_rate": 3.707276806528282e-07,
"logits/chosen": -0.3431912362575531,
"logits/rejected": -0.21362292766571045,
"logps/chosen": -1.175959587097168,
"logps/rejected": -1.914841890335083,
"loss": 1.9597,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.175959587097168,
"rewards/margins": 0.7388821840286255,
"rewards/rejected": -1.914841890335083,
"step": 230
},
{
"epoch": 0.49201779638838,
"grad_norm": 32.41861896385678,
"learning_rate": 3.6000770813281334e-07,
"logits/chosen": -0.281482458114624,
"logits/rejected": -0.2136625498533249,
"logps/chosen": -1.1437963247299194,
"logps/rejected": -1.6638180017471313,
"loss": 1.9987,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.1437963247299194,
"rewards/margins": 0.5200216770172119,
"rewards/rejected": -1.6638180017471313,
"step": 235
},
{
"epoch": 0.5024862601413242,
"grad_norm": 25.062414755869742,
"learning_rate": 3.4920729345930654e-07,
"logits/chosen": -0.31132057309150696,
"logits/rejected": -0.2606331408023834,
"logps/chosen": -1.1404446363449097,
"logps/rejected": -1.7029993534088135,
"loss": 2.0234,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.1404446363449097,
"rewards/margins": 0.5625545978546143,
"rewards/rejected": -1.7029993534088135,
"step": 240
},
{
"epoch": 0.5129547238942685,
"grad_norm": 23.400197287310235,
"learning_rate": 3.383409149158814e-07,
"logits/chosen": -0.34879469871520996,
"logits/rejected": -0.27785512804985046,
"logps/chosen": -1.2463206052780151,
"logps/rejected": -1.6453937292099,
"loss": 2.0244,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.2463206052780151,
"rewards/margins": 0.39907318353652954,
"rewards/rejected": -1.6453937292099,
"step": 245
},
{
"epoch": 0.5234231876472127,
"grad_norm": 30.06789343033768,
"learning_rate": 3.2742313921268035e-07,
"logits/chosen": -0.2991330623626709,
"logits/rejected": -0.24600133299827576,
"logps/chosen": -1.1448876857757568,
"logps/rejected": -1.7555782794952393,
"loss": 2.0331,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1448876857757568,
"rewards/margins": 0.6106906533241272,
"rewards/rejected": -1.7555782794952393,
"step": 250
},
{
"epoch": 0.533891651400157,
"grad_norm": 26.26439332360777,
"learning_rate": 3.1646860195929825e-07,
"logits/chosen": -0.26393812894821167,
"logits/rejected": -0.15189418196678162,
"logps/chosen": -1.2170337438583374,
"logps/rejected": -1.7952607870101929,
"loss": 1.9685,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.2170337438583374,
"rewards/margins": 0.5782270431518555,
"rewards/rejected": -1.7952607870101929,
"step": 255
},
{
"epoch": 0.5443601151531012,
"grad_norm": 30.83419650992516,
"learning_rate": 3.054919880453032e-07,
"logits/chosen": -0.24465498328208923,
"logits/rejected": -0.1657981127500534,
"logps/chosen": -1.1195638179779053,
"logps/rejected": -1.8283071517944336,
"loss": 2.0058,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.1195638179779053,
"rewards/margins": 0.7087433934211731,
"rewards/rejected": -1.8283071517944336,
"step": 260
},
{
"epoch": 0.5548285789060455,
"grad_norm": 26.538461186209517,
"learning_rate": 2.9450801195469686e-07,
"logits/chosen": -0.2653834819793701,
"logits/rejected": -0.21019785106182098,
"logps/chosen": -1.2122917175292969,
"logps/rejected": -1.5932838916778564,
"loss": 1.9747,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.2122917175292969,
"rewards/margins": 0.38099202513694763,
"rewards/rejected": -1.5932838916778564,
"step": 265
},
{
"epoch": 0.5652970426589898,
"grad_norm": 30.571225220792357,
"learning_rate": 2.835313980407017e-07,
"logits/chosen": -0.2485169917345047,
"logits/rejected": -0.1672702431678772,
"logps/chosen": -1.2699711322784424,
"logps/rejected": -1.6770769357681274,
"loss": 2.0095,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.2699711322784424,
"rewards/margins": 0.40710583329200745,
"rewards/rejected": -1.6770769357681274,
"step": 270
},
{
"epoch": 0.575765506411934,
"grad_norm": 15.142492975343366,
"learning_rate": 2.7257686078731973e-07,
"logits/chosen": -0.2676723599433899,
"logits/rejected": -0.14488348364830017,
"logps/chosen": -1.1701332330703735,
"logps/rejected": -1.882002830505371,
"loss": 1.9973,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.1701332330703735,
"rewards/margins": 0.7118695974349976,
"rewards/rejected": -1.882002830505371,
"step": 275
},
{
"epoch": 0.5862339701648783,
"grad_norm": 17.861104389604897,
"learning_rate": 2.6165908508411857e-07,
"logits/chosen": -0.27734139561653137,
"logits/rejected": -0.1873548924922943,
"logps/chosen": -1.096847653388977,
"logps/rejected": -1.556873083114624,
"loss": 1.979,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.096847653388977,
"rewards/margins": 0.46002545952796936,
"rewards/rejected": -1.556873083114624,
"step": 280
},
{
"epoch": 0.5967024339178225,
"grad_norm": 29.049210332966997,
"learning_rate": 2.5079270654069354e-07,
"logits/chosen": -0.22700035572052002,
"logits/rejected": -0.20169806480407715,
"logps/chosen": -1.1871209144592285,
"logps/rejected": -1.729832410812378,
"loss": 1.9858,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.1871209144592285,
"rewards/margins": 0.5427114367485046,
"rewards/rejected": -1.729832410812378,
"step": 285
},
{
"epoch": 0.6071708976707668,
"grad_norm": 19.177779527300032,
"learning_rate": 2.399922918671867e-07,
"logits/chosen": -0.27257853746414185,
"logits/rejected": -0.20175373554229736,
"logps/chosen": -1.1817193031311035,
"logps/rejected": -1.81912362575531,
"loss": 1.9728,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.1817193031311035,
"rewards/margins": 0.6374045610427856,
"rewards/rejected": -1.81912362575531,
"step": 290
},
{
"epoch": 0.6176393614237111,
"grad_norm": 39.27420791348495,
"learning_rate": 2.2927231934717176e-07,
"logits/chosen": -0.2650902271270752,
"logits/rejected": -0.204110786318779,
"logps/chosen": -1.1773895025253296,
"logps/rejected": -1.8753960132598877,
"loss": 2.0113,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.1773895025253296,
"rewards/margins": 0.6980065107345581,
"rewards/rejected": -1.8753960132598877,
"step": 295
},
{
"epoch": 0.6281078251766553,
"grad_norm": 19.055812699519027,
"learning_rate": 2.1864715942907487e-07,
"logits/chosen": -0.31268611550331116,
"logits/rejected": -0.26396140456199646,
"logps/chosen": -1.2095041275024414,
"logps/rejected": -1.6991554498672485,
"loss": 1.9925,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.2095041275024414,
"rewards/margins": 0.4896513819694519,
"rewards/rejected": -1.6991554498672485,
"step": 300
},
{
"epoch": 0.6385762889295996,
"grad_norm": 24.000715730668365,
"learning_rate": 2.081310554621522e-07,
"logits/chosen": -0.2226269692182541,
"logits/rejected": -0.17259590327739716,
"logps/chosen": -1.2206140756607056,
"logps/rejected": -1.8647050857543945,
"loss": 1.9507,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.2206140756607056,
"rewards/margins": 0.6440912485122681,
"rewards/rejected": -1.8647050857543945,
"step": 305
},
{
"epoch": 0.6490447526825438,
"grad_norm": 24.680587241234605,
"learning_rate": 1.9773810460284862e-07,
"logits/chosen": -0.21720829606056213,
"logits/rejected": -0.22596517205238342,
"logps/chosen": -1.121246576309204,
"logps/rejected": -1.6572654247283936,
"loss": 1.9548,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.121246576309204,
"rewards/margins": 0.536018967628479,
"rewards/rejected": -1.6572654247283936,
"step": 310
},
{
"epoch": 0.6595132164354881,
"grad_norm": 191.6202253290827,
"learning_rate": 1.874822389171314e-07,
"logits/chosen": -0.23975515365600586,
"logits/rejected": -0.13572129607200623,
"logps/chosen": -1.1023863554000854,
"logps/rejected": -1.9153015613555908,
"loss": 1.9671,
"rewards/accuracies": 0.7250000238418579,
"rewards/chosen": -1.1023863554000854,
"rewards/margins": 0.8129149675369263,
"rewards/rejected": -1.9153015613555908,
"step": 315
},
{
"epoch": 0.6699816801884323,
"grad_norm": 29.251518787297723,
"learning_rate": 1.7737720670413356e-07,
"logits/chosen": -0.19940456748008728,
"logits/rejected": -0.15788142383098602,
"logps/chosen": -1.2180979251861572,
"logps/rejected": -1.7693755626678467,
"loss": 1.9257,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2180979251861572,
"rewards/margins": 0.5512775182723999,
"rewards/rejected": -1.7693755626678467,
"step": 320
},
{
"epoch": 0.6804501439413766,
"grad_norm": 38.573296633637,
"learning_rate": 1.6743655406614095e-07,
"logits/chosen": -0.2212747037410736,
"logits/rejected": -0.13944557309150696,
"logps/chosen": -1.165976881980896,
"logps/rejected": -1.8154420852661133,
"loss": 1.9466,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.165976881980896,
"rewards/margins": 0.6494652032852173,
"rewards/rejected": -1.8154420852661133,
"step": 325
},
{
"epoch": 0.6909186076943209,
"grad_norm": 29.264456688369886,
"learning_rate": 1.5767360674963198e-07,
"logits/chosen": -0.21240024268627167,
"logits/rejected": -0.13640090823173523,
"logps/chosen": -1.1329911947250366,
"logps/rejected": -1.5841938257217407,
"loss": 1.9788,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.1329911947250366,
"rewards/margins": 0.45120254158973694,
"rewards/rejected": -1.5841938257217407,
"step": 330
},
{
"epoch": 0.7013870714472651,
"grad_norm": 21.84656926034516,
"learning_rate": 1.4810145228170922e-07,
"logits/chosen": -0.2895652651786804,
"logits/rejected": -0.20374973118305206,
"logps/chosen": -1.0948199033737183,
"logps/rejected": -1.5535070896148682,
"loss": 1.9921,
"rewards/accuracies": 0.699999988079071,
"rewards/chosen": -1.0948199033737183,
"rewards/margins": 0.4586872160434723,
"rewards/rejected": -1.5535070896148682,
"step": 335
},
{
"epoch": 0.7118555352002094,
"grad_norm": 60.16510624521895,
"learning_rate": 1.3873292242587306e-07,
"logits/chosen": -0.2519373893737793,
"logits/rejected": -0.1798809915781021,
"logps/chosen": -1.3221559524536133,
"logps/rejected": -1.7844518423080444,
"loss": 2.0219,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.3221559524536133,
"rewards/margins": 0.4622960686683655,
"rewards/rejected": -1.7844518423080444,
"step": 340
},
{
"epoch": 0.7223239989531536,
"grad_norm": 31.10556234832626,
"learning_rate": 1.295805759806533e-07,
"logits/chosen": -0.28459057211875916,
"logits/rejected": -0.1859065145254135,
"logps/chosen": -1.1906864643096924,
"logps/rejected": -1.7581002712249756,
"loss": 1.9961,
"rewards/accuracies": 0.643750011920929,
"rewards/chosen": -1.1906864643096924,
"rewards/margins": 0.567413866519928,
"rewards/rejected": -1.7581002712249756,
"step": 345
},
{
"epoch": 0.7327924627060979,
"grad_norm": 21.145366603131734,
"learning_rate": 1.2065668194415777e-07,
"logits/chosen": -0.19020649790763855,
"logits/rejected": -0.15534143149852753,
"logps/chosen": -1.1794466972351074,
"logps/rejected": -1.6421940326690674,
"loss": 2.0254,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.1794466972351074,
"rewards/margins": 0.4627472758293152,
"rewards/rejected": -1.6421940326690674,
"step": 350
},
{
"epoch": 0.7432609264590422,
"grad_norm": 29.29098473580897,
"learning_rate": 1.1197320306710923e-07,
"logits/chosen": -0.19632667303085327,
"logits/rejected": -0.12513799965381622,
"logps/chosen": -1.0603824853897095,
"logps/rejected": -1.7353988885879517,
"loss": 1.9288,
"rewards/accuracies": 0.731249988079071,
"rewards/chosen": -1.0603824853897095,
"rewards/margins": 0.6750164031982422,
"rewards/rejected": -1.7353988885879517,
"step": 355
},
{
"epoch": 0.7537293902119864,
"grad_norm": 24.609476201697003,
"learning_rate": 1.035417798164145e-07,
"logits/chosen": -0.2547205984592438,
"logits/rejected": -0.17141126096248627,
"logps/chosen": -1.0777822732925415,
"logps/rejected": -1.6378345489501953,
"loss": 1.9002,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.0777822732925415,
"rewards/margins": 0.5600521564483643,
"rewards/rejected": -1.6378345489501953,
"step": 360
},
{
"epoch": 0.7641978539649307,
"grad_norm": 44.888955546131726,
"learning_rate": 9.537371477076535e-08,
"logits/chosen": -0.24833258986473083,
"logits/rejected": -0.14280755817890167,
"logps/chosen": -1.2661911249160767,
"logps/rejected": -1.8641719818115234,
"loss": 2.0054,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.2661911249160767,
"rewards/margins": 0.5979806184768677,
"rewards/rejected": -1.8641719818115234,
"step": 365
},
{
"epoch": 0.7746663177178749,
"grad_norm": 29.759713813955717,
"learning_rate": 8.747995746918898e-08,
"logits/chosen": -0.18603017926216125,
"logits/rejected": -0.11623908579349518,
"logps/chosen": -1.2385270595550537,
"logps/rejected": -1.8711185455322266,
"loss": 1.9635,
"rewards/accuracies": 0.6812499761581421,
"rewards/chosen": -1.2385270595550537,
"rewards/margins": 0.6325916051864624,
"rewards/rejected": -1.8711185455322266,
"step": 370
},
{
"epoch": 0.7851347814708192,
"grad_norm": 18.386697317487712,
"learning_rate": 7.987108973285888e-08,
"logits/chosen": -0.2141023874282837,
"logits/rejected": -0.24412047863006592,
"logps/chosen": -1.2150145769119263,
"logps/rejected": -1.7247231006622314,
"loss": 1.9899,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.2150145769119263,
"rewards/margins": 0.5097082853317261,
"rewards/rejected": -1.7247231006622314,
"step": 375
},
{
"epoch": 0.7956032452237635,
"grad_norm": 24.333148504874714,
"learning_rate": 7.255731147984174e-08,
"logits/chosen": -0.24650990962982178,
"logits/rejected": -0.18783050775527954,
"logps/chosen": -1.2390451431274414,
"logps/rejected": -1.6897704601287842,
"loss": 1.9425,
"rewards/accuracies": 0.612500011920929,
"rewards/chosen": -1.2390451431274414,
"rewards/margins": 0.45072537660598755,
"rewards/rejected": -1.6897704601287842,
"step": 380
},
{
"epoch": 0.8060717089767077,
"grad_norm": 22.373563868926805,
"learning_rate": 6.554842705179898e-08,
"logits/chosen": -0.2532094120979309,
"logits/rejected": -0.2002202570438385,
"logps/chosen": -1.157409429550171,
"logps/rejected": -1.7103229761123657,
"loss": 1.9743,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.157409429550171,
"rewards/margins": 0.55291348695755,
"rewards/rejected": -1.7103229761123657,
"step": 385
},
{
"epoch": 0.816540172729652,
"grad_norm": 27.22781403012628,
"learning_rate": 5.885383207096832e-08,
"logits/chosen": -0.25118163228034973,
"logits/rejected": -0.17533348500728607,
"logps/chosen": -1.158809781074524,
"logps/rejected": -1.7991712093353271,
"loss": 1.9567,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.158809781074524,
"rewards/margins": 0.6403613090515137,
"rewards/rejected": -1.7991712093353271,
"step": 390
},
{
"epoch": 0.8270086364825961,
"grad_norm": 41.14703828256896,
"learning_rate": 5.2482500845047165e-08,
"logits/chosen": -0.2559366524219513,
"logits/rejected": -0.13099372386932373,
"logps/chosen": -1.1643617153167725,
"logps/rejected": -1.7700135707855225,
"loss": 1.9559,
"rewards/accuracies": 0.6937500238418579,
"rewards/chosen": -1.1643617153167725,
"rewards/margins": 0.60565185546875,
"rewards/rejected": -1.7700135707855225,
"step": 395
},
{
"epoch": 0.8374771002355405,
"grad_norm": 24.3693128346703,
"learning_rate": 4.644297433686162e-08,
"logits/chosen": -0.19337531924247742,
"logits/rejected": -0.14422497153282166,
"logps/chosen": -1.0990240573883057,
"logps/rejected": -1.7358309030532837,
"loss": 1.9382,
"rewards/accuracies": 0.65625,
"rewards/chosen": -1.0990240573883057,
"rewards/margins": 0.6368069648742676,
"rewards/rejected": -1.7358309030532837,
"step": 400
},
{
"epoch": 0.8479455639884846,
"grad_norm": 27.686221570305076,
"learning_rate": 4.074334871494558e-08,
"logits/chosen": -0.27724021673202515,
"logits/rejected": -0.21434080600738525,
"logps/chosen": -1.2541942596435547,
"logps/rejected": -1.889288306236267,
"loss": 1.981,
"rewards/accuracies": 0.6499999761581421,
"rewards/chosen": -1.2541942596435547,
"rewards/margins": 0.6350940465927124,
"rewards/rejected": -1.889288306236267,
"step": 405
},
{
"epoch": 0.8584140277414289,
"grad_norm": 27.73558369495071,
"learning_rate": 3.5391264500382e-08,
"logits/chosen": -0.22901353240013123,
"logits/rejected": -0.1688699871301651,
"logps/chosen": -1.0957512855529785,
"logps/rejected": -1.5964380502700806,
"loss": 1.9786,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.0957512855529785,
"rewards/margins": 0.500686526298523,
"rewards/rejected": -1.5964380502700806,
"step": 410
},
{
"epoch": 0.8688824914943732,
"grad_norm": 25.24757762967279,
"learning_rate": 3.0393896324452226e-08,
"logits/chosen": -0.23829559981822968,
"logits/rejected": -0.14140795171260834,
"logps/chosen": -1.1398870944976807,
"logps/rejected": -1.7605117559432983,
"loss": 1.9365,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.1398870944976807,
"rewards/margins": 0.6206245422363281,
"rewards/rejected": -1.7605117559432983,
"step": 415
},
{
"epoch": 0.8793509552473174,
"grad_norm": 24.11409806888563,
"learning_rate": 2.5757943310825026e-08,
"logits/chosen": -0.21624989807605743,
"logits/rejected": -0.15116100013256073,
"logps/chosen": -1.1420520544052124,
"logps/rejected": -1.7557262182235718,
"loss": 1.9847,
"rewards/accuracies": 0.6000000238418579,
"rewards/chosen": -1.1420520544052124,
"rewards/margins": 0.6136741042137146,
"rewards/rejected": -1.7557262182235718,
"step": 420
},
{
"epoch": 0.8898194190002617,
"grad_norm": 31.071203059945226,
"learning_rate": 2.148962009517823e-08,
"logits/chosen": -0.1776510775089264,
"logits/rejected": -0.11379513889551163,
"logps/chosen": -1.2811336517333984,
"logps/rejected": -1.7914276123046875,
"loss": 1.9474,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.2811336517333984,
"rewards/margins": 0.5102939605712891,
"rewards/rejected": -1.7914276123046875,
"step": 425
},
{
"epoch": 0.9002878827532059,
"grad_norm": 22.593650984666684,
"learning_rate": 1.759464849429082e-08,
"logits/chosen": -0.2011154592037201,
"logits/rejected": -0.14906981587409973,
"logps/chosen": -1.2140544652938843,
"logps/rejected": -1.8150146007537842,
"loss": 1.9428,
"rewards/accuracies": 0.637499988079071,
"rewards/chosen": -1.2140544652938843,
"rewards/margins": 0.6009601354598999,
"rewards/rejected": -1.8150146007537842,
"step": 430
},
{
"epoch": 0.9107563465061502,
"grad_norm": 20.886785791055413,
"learning_rate": 1.4078249835774169e-08,
"logits/chosen": -0.2394082248210907,
"logits/rejected": -0.1875392496585846,
"logps/chosen": -1.15377676486969,
"logps/rejected": -1.962689995765686,
"loss": 1.931,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.15377676486969,
"rewards/margins": 0.8089130520820618,
"rewards/rejected": -1.962689995765686,
"step": 435
},
{
"epoch": 0.9212248102590945,
"grad_norm": 24.47138715404228,
"learning_rate": 1.0945137958723705e-08,
"logits/chosen": -0.0983675867319107,
"logits/rejected": -0.08297935873270035,
"logps/chosen": -1.2065281867980957,
"logps/rejected": -1.7344862222671509,
"loss": 2.0038,
"rewards/accuracies": 0.6187499761581421,
"rewards/chosen": -1.2065281867980957,
"rewards/margins": 0.5279580354690552,
"rewards/rejected": -1.7344862222671509,
"step": 440
},
{
"epoch": 0.9316932740120387,
"grad_norm": 24.937539236981156,
"learning_rate": 8.19951289467482e-09,
"logits/chosen": -0.2168574333190918,
"logits/rejected": -0.1608891487121582,
"logps/chosen": -1.1793944835662842,
"logps/rejected": -1.7278966903686523,
"loss": 2.0001,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.1793944835662842,
"rewards/margins": 0.5485021471977234,
"rewards/rejected": -1.7278966903686523,
"step": 445
},
{
"epoch": 0.942161737764983,
"grad_norm": 25.949713098879222,
"learning_rate": 5.84505523733293e-09,
"logits/chosen": -0.13714662194252014,
"logits/rejected": -0.09888915717601776,
"logps/chosen": -1.2185251712799072,
"logps/rejected": -1.736971139907837,
"loss": 1.9405,
"rewards/accuracies": 0.675000011920929,
"rewards/chosen": -1.2185251712799072,
"rewards/margins": 0.5184457302093506,
"rewards/rejected": -1.736971139907837,
"step": 450
},
{
"epoch": 0.9526302015179272,
"grad_norm": 48.46134541581324,
"learning_rate": 3.8849212086261466e-09,
"logits/chosen": -0.17583271861076355,
"logits/rejected": -0.12143261730670929,
"logps/chosen": -1.3378379344940186,
"logps/rejected": -1.6454353332519531,
"loss": 1.997,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3378379344940186,
"rewards/margins": 0.3075973391532898,
"rewards/rejected": -1.6454353332519531,
"step": 455
},
{
"epoch": 0.9630986652708715,
"grad_norm": 17.18309794816949,
"learning_rate": 2.3217384276938756e-09,
"logits/chosen": -0.15463075041770935,
"logits/rejected": -0.10197849571704865,
"logps/chosen": -1.0989463329315186,
"logps/rejected": -1.7458966970443726,
"loss": 1.932,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.0989463329315186,
"rewards/margins": 0.6469505429267883,
"rewards/rejected": -1.7458966970443726,
"step": 460
},
{
"epoch": 0.9735671290238157,
"grad_norm": 23.454717675856944,
"learning_rate": 1.1576023884836472e-09,
"logits/chosen": -0.25512656569480896,
"logits/rejected": -0.15672564506530762,
"logps/chosen": -1.2135329246520996,
"logps/rejected": -1.7796437740325928,
"loss": 1.9646,
"rewards/accuracies": 0.668749988079071,
"rewards/chosen": -1.2135329246520996,
"rewards/margins": 0.5661108493804932,
"rewards/rejected": -1.7796437740325928,
"step": 465
},
{
"epoch": 0.98403559277676,
"grad_norm": 38.4586695917519,
"learning_rate": 3.940736506780395e-10,
"logits/chosen": -0.2280760258436203,
"logits/rejected": -0.14886632561683655,
"logps/chosen": -1.1674778461456299,
"logps/rejected": -1.6563600301742554,
"loss": 2.0158,
"rewards/accuracies": 0.6625000238418579,
"rewards/chosen": -1.1674778461456299,
"rewards/margins": 0.4888822138309479,
"rewards/rejected": -1.6563600301742554,
"step": 470
},
{
"epoch": 0.9945040565297043,
"grad_norm": 20.371210573575656,
"learning_rate": 3.2175747716822744e-11,
"logits/chosen": -0.268063485622406,
"logits/rejected": -0.13501767814159393,
"logps/chosen": -1.2155284881591797,
"logps/rejected": -1.7273550033569336,
"loss": 1.9722,
"rewards/accuracies": 0.59375,
"rewards/chosen": -1.2155284881591797,
"rewards/margins": 0.5118265151977539,
"rewards/rejected": -1.7273550033569336,
"step": 475
},
{
"epoch": 0.998691442030882,
"step": 477,
"total_flos": 0.0,
"train_loss": 2.0149041866606385,
"train_runtime": 17499.3556,
"train_samples_per_second": 3.494,
"train_steps_per_second": 0.027
}
],
"logging_steps": 5,
"max_steps": 477,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 1000000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}