{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 50, "global_step": 352, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.028409090909090908, "grad_norm": 90.01349453953418, "learning_rate": 1.3888888888888888e-07, "logits/chosen": -2.8592569828033447, "logits/rejected": -2.642957925796509, "logps/chosen": -390.56573486328125, "logps/rejected": -607.8802490234375, "loss": 1.3762, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.001940618036314845, "rewards/margins": 0.013666175305843353, "rewards/rejected": -0.011725558899343014, "step": 10 }, { "epoch": 0.056818181818181816, "grad_norm": 38.16702665689489, "learning_rate": 2.7777777777777776e-07, "logits/chosen": -2.842255115509033, "logits/rejected": -2.694774627685547, "logps/chosen": -328.5555419921875, "logps/rejected": -774.9117431640625, "loss": 1.0262, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": 0.08589581400156021, "rewards/margins": 0.660616934299469, "rewards/rejected": -0.5747210383415222, "step": 20 }, { "epoch": 0.08522727272727272, "grad_norm": 9.474354712422207, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.857499837875366, "logits/rejected": -2.675124406814575, "logps/chosen": -295.38397216796875, "logps/rejected": -1112.8095703125, "loss": 0.3585, "rewards/accuracies": 1.0, "rewards/chosen": 0.36898040771484375, "rewards/margins": 4.717532157897949, "rewards/rejected": -4.3485517501831055, "step": 30 }, { "epoch": 0.11363636363636363, "grad_norm": 2.844664039016391, "learning_rate": 4.998023493068254e-07, "logits/chosen": -2.8356635570526123, "logits/rejected": -2.645285129547119, "logps/chosen": -314.98876953125, "logps/rejected": -2130.987060546875, "loss": 0.083, "rewards/accuracies": 1.0, "rewards/chosen": 0.3176669478416443, "rewards/margins": 14.54858112335205, "rewards/rejected": -14.230911254882812, "step": 40 }, { "epoch": 0.14204545454545456, "grad_norm": 0.18581946577481065, "learning_rate": 4.975823666181255e-07, "logits/chosen": -2.8282175064086914, "logits/rejected": -2.554896831512451, "logps/chosen": -441.171875, "logps/rejected": -4335.5703125, "loss": 0.011, "rewards/accuracies": 1.0, "rewards/chosen": -0.785366415977478, "rewards/margins": 36.03093338012695, "rewards/rejected": -36.8162956237793, "step": 50 }, { "epoch": 0.14204545454545456, "eval_logits/chosen": -2.965341806411743, "eval_logits/rejected": -2.5409843921661377, "eval_logps/chosen": -459.0153503417969, "eval_logps/rejected": -4976.9013671875, "eval_loss": 0.009123104624450207, "eval_rewards/accuracies": 0.9959677457809448, "eval_rewards/chosen": -0.9203360676765442, "eval_rewards/margins": 42.44078063964844, "eval_rewards/rejected": -43.361122131347656, "eval_runtime": 195.4208, "eval_samples_per_second": 19.983, "eval_steps_per_second": 0.317, "step": 50 }, { "epoch": 0.17045454545454544, "grad_norm": 1.8496563592278583, "learning_rate": 4.929173350101024e-07, "logits/chosen": -3.018855571746826, "logits/rejected": -2.489605188369751, "logps/chosen": -478.2201232910156, "logps/rejected": -5343.02783203125, "loss": 0.0066, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.1832506656646729, "rewards/margins": 45.8184814453125, "rewards/rejected": -47.00173568725586, "step": 60 }, { "epoch": 0.19886363636363635, "grad_norm": 0.42838497413063575, "learning_rate": 4.858533249305336e-07, "logits/chosen": -2.8961918354034424, "logits/rejected": -1.8984692096710205, "logps/chosen": -537.8681640625, "logps/rejected": -5729.29150390625, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -1.587074875831604, "rewards/margins": 49.57848358154297, "rewards/rejected": -51.165557861328125, "step": 70 }, { "epoch": 0.22727272727272727, "grad_norm": 23.183269598091908, "learning_rate": 4.764600984163808e-07, "logits/chosen": -3.015650510787964, "logits/rejected": -1.4822012186050415, "logps/chosen": -574.1959838867188, "logps/rejected": -7196.5283203125, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -1.6787515878677368, "rewards/margins": 63.5491943359375, "rewards/rejected": -65.22795104980469, "step": 80 }, { "epoch": 0.2556818181818182, "grad_norm": 0.2590854602849931, "learning_rate": 4.6483042014491527e-07, "logits/chosen": -2.880042314529419, "logits/rejected": -1.109546422958374, "logps/chosen": -574.42529296875, "logps/rejected": -6483.9189453125, "loss": 0.0194, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.123119831085205, "rewards/margins": 56.34221267700195, "rewards/rejected": -58.4653434753418, "step": 90 }, { "epoch": 0.2840909090909091, "grad_norm": 0.2975727339596996, "learning_rate": 4.510791413176912e-07, "logits/chosen": -2.7033655643463135, "logits/rejected": -0.46150803565979004, "logps/chosen": -574.6309814453125, "logps/rejected": -7050.77099609375, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -2.0516088008880615, "rewards/margins": 61.398292541503906, "rewards/rejected": -63.44989776611328, "step": 100 }, { "epoch": 0.2840909090909091, "eval_logits/chosen": -2.5256500244140625, "eval_logits/rejected": 0.035765551030635834, "eval_logps/chosen": -558.7725830078125, "eval_logps/rejected": -6818.2158203125, "eval_loss": 0.0016336780972778797, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -1.9179083108901978, "eval_rewards/margins": 59.856361389160156, "eval_rewards/rejected": -61.77427673339844, "eval_runtime": 192.2507, "eval_samples_per_second": 20.312, "eval_steps_per_second": 0.322, "step": 100 }, { "epoch": 0.3125, "grad_norm": 0.04984017585029643, "learning_rate": 4.353420654246546e-07, "logits/chosen": -2.2970995903015137, "logits/rejected": 0.2104659080505371, "logps/chosen": -568.5081787109375, "logps/rejected": -6664.22412109375, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -1.930869460105896, "rewards/margins": 58.71869659423828, "rewards/rejected": -60.649559020996094, "step": 110 }, { "epoch": 0.3409090909090909, "grad_norm": 0.03797940797804543, "learning_rate": 4.177746070897592e-07, "logits/chosen": -1.7653045654296875, "logits/rejected": 1.1280748844146729, "logps/chosen": -600.4093017578125, "logps/rejected": -7361.03662109375, "loss": 0.0084, "rewards/accuracies": 1.0, "rewards/chosen": -2.188924789428711, "rewards/margins": 65.04915618896484, "rewards/rejected": -67.23808288574219, "step": 120 }, { "epoch": 0.3693181818181818, "grad_norm": 0.47736957873502217, "learning_rate": 3.9855025724292763e-07, "logits/chosen": -2.10666823387146, "logits/rejected": 1.7087266445159912, "logps/chosen": -594.9124755859375, "logps/rejected": -7779.0634765625, "loss": 0.003, "rewards/accuracies": 1.0, "rewards/chosen": -2.2446084022521973, "rewards/margins": 69.14064025878906, "rewards/rejected": -71.38525390625, "step": 130 }, { "epoch": 0.3977272727272727, "grad_norm": 1.020605288874105, "learning_rate": 3.7785886977585555e-07, "logits/chosen": -2.2910099029541016, "logits/rejected": 1.9655479192733765, "logps/chosen": -569.71533203125, "logps/rejected": -8695.2900390625, "loss": 0.0045, "rewards/accuracies": 1.0, "rewards/chosen": -2.0367894172668457, "rewards/margins": 78.4962158203125, "rewards/rejected": -80.53299713134766, "step": 140 }, { "epoch": 0.42613636363636365, "grad_norm": 0.5904857854916888, "learning_rate": 3.5590478660213206e-07, "logits/chosen": -2.2953405380249023, "logits/rejected": 1.9216325283050537, "logps/chosen": -595.8374633789062, "logps/rejected": -8074.73974609375, "loss": 0.0023, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.1777424812316895, "rewards/margins": 71.44985961914062, "rewards/rejected": -73.62760925292969, "step": 150 }, { "epoch": 0.42613636363636365, "eval_logits/chosen": -2.4309346675872803, "eval_logits/rejected": 1.8614341020584106, "eval_logps/chosen": -558.0396728515625, "eval_logps/rejected": -7546.6005859375, "eval_loss": 0.001163232373073697, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -1.9105795621871948, "eval_rewards/margins": 67.14754486083984, "eval_rewards/rejected": -69.05812072753906, "eval_runtime": 193.8607, "eval_samples_per_second": 20.143, "eval_steps_per_second": 0.32, "step": 150 }, { "epoch": 0.45454545454545453, "grad_norm": 5.391869442805235, "learning_rate": 3.3290481963801696e-07, "logits/chosen": -2.435987949371338, "logits/rejected": 2.048819065093994, "logps/chosen": -532.9867553710938, "logps/rejected": -7208.0419921875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.628488540649414, "rewards/margins": 64.28703308105469, "rewards/rejected": -65.91552734375, "step": 160 }, { "epoch": 0.48295454545454547, "grad_norm": 0.031936510635288025, "learning_rate": 3.0908610963322626e-07, "logits/chosen": -2.173886775970459, "logits/rejected": 2.0431816577911377, "logps/chosen": -577.22314453125, "logps/rejected": -7725.6796875, "loss": 0.0074, "rewards/accuracies": 1.0, "rewards/chosen": -1.896048903465271, "rewards/margins": 68.07765197753906, "rewards/rejected": -69.97369384765625, "step": 170 }, { "epoch": 0.5113636363636364, "grad_norm": 0.018094668159337083, "learning_rate": 2.846838829972671e-07, "logits/chosen": -1.9897123575210571, "logits/rejected": 2.31811785697937, "logps/chosen": -567.0270385742188, "logps/rejected": -7191.4697265625, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -2.061227798461914, "rewards/margins": 63.72364044189453, "rewards/rejected": -65.78486633300781, "step": 180 }, { "epoch": 0.5397727272727273, "grad_norm": 0.3615128328570267, "learning_rate": 2.5993912877423147e-07, "logits/chosen": -1.5569745302200317, "logits/rejected": 1.8910541534423828, "logps/chosen": -519.6112670898438, "logps/rejected": -8060.75390625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -1.7399555444717407, "rewards/margins": 71.95148468017578, "rewards/rejected": -73.69144439697266, "step": 190 }, { "epoch": 0.5681818181818182, "grad_norm": 1.460001191569848, "learning_rate": 2.3509621870754504e-07, "logits/chosen": -1.1832640171051025, "logits/rejected": 2.8146424293518066, "logps/chosen": -540.25, "logps/rejected": -7328.6376953125, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -1.751201868057251, "rewards/margins": 65.22674560546875, "rewards/rejected": -66.97795104980469, "step": 200 }, { "epoch": 0.5681818181818182, "eval_logits/chosen": -1.0991721153259277, "eval_logits/rejected": 2.4905478954315186, "eval_logps/chosen": -560.5505981445312, "eval_logps/rejected": -7817.95947265625, "eval_loss": 0.0009270149166695774, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -1.9356889724731445, "eval_rewards/margins": 69.83601379394531, "eval_rewards/rejected": -71.7717056274414, "eval_runtime": 195.666, "eval_samples_per_second": 19.957, "eval_steps_per_second": 0.317, "step": 200 }, { "epoch": 0.5965909090909091, "grad_norm": 0.6173080616307016, "learning_rate": 2.1040049389819624e-07, "logits/chosen": -1.0591375827789307, "logits/rejected": 2.6843762397766113, "logps/chosen": -585.5335083007812, "logps/rejected": -7571.32666015625, "loss": 0.0065, "rewards/accuracies": 1.0, "rewards/chosen": -1.9489023685455322, "rewards/margins": 67.44532775878906, "rewards/rejected": -69.39422607421875, "step": 210 }, { "epoch": 0.625, "grad_norm": 3.3870532652701733, "learning_rate": 1.8609584188988133e-07, "logits/chosen": -0.8726997375488281, "logits/rejected": 2.3560879230499268, "logps/chosen": -635.2283935546875, "logps/rejected": -6522.76708984375, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -2.706278085708618, "rewards/margins": 56.02503204345703, "rewards/rejected": -58.73130416870117, "step": 220 }, { "epoch": 0.6534090909090909, "grad_norm": 84.74544834753664, "learning_rate": 1.624222881090439e-07, "logits/chosen": -1.068495512008667, "logits/rejected": 2.1958107948303223, "logps/chosen": -655.9327392578125, "logps/rejected": -7632.9365234375, "loss": 0.0224, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -2.6126813888549805, "rewards/margins": 66.86088562011719, "rewards/rejected": -69.47355651855469, "step": 230 }, { "epoch": 0.6818181818181818, "grad_norm": 0.655326329468716, "learning_rate": 1.3961362544602212e-07, "logits/chosen": -0.8508334159851074, "logits/rejected": 2.6375107765197754, "logps/chosen": -604.7547607421875, "logps/rejected": -6603.0595703125, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -2.287423849105835, "rewards/margins": 57.80244827270508, "rewards/rejected": -60.08986282348633, "step": 240 }, { "epoch": 0.7102272727272727, "grad_norm": 0.8566182395838629, "learning_rate": 1.1789510538684522e-07, "logits/chosen": -0.9583337903022766, "logits/rejected": 2.481625556945801, "logps/chosen": -564.140869140625, "logps/rejected": -8373.404296875, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -2.12107253074646, "rewards/margins": 74.31715393066406, "rewards/rejected": -76.43821716308594, "step": 250 }, { "epoch": 0.7102272727272727, "eval_logits/chosen": -0.9364973902702332, "eval_logits/rejected": 2.6537041664123535, "eval_logps/chosen": -579.6390991210938, "eval_logps/rejected": -7301.04736328125, "eval_loss": 0.0008862165850587189, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -2.126574754714966, "eval_rewards/margins": 64.47601318359375, "eval_rewards/rejected": -66.60258483886719, "eval_runtime": 193.2705, "eval_samples_per_second": 20.205, "eval_steps_per_second": 0.321, "step": 250 }, { "epoch": 0.7386363636363636, "grad_norm": 0.3669759262567033, "learning_rate": 9.748121349736891e-08, "logits/chosen": -0.815521240234375, "logits/rejected": 2.5474696159362793, "logps/chosen": -614.1104736328125, "logps/rejected": -7755.7451171875, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.302823305130005, "rewards/margins": 68.47334289550781, "rewards/rejected": -70.77616882324219, "step": 260 }, { "epoch": 0.7670454545454546, "grad_norm": 0.14376906522497312, "learning_rate": 7.857355122839673e-08, "logits/chosen": -0.8737711906433105, "logits/rejected": 2.419881582260132, "logps/chosen": -592.2354125976562, "logps/rejected": -7475.37109375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -2.113858461380005, "rewards/margins": 65.95025634765625, "rewards/rejected": -68.0641098022461, "step": 270 }, { "epoch": 0.7954545454545454, "grad_norm": 0.17275320368872452, "learning_rate": 6.135884496044244e-08, "logits/chosen": -0.8250367045402527, "logits/rejected": 2.680527687072754, "logps/chosen": -574.1306762695312, "logps/rejected": -7469.40087890625, "loss": 0.0022, "rewards/accuracies": 1.0, "rewards/chosen": -1.9495769739151, "rewards/margins": 66.2955322265625, "rewards/rejected": -68.24510955810547, "step": 280 }, { "epoch": 0.8238636363636364, "grad_norm": 0.036939229346128824, "learning_rate": 4.600710195020982e-08, "logits/chosen": -0.6854007244110107, "logits/rejected": 2.711479902267456, "logps/chosen": -595.4219970703125, "logps/rejected": -7571.99853515625, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -2.0627870559692383, "rewards/margins": 67.24461364746094, "rewards/rejected": -69.30741119384766, "step": 290 }, { "epoch": 0.8522727272727273, "grad_norm": 1.0005598289334474, "learning_rate": 3.2669931390104374e-08, "logits/chosen": -0.7712670564651489, "logits/rejected": 2.537261724472046, "logps/chosen": -546.4788818359375, "logps/rejected": -8221.009765625, "loss": 0.0017, "rewards/accuracies": 1.0, "rewards/chosen": -1.9022438526153564, "rewards/margins": 74.09959411621094, "rewards/rejected": -76.00184631347656, "step": 300 }, { "epoch": 0.8522727272727273, "eval_logits/chosen": -0.6079820394515991, "eval_logits/rejected": 2.6897590160369873, "eval_logps/chosen": -575.6599731445312, "eval_logps/rejected": -7868.724609375, "eval_loss": 0.0006528676021844149, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -2.086782693862915, "eval_rewards/margins": 70.19257354736328, "eval_rewards/rejected": -72.27935791015625, "eval_runtime": 193.3061, "eval_samples_per_second": 20.201, "eval_steps_per_second": 0.321, "step": 300 }, { "epoch": 0.8806818181818182, "grad_norm": 0.0660494270632579, "learning_rate": 2.147904716149135e-08, "logits/chosen": -0.5389717817306519, "logits/rejected": 2.602092981338501, "logps/chosen": -555.8554077148438, "logps/rejected": -7988.98974609375, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -1.9378883838653564, "rewards/margins": 70.9469985961914, "rewards/rejected": -72.8848876953125, "step": 310 }, { "epoch": 0.9090909090909091, "grad_norm": 0.0488754281750263, "learning_rate": 1.254496706805433e-08, "logits/chosen": -0.7335542440414429, "logits/rejected": 2.5863664150238037, "logps/chosen": -590.0091552734375, "logps/rejected": -8039.3125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -2.0733609199523926, "rewards/margins": 71.79994201660156, "rewards/rejected": -73.87330627441406, "step": 320 }, { "epoch": 0.9375, "grad_norm": 0.2519430705585286, "learning_rate": 5.955921395237318e-09, "logits/chosen": -0.6468337178230286, "logits/rejected": 2.546152353286743, "logps/chosen": -548.1500244140625, "logps/rejected": -8072.88037109375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.928818941116333, "rewards/margins": 72.32246398925781, "rewards/rejected": -74.25126647949219, "step": 330 }, { "epoch": 0.9659090909090909, "grad_norm": 0.8725515919272617, "learning_rate": 1.7769815745066474e-09, "logits/chosen": -0.8968937993049622, "logits/rejected": 2.3741860389709473, "logps/chosen": -561.4879760742188, "logps/rejected": -7435.28125, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.9250719547271729, "rewards/margins": 66.34008026123047, "rewards/rejected": -68.26514434814453, "step": 340 }, { "epoch": 0.9943181818181818, "grad_norm": 3.1933280173345713, "learning_rate": 4.9417557483610875e-11, "logits/chosen": -0.6227324604988098, "logits/rejected": 2.5037405490875244, "logps/chosen": -589.6453247070312, "logps/rejected": -7948.50927734375, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -2.153775215148926, "rewards/margins": 70.8557357788086, "rewards/rejected": -73.00951385498047, "step": 350 }, { "epoch": 0.9943181818181818, "eval_logits/chosen": -0.5257502198219299, "eval_logits/rejected": 2.7151966094970703, "eval_logps/chosen": -577.6112670898438, "eval_logps/rejected": -7961.02392578125, "eval_loss": 0.000620901002548635, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -2.106295585632324, "eval_rewards/margins": 71.09603881835938, "eval_rewards/rejected": -73.20234680175781, "eval_runtime": 193.4431, "eval_samples_per_second": 20.187, "eval_steps_per_second": 0.321, "step": 350 }, { "epoch": 1.0, "step": 352, "total_flos": 0.0, "train_loss": 0.08502711300074721, "train_runtime": 9968.7204, "train_samples_per_second": 4.514, "train_steps_per_second": 0.035 } ], "logging_steps": 10, "max_steps": 352, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }