simple_online_epoch_2_dpo_iter_6 / trainer_state.json
YYYYYYibo's picture
Model save
64526f6 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9962157048249763,
"eval_steps": 500,
"global_step": 162,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01,
"grad_norm": 44.790242854161896,
"learning_rate": 2.941176470588235e-08,
"logits/chosen": 0.4138435125350952,
"logits/rejected": 0.3073309361934662,
"logps/chosen": -238.74684143066406,
"logps/rejected": -277.3367919921875,
"loss": 0.6931,
"rewards/accuracies": 0.0,
"rewards/chosen": 0.0,
"rewards/margins": 0.0,
"rewards/rejected": 0.0,
"step": 1
},
{
"epoch": 0.06,
"grad_norm": 32.66922851838542,
"learning_rate": 2.941176470588235e-07,
"logits/chosen": 0.19251321256160736,
"logits/rejected": 0.15595921874046326,
"logps/chosen": -266.190673828125,
"logps/rejected": -288.5094299316406,
"loss": 0.6912,
"rewards/accuracies": 0.5384615659713745,
"rewards/chosen": -0.04362406209111214,
"rewards/margins": 0.010117193683981895,
"rewards/rejected": -0.05374125763773918,
"step": 10
},
{
"epoch": 0.12,
"grad_norm": 34.2665441677138,
"learning_rate": 4.99472085783721e-07,
"logits/chosen": 0.4772653877735138,
"logits/rejected": 0.34988027811050415,
"logps/chosen": -288.5440368652344,
"logps/rejected": -317.422607421875,
"loss": 0.6966,
"rewards/accuracies": 0.48846152424812317,
"rewards/chosen": -0.29237106442451477,
"rewards/margins": 0.041345465928316116,
"rewards/rejected": -0.3337165117263794,
"step": 20
},
{
"epoch": 0.18,
"grad_norm": 51.989784907303765,
"learning_rate": 4.901488388458247e-07,
"logits/chosen": 0.08469453454017639,
"logits/rejected": 0.02361711673438549,
"logps/chosen": -257.55908203125,
"logps/rejected": -290.6418151855469,
"loss": 0.6904,
"rewards/accuracies": 0.5153846144676208,
"rewards/chosen": 0.0216812863945961,
"rewards/margins": 0.06838896870613098,
"rewards/rejected": -0.04670768231153488,
"step": 30
},
{
"epoch": 0.25,
"grad_norm": 48.07967092856285,
"learning_rate": 4.695964991097616e-07,
"logits/chosen": 0.5012978315353394,
"logits/rejected": 0.3107348382472992,
"logps/chosen": -293.4065856933594,
"logps/rejected": -318.2868347167969,
"loss": 0.6923,
"rewards/accuracies": 0.5153846144676208,
"rewards/chosen": -0.3082655370235443,
"rewards/margins": 0.04468757286667824,
"rewards/rejected": -0.35295310616493225,
"step": 40
},
{
"epoch": 0.31,
"grad_norm": 50.8642865419175,
"learning_rate": 4.3877607113930516e-07,
"logits/chosen": 0.5789575576782227,
"logits/rejected": 0.747968852519989,
"logps/chosen": -291.36279296875,
"logps/rejected": -306.2597961425781,
"loss": 0.696,
"rewards/accuracies": 0.5269230604171753,
"rewards/chosen": -0.29860785603523254,
"rewards/margins": 0.016954706981778145,
"rewards/rejected": -0.31556254625320435,
"step": 50
},
{
"epoch": 0.37,
"grad_norm": 42.68053454562829,
"learning_rate": 3.991286838919086e-07,
"logits/chosen": 0.5176121592521667,
"logits/rejected": 0.48673737049102783,
"logps/chosen": -285.60284423828125,
"logps/rejected": -301.834228515625,
"loss": 0.6896,
"rewards/accuracies": 0.4961538314819336,
"rewards/chosen": -0.20814552903175354,
"rewards/margins": 0.02328580990433693,
"rewards/rejected": -0.23143133521080017,
"step": 60
},
{
"epoch": 0.43,
"grad_norm": 37.59954622299739,
"learning_rate": 3.52508205130354e-07,
"logits/chosen": 0.47016510367393494,
"logits/rejected": 0.6350060105323792,
"logps/chosen": -298.3149719238281,
"logps/rejected": -311.8184509277344,
"loss": 0.6954,
"rewards/accuracies": 0.5423076748847961,
"rewards/chosen": -0.33209383487701416,
"rewards/margins": 0.01967799849808216,
"rewards/rejected": -0.35177183151245117,
"step": 70
},
{
"epoch": 0.49,
"grad_norm": 40.19774755409481,
"learning_rate": 3.010945566265912e-07,
"logits/chosen": 0.8041943311691284,
"logits/rejected": 0.9286781549453735,
"logps/chosen": -320.7852783203125,
"logps/rejected": -339.48193359375,
"loss": 0.6856,
"rewards/accuracies": 0.557692289352417,
"rewards/chosen": -0.5020374655723572,
"rewards/margins": 0.013489325530827045,
"rewards/rejected": -0.5155267715454102,
"step": 80
},
{
"epoch": 0.55,
"grad_norm": 40.3298911710128,
"learning_rate": 2.4729178344249006e-07,
"logits/chosen": 0.526244044303894,
"logits/rejected": 0.5612362027168274,
"logps/chosen": -289.9747009277344,
"logps/rejected": -304.9601135253906,
"loss": 0.6927,
"rewards/accuracies": 0.5230769515037537,
"rewards/chosen": -0.2381971925497055,
"rewards/margins": 0.04978089779615402,
"rewards/rejected": -0.2879781126976013,
"step": 90
},
{
"epoch": 0.61,
"grad_norm": 35.40524465285595,
"learning_rate": 1.9361564345465145e-07,
"logits/chosen": 0.3361697196960449,
"logits/rejected": 0.5479218363761902,
"logps/chosen": -272.99285888671875,
"logps/rejected": -300.7643737792969,
"loss": 0.6878,
"rewards/accuracies": 0.5153846144676208,
"rewards/chosen": -0.1911364644765854,
"rewards/margins": 0.03422596678137779,
"rewards/rejected": -0.22536242008209229,
"step": 100
},
{
"epoch": 0.68,
"grad_norm": 38.98659700871813,
"learning_rate": 1.4257597331216208e-07,
"logits/chosen": 0.6074225902557373,
"logits/rejected": 0.7285165786743164,
"logps/chosen": -311.04827880859375,
"logps/rejected": -335.56396484375,
"loss": 0.685,
"rewards/accuracies": 0.5423076748847961,
"rewards/chosen": -0.4663804769515991,
"rewards/margins": 0.05671105906367302,
"rewards/rejected": -0.5230914950370789,
"step": 110
},
{
"epoch": 0.74,
"grad_norm": 50.7969521151244,
"learning_rate": 9.655933126436563e-08,
"logits/chosen": 0.5686596035957336,
"logits/rejected": 0.6905936002731323,
"logps/chosen": -276.1257629394531,
"logps/rejected": -292.55120849609375,
"loss": 0.7065,
"rewards/accuracies": 0.5538461804389954,
"rewards/chosen": -0.14772899448871613,
"rewards/margins": 0.05048359930515289,
"rewards/rejected": -0.19821257889270782,
"step": 120
},
{
"epoch": 0.8,
"grad_norm": 44.99508540990744,
"learning_rate": 5.771740434959277e-08,
"logits/chosen": 0.7891207337379456,
"logits/rejected": 0.6963477730751038,
"logps/chosen": -289.924072265625,
"logps/rejected": -315.6805419921875,
"loss": 0.6929,
"rewards/accuracies": 0.550000011920929,
"rewards/chosen": -0.2571752965450287,
"rewards/margins": 0.07165656983852386,
"rewards/rejected": -0.32883188128471375,
"step": 130
},
{
"epoch": 0.86,
"grad_norm": 41.07192587696782,
"learning_rate": 2.7866397900677185e-08,
"logits/chosen": 0.728725016117096,
"logits/rejected": 0.6798302531242371,
"logps/chosen": -313.1536865234375,
"logps/rejected": -325.9017639160156,
"loss": 0.686,
"rewards/accuracies": 0.5423076748847961,
"rewards/chosen": -0.5450281500816345,
"rewards/margins": 0.03127431869506836,
"rewards/rejected": -0.5763024687767029,
"step": 140
},
{
"epoch": 0.92,
"grad_norm": 51.88495068918687,
"learning_rate": 8.402111802159412e-09,
"logits/chosen": 0.7722111344337463,
"logits/rejected": 0.8273798227310181,
"logps/chosen": -296.1044921875,
"logps/rejected": -326.81695556640625,
"loss": 0.6812,
"rewards/accuracies": 0.607692301273346,
"rewards/chosen": -0.4110731780529022,
"rewards/margins": 0.07831522077322006,
"rewards/rejected": -0.4893884062767029,
"step": 150
},
{
"epoch": 0.98,
"grad_norm": 43.53404843175348,
"learning_rate": 2.3467443900582197e-10,
"logits/chosen": 0.9644160866737366,
"logits/rejected": 1.0424695014953613,
"logps/chosen": -288.78082275390625,
"logps/rejected": -310.8374328613281,
"loss": 0.685,
"rewards/accuracies": 0.5346153974533081,
"rewards/chosen": -0.3611108064651489,
"rewards/margins": 0.10043878108263016,
"rewards/rejected": -0.4615496098995209,
"step": 160
},
{
"epoch": 1.0,
"step": 162,
"total_flos": 0.0,
"train_loss": 0.6911558170377472,
"train_runtime": 23474.6733,
"train_samples_per_second": 0.9,
"train_steps_per_second": 0.007
}
],
"logging_steps": 10,
"max_steps": 162,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 200,
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}