{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 478, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020920502092050207, "grad_norm": 9.710838317871094, "learning_rate": 1.0416666666666666e-08, "logits/chosen": -3.161454916000366, "logits/rejected": -3.0680501461029053, "logps/chosen": -437.20782470703125, "logps/rejected": -343.8380432128906, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.05230125523012552, "grad_norm": 8.52570629119873, "learning_rate": 2.604166666666667e-07, "logits/chosen": -3.30265212059021, "logits/rejected": -3.2195777893066406, "logps/chosen": -432.7400207519531, "logps/rejected": -391.2707824707031, "loss": 0.6931, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.0006945470231585205, "rewards/margins": 0.0007254942320287228, "rewards/rejected": -3.094731437158771e-05, "step": 25 }, { "epoch": 0.10460251046025104, "grad_norm": 9.973247528076172, "learning_rate": 4.999733114418725e-07, "logits/chosen": -3.299149751663208, "logits/rejected": -3.2438504695892334, "logps/chosen": -446.8968811035156, "logps/rejected": -404.587158203125, "loss": 0.6914, "rewards/accuracies": 0.5799999833106995, "rewards/chosen": 0.005268932785838842, "rewards/margins": 0.002896952675655484, "rewards/rejected": 0.002371980343014002, "step": 50 }, { "epoch": 0.15690376569037656, "grad_norm": 9.357086181640625, "learning_rate": 4.951516761176343e-07, "logits/chosen": -3.302896738052368, "logits/rejected": -3.225714683532715, "logps/chosen": -480.6836853027344, "logps/rejected": -436.4853515625, "loss": 0.6871, "rewards/accuracies": 0.5975000262260437, "rewards/chosen": 0.013074235059320927, "rewards/margins": 0.012715624645352364, "rewards/rejected": 0.0003586093371268362, "step": 75 }, { "epoch": 0.20920502092050208, "grad_norm": 8.75236988067627, "learning_rate": 4.821741763807186e-07, "logits/chosen": -3.2598507404327393, "logits/rejected": -3.2092175483703613, "logps/chosen": -426.1629638671875, "logps/rejected": -389.50421142578125, "loss": 0.6795, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.003723819274455309, "rewards/margins": 0.02804265171289444, "rewards/rejected": -0.02431883104145527, "step": 100 }, { "epoch": 0.20920502092050208, "eval_logits/chosen": -3.313567638397217, "eval_logits/rejected": -3.2565112113952637, "eval_logps/chosen": -423.0196533203125, "eval_logps/rejected": -407.8036804199219, "eval_loss": 0.6759119629859924, "eval_rewards/accuracies": 0.62890625, "eval_rewards/chosen": 0.0017230990342795849, "eval_rewards/margins": 0.0345395989716053, "eval_rewards/rejected": -0.03281649947166443, "eval_runtime": 8.2733, "eval_samples_per_second": 241.742, "eval_steps_per_second": 3.868, "step": 100 }, { "epoch": 0.2615062761506276, "grad_norm": 8.889654159545898, "learning_rate": 4.614725560802639e-07, "logits/chosen": -3.2867064476013184, "logits/rejected": -3.2023117542266846, "logps/chosen": -430.924560546875, "logps/rejected": -382.1640319824219, "loss": 0.6731, "rewards/accuracies": 0.6474999785423279, "rewards/chosen": -0.01004675030708313, "rewards/margins": 0.04363078624010086, "rewards/rejected": -0.05367753654718399, "step": 125 }, { "epoch": 0.3138075313807531, "grad_norm": 9.882610321044922, "learning_rate": 4.337355301007335e-07, "logits/chosen": -3.222916841506958, "logits/rejected": -3.1899585723876953, "logps/chosen": -443.3583679199219, "logps/rejected": -414.344970703125, "loss": 0.667, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.017106110230088234, "rewards/margins": 0.06052257865667343, "rewards/rejected": -0.07762870192527771, "step": 150 }, { "epoch": 0.36610878661087864, "grad_norm": 9.968969345092773, "learning_rate": 3.9988587174999306e-07, "logits/chosen": -3.2201168537139893, "logits/rejected": -3.131910800933838, "logps/chosen": -477.7035217285156, "logps/rejected": -405.7104797363281, "loss": 0.6574, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": -0.03587143123149872, "rewards/margins": 0.09045815467834473, "rewards/rejected": -0.12632958590984344, "step": 175 }, { "epoch": 0.41841004184100417, "grad_norm": 9.230700492858887, "learning_rate": 3.610497133404795e-07, "logits/chosen": -3.23995304107666, "logits/rejected": -3.1550867557525635, "logps/chosen": -431.6617126464844, "logps/rejected": -396.62774658203125, "loss": 0.6584, "rewards/accuracies": 0.6349999904632568, "rewards/chosen": -0.06991340965032578, "rewards/margins": 0.08443903923034668, "rewards/rejected": -0.15435244143009186, "step": 200 }, { "epoch": 0.41841004184100417, "eval_logits/chosen": -3.2767982482910156, "eval_logits/rejected": -3.2240023612976074, "eval_logps/chosen": -429.85614013671875, "eval_logps/rejected": -420.6952209472656, "eval_loss": 0.6533502340316772, "eval_rewards/accuracies": 0.64453125, "eval_rewards/chosen": -0.06664139777421951, "eval_rewards/margins": 0.09509073942899704, "eval_rewards/rejected": -0.16173213720321655, "eval_runtime": 8.2763, "eval_samples_per_second": 241.653, "eval_steps_per_second": 3.866, "step": 200 }, { "epoch": 0.4707112970711297, "grad_norm": 10.79430103302002, "learning_rate": 3.185190812915646e-07, "logits/chosen": -3.1671783924102783, "logits/rejected": -3.118861436843872, "logps/chosen": -446.6968994140625, "logps/rejected": -410.1864013671875, "loss": 0.6518, "rewards/accuracies": 0.6449999809265137, "rewards/chosen": -0.08074235171079636, "rewards/margins": 0.10100732743740082, "rewards/rejected": -0.18174967169761658, "step": 225 }, { "epoch": 0.5230125523012552, "grad_norm": 10.086767196655273, "learning_rate": 2.7370891215954565e-07, "logits/chosen": -3.1980080604553223, "logits/rejected": -3.1623446941375732, "logps/chosen": -440.4261474609375, "logps/rejected": -439.9627685546875, "loss": 0.6476, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.07635506987571716, "rewards/margins": 0.11709018051624298, "rewards/rejected": -0.19344525039196014, "step": 250 }, { "epoch": 0.5753138075313807, "grad_norm": 9.709197044372559, "learning_rate": 2.2810997961375938e-07, "logits/chosen": -3.216128349304199, "logits/rejected": -3.1448960304260254, "logps/chosen": -425.5062561035156, "logps/rejected": -391.5084228515625, "loss": 0.6461, "rewards/accuracies": 0.6800000071525574, "rewards/chosen": -0.072402223944664, "rewards/margins": 0.1306913197040558, "rewards/rejected": -0.2030935436487198, "step": 275 }, { "epoch": 0.6276150627615062, "grad_norm": 10.980072975158691, "learning_rate": 1.8323929841460178e-07, "logits/chosen": -3.2006824016571045, "logits/rejected": -3.1383109092712402, "logps/chosen": -443.0799865722656, "logps/rejected": -436.2774658203125, "loss": 0.6494, "rewards/accuracies": 0.625, "rewards/chosen": -0.1245601624250412, "rewards/margins": 0.10581608861684799, "rewards/rejected": -0.2303762584924698, "step": 300 }, { "epoch": 0.6276150627615062, "eval_logits/chosen": -3.2553329467773438, "eval_logits/rejected": -3.2049574851989746, "eval_logps/chosen": -433.9639892578125, "eval_logps/rejected": -428.6236572265625, "eval_loss": 0.6438009142875671, "eval_rewards/accuracies": 0.62109375, "eval_rewards/chosen": -0.10771973431110382, "eval_rewards/margins": 0.13329659402370453, "eval_rewards/rejected": -0.24101632833480835, "eval_runtime": 8.3559, "eval_samples_per_second": 239.351, "eval_steps_per_second": 3.83, "step": 300 }, { "epoch": 0.6799163179916318, "grad_norm": 9.95361614227295, "learning_rate": 1.4058965538597032e-07, "logits/chosen": -3.2326276302337646, "logits/rejected": -3.198971748352051, "logps/chosen": -443.37371826171875, "logps/rejected": -433.8561706542969, "loss": 0.6392, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": -0.11981040984392166, "rewards/margins": 0.12150833755731583, "rewards/rejected": -0.2413187474012375, "step": 325 }, { "epoch": 0.7322175732217573, "grad_norm": 10.580938339233398, "learning_rate": 1.0157994641835734e-07, "logits/chosen": -3.1840951442718506, "logits/rejected": -3.0981223583221436, "logps/chosen": -430.7939453125, "logps/rejected": -403.5060119628906, "loss": 0.6447, "rewards/accuracies": 0.6225000023841858, "rewards/chosen": -0.15974169969558716, "rewards/margins": 0.09658970683813095, "rewards/rejected": -0.2563314139842987, "step": 350 }, { "epoch": 0.7845188284518828, "grad_norm": 8.326279640197754, "learning_rate": 6.75079717232744e-08, "logits/chosen": -3.2011187076568604, "logits/rejected": -3.0969460010528564, "logps/chosen": -465.22344970703125, "logps/rejected": -392.5793762207031, "loss": 0.6411, "rewards/accuracies": 0.6549999713897705, "rewards/chosen": -0.11683624982833862, "rewards/margins": 0.14286838471889496, "rewards/rejected": -0.2597046196460724, "step": 375 }, { "epoch": 0.8368200836820083, "grad_norm": 8.851872444152832, "learning_rate": 3.9507259776993954e-08, "logits/chosen": -3.221069097518921, "logits/rejected": -3.1744155883789062, "logps/chosen": -449.8016052246094, "logps/rejected": -409.0178527832031, "loss": 0.6428, "rewards/accuracies": 0.6524999737739563, "rewards/chosen": -0.08884063363075256, "rewards/margins": 0.13940051198005676, "rewards/rejected": -0.22824116051197052, "step": 400 }, { "epoch": 0.8368200836820083, "eval_logits/chosen": -3.254263401031494, "eval_logits/rejected": -3.204622507095337, "eval_logps/chosen": -433.20001220703125, "eval_logps/rejected": -428.88836669921875, "eval_loss": 0.6415477395057678, "eval_rewards/accuracies": 0.62109375, "eval_rewards/chosen": -0.10007989406585693, "eval_rewards/margins": 0.14358317852020264, "eval_rewards/rejected": -0.24366310238838196, "eval_runtime": 8.7287, "eval_samples_per_second": 229.13, "eval_steps_per_second": 3.666, "step": 400 }, { "epoch": 0.8891213389121339, "grad_norm": 10.138018608093262, "learning_rate": 1.850935636255496e-08, "logits/chosen": -3.1941397190093994, "logits/rejected": -3.1335413455963135, "logps/chosen": -463.3765869140625, "logps/rejected": -418.0171203613281, "loss": 0.6393, "rewards/accuracies": 0.6399999856948853, "rewards/chosen": -0.11999661475419998, "rewards/margins": 0.13007262349128723, "rewards/rejected": -0.2500692307949066, "step": 425 }, { "epoch": 0.9414225941422594, "grad_norm": 9.687002182006836, "learning_rate": 5.212833302556258e-09, "logits/chosen": -3.202920913696289, "logits/rejected": -3.14504337310791, "logps/chosen": -442.259765625, "logps/rejected": -406.7275085449219, "loss": 0.6419, "rewards/accuracies": 0.6050000190734863, "rewards/chosen": -0.13094988465309143, "rewards/margins": 0.1017264574766159, "rewards/rejected": -0.23267632722854614, "step": 450 }, { "epoch": 0.9937238493723849, "grad_norm": 8.862220764160156, "learning_rate": 6.004792024680294e-11, "logits/chosen": -3.1527392864227295, "logits/rejected": -3.1062843799591064, "logps/chosen": -445.8639831542969, "logps/rejected": -418.4268493652344, "loss": 0.6377, "rewards/accuracies": 0.6575000286102295, "rewards/chosen": -0.11145105212926865, "rewards/margins": 0.15232053399085999, "rewards/rejected": -0.26377159357070923, "step": 475 }, { "epoch": 1.0, "step": 478, "total_flos": 0.0, "train_loss": 0.6571792745689967, "train_runtime": 784.6622, "train_samples_per_second": 77.913, "train_steps_per_second": 0.609 } ], "logging_steps": 25, "max_steps": 478, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }