{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9984431759211209, "eval_steps": 1000, "global_step": 481, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0020757654385054488, "grad_norm": 7.453312579539728, "learning_rate": 1.020408163265306e-08, "logits/chosen": -2.730942726135254, "logits/rejected": -2.654609203338623, "logps/chosen": -350.489990234375, "logps/rejected": -325.546875, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02075765438505449, "grad_norm": 7.878888995232912, "learning_rate": 1.0204081632653061e-07, "logits/chosen": -2.7330236434936523, "logits/rejected": -2.735116720199585, "logps/chosen": -366.51531982421875, "logps/rejected": -412.2677001953125, "loss": 0.6931, "rewards/accuracies": 0.4270833432674408, "rewards/chosen": -0.000205132644623518, "rewards/margins": 4.354613702162169e-05, "rewards/rejected": -0.00024867875617928803, "step": 10 }, { "epoch": 0.04151530877010898, "grad_norm": 7.53294584904676, "learning_rate": 2.0408163265306121e-07, "logits/chosen": -2.7173304557800293, "logits/rejected": -2.693912982940674, "logps/chosen": -378.73748779296875, "logps/rejected": -404.47003173828125, "loss": 0.6892, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.0025507560931146145, "rewards/margins": 0.008084132336080074, "rewards/rejected": -0.005533376708626747, "step": 20 }, { "epoch": 0.062272963155163466, "grad_norm": 8.007471678418003, "learning_rate": 3.0612244897959183e-07, "logits/chosen": -2.716646194458008, "logits/rejected": -2.700786590576172, "logps/chosen": -363.6639709472656, "logps/rejected": -390.54083251953125, "loss": 0.6692, "rewards/accuracies": 0.903124988079071, "rewards/chosen": 0.02478734776377678, "rewards/margins": 0.051134396344423294, "rewards/rejected": -0.026347041130065918, "step": 30 }, { "epoch": 0.08303061754021795, "grad_norm": 9.3622528637074, "learning_rate": 4.0816326530612243e-07, "logits/chosen": -2.7087109088897705, "logits/rejected": -2.669712543487549, "logps/chosen": -347.83538818359375, "logps/rejected": -376.85260009765625, "loss": 0.6044, "rewards/accuracies": 0.875, "rewards/chosen": 0.05488457530736923, "rewards/margins": 0.19498120248317719, "rewards/rejected": -0.14009663462638855, "step": 40 }, { "epoch": 0.10378827192527244, "grad_norm": 11.969966849217528, "learning_rate": 4.999933894080444e-07, "logits/chosen": -2.7135281562805176, "logits/rejected": -2.6938090324401855, "logps/chosen": -403.2617492675781, "logps/rejected": -495.21270751953125, "loss": 0.4674, "rewards/accuracies": 0.871874988079071, "rewards/chosen": -0.37011662125587463, "rewards/margins": 0.6785963177680969, "rewards/rejected": -1.048712968826294, "step": 50 }, { "epoch": 0.12454592631032693, "grad_norm": 19.176201042712012, "learning_rate": 4.992005413014143e-07, "logits/chosen": -2.7302985191345215, "logits/rejected": -2.7273764610290527, "logps/chosen": -528.5646362304688, "logps/rejected": -741.4615478515625, "loss": 0.3523, "rewards/accuracies": 0.84375, "rewards/chosen": -1.8138965368270874, "rewards/margins": 1.7118844985961914, "rewards/rejected": -3.5257811546325684, "step": 60 }, { "epoch": 0.14530358069538143, "grad_norm": 20.310776567548704, "learning_rate": 4.970903776169402e-07, "logits/chosen": -2.7460341453552246, "logits/rejected": -2.7275753021240234, "logps/chosen": -634.8268432617188, "logps/rejected": -865.6439208984375, "loss": 0.3052, "rewards/accuracies": 0.84375, "rewards/chosen": -2.382263422012329, "rewards/margins": 2.2324626445770264, "rewards/rejected": -4.6147260665893555, "step": 70 }, { "epoch": 0.1660612350804359, "grad_norm": 19.886871444649117, "learning_rate": 4.936740530314087e-07, "logits/chosen": -2.3413853645324707, "logits/rejected": -2.102804660797119, "logps/chosen": -591.3840942382812, "logps/rejected": -896.90625, "loss": 0.25, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -2.003624200820923, "rewards/margins": 2.942108631134033, "rewards/rejected": -4.945733070373535, "step": 80 }, { "epoch": 0.1868188894654904, "grad_norm": 16.07417524586734, "learning_rate": 4.889696268057348e-07, "logits/chosen": -1.8468377590179443, "logits/rejected": -1.1863611936569214, "logps/chosen": -567.8921508789062, "logps/rejected": -934.1871337890625, "loss": 0.2254, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -2.0263195037841797, "rewards/margins": 3.4611504077911377, "rewards/rejected": -5.487469673156738, "step": 90 }, { "epoch": 0.2075765438505449, "grad_norm": 15.952464662940557, "learning_rate": 4.830019673206996e-07, "logits/chosen": -1.3128455877304077, "logits/rejected": -0.37191733717918396, "logps/chosen": -637.7520751953125, "logps/rejected": -1123.6968994140625, "loss": 0.2034, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.622469425201416, "rewards/margins": 4.605704307556152, "rewards/rejected": -7.22817325592041, "step": 100 }, { "epoch": 0.2283341982355994, "grad_norm": 20.61911089038355, "learning_rate": 4.7580262061854606e-07, "logits/chosen": -0.8984780311584473, "logits/rejected": 0.00587611785158515, "logps/chosen": -629.09521484375, "logps/rejected": -1123.9976806640625, "loss": 0.2043, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -2.5760746002197266, "rewards/margins": 4.712790012359619, "rewards/rejected": -7.288865089416504, "step": 110 }, { "epoch": 0.24909185262065386, "grad_norm": 13.519087980837655, "learning_rate": 4.674096436453447e-07, "logits/chosen": -0.8746647834777832, "logits/rejected": 0.03333142027258873, "logps/chosen": -662.9947509765625, "logps/rejected": -1107.119140625, "loss": 0.1947, "rewards/accuracies": 0.90625, "rewards/chosen": -2.6802916526794434, "rewards/margins": 4.257796287536621, "rewards/rejected": -6.938088417053223, "step": 120 }, { "epoch": 0.26984950700570837, "grad_norm": 15.179032922942401, "learning_rate": 4.578674030756363e-07, "logits/chosen": -0.5216516256332397, "logits/rejected": 0.797188401222229, "logps/chosen": -672.4591064453125, "logps/rejected": -1201.9649658203125, "loss": 0.1766, "rewards/accuracies": 0.90625, "rewards/chosen": -2.92866849899292, "rewards/margins": 5.020692348480225, "rewards/rejected": -7.949362277984619, "step": 130 }, { "epoch": 0.29060716139076287, "grad_norm": 19.394840025974254, "learning_rate": 4.4722634078279865e-07, "logits/chosen": 0.05672640725970268, "logits/rejected": 1.2894923686981201, "logps/chosen": -632.3123779296875, "logps/rejected": -1134.365478515625, "loss": 0.1989, "rewards/accuracies": 0.8843749761581421, "rewards/chosen": -2.6184778213500977, "rewards/margins": 4.792341709136963, "rewards/rejected": -7.410820007324219, "step": 140 }, { "epoch": 0.3113648157758173, "grad_norm": 15.545837682982413, "learning_rate": 4.355427071949004e-07, "logits/chosen": -0.034926723688840866, "logits/rejected": 1.3178081512451172, "logps/chosen": -625.6254272460938, "logps/rejected": -1133.8699951171875, "loss": 0.1657, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -2.654139757156372, "rewards/margins": 4.885876655578613, "rewards/rejected": -7.540017127990723, "step": 150 }, { "epoch": 0.3321224701608718, "grad_norm": 21.099719863638594, "learning_rate": 4.228782639455674e-07, "logits/chosen": -0.2264009416103363, "logits/rejected": 1.3679448366165161, "logps/chosen": -684.0534057617188, "logps/rejected": -1281.2177734375, "loss": 0.1738, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -3.0014939308166504, "rewards/margins": 5.716488838195801, "rewards/rejected": -8.717982292175293, "step": 160 }, { "epoch": 0.3528801245459263, "grad_norm": 14.4755316075222, "learning_rate": 4.092999573916971e-07, "logits/chosen": 0.14696760475635529, "logits/rejected": 1.6602694988250732, "logps/chosen": -664.4630126953125, "logps/rejected": -1225.433837890625, "loss": 0.1804, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.056241273880005, "rewards/margins": 5.417787551879883, "rewards/rejected": -8.474028587341309, "step": 170 }, { "epoch": 0.3736377789309808, "grad_norm": 15.213191213812825, "learning_rate": 3.948795647238637e-07, "logits/chosen": -0.7323606014251709, "logits/rejected": 1.1011723279953003, "logps/chosen": -630.0668334960938, "logps/rejected": -1234.099609375, "loss": 0.1783, "rewards/accuracies": 0.921875, "rewards/chosen": -2.6342709064483643, "rewards/margins": 5.844083786010742, "rewards/rejected": -8.478353500366211, "step": 180 }, { "epoch": 0.39439543331603527, "grad_norm": 16.709219865079046, "learning_rate": 3.796933145401304e-07, "logits/chosen": -0.12861236929893494, "logits/rejected": 1.5260117053985596, "logps/chosen": -731.4463500976562, "logps/rejected": -1386.452880859375, "loss": 0.1646, "rewards/accuracies": 0.9375, "rewards/chosen": -3.487692356109619, "rewards/margins": 6.185595989227295, "rewards/rejected": -9.673288345336914, "step": 190 }, { "epoch": 0.4151530877010898, "grad_norm": 16.46330948268556, "learning_rate": 3.638214838889801e-07, "logits/chosen": 0.014425823464989662, "logits/rejected": 1.5777640342712402, "logps/chosen": -647.3234252929688, "logps/rejected": -1219.0548095703125, "loss": 0.1669, "rewards/accuracies": 0.909375011920929, "rewards/chosen": -2.797513484954834, "rewards/margins": 5.429152011871338, "rewards/rejected": -8.226665496826172, "step": 200 }, { "epoch": 0.4359107420861443, "grad_norm": 19.39279809291309, "learning_rate": 3.4734797391146383e-07, "logits/chosen": -0.12084762752056122, "logits/rejected": 1.6122562885284424, "logps/chosen": -656.5711059570312, "logps/rejected": -1250.8482666015625, "loss": 0.1594, "rewards/accuracies": 0.9281250238418579, "rewards/chosen": -2.874763011932373, "rewards/margins": 5.74980354309082, "rewards/rejected": -8.624567031860352, "step": 210 }, { "epoch": 0.4566683964711988, "grad_norm": 23.517592451341034, "learning_rate": 3.3035986632579036e-07, "logits/chosen": -1.0772771835327148, "logits/rejected": 0.6209205389022827, "logps/chosen": -622.2130126953125, "logps/rejected": -1293.0042724609375, "loss": 0.1678, "rewards/accuracies": 0.921875, "rewards/chosen": -2.5699634552001953, "rewards/margins": 6.527965545654297, "rewards/rejected": -9.097929000854492, "step": 220 }, { "epoch": 0.4774260508562532, "grad_norm": 20.580951166094668, "learning_rate": 3.1294696309885716e-07, "logits/chosen": -1.1179264783859253, "logits/rejected": 0.7691652178764343, "logps/chosen": -666.9544677734375, "logps/rejected": -1416.962158203125, "loss": 0.1673, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.058974027633667, "rewards/margins": 7.343924522399902, "rewards/rejected": -10.402898788452148, "step": 230 }, { "epoch": 0.49818370524130773, "grad_norm": 14.600883148928759, "learning_rate": 2.952013117380913e-07, "logits/chosen": -0.9207614660263062, "logits/rejected": 1.1953575611114502, "logps/chosen": -662.4710693359375, "logps/rejected": -1418.23291015625, "loss": 0.1614, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -3.0431346893310547, "rewards/margins": 7.319464206695557, "rewards/rejected": -10.362597465515137, "step": 240 }, { "epoch": 0.5189413596263622, "grad_norm": 11.446046359523642, "learning_rate": 2.7721671871299114e-07, "logits/chosen": -0.7357327938079834, "logits/rejected": 1.461576223373413, "logps/chosen": -672.9193725585938, "logps/rejected": -1338.949462890625, "loss": 0.1602, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -2.9312338829040527, "rewards/margins": 6.449375152587891, "rewards/rejected": -9.380608558654785, "step": 250 }, { "epoch": 0.5396990140114167, "grad_norm": 14.757287503572078, "learning_rate": 2.5908825357849993e-07, "logits/chosen": -0.8231679797172546, "logits/rejected": 1.1155385971069336, "logps/chosen": -656.1690063476562, "logps/rejected": -1271.033447265625, "loss": 0.1622, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -2.8299801349639893, "rewards/margins": 5.925788402557373, "rewards/rejected": -8.755769729614258, "step": 260 }, { "epoch": 0.5604566683964712, "grad_norm": 17.896435569322648, "learning_rate": 2.409117464215001e-07, "logits/chosen": -0.4632663130760193, "logits/rejected": 1.819011926651001, "logps/chosen": -664.986572265625, "logps/rejected": -1455.3441162109375, "loss": 0.1471, "rewards/accuracies": 0.940625011920929, "rewards/chosen": -3.113548517227173, "rewards/margins": 7.589502811431885, "rewards/rejected": -10.70305061340332, "step": 270 }, { "epoch": 0.5812143227815257, "grad_norm": 18.08668716790038, "learning_rate": 2.227832812870089e-07, "logits/chosen": -0.542155385017395, "logits/rejected": 1.9537347555160522, "logps/chosen": -686.6569213867188, "logps/rejected": -1460.84619140625, "loss": 0.1557, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -3.106727123260498, "rewards/margins": 7.513753414154053, "rewards/rejected": -10.620479583740234, "step": 280 }, { "epoch": 0.6019719771665801, "grad_norm": 20.414881165009998, "learning_rate": 2.0479868826190871e-07, "logits/chosen": -0.437448650598526, "logits/rejected": 1.7888593673706055, "logps/chosen": -709.882568359375, "logps/rejected": -1392.702392578125, "loss": 0.1617, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.35121488571167, "rewards/margins": 6.753846645355225, "rewards/rejected": -10.105062484741211, "step": 290 }, { "epoch": 0.6227296315516346, "grad_norm": 16.484968508125828, "learning_rate": 1.8705303690114287e-07, "logits/chosen": -0.2719939947128296, "logits/rejected": 1.811428427696228, "logps/chosen": -710.2088012695312, "logps/rejected": -1419.313720703125, "loss": 0.1478, "rewards/accuracies": 0.9468749761581421, "rewards/chosen": -3.3416149616241455, "rewards/margins": 6.840612888336182, "rewards/rejected": -10.182229042053223, "step": 300 }, { "epoch": 0.6434872859366891, "grad_norm": 15.309368142287557, "learning_rate": 1.6964013367420965e-07, "logits/chosen": -0.341867595911026, "logits/rejected": 1.7422988414764404, "logps/chosen": -686.5745849609375, "logps/rejected": -1366.7637939453125, "loss": 0.1534, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.167562961578369, "rewards/margins": 6.726889133453369, "rewards/rejected": -9.894452095031738, "step": 310 }, { "epoch": 0.6642449403217436, "grad_norm": 11.4821386278306, "learning_rate": 1.5265202608853628e-07, "logits/chosen": -0.17296895384788513, "logits/rejected": 1.9412486553192139, "logps/chosen": -665.36376953125, "logps/rejected": -1430.9146728515625, "loss": 0.148, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -3.0024094581604004, "rewards/margins": 7.437635898590088, "rewards/rejected": -10.440046310424805, "step": 320 }, { "epoch": 0.6850025947067981, "grad_norm": 10.552793459289758, "learning_rate": 1.3617851611101993e-07, "logits/chosen": -0.5047305226325989, "logits/rejected": 1.5836106538772583, "logps/chosen": -686.6402587890625, "logps/rejected": -1443.398193359375, "loss": 0.1519, "rewards/accuracies": 0.8968750238418579, "rewards/chosen": -3.1413583755493164, "rewards/margins": 7.2722015380859375, "rewards/rejected": -10.413559913635254, "step": 330 }, { "epoch": 0.7057602490918526, "grad_norm": 17.65296980495948, "learning_rate": 1.2030668545986958e-07, "logits/chosen": -0.569928765296936, "logits/rejected": 1.6948425769805908, "logps/chosen": -716.9683837890625, "logps/rejected": -1518.8372802734375, "loss": 0.1463, "rewards/accuracies": 0.921875, "rewards/chosen": -3.4350712299346924, "rewards/margins": 7.720976829528809, "rewards/rejected": -11.156047821044922, "step": 340 }, { "epoch": 0.7265179034769071, "grad_norm": 14.715250965523067, "learning_rate": 1.0512043527613623e-07, "logits/chosen": -0.7549006342887878, "logits/rejected": 1.3407833576202393, "logps/chosen": -695.8525390625, "logps/rejected": -1478.35546875, "loss": 0.1559, "rewards/accuracies": 0.903124988079071, "rewards/chosen": -3.347827911376953, "rewards/margins": 7.52248477935791, "rewards/rejected": -10.870311737060547, "step": 350 }, { "epoch": 0.7472755578619616, "grad_norm": 18.014896980938854, "learning_rate": 9.070004260830294e-08, "logits/chosen": -0.9223737716674805, "logits/rejected": 1.4507310390472412, "logps/chosen": -692.382080078125, "logps/rejected": -1453.919921875, "loss": 0.1437, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.2790751457214355, "rewards/margins": 7.526673316955566, "rewards/rejected": -10.80574893951416, "step": 360 }, { "epoch": 0.768033212247016, "grad_norm": 14.401106731580096, "learning_rate": 7.712173605443267e-08, "logits/chosen": -0.9376351237297058, "logits/rejected": 1.583579421043396, "logps/chosen": -700.8660888671875, "logps/rejected": -1505.467529296875, "loss": 0.1494, "rewards/accuracies": 0.90625, "rewards/chosen": -3.340554714202881, "rewards/margins": 7.916815280914307, "rewards/rejected": -11.257369995117188, "step": 370 }, { "epoch": 0.7887908666320705, "grad_norm": 11.438125899827915, "learning_rate": 6.445729280509957e-08, "logits/chosen": -0.8953694105148315, "logits/rejected": 1.4685465097427368, "logps/chosen": -690.8338623046875, "logps/rejected": -1588.8245849609375, "loss": 0.1468, "rewards/accuracies": 0.921875, "rewards/chosen": -3.3841090202331543, "rewards/margins": 8.392255783081055, "rewards/rejected": -11.77636432647705, "step": 380 }, { "epoch": 0.809548521017125, "grad_norm": 13.207322793003062, "learning_rate": 5.2773659217201364e-08, "logits/chosen": -0.9152933359146118, "logits/rejected": 1.6746854782104492, "logps/chosen": -707.5098266601562, "logps/rejected": -1445.887939453125, "loss": 0.1555, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -3.3184616565704346, "rewards/margins": 7.17882776260376, "rewards/rejected": -10.497289657592773, "step": 390 }, { "epoch": 0.8303061754021795, "grad_norm": 11.811171239173802, "learning_rate": 4.213259692436366e-08, "logits/chosen": -0.807452380657196, "logits/rejected": 1.5794246196746826, "logps/chosen": -692.3187255859375, "logps/rejected": -1477.223388671875, "loss": 0.1455, "rewards/accuracies": 0.934374988079071, "rewards/chosen": -3.2713589668273926, "rewards/margins": 7.557108402252197, "rewards/rejected": -10.82846736907959, "step": 400 }, { "epoch": 0.851063829787234, "grad_norm": 13.900287224699019, "learning_rate": 3.259035635465529e-08, "logits/chosen": -0.6530941128730774, "logits/rejected": 1.565045714378357, "logps/chosen": -704.8355712890625, "logps/rejected": -1375.3914794921875, "loss": 0.1474, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -3.393775463104248, "rewards/margins": 6.558309078216553, "rewards/rejected": -9.952085494995117, "step": 410 }, { "epoch": 0.8718214841722886, "grad_norm": 25.604242461790278, "learning_rate": 2.4197379381453942e-08, "logits/chosen": -0.6059257388114929, "logits/rejected": 1.747667670249939, "logps/chosen": -710.49365234375, "logps/rejected": -1475.5013427734375, "loss": 0.1551, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.3643269538879395, "rewards/margins": 7.329138278961182, "rewards/rejected": -10.693464279174805, "step": 420 }, { "epoch": 0.892579138557343, "grad_norm": 17.02837493916288, "learning_rate": 1.699803267930039e-08, "logits/chosen": -0.6381738781929016, "logits/rejected": 1.581946611404419, "logps/chosen": -700.8256225585938, "logps/rejected": -1424.5767822265625, "loss": 0.14, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.278334140777588, "rewards/margins": 7.011561393737793, "rewards/rejected": -10.289896965026855, "step": 430 }, { "epoch": 0.9133367929423976, "grad_norm": 12.115208573188879, "learning_rate": 1.1030373194265114e-08, "logits/chosen": -0.6104884743690491, "logits/rejected": 1.8680555820465088, "logps/chosen": -696.623046875, "logps/rejected": -1484.712646484375, "loss": 0.1555, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -3.214094638824463, "rewards/margins": 7.734494686126709, "rewards/rejected": -10.948590278625488, "step": 440 }, { "epoch": 0.934094447327452, "grad_norm": 16.53240755993655, "learning_rate": 6.325946968591317e-09, "logits/chosen": -0.5625468492507935, "logits/rejected": 2.0248143672943115, "logps/chosen": -701.9017333984375, "logps/rejected": -1457.185791015625, "loss": 0.1506, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -3.3476765155792236, "rewards/margins": 7.459791660308838, "rewards/rejected": -10.807466506958008, "step": 450 }, { "epoch": 0.9548521017125065, "grad_norm": 13.032337936414088, "learning_rate": 2.909622383059834e-09, "logits/chosen": -0.5611749291419983, "logits/rejected": 1.8944803476333618, "logps/chosen": -692.7264404296875, "logps/rejected": -1460.9708251953125, "loss": 0.1466, "rewards/accuracies": 0.9375, "rewards/chosen": -3.228968858718872, "rewards/margins": 7.5441155433654785, "rewards/rejected": -10.77308464050293, "step": 460 }, { "epoch": 0.975609756097561, "grad_norm": 17.252869051917436, "learning_rate": 7.994586985856089e-10, "logits/chosen": -0.6213638782501221, "logits/rejected": 1.7617113590240479, "logps/chosen": -707.3765869140625, "logps/rejected": -1481.257080078125, "loss": 0.1374, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -3.3621768951416016, "rewards/margins": 7.499251365661621, "rewards/rejected": -10.861429214477539, "step": 470 }, { "epoch": 0.9963674104826155, "grad_norm": 12.988129191271089, "learning_rate": 6.610591955641398e-12, "logits/chosen": -0.5171535015106201, "logits/rejected": 1.7204986810684204, "logps/chosen": -687.4800415039062, "logps/rejected": -1466.64501953125, "loss": 0.1487, "rewards/accuracies": 0.9375, "rewards/chosen": -3.355544328689575, "rewards/margins": 7.49337911605835, "rewards/rejected": -10.848923683166504, "step": 480 }, { "epoch": 0.9984431759211209, "step": 481, "total_flos": 0.0, "train_loss": 0.21933725711709495, "train_runtime": 13808.8776, "train_samples_per_second": 8.93, "train_steps_per_second": 0.035 } ], "logging_steps": 10, "max_steps": 481, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }