|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 5.0, |
|
"eval_steps": 500, |
|
"global_step": 625, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 76.32773843110532, |
|
"learning_rate": 1.5873015873015874e-07, |
|
"loss": 4.4795, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 71.81150350946773, |
|
"learning_rate": 7.936507936507937e-07, |
|
"loss": 4.307, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 42.999152941442446, |
|
"learning_rate": 1.5873015873015873e-06, |
|
"loss": 3.4849, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 46.44864615552445, |
|
"learning_rate": 2.380952380952381e-06, |
|
"loss": 1.5332, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 5.770504948547854, |
|
"learning_rate": 3.1746031746031746e-06, |
|
"loss": 0.3264, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 3.383097613641565, |
|
"learning_rate": 3.968253968253968e-06, |
|
"loss": 0.195, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 2.543807204293022, |
|
"learning_rate": 4.761904761904762e-06, |
|
"loss": 0.152, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 2.071517230467083, |
|
"learning_rate": 5.555555555555557e-06, |
|
"loss": 0.1464, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 3.1880730134020405, |
|
"learning_rate": 6.349206349206349e-06, |
|
"loss": 0.1532, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 2.273945566819242, |
|
"learning_rate": 7.1428571428571436e-06, |
|
"loss": 0.1477, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 2.8908492033026, |
|
"learning_rate": 7.936507936507936e-06, |
|
"loss": 0.1655, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 1.408828965452355, |
|
"learning_rate": 8.730158730158731e-06, |
|
"loss": 0.163, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.7147558780209895, |
|
"learning_rate": 9.523809523809525e-06, |
|
"loss": 0.1581, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 2.3248153460703316, |
|
"learning_rate": 9.999687519737639e-06, |
|
"loss": 0.1587, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 3.19958218992742, |
|
"learning_rate": 9.996172565322375e-06, |
|
"loss": 0.1782, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 2.3598525259461014, |
|
"learning_rate": 9.988754811052616e-06, |
|
"loss": 0.1708, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 2.350384753086168, |
|
"learning_rate": 9.97744005136599e-06, |
|
"loss": 0.1676, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.8952757818976034, |
|
"learning_rate": 9.962237124876828e-06, |
|
"loss": 0.1709, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 1.7959529824065594, |
|
"learning_rate": 9.943157907471825e-06, |
|
"loss": 0.1818, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 3.148673436638757, |
|
"learning_rate": 9.920217303033091e-06, |
|
"loss": 0.1868, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 2.6291492953956035, |
|
"learning_rate": 9.893433231795864e-06, |
|
"loss": 0.1802, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 2.116086213452932, |
|
"learning_rate": 9.862826616349981e-06, |
|
"loss": 0.1963, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.590351057974938, |
|
"learning_rate": 9.828421365296023e-06, |
|
"loss": 0.1884, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 2.2826587299690297, |
|
"learning_rate": 9.79024435456893e-06, |
|
"loss": 0.194, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.7617227651937795, |
|
"learning_rate": 9.748325406443647e-06, |
|
"loss": 0.1814, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 2.7640313184450886, |
|
"learning_rate": 9.702697266239211e-06, |
|
"loss": 0.18, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 13.383134400696948, |
|
"learning_rate": 9.653395576739504e-06, |
|
"loss": 0.1596, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 1.960084286202522, |
|
"learning_rate": 9.600458850350588e-06, |
|
"loss": 0.163, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 1.6888079694486844, |
|
"learning_rate": 9.543928439016445e-06, |
|
"loss": 0.1712, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.16, |
|
"grad_norm": 2.5101725396253265, |
|
"learning_rate": 9.483848501916578e-06, |
|
"loss": 0.1667, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 1.7853092667274246, |
|
"learning_rate": 9.42026597097071e-06, |
|
"loss": 0.17, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.24, |
|
"grad_norm": 1.9741842768861118, |
|
"learning_rate": 9.353230514177553e-06, |
|
"loss": 0.1679, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 2.4658963041094615, |
|
"learning_rate": 9.282794496816244e-06, |
|
"loss": 0.1731, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 2.923478442896321, |
|
"learning_rate": 9.209012940540806e-06, |
|
"loss": 0.1708, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 3.19217673963713, |
|
"learning_rate": 9.131943480399531e-06, |
|
"loss": 0.1752, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 2.2076768543351575, |
|
"learning_rate": 9.05164631981292e-06, |
|
"loss": 0.1646, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 1.5911362670253353, |
|
"learning_rate": 8.968184183545285e-06, |
|
"loss": 0.1558, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 2.438333228986101, |
|
"learning_rate": 8.881622268706825e-06, |
|
"loss": 0.1639, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 1.8709425955134034, |
|
"learning_rate": 8.792028193824364e-06, |
|
"loss": 0.1566, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 2.273561130366255, |
|
"learning_rate": 8.699471946020612e-06, |
|
"loss": 0.1743, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 2.2532334985776963, |
|
"learning_rate": 8.604025826343167e-06, |
|
"loss": 0.1614, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.6400000000000001, |
|
"grad_norm": 2.8000291867854368, |
|
"learning_rate": 8.505764393285985e-06, |
|
"loss": 0.1688, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 3.5386862233237553, |
|
"learning_rate": 8.404764404547404e-06, |
|
"loss": 0.1736, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 2.1915281083973475, |
|
"learning_rate": 8.301104757070276e-06, |
|
"loss": 0.1655, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 1.735218330228595, |
|
"learning_rate": 8.194866425410984e-06, |
|
"loss": 0.1511, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 2.0251426912636514, |
|
"learning_rate": 8.086132398485525e-06, |
|
"loss": 0.1499, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.8399999999999999, |
|
"grad_norm": 2.595962962052259, |
|
"learning_rate": 7.974987614742066e-06, |
|
"loss": 0.1696, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.88, |
|
"grad_norm": 1.8465107375877872, |
|
"learning_rate": 7.861518895810597e-06, |
|
"loss": 0.1645, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.92, |
|
"grad_norm": 1.7181169308567998, |
|
"learning_rate": 7.745814878681516e-06, |
|
"loss": 0.1628, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 1.914333923678904, |
|
"learning_rate": 7.627965946466167e-06, |
|
"loss": 0.1453, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 1.6246422019481361, |
|
"learning_rate": 7.50806415779332e-06, |
|
"loss": 0.1489, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.04, |
|
"grad_norm": 1.5381806636260356, |
|
"learning_rate": 7.386203174896872e-06, |
|
"loss": 0.1361, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.08, |
|
"grad_norm": 1.6845512041994002, |
|
"learning_rate": 7.262478190450834e-06, |
|
"loss": 0.1332, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.12, |
|
"grad_norm": 2.367964043590009, |
|
"learning_rate": 7.136985853208824e-06, |
|
"loss": 0.1426, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.16, |
|
"grad_norm": 1.7353039468487315, |
|
"learning_rate": 7.0098241925061215e-06, |
|
"loss": 0.1348, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 1.8745427410009357, |
|
"learning_rate": 6.881092541683279e-06, |
|
"loss": 0.1367, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.24, |
|
"grad_norm": 1.7486606607428097, |
|
"learning_rate": 6.750891460491093e-06, |
|
"loss": 0.1432, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.2800000000000002, |
|
"grad_norm": 2.7256153449292957, |
|
"learning_rate": 6.619322656537552e-06, |
|
"loss": 0.1327, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.32, |
|
"grad_norm": 2.011781932169801, |
|
"learning_rate": 6.486488905838143e-06, |
|
"loss": 0.1471, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.36, |
|
"grad_norm": 1.958955631206453, |
|
"learning_rate": 6.352493972531535e-06, |
|
"loss": 0.146, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 2.125212212501986, |
|
"learning_rate": 6.2174425278234115e-06, |
|
"loss": 0.1439, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.44, |
|
"grad_norm": 1.7405119151293693, |
|
"learning_rate": 6.0814400682217236e-06, |
|
"loss": 0.1345, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.48, |
|
"grad_norm": 1.307904513904714, |
|
"learning_rate": 5.944592833127253e-06, |
|
"loss": 0.1412, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.52, |
|
"grad_norm": 1.5618909410769424, |
|
"learning_rate": 5.807007721843862e-06, |
|
"loss": 0.1396, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.56, |
|
"grad_norm": 1.6001114294946008, |
|
"learning_rate": 5.668792210073255e-06, |
|
"loss": 0.1381, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 2.0825376870245433, |
|
"learning_rate": 5.530054265959486e-06, |
|
"loss": 0.1279, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.64, |
|
"grad_norm": 1.4899012611449984, |
|
"learning_rate": 5.39090226574877e-06, |
|
"loss": 0.1273, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.68, |
|
"grad_norm": 1.3587693779949943, |
|
"learning_rate": 5.2514449091305375e-06, |
|
"loss": 0.139, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.7199999999999998, |
|
"grad_norm": 1.3905248184940833, |
|
"learning_rate": 5.111791134325793e-06, |
|
"loss": 0.1337, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.76, |
|
"grad_norm": 1.7261472057985348, |
|
"learning_rate": 4.9720500329891755e-06, |
|
"loss": 0.1419, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 1.5882220461692291, |
|
"learning_rate": 4.832330764991131e-06, |
|
"loss": 0.129, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.84, |
|
"grad_norm": 33.74153723265867, |
|
"learning_rate": 4.692742473146818e-06, |
|
"loss": 0.182, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.88, |
|
"grad_norm": 1.2964424054104968, |
|
"learning_rate": 4.553394197958339e-06, |
|
"loss": 0.1353, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.92, |
|
"grad_norm": 1.203808822414652, |
|
"learning_rate": 4.414394792436877e-06, |
|
"loss": 0.1394, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.96, |
|
"grad_norm": 1.2806702384421942, |
|
"learning_rate": 4.275852837071309e-06, |
|
"loss": 0.1251, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 1.337023723464791, |
|
"learning_rate": 4.137876555009684e-06, |
|
"loss": 0.1376, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.04, |
|
"grad_norm": 1.12357649083518, |
|
"learning_rate": 4.000573727519868e-06, |
|
"loss": 0.12, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.08, |
|
"grad_norm": 1.1261752050608114, |
|
"learning_rate": 3.86405160979534e-06, |
|
"loss": 0.12, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.12, |
|
"grad_norm": 1.31280326818583, |
|
"learning_rate": 3.7284168471719527e-06, |
|
"loss": 0.1153, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.16, |
|
"grad_norm": 1.3340360591201343, |
|
"learning_rate": 3.5937753918210705e-06, |
|
"loss": 0.125, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.2, |
|
"grad_norm": 1.074093734063287, |
|
"learning_rate": 3.4602324199842026e-06, |
|
"loss": 0.1201, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.24, |
|
"grad_norm": 1.3868574560188593, |
|
"learning_rate": 3.3278922498137455e-06, |
|
"loss": 0.1165, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.2800000000000002, |
|
"grad_norm": 1.5476889671334009, |
|
"learning_rate": 3.1968582598840234e-06, |
|
"loss": 0.1348, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.32, |
|
"grad_norm": 2.2098782619120945, |
|
"learning_rate": 3.067232808436299e-06, |
|
"loss": 0.1264, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.36, |
|
"grad_norm": 1.4042367103273385, |
|
"learning_rate": 2.9391171534208185e-06, |
|
"loss": 0.1254, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.4, |
|
"grad_norm": 0.9517570079120965, |
|
"learning_rate": 2.812611373398365e-06, |
|
"loss": 0.1196, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.44, |
|
"grad_norm": 1.411957760122345, |
|
"learning_rate": 2.6878142893630904e-06, |
|
"loss": 0.1237, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.48, |
|
"grad_norm": 1.0469263122557906, |
|
"learning_rate": 2.564823387547716e-06, |
|
"loss": 0.1234, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.52, |
|
"grad_norm": 1.0880117862903047, |
|
"learning_rate": 2.4437347432713838e-06, |
|
"loss": 0.1281, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 3.56, |
|
"grad_norm": 1.4980302601520634, |
|
"learning_rate": 2.3246429458896637e-06, |
|
"loss": 0.1196, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 3.6, |
|
"grad_norm": 1.0217971404002908, |
|
"learning_rate": 2.207641024905322e-06, |
|
"loss": 0.1224, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 3.64, |
|
"grad_norm": 1.1671521628697192, |
|
"learning_rate": 2.0928203772975917e-06, |
|
"loss": 0.119, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 3.68, |
|
"grad_norm": 1.107501819947676, |
|
"learning_rate": 1.9802706961266936e-06, |
|
"loss": 0.1201, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.7199999999999998, |
|
"grad_norm": 0.9124789903191979, |
|
"learning_rate": 1.870079900469392e-06, |
|
"loss": 0.1204, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.76, |
|
"grad_norm": 1.4405450223921223, |
|
"learning_rate": 1.7623340667403089e-06, |
|
"loss": 0.1173, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.8, |
|
"grad_norm": 1.2383539174028362, |
|
"learning_rate": 1.657117361452651e-06, |
|
"loss": 0.123, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 3.84, |
|
"grad_norm": 1.5492361002489776, |
|
"learning_rate": 1.5545119754708682e-06, |
|
"loss": 0.1213, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.88, |
|
"grad_norm": 0.9688910602285367, |
|
"learning_rate": 1.454598059806609e-06, |
|
"loss": 0.1244, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 3.92, |
|
"grad_norm": 1.2497049971642828, |
|
"learning_rate": 1.3574536630081208e-06, |
|
"loss": 0.1266, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.96, |
|
"grad_norm": 1.4359900705563882, |
|
"learning_rate": 1.2631546701920073e-06, |
|
"loss": 0.1196, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 1.013179470712264, |
|
"learning_rate": 1.1717747437649657e-06, |
|
"loss": 0.1186, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.04, |
|
"grad_norm": 1.128971555703373, |
|
"learning_rate": 1.0833852658818167e-06, |
|
"loss": 0.1077, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 4.08, |
|
"grad_norm": 0.8064418108816587, |
|
"learning_rate": 9.980552826847635e-07, |
|
"loss": 0.1099, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.12, |
|
"grad_norm": 0.8053995942309401, |
|
"learning_rate": 9.158514503674543e-07, |
|
"loss": 0.1122, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 4.16, |
|
"grad_norm": 1.0137589959248894, |
|
"learning_rate": 8.368379831059592e-07, |
|
"loss": 0.1108, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.2, |
|
"grad_norm": 1.179886916033213, |
|
"learning_rate": 7.61076602897371e-07, |
|
"loss": 0.1168, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 4.24, |
|
"grad_norm": 1.1023307485760356, |
|
"learning_rate": 6.886264913451635e-07, |
|
"loss": 0.111, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.28, |
|
"grad_norm": 0.8911248731367788, |
|
"learning_rate": 6.1954424342902e-07, |
|
"loss": 0.114, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 4.32, |
|
"grad_norm": 0.9031682796275837, |
|
"learning_rate": 5.538838232952104e-07, |
|
"loss": 0.1099, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.36, |
|
"grad_norm": 0.9777576873311943, |
|
"learning_rate": 4.916965221020753e-07, |
|
"loss": 0.1149, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 4.4, |
|
"grad_norm": 0.794466996633541, |
|
"learning_rate": 4.3303091795353024e-07, |
|
"loss": 0.109, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 4.44, |
|
"grad_norm": 1.1715360106229433, |
|
"learning_rate": 3.779328379518898e-07, |
|
"loss": 0.119, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 4.48, |
|
"grad_norm": 0.9196816880764788, |
|
"learning_rate": 3.2644532239966444e-07, |
|
"loss": 0.1107, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 4.52, |
|
"grad_norm": 0.8511671131233757, |
|
"learning_rate": 2.7860859117828985e-07, |
|
"loss": 0.1106, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 4.5600000000000005, |
|
"grad_norm": 1.0226582223355896, |
|
"learning_rate": 2.3446001233004333e-07, |
|
"loss": 0.1176, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 4.6, |
|
"grad_norm": 0.8883730009317985, |
|
"learning_rate": 1.9403407286770592e-07, |
|
"loss": 0.1122, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 4.64, |
|
"grad_norm": 0.9879760794050276, |
|
"learning_rate": 1.573623518347517e-07, |
|
"loss": 0.1122, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 4.68, |
|
"grad_norm": 0.8638827655369075, |
|
"learning_rate": 1.2447349563713186e-07, |
|
"loss": 0.1144, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 4.72, |
|
"grad_norm": 0.8402422894300282, |
|
"learning_rate": 9.539319566590766e-08, |
|
"loss": 0.1108, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 4.76, |
|
"grad_norm": 1.173156707092929, |
|
"learning_rate": 7.014416822821557e-08, |
|
"loss": 0.1186, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 4.8, |
|
"grad_norm": 1.0955817536201944, |
|
"learning_rate": 4.8746136802240716e-08, |
|
"loss": 0.1115, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 4.84, |
|
"grad_norm": 0.9330940788218187, |
|
"learning_rate": 3.121581663007134e-08, |
|
"loss": 0.1157, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 4.88, |
|
"grad_norm": 1.1560897368741774, |
|
"learning_rate": 1.75669016604485e-08, |
|
"loss": 0.1035, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 4.92, |
|
"grad_norm": 1.0534577947172947, |
|
"learning_rate": 7.81005385163458e-09, |
|
"loss": 0.1134, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 4.96, |
|
"grad_norm": 0.893194596809323, |
|
"learning_rate": 1.952894842735531e-09, |
|
"loss": 0.1145, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.84002664295751, |
|
"learning_rate": 0.0, |
|
"loss": 0.1091, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"step": 625, |
|
"total_flos": 1839624683520.0, |
|
"train_loss": 0.2137949773788452, |
|
"train_runtime": 9889.1554, |
|
"train_samples_per_second": 1.011, |
|
"train_steps_per_second": 0.063 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 625, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1839624683520.0, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|