|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.752688172043011, |
|
"eval_steps": 1000, |
|
"global_step": 18000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0005973715651135006, |
|
"grad_norm": 8.9375, |
|
"learning_rate": 2e-06, |
|
"loss": 1.3168, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.05973715651135006, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.0002, |
|
"loss": 0.5503, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.11947431302270012, |
|
"grad_norm": 0.314453125, |
|
"learning_rate": 0.0004, |
|
"loss": 0.3885, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.17921146953405018, |
|
"grad_norm": 0.298828125, |
|
"learning_rate": 0.0006, |
|
"loss": 0.3758, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.23894862604540024, |
|
"grad_norm": 0.12890625, |
|
"learning_rate": 0.0008, |
|
"loss": 0.363, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.2986857825567503, |
|
"grad_norm": 0.1943359375, |
|
"learning_rate": 0.001, |
|
"loss": 0.3567, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.35842293906810035, |
|
"grad_norm": 0.11181640625, |
|
"learning_rate": 0.0012, |
|
"loss": 0.3435, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.41816009557945044, |
|
"grad_norm": 0.12109375, |
|
"learning_rate": 0.0014, |
|
"loss": 0.3415, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.4778972520908005, |
|
"grad_norm": 0.09130859375, |
|
"learning_rate": 0.0016, |
|
"loss": 0.3394, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.5376344086021505, |
|
"grad_norm": 0.11572265625, |
|
"learning_rate": 0.0018000000000000002, |
|
"loss": 0.345, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.5973715651135006, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.002, |
|
"loss": 0.3177, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.6571087216248507, |
|
"grad_norm": 0.06494140625, |
|
"learning_rate": 0.0019998292504580526, |
|
"loss": 0.3318, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.7168458781362007, |
|
"grad_norm": 0.1455078125, |
|
"learning_rate": 0.001999317060143023, |
|
"loss": 0.3229, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.7765830346475507, |
|
"grad_norm": 0.1298828125, |
|
"learning_rate": 0.001998463603967434, |
|
"loss": 0.3145, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.8363201911589009, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 0.0019972691733857882, |
|
"loss": 0.3158, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.8960573476702509, |
|
"grad_norm": 0.13671875, |
|
"learning_rate": 0.0019957341762950344, |
|
"loss": 0.3132, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.955794504181601, |
|
"grad_norm": 0.11572265625, |
|
"learning_rate": 0.001993859136895274, |
|
"loss": 0.3011, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.015531660692951, |
|
"grad_norm": 0.125, |
|
"learning_rate": 0.0019916446955107426, |
|
"loss": 0.3014, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.075268817204301, |
|
"grad_norm": 0.11279296875, |
|
"learning_rate": 0.001989091608371146, |
|
"loss": 0.3, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.135005973715651, |
|
"grad_norm": 0.08544921875, |
|
"learning_rate": 0.0019862007473534027, |
|
"loss": 0.3016, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.1947431302270013, |
|
"grad_norm": 0.10791015625, |
|
"learning_rate": 0.001982973099683902, |
|
"loss": 0.2991, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.2544802867383513, |
|
"grad_norm": 0.103515625, |
|
"learning_rate": 0.001979409767601366, |
|
"loss": 0.2974, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 1.3142174432497014, |
|
"grad_norm": 0.08056640625, |
|
"learning_rate": 0.001975511967980437, |
|
"loss": 0.2903, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 1.3739545997610514, |
|
"grad_norm": 0.109375, |
|
"learning_rate": 0.001971281031916114, |
|
"loss": 0.2796, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 1.4336917562724014, |
|
"grad_norm": 0.125, |
|
"learning_rate": 0.0019667184042691877, |
|
"loss": 0.2746, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 1.4934289127837514, |
|
"grad_norm": 0.10400390625, |
|
"learning_rate": 0.001961825643172819, |
|
"loss": 0.2766, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 1.5531660692951015, |
|
"grad_norm": 0.06494140625, |
|
"learning_rate": 0.0019566044195004407, |
|
"loss": 0.2739, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 1.6129032258064515, |
|
"grad_norm": 0.12255859375, |
|
"learning_rate": 0.0019510565162951536, |
|
"loss": 0.2677, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 1.6726403823178018, |
|
"grad_norm": 0.0869140625, |
|
"learning_rate": 0.0019451838281608197, |
|
"loss": 0.2605, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 1.7323775388291518, |
|
"grad_norm": 0.08837890625, |
|
"learning_rate": 0.0019389883606150567, |
|
"loss": 0.259, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 1.7921146953405018, |
|
"grad_norm": 0.1279296875, |
|
"learning_rate": 0.0019324722294043557, |
|
"loss": 0.256, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 1.8518518518518519, |
|
"grad_norm": 0.1279296875, |
|
"learning_rate": 0.0019256376597815564, |
|
"loss": 0.246, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 1.911589008363202, |
|
"grad_norm": 0.12890625, |
|
"learning_rate": 0.001918486985745923, |
|
"loss": 0.2431, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 1.971326164874552, |
|
"grad_norm": 0.083984375, |
|
"learning_rate": 0.0019110226492460884, |
|
"loss": 0.2364, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 2.031063321385902, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 0.0019032471993461289, |
|
"loss": 0.2306, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 2.090800477897252, |
|
"grad_norm": 0.087890625, |
|
"learning_rate": 0.0018951632913550625, |
|
"loss": 0.2331, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 2.150537634408602, |
|
"grad_norm": 0.091796875, |
|
"learning_rate": 0.0018867736859200619, |
|
"loss": 0.2291, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 2.2102747909199523, |
|
"grad_norm": 0.0849609375, |
|
"learning_rate": 0.0018780812480836979, |
|
"loss": 0.227, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 2.270011947431302, |
|
"grad_norm": 0.1015625, |
|
"learning_rate": 0.0018690889463055284, |
|
"loss": 0.2255, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 2.3297491039426523, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 0.0018597998514483724, |
|
"loss": 0.2165, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 2.3894862604540026, |
|
"grad_norm": 0.091796875, |
|
"learning_rate": 0.0018502171357296143, |
|
"loss": 0.2157, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 2.4492234169653524, |
|
"grad_norm": 0.08740234375, |
|
"learning_rate": 0.0018403440716378927, |
|
"loss": 0.2086, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 2.5089605734767026, |
|
"grad_norm": 0.111328125, |
|
"learning_rate": 0.0018301840308155505, |
|
"loss": 0.2077, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 2.5686977299880525, |
|
"grad_norm": 0.09619140625, |
|
"learning_rate": 0.0018197404829072212, |
|
"loss": 0.201, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 2.6284348864994027, |
|
"grad_norm": 0.11083984375, |
|
"learning_rate": 0.0018090169943749475, |
|
"loss": 0.1991, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 2.6881720430107525, |
|
"grad_norm": 0.0830078125, |
|
"learning_rate": 0.0017980172272802398, |
|
"loss": 0.1961, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 2.7479091995221028, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 0.0017867449380334832, |
|
"loss": 0.1934, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 2.807646356033453, |
|
"grad_norm": 0.1064453125, |
|
"learning_rate": 0.0017752039761111298, |
|
"loss": 0.1862, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 2.867383512544803, |
|
"grad_norm": 0.08154296875, |
|
"learning_rate": 0.001763398282741103, |
|
"loss": 0.1849, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 2.927120669056153, |
|
"grad_norm": 0.0869140625, |
|
"learning_rate": 0.0017513318895568735, |
|
"loss": 0.1828, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 2.986857825567503, |
|
"grad_norm": 0.09619140625, |
|
"learning_rate": 0.001739008917220659, |
|
"loss": 0.1769, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 3.046594982078853, |
|
"grad_norm": 0.107421875, |
|
"learning_rate": 0.0017264335740162242, |
|
"loss": 0.1729, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 3.106332138590203, |
|
"grad_norm": 0.0859375, |
|
"learning_rate": 0.0017136101544117524, |
|
"loss": 0.1745, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 3.166069295101553, |
|
"grad_norm": 0.107421875, |
|
"learning_rate": 0.0017005430375932908, |
|
"loss": 0.1742, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 3.225806451612903, |
|
"grad_norm": 0.08837890625, |
|
"learning_rate": 0.0016872366859692627, |
|
"loss": 0.175, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 3.2855436081242533, |
|
"grad_norm": 0.126953125, |
|
"learning_rate": 0.0016736956436465573, |
|
"loss": 0.1643, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 3.3452807646356035, |
|
"grad_norm": 0.07666015625, |
|
"learning_rate": 0.0016599245348787228, |
|
"loss": 0.1651, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 3.4050179211469533, |
|
"grad_norm": 0.12158203125, |
|
"learning_rate": 0.0016459280624867873, |
|
"loss": 0.1613, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 3.4647550776583036, |
|
"grad_norm": 0.1396484375, |
|
"learning_rate": 0.001631711006253251, |
|
"loss": 0.1586, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 3.5244922341696534, |
|
"grad_norm": 0.09228515625, |
|
"learning_rate": 0.001617278221289793, |
|
"loss": 0.1601, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 3.5842293906810037, |
|
"grad_norm": 0.0888671875, |
|
"learning_rate": 0.0016026346363792565, |
|
"loss": 0.1515, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 3.6439665471923535, |
|
"grad_norm": 0.1259765625, |
|
"learning_rate": 0.0015877852522924731, |
|
"loss": 0.1492, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 3.7037037037037037, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 0.0015727351400805052, |
|
"loss": 0.1514, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 3.763440860215054, |
|
"grad_norm": 0.0771484375, |
|
"learning_rate": 0.0015574894393428856, |
|
"loss": 0.1462, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 3.823178016726404, |
|
"grad_norm": 0.078125, |
|
"learning_rate": 0.0015420533564724495, |
|
"loss": 0.143, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 3.882915173237754, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 0.0015264321628773558, |
|
"loss": 0.1426, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 3.942652329749104, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.001510631193180907, |
|
"loss": 0.1391, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 4.002389486260454, |
|
"grad_norm": 0.10107421875, |
|
"learning_rate": 0.001494655843399779, |
|
"loss": 0.1355, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 4.062126642771804, |
|
"grad_norm": 0.0791015625, |
|
"learning_rate": 0.0014785115691012866, |
|
"loss": 0.1335, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 4.121863799283154, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 0.0014622038835403132, |
|
"loss": 0.1398, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 4.181600955794504, |
|
"grad_norm": 0.08203125, |
|
"learning_rate": 0.0014457383557765385, |
|
"loss": 0.1342, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 4.241338112305854, |
|
"grad_norm": 0.1064453125, |
|
"learning_rate": 0.001429120608772609, |
|
"loss": 0.135, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 4.301075268817204, |
|
"grad_norm": 0.07958984375, |
|
"learning_rate": 0.0014123563174739035, |
|
"loss": 0.1297, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 4.360812425328555, |
|
"grad_norm": 0.0927734375, |
|
"learning_rate": 0.0013954512068705424, |
|
"loss": 0.1294, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 4.4205495818399045, |
|
"grad_norm": 0.09423828125, |
|
"learning_rate": 0.0013784110500423103, |
|
"loss": 0.1257, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 4.480286738351254, |
|
"grad_norm": 0.076171875, |
|
"learning_rate": 0.0013612416661871532, |
|
"loss": 0.1249, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 4.540023894862604, |
|
"grad_norm": 0.064453125, |
|
"learning_rate": 0.0013439489186339282, |
|
"loss": 0.1243, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 4.599761051373955, |
|
"grad_norm": 0.07275390625, |
|
"learning_rate": 0.0013265387128400831, |
|
"loss": 0.1191, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 4.659498207885305, |
|
"grad_norm": 0.06591796875, |
|
"learning_rate": 0.0013090169943749475, |
|
"loss": 0.1188, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 4.7192353643966545, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 0.0012913897468893247, |
|
"loss": 0.1174, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 4.778972520908005, |
|
"grad_norm": 0.061279296875, |
|
"learning_rate": 0.0012736629900720832, |
|
"loss": 0.1168, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 4.838709677419355, |
|
"grad_norm": 0.06640625, |
|
"learning_rate": 0.0012558427775944357, |
|
"loss": 0.1135, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 4.898446833930705, |
|
"grad_norm": 0.09912109375, |
|
"learning_rate": 0.0012379351950426187, |
|
"loss": 0.1117, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 4.958183990442055, |
|
"grad_norm": 0.05712890625, |
|
"learning_rate": 0.0012199463578396689, |
|
"loss": 0.1092, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 5.017921146953405, |
|
"grad_norm": 0.07861328125, |
|
"learning_rate": 0.0012018824091570102, |
|
"loss": 0.1091, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 5.077658303464755, |
|
"grad_norm": 0.0859375, |
|
"learning_rate": 0.0011837495178165704, |
|
"loss": 0.1086, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 5.137395459976105, |
|
"grad_norm": 0.06103515625, |
|
"learning_rate": 0.00116555387618413, |
|
"loss": 0.109, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 5.197132616487456, |
|
"grad_norm": 0.0869140625, |
|
"learning_rate": 0.0011473016980546376, |
|
"loss": 0.1071, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 5.256869772998805, |
|
"grad_norm": 0.0703125, |
|
"learning_rate": 0.0011289992165302034, |
|
"loss": 0.1081, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 5.316606929510155, |
|
"grad_norm": 0.0869140625, |
|
"learning_rate": 0.001110652681891501, |
|
"loss": 0.1061, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 5.376344086021505, |
|
"grad_norm": 0.123046875, |
|
"learning_rate": 0.001092268359463302, |
|
"loss": 0.1044, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 5.436081242532856, |
|
"grad_norm": 0.07177734375, |
|
"learning_rate": 0.001073852527474874, |
|
"loss": 0.1004, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 5.4958183990442055, |
|
"grad_norm": 0.09130859375, |
|
"learning_rate": 0.00105541147491597, |
|
"loss": 0.1004, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 5.555555555555555, |
|
"grad_norm": 0.058837890625, |
|
"learning_rate": 0.0010369514993891452, |
|
"loss": 0.098, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 5.615292712066906, |
|
"grad_norm": 0.057861328125, |
|
"learning_rate": 0.00101847890495913, |
|
"loss": 0.0989, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 5.675029868578256, |
|
"grad_norm": 0.07421875, |
|
"learning_rate": 0.001, |
|
"loss": 0.0954, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 5.734767025089606, |
|
"grad_norm": 0.09375, |
|
"learning_rate": 0.0009815210950408703, |
|
"loss": 0.0955, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 5.7945041816009555, |
|
"grad_norm": 0.0654296875, |
|
"learning_rate": 0.0009630485006108553, |
|
"loss": 0.094, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 5.854241338112306, |
|
"grad_norm": 0.06787109375, |
|
"learning_rate": 0.0009445885250840301, |
|
"loss": 0.0924, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 5.913978494623656, |
|
"grad_norm": 0.06298828125, |
|
"learning_rate": 0.0009261474725251261, |
|
"loss": 0.0911, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 5.973715651135006, |
|
"grad_norm": 0.054443359375, |
|
"learning_rate": 0.0009077316405366981, |
|
"loss": 0.0902, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 6.033452807646356, |
|
"grad_norm": 0.0673828125, |
|
"learning_rate": 0.0008893473181084994, |
|
"loss": 0.0891, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 6.093189964157706, |
|
"grad_norm": 0.06103515625, |
|
"learning_rate": 0.000871000783469797, |
|
"loss": 0.0914, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 6.152927120669056, |
|
"grad_norm": 0.053466796875, |
|
"learning_rate": 0.0008526983019453623, |
|
"loss": 0.0872, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 6.212664277180406, |
|
"grad_norm": 0.08740234375, |
|
"learning_rate": 0.00083444612381587, |
|
"loss": 0.0903, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 6.272401433691757, |
|
"grad_norm": 0.0712890625, |
|
"learning_rate": 0.0008162504821834296, |
|
"loss": 0.0858, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 6.332138590203106, |
|
"grad_norm": 0.051513671875, |
|
"learning_rate": 0.00079811759084299, |
|
"loss": 0.0876, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 6.391875746714456, |
|
"grad_norm": 0.05810546875, |
|
"learning_rate": 0.0007800536421603317, |
|
"loss": 0.0874, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 6.451612903225806, |
|
"grad_norm": 0.060546875, |
|
"learning_rate": 0.0007620648049573815, |
|
"loss": 0.0834, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 6.511350059737157, |
|
"grad_norm": 0.05859375, |
|
"learning_rate": 0.0007441572224055644, |
|
"loss": 0.0814, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 6.571087216248507, |
|
"grad_norm": 0.06494140625, |
|
"learning_rate": 0.0007263370099279172, |
|
"loss": 0.0822, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 6.630824372759856, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 0.0007086102531106754, |
|
"loss": 0.0814, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 6.690561529271207, |
|
"grad_norm": 0.072265625, |
|
"learning_rate": 0.0006909830056250527, |
|
"loss": 0.0803, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 6.750298685782557, |
|
"grad_norm": 0.052734375, |
|
"learning_rate": 0.0006734612871599168, |
|
"loss": 0.0802, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 6.810035842293907, |
|
"grad_norm": 0.053955078125, |
|
"learning_rate": 0.0006560510813660718, |
|
"loss": 0.0781, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 6.8697729988052565, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 0.0006387583338128471, |
|
"loss": 0.0779, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 6.929510155316607, |
|
"grad_norm": 0.08544921875, |
|
"learning_rate": 0.0006215889499576897, |
|
"loss": 0.0783, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 6.989247311827957, |
|
"grad_norm": 0.0732421875, |
|
"learning_rate": 0.0006045487931294575, |
|
"loss": 0.0765, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 7.048984468339307, |
|
"grad_norm": 0.062255859375, |
|
"learning_rate": 0.0005876436825260967, |
|
"loss": 0.0764, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 7.1087216248506575, |
|
"grad_norm": 0.06298828125, |
|
"learning_rate": 0.000570879391227391, |
|
"loss": 0.0773, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 7.168458781362007, |
|
"grad_norm": 0.056640625, |
|
"learning_rate": 0.0005542616442234618, |
|
"loss": 0.0759, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 7.228195937873357, |
|
"grad_norm": 0.042236328125, |
|
"learning_rate": 0.0005377961164596869, |
|
"loss": 0.0783, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 7.287933094384707, |
|
"grad_norm": 0.053955078125, |
|
"learning_rate": 0.0005214884308987136, |
|
"loss": 0.0757, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 7.347670250896058, |
|
"grad_norm": 0.053955078125, |
|
"learning_rate": 0.0005053441566002214, |
|
"loss": 0.0768, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 7.407407407407407, |
|
"grad_norm": 0.0556640625, |
|
"learning_rate": 0.0004893688068190932, |
|
"loss": 0.0746, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 7.467144563918757, |
|
"grad_norm": 0.0537109375, |
|
"learning_rate": 0.0004735678371226441, |
|
"loss": 0.0749, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 7.526881720430108, |
|
"grad_norm": 0.057861328125, |
|
"learning_rate": 0.00045794664352755057, |
|
"loss": 0.0728, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 7.586618876941458, |
|
"grad_norm": 0.05419921875, |
|
"learning_rate": 0.0004425105606571145, |
|
"loss": 0.0732, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 7.646356033452808, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 0.00042726485991949483, |
|
"loss": 0.0731, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 7.706093189964157, |
|
"grad_norm": 0.05859375, |
|
"learning_rate": 0.00041221474770752696, |
|
"loss": 0.0728, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 7.765830346475508, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 0.0003973653636207437, |
|
"loss": 0.072, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 7.825567502986858, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 0.0003827217787102072, |
|
"loss": 0.0718, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 7.885304659498208, |
|
"grad_norm": 0.0400390625, |
|
"learning_rate": 0.0003682889937467493, |
|
"loss": 0.0721, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 7.945041816009558, |
|
"grad_norm": 0.047607421875, |
|
"learning_rate": 0.00035407193751321286, |
|
"loss": 0.0703, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 1.004778972520908, |
|
"grad_norm": 0.057861328125, |
|
"learning_rate": 0.0003400754651212776, |
|
"loss": 0.0716, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 1.064516129032258, |
|
"grad_norm": 0.049072265625, |
|
"learning_rate": 0.0003263043563534428, |
|
"loss": 0.0711, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 1.124253285543608, |
|
"grad_norm": 0.039794921875, |
|
"learning_rate": 0.0003127633140307373, |
|
"loss": 0.0714, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 1.183990442054958, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 0.00029945696240670904, |
|
"loss": 0.0728, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 1.2437275985663083, |
|
"grad_norm": 0.06005859375, |
|
"learning_rate": 0.00028638984558824776, |
|
"loss": 0.0733, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 1.3034647550776584, |
|
"grad_norm": 0.04833984375, |
|
"learning_rate": 0.000273566425983776, |
|
"loss": 0.071, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 1.3632019115890084, |
|
"grad_norm": 0.042724609375, |
|
"learning_rate": 0.000260991082779341, |
|
"loss": 0.0729, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 1.4229390681003584, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 0.00024866811044312666, |
|
"loss": 0.0711, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 1.4826762246117084, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 0.00023660171725889702, |
|
"loss": 0.0704, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 1.5424133811230587, |
|
"grad_norm": 0.04931640625, |
|
"learning_rate": 0.0002247960238888701, |
|
"loss": 0.0705, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 1.6021505376344085, |
|
"grad_norm": 0.04638671875, |
|
"learning_rate": 0.00021325506196651677, |
|
"loss": 0.0709, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 1.6618876941457588, |
|
"grad_norm": 0.034423828125, |
|
"learning_rate": 0.0002019827727197605, |
|
"loss": 0.0708, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 1.7216248506571086, |
|
"grad_norm": 0.04541015625, |
|
"learning_rate": 0.00019098300562505265, |
|
"loss": 0.0692, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 1.7813620071684588, |
|
"grad_norm": 0.056396484375, |
|
"learning_rate": 0.000180259517092779, |
|
"loss": 0.0702, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 1.8410991636798089, |
|
"grad_norm": 0.050048828125, |
|
"learning_rate": 0.00016981596918444952, |
|
"loss": 0.0701, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 1.900836320191159, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 0.0001596559283621074, |
|
"loss": 0.0695, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 1.960573476702509, |
|
"grad_norm": 0.05224609375, |
|
"learning_rate": 0.00014978286427038602, |
|
"loss": 0.0701, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 2.020310633213859, |
|
"grad_norm": 0.05419921875, |
|
"learning_rate": 0.00014020014855162756, |
|
"loss": 0.0684, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 2.080047789725209, |
|
"grad_norm": 0.040283203125, |
|
"learning_rate": 0.00013091105369447166, |
|
"loss": 0.072, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 2.139784946236559, |
|
"grad_norm": 0.040283203125, |
|
"learning_rate": 0.00012191875191630208, |
|
"loss": 0.0695, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 2.1995221027479093, |
|
"grad_norm": 0.051025390625, |
|
"learning_rate": 0.00011322631407993811, |
|
"loss": 0.0717, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 2.2592592592592595, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 0.00010483670864493777, |
|
"loss": 0.0721, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 2.3189964157706093, |
|
"grad_norm": 0.036865234375, |
|
"learning_rate": 9.675280065387115e-05, |
|
"loss": 0.0704, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 2.378733572281959, |
|
"grad_norm": 0.040771484375, |
|
"learning_rate": 8.897735075391155e-05, |
|
"loss": 0.0711, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 2.4384707287933094, |
|
"grad_norm": 0.0322265625, |
|
"learning_rate": 8.151301425407698e-05, |
|
"loss": 0.0698, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 2.4982078853046596, |
|
"grad_norm": 0.043701171875, |
|
"learning_rate": 7.43623402184438e-05, |
|
"loss": 0.07, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 2.5579450418160095, |
|
"grad_norm": 0.041015625, |
|
"learning_rate": 6.75277705956443e-05, |
|
"loss": 0.0694, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 2.6176821983273597, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 6.1011639384943586e-05, |
|
"loss": 0.0713, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 2.67741935483871, |
|
"grad_norm": 0.037353515625, |
|
"learning_rate": 5.481617183918053e-05, |
|
"loss": 0.069, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 2.7371565113500598, |
|
"grad_norm": 0.0380859375, |
|
"learning_rate": 4.894348370484647e-05, |
|
"loss": 0.0701, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 2.7968936678614096, |
|
"grad_norm": 0.04296875, |
|
"learning_rate": 4.339558049955927e-05, |
|
"loss": 0.0692, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 2.85663082437276, |
|
"grad_norm": 0.043212890625, |
|
"learning_rate": 3.817435682718096e-05, |
|
"loss": 0.0692, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 2.91636798088411, |
|
"grad_norm": 0.05078125, |
|
"learning_rate": 3.3281595730812576e-05, |
|
"loss": 0.0696, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 2.97610513739546, |
|
"grad_norm": 0.041015625, |
|
"learning_rate": 2.8718968083886077e-05, |
|
"loss": 0.0695, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 3.03584229390681, |
|
"grad_norm": 0.053955078125, |
|
"learning_rate": 2.44880320195634e-05, |
|
"loss": 0.0686, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 3.09557945041816, |
|
"grad_norm": 0.041015625, |
|
"learning_rate": 2.059023239863411e-05, |
|
"loss": 0.0713, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 3.15531660692951, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 1.7026900316098216e-05, |
|
"loss": 0.069, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 3.21505376344086, |
|
"grad_norm": 0.0498046875, |
|
"learning_rate": 1.3799252646597427e-05, |
|
"loss": 0.0726, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 3.2747909199522103, |
|
"grad_norm": 0.04248046875, |
|
"learning_rate": 1.0908391628854042e-05, |
|
"loss": 0.0703, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 3.3345280764635605, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 8.355304489257254e-06, |
|
"loss": 0.0711, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 3.3942652329749103, |
|
"grad_norm": 0.059814453125, |
|
"learning_rate": 6.140863104726391e-06, |
|
"loss": 0.072, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 3.4540023894862606, |
|
"grad_norm": 0.046142578125, |
|
"learning_rate": 4.265823704965532e-06, |
|
"loss": 0.0696, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 3.5137395459976104, |
|
"grad_norm": 0.0693359375, |
|
"learning_rate": 2.730826614211979e-06, |
|
"loss": 0.0685, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 3.5734767025089607, |
|
"grad_norm": 0.055419921875, |
|
"learning_rate": 1.5363960325660565e-06, |
|
"loss": 0.07, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 3.6332138590203105, |
|
"grad_norm": 0.046630859375, |
|
"learning_rate": 6.829398569770939e-07, |
|
"loss": 0.0708, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 3.6929510155316607, |
|
"grad_norm": 0.049560546875, |
|
"learning_rate": 1.7074954194729043e-07, |
|
"loss": 0.0694, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 3.752688172043011, |
|
"grad_norm": 0.036865234375, |
|
"learning_rate": 0.0, |
|
"loss": 0.0694, |
|
"step": 18000 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 18000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 11, |
|
"save_steps": 3000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": true, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.467487355285709e+17, |
|
"train_batch_size": 6, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|