|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.0, |
|
"eval_steps": 500, |
|
"global_step": 2122, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.5460187196731567, |
|
"learning_rate": 3.5211267605633804e-06, |
|
"loss": 2.7305, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.3390659093856812, |
|
"learning_rate": 7.042253521126761e-06, |
|
"loss": 2.7192, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.4170739650726318, |
|
"learning_rate": 1.056338028169014e-05, |
|
"loss": 2.6137, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.9075584411621094, |
|
"learning_rate": 1.4084507042253522e-05, |
|
"loss": 2.3702, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.8087872862815857, |
|
"learning_rate": 1.7605633802816902e-05, |
|
"loss": 2.0872, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.6687940359115601, |
|
"learning_rate": 2.112676056338028e-05, |
|
"loss": 1.9849, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.48965755105018616, |
|
"learning_rate": 2.464788732394366e-05, |
|
"loss": 1.8513, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.44189536571502686, |
|
"learning_rate": 2.8169014084507043e-05, |
|
"loss": 1.7901, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.4670378863811493, |
|
"learning_rate": 2.9827134625458355e-05, |
|
"loss": 1.7426, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5341612100601196, |
|
"learning_rate": 2.9434258774227347e-05, |
|
"loss": 1.663, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.32935911417007446, |
|
"learning_rate": 2.9041382922996336e-05, |
|
"loss": 1.6262, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.49150750041007996, |
|
"learning_rate": 2.864850707176532e-05, |
|
"loss": 1.5616, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.4343280792236328, |
|
"learning_rate": 2.825563122053431e-05, |
|
"loss": 1.554, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.46223270893096924, |
|
"learning_rate": 2.78627553693033e-05, |
|
"loss": 1.4944, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.5165560245513916, |
|
"learning_rate": 2.7469879518072288e-05, |
|
"loss": 1.4857, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.4836352467536926, |
|
"learning_rate": 2.707700366684128e-05, |
|
"loss": 1.4545, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.5089160799980164, |
|
"learning_rate": 2.668412781561027e-05, |
|
"loss": 1.3965, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.4400924742221832, |
|
"learning_rate": 2.6291251964379258e-05, |
|
"loss": 1.3703, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.520064115524292, |
|
"learning_rate": 2.5898376113148247e-05, |
|
"loss": 1.3869, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.4993634819984436, |
|
"learning_rate": 2.5505500261917232e-05, |
|
"loss": 1.3827, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.4747565686702728, |
|
"learning_rate": 2.511262441068622e-05, |
|
"loss": 1.3494, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.45222511887550354, |
|
"learning_rate": 2.4719748559455214e-05, |
|
"loss": 1.3295, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.5314898490905762, |
|
"learning_rate": 2.4326872708224202e-05, |
|
"loss": 1.2971, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.4976591467857361, |
|
"learning_rate": 2.393399685699319e-05, |
|
"loss": 1.2882, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.5847007632255554, |
|
"learning_rate": 2.354112100576218e-05, |
|
"loss": 1.3001, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.4775733947753906, |
|
"learning_rate": 2.314824515453117e-05, |
|
"loss": 1.3185, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.4997495710849762, |
|
"learning_rate": 2.2755369303300158e-05, |
|
"loss": 1.31, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.46225643157958984, |
|
"learning_rate": 2.2362493452069147e-05, |
|
"loss": 1.2577, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.5184119939804077, |
|
"learning_rate": 2.1969617600838136e-05, |
|
"loss": 1.2846, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.5487437844276428, |
|
"learning_rate": 2.1576741749607125e-05, |
|
"loss": 1.234, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.44502514600753784, |
|
"learning_rate": 2.1183865898376113e-05, |
|
"loss": 1.2652, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.5340321660041809, |
|
"learning_rate": 2.0790990047145102e-05, |
|
"loss": 1.2302, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.46381640434265137, |
|
"learning_rate": 2.039811419591409e-05, |
|
"loss": 1.2402, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.5519339442253113, |
|
"learning_rate": 2.0005238344683083e-05, |
|
"loss": 1.2924, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.47584566473960876, |
|
"learning_rate": 1.9612362493452072e-05, |
|
"loss": 1.2526, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.5393884778022766, |
|
"learning_rate": 1.9219486642221058e-05, |
|
"loss": 1.188, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.5071331858634949, |
|
"learning_rate": 1.8826610790990047e-05, |
|
"loss": 1.228, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.5538953542709351, |
|
"learning_rate": 1.8433734939759036e-05, |
|
"loss": 1.238, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.4920578598976135, |
|
"learning_rate": 1.8040859088528024e-05, |
|
"loss": 1.1884, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.5614542961120605, |
|
"learning_rate": 1.7647983237297017e-05, |
|
"loss": 1.1734, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.514637291431427, |
|
"learning_rate": 1.7255107386066006e-05, |
|
"loss": 1.1938, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.6818568110466003, |
|
"learning_rate": 1.6862231534834994e-05, |
|
"loss": 1.2151, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 1.01, |
|
"grad_norm": 0.5652730464935303, |
|
"learning_rate": 1.646935568360398e-05, |
|
"loss": 1.1932, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 0.46096867322921753, |
|
"learning_rate": 1.607647983237297e-05, |
|
"loss": 1.1895, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 1.06, |
|
"grad_norm": 0.5606017112731934, |
|
"learning_rate": 1.5683603981141958e-05, |
|
"loss": 1.1812, |
|
"step": 1125 |
|
}, |
|
{ |
|
"epoch": 1.08, |
|
"grad_norm": 0.6677452325820923, |
|
"learning_rate": 1.5290728129910947e-05, |
|
"loss": 1.2344, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 1.11, |
|
"grad_norm": 0.4959929585456848, |
|
"learning_rate": 1.4897852278679937e-05, |
|
"loss": 1.1922, |
|
"step": 1175 |
|
}, |
|
{ |
|
"epoch": 1.13, |
|
"grad_norm": 0.5288898348808289, |
|
"learning_rate": 1.4504976427448928e-05, |
|
"loss": 1.1824, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 1.15, |
|
"grad_norm": 0.49359196424484253, |
|
"learning_rate": 1.4112100576217915e-05, |
|
"loss": 1.2163, |
|
"step": 1225 |
|
}, |
|
{ |
|
"epoch": 1.18, |
|
"grad_norm": 0.5195351839065552, |
|
"learning_rate": 1.3719224724986904e-05, |
|
"loss": 1.1688, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.5838152766227722, |
|
"learning_rate": 1.3326348873755894e-05, |
|
"loss": 1.1566, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 1.23, |
|
"grad_norm": 0.5872088074684143, |
|
"learning_rate": 1.2933473022524883e-05, |
|
"loss": 1.1685, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 1.25, |
|
"grad_norm": 0.5522224307060242, |
|
"learning_rate": 1.254059717129387e-05, |
|
"loss": 1.1745, |
|
"step": 1325 |
|
}, |
|
{ |
|
"epoch": 1.27, |
|
"grad_norm": 0.56575608253479, |
|
"learning_rate": 1.2147721320062861e-05, |
|
"loss": 1.1895, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.5466055870056152, |
|
"learning_rate": 1.175484546883185e-05, |
|
"loss": 1.1445, |
|
"step": 1375 |
|
}, |
|
{ |
|
"epoch": 1.32, |
|
"grad_norm": 0.5417299866676331, |
|
"learning_rate": 1.1361969617600839e-05, |
|
"loss": 1.2091, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 1.34, |
|
"grad_norm": 0.6165823936462402, |
|
"learning_rate": 1.0969093766369827e-05, |
|
"loss": 1.1533, |
|
"step": 1425 |
|
}, |
|
{ |
|
"epoch": 1.37, |
|
"grad_norm": 0.6337828636169434, |
|
"learning_rate": 1.0576217915138816e-05, |
|
"loss": 1.1721, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 1.39, |
|
"grad_norm": 0.5324290990829468, |
|
"learning_rate": 1.0183342063907805e-05, |
|
"loss": 1.2054, |
|
"step": 1475 |
|
}, |
|
{ |
|
"epoch": 1.41, |
|
"grad_norm": 0.5414568185806274, |
|
"learning_rate": 9.790466212676796e-06, |
|
"loss": 1.164, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.5330114364624023, |
|
"learning_rate": 9.397590361445783e-06, |
|
"loss": 1.1632, |
|
"step": 1525 |
|
}, |
|
{ |
|
"epoch": 1.46, |
|
"grad_norm": 0.4752555787563324, |
|
"learning_rate": 9.004714510214772e-06, |
|
"loss": 1.123, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 1.48, |
|
"grad_norm": 0.5737998485565186, |
|
"learning_rate": 8.61183865898376e-06, |
|
"loss": 1.168, |
|
"step": 1575 |
|
}, |
|
{ |
|
"epoch": 1.51, |
|
"grad_norm": 0.5172936320304871, |
|
"learning_rate": 8.218962807752751e-06, |
|
"loss": 1.1856, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 1.53, |
|
"grad_norm": 0.6017385721206665, |
|
"learning_rate": 7.826086956521738e-06, |
|
"loss": 1.1571, |
|
"step": 1625 |
|
}, |
|
{ |
|
"epoch": 1.56, |
|
"grad_norm": 0.6177796125411987, |
|
"learning_rate": 7.433211105290728e-06, |
|
"loss": 1.166, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 1.58, |
|
"grad_norm": 0.5164004564285278, |
|
"learning_rate": 7.040335254059718e-06, |
|
"loss": 1.1979, |
|
"step": 1675 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.6410110592842102, |
|
"learning_rate": 6.647459402828706e-06, |
|
"loss": 1.156, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 1.63, |
|
"grad_norm": 0.5372936129570007, |
|
"learning_rate": 6.254583551597696e-06, |
|
"loss": 1.1532, |
|
"step": 1725 |
|
}, |
|
{ |
|
"epoch": 1.65, |
|
"grad_norm": 0.6036843657493591, |
|
"learning_rate": 5.8617077003666845e-06, |
|
"loss": 1.1793, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 1.67, |
|
"grad_norm": 0.6881660223007202, |
|
"learning_rate": 5.468831849135673e-06, |
|
"loss": 1.1709, |
|
"step": 1775 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.5717695951461792, |
|
"learning_rate": 5.075955997904662e-06, |
|
"loss": 1.1441, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 1.72, |
|
"grad_norm": 0.5951740145683289, |
|
"learning_rate": 4.683080146673651e-06, |
|
"loss": 1.1855, |
|
"step": 1825 |
|
}, |
|
{ |
|
"epoch": 1.74, |
|
"grad_norm": 0.628299355506897, |
|
"learning_rate": 4.29020429544264e-06, |
|
"loss": 1.1888, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 1.77, |
|
"grad_norm": 0.5050541758537292, |
|
"learning_rate": 3.89732844421163e-06, |
|
"loss": 1.1396, |
|
"step": 1875 |
|
}, |
|
{ |
|
"epoch": 1.79, |
|
"grad_norm": 0.5191190838813782, |
|
"learning_rate": 3.504452592980618e-06, |
|
"loss": 1.1955, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 1.81, |
|
"grad_norm": 0.5869964957237244, |
|
"learning_rate": 3.111576741749607e-06, |
|
"loss": 1.1314, |
|
"step": 1925 |
|
}, |
|
{ |
|
"epoch": 1.84, |
|
"grad_norm": 0.5007169842720032, |
|
"learning_rate": 2.718700890518596e-06, |
|
"loss": 1.1516, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 1.86, |
|
"grad_norm": 0.6014929413795471, |
|
"learning_rate": 2.325825039287585e-06, |
|
"loss": 1.1901, |
|
"step": 1975 |
|
}, |
|
{ |
|
"epoch": 1.89, |
|
"grad_norm": 0.8301869034767151, |
|
"learning_rate": 1.932949188056574e-06, |
|
"loss": 1.1197, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 1.91, |
|
"grad_norm": 0.5901632905006409, |
|
"learning_rate": 1.5400733368255633e-06, |
|
"loss": 1.1843, |
|
"step": 2025 |
|
}, |
|
{ |
|
"epoch": 1.93, |
|
"grad_norm": 0.580962598323822, |
|
"learning_rate": 1.1471974855945522e-06, |
|
"loss": 1.1456, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 1.96, |
|
"grad_norm": 0.5481582283973694, |
|
"learning_rate": 7.543216343635412e-07, |
|
"loss": 1.1644, |
|
"step": 2075 |
|
}, |
|
{ |
|
"epoch": 1.98, |
|
"grad_norm": 0.5194025039672852, |
|
"learning_rate": 3.6144578313253016e-07, |
|
"loss": 1.1581, |
|
"step": 2100 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 2122, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 500, |
|
"total_flos": 4558731463360512.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|