|
{ |
|
"best_global_step": null, |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 9.91616766467066, |
|
"eval_steps": 500, |
|
"global_step": 1110, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.08982035928143713, |
|
"grad_norm": 49.68221664428711, |
|
"learning_rate": 1.4414414414414416e-06, |
|
"loss": 8.7467, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.17964071856287425, |
|
"grad_norm": 22.61308479309082, |
|
"learning_rate": 3.063063063063063e-06, |
|
"loss": 5.5972, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.2694610778443114, |
|
"grad_norm": 11.650931358337402, |
|
"learning_rate": 4.864864864864866e-06, |
|
"loss": 4.1239, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3592814371257485, |
|
"grad_norm": 11.040465354919434, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 3.3473, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.4491017964071856, |
|
"grad_norm": 21.185564041137695, |
|
"learning_rate": 8.46846846846847e-06, |
|
"loss": 2.843, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.5389221556886228, |
|
"grad_norm": 6.222358226776123, |
|
"learning_rate": 1.027027027027027e-05, |
|
"loss": 2.5158, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.6287425149700598, |
|
"grad_norm": 12.189018249511719, |
|
"learning_rate": 1.2072072072072074e-05, |
|
"loss": 2.1371, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.718562874251497, |
|
"grad_norm": 17.56637191772461, |
|
"learning_rate": 1.3873873873873875e-05, |
|
"loss": 1.9438, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.8083832335329342, |
|
"grad_norm": 5.583638668060303, |
|
"learning_rate": 1.5675675675675676e-05, |
|
"loss": 1.7556, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8982035928143712, |
|
"grad_norm": 3.642561197280884, |
|
"learning_rate": 1.7477477477477477e-05, |
|
"loss": 1.6005, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9880239520958084, |
|
"grad_norm": 2.80586314201355, |
|
"learning_rate": 1.927927927927928e-05, |
|
"loss": 1.5588, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0718562874251496, |
|
"grad_norm": 2.6551783084869385, |
|
"learning_rate": 1.9998219965624736e-05, |
|
"loss": 1.4294, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.1616766467065869, |
|
"grad_norm": 2.895826578140259, |
|
"learning_rate": 1.9987344272588007e-05, |
|
"loss": 1.4837, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.251497005988024, |
|
"grad_norm": 2.8669326305389404, |
|
"learning_rate": 1.9966592535953532e-05, |
|
"loss": 1.453, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.341317365269461, |
|
"grad_norm": 2.1909446716308594, |
|
"learning_rate": 1.9935985276197033e-05, |
|
"loss": 1.3837, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.4311377245508983, |
|
"grad_norm": 3.2299187183380127, |
|
"learning_rate": 1.989555275948572e-05, |
|
"loss": 1.3152, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.5209580838323353, |
|
"grad_norm": 1.9108384847640991, |
|
"learning_rate": 1.984533496774942e-05, |
|
"loss": 1.1961, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.6107784431137726, |
|
"grad_norm": 1.1921658515930176, |
|
"learning_rate": 1.9785381559144196e-05, |
|
"loss": 1.1209, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.7005988023952097, |
|
"grad_norm": 0.8914986848831177, |
|
"learning_rate": 1.9715751818947603e-05, |
|
"loss": 1.1056, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.7904191616766467, |
|
"grad_norm": 0.9408266544342041, |
|
"learning_rate": 1.963651460093409e-05, |
|
"loss": 1.0827, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.8802395209580838, |
|
"grad_norm": 0.8248458504676819, |
|
"learning_rate": 1.9547748259288536e-05, |
|
"loss": 1.0787, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.9700598802395208, |
|
"grad_norm": 0.9938483238220215, |
|
"learning_rate": 1.9449540571125284e-05, |
|
"loss": 1.0597, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.053892215568862, |
|
"grad_norm": 0.9142336249351501, |
|
"learning_rate": 1.93419886496892e-05, |
|
"loss": 0.9906, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.143712574850299, |
|
"grad_norm": 0.8745118379592896, |
|
"learning_rate": 1.9225198848324687e-05, |
|
"loss": 1.05, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.2335329341317367, |
|
"grad_norm": 0.7089764475822449, |
|
"learning_rate": 1.909928665530757e-05, |
|
"loss": 1.043, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.3233532934131738, |
|
"grad_norm": 0.4864867031574249, |
|
"learning_rate": 1.896437657964382e-05, |
|
"loss": 1.0463, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.413173652694611, |
|
"grad_norm": 0.971626341342926, |
|
"learning_rate": 1.8820602027948112e-05, |
|
"loss": 1.0418, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.502994011976048, |
|
"grad_norm": 0.699500322341919, |
|
"learning_rate": 1.866810517252393e-05, |
|
"loss": 1.038, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.592814371257485, |
|
"grad_norm": 0.9161490797996521, |
|
"learning_rate": 1.8507036810775617e-05, |
|
"loss": 1.0338, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.682634730538922, |
|
"grad_norm": 0.6857994198799133, |
|
"learning_rate": 1.833755621609152e-05, |
|
"loss": 1.0231, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.772455089820359, |
|
"grad_norm": 0.7041919827461243, |
|
"learning_rate": 1.815983098034555e-05, |
|
"loss": 1.0218, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.8622754491017965, |
|
"grad_norm": 0.9268773794174194, |
|
"learning_rate": 1.7974036848172992e-05, |
|
"loss": 1.0229, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.9520958083832336, |
|
"grad_norm": 0.7882820963859558, |
|
"learning_rate": 1.7780357543184396e-05, |
|
"loss": 1.0199, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.035928143712575, |
|
"grad_norm": 0.7048642635345459, |
|
"learning_rate": 1.757898458628941e-05, |
|
"loss": 0.9551, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.125748502994012, |
|
"grad_norm": 0.5380260348320007, |
|
"learning_rate": 1.7370117106310216e-05, |
|
"loss": 1.0078, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.215568862275449, |
|
"grad_norm": 0.9133287668228149, |
|
"learning_rate": 1.715396164307182e-05, |
|
"loss": 1.0026, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.305389221556886, |
|
"grad_norm": 0.8086357712745667, |
|
"learning_rate": 1.6930731943163975e-05, |
|
"loss": 1.0082, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.3952095808383236, |
|
"grad_norm": 0.7032626271247864, |
|
"learning_rate": 1.6700648748576577e-05, |
|
"loss": 1.0025, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.4850299401197606, |
|
"grad_norm": 0.7610228657722473, |
|
"learning_rate": 1.6463939578417695e-05, |
|
"loss": 0.9943, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.5748502994011977, |
|
"grad_norm": 0.7735270261764526, |
|
"learning_rate": 1.622083850392996e-05, |
|
"loss": 1.0014, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.6646706586826348, |
|
"grad_norm": 0.5382483005523682, |
|
"learning_rate": 1.5971585917027864e-05, |
|
"loss": 0.9961, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.754491017964072, |
|
"grad_norm": 0.6216614246368408, |
|
"learning_rate": 1.5716428292584788e-05, |
|
"loss": 0.992, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.844311377245509, |
|
"grad_norm": 0.5743616819381714, |
|
"learning_rate": 1.545561794470492e-05, |
|
"loss": 0.9861, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.934131736526946, |
|
"grad_norm": 0.7634334564208984, |
|
"learning_rate": 1.518941277722096e-05, |
|
"loss": 0.9881, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.017964071856287, |
|
"grad_norm": 0.7453029751777649, |
|
"learning_rate": 1.491807602866442e-05, |
|
"loss": 0.9172, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.107784431137724, |
|
"grad_norm": 1.0462350845336914, |
|
"learning_rate": 1.4641876011960661e-05, |
|
"loss": 0.9798, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.197604790419161, |
|
"grad_norm": 0.8113179206848145, |
|
"learning_rate": 1.436108584910611e-05, |
|
"loss": 0.9799, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.287425149700598, |
|
"grad_norm": 0.6361674070358276, |
|
"learning_rate": 1.4075983201089964e-05, |
|
"loss": 0.9654, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.3772455089820355, |
|
"grad_norm": 0.6291260123252869, |
|
"learning_rate": 1.3786849993327503e-05, |
|
"loss": 0.9631, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.467065868263473, |
|
"grad_norm": 0.988298773765564, |
|
"learning_rate": 1.349397213687651e-05, |
|
"loss": 0.9729, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.5568862275449105, |
|
"grad_norm": 1.0734843015670776, |
|
"learning_rate": 1.3197639245712454e-05, |
|
"loss": 0.963, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.6467065868263475, |
|
"grad_norm": 0.834682047367096, |
|
"learning_rate": 1.2898144350342015e-05, |
|
"loss": 0.9538, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.736526946107785, |
|
"grad_norm": 0.5939741134643555, |
|
"learning_rate": 1.2595783608038157e-05, |
|
"loss": 0.9623, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.826347305389222, |
|
"grad_norm": 0.8625423908233643, |
|
"learning_rate": 1.22908560099833e-05, |
|
"loss": 0.9557, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.916167664670659, |
|
"grad_norm": 0.8925888538360596, |
|
"learning_rate": 1.198366308561013e-05, |
|
"loss": 0.9419, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.3238130807876587, |
|
"learning_rate": 1.1674508604432464e-05, |
|
"loss": 0.8782, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 5.089820359281437, |
|
"grad_norm": 0.5171638131141663, |
|
"learning_rate": 1.1363698275661002e-05, |
|
"loss": 0.9256, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 5.179640718562874, |
|
"grad_norm": 1.136551022529602, |
|
"learning_rate": 1.1051539445900982e-05, |
|
"loss": 0.9282, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 5.269461077844311, |
|
"grad_norm": 0.9350560307502747, |
|
"learning_rate": 1.0738340795230722e-05, |
|
"loss": 0.9316, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 5.359281437125748, |
|
"grad_norm": 1.0158970355987549, |
|
"learning_rate": 1.0424412031961485e-05, |
|
"loss": 0.9381, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 5.449101796407185, |
|
"grad_norm": 1.0037592649459839, |
|
"learning_rate": 1.0110063586380647e-05, |
|
"loss": 0.9288, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 5.538922155688622, |
|
"grad_norm": 0.6334489583969116, |
|
"learning_rate": 9.795606303780885e-06, |
|
"loss": 0.9272, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 5.62874251497006, |
|
"grad_norm": 0.7240671515464783, |
|
"learning_rate": 9.48135113707899e-06, |
|
"loss": 0.9318, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 5.718562874251497, |
|
"grad_norm": 0.9315741658210754, |
|
"learning_rate": 9.167608839328273e-06, |
|
"loss": 0.9297, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 5.808383233532934, |
|
"grad_norm": 0.7542333602905273, |
|
"learning_rate": 8.854689656428591e-06, |
|
"loss": 0.9119, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 5.8982035928143715, |
|
"grad_norm": 0.864987313747406, |
|
"learning_rate": 8.542903020337887e-06, |
|
"loss": 0.919, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 5.9880239520958085, |
|
"grad_norm": 0.5868083834648132, |
|
"learning_rate": 8.232557243088585e-06, |
|
"loss": 0.9136, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 6.07185628742515, |
|
"grad_norm": 0.8045146465301514, |
|
"learning_rate": 7.923959211911448e-06, |
|
"loss": 0.8452, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 6.161676646706587, |
|
"grad_norm": 0.9145955443382263, |
|
"learning_rate": 7.617414085768352e-06, |
|
"loss": 0.9, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 6.251497005988024, |
|
"grad_norm": 0.6515654921531677, |
|
"learning_rate": 7.313224993594057e-06, |
|
"loss": 0.8946, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 6.341317365269461, |
|
"grad_norm": 0.6843001246452332, |
|
"learning_rate": 7.011692734545403e-06, |
|
"loss": 0.8994, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 6.431137724550898, |
|
"grad_norm": 0.8299969434738159, |
|
"learning_rate": 6.713115480554313e-06, |
|
"loss": 0.894, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 6.520958083832335, |
|
"grad_norm": 1.121025800704956, |
|
"learning_rate": 6.4177884814787284e-06, |
|
"loss": 0.8982, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 6.610778443113772, |
|
"grad_norm": 1.1040045022964478, |
|
"learning_rate": 6.126003773143072e-06, |
|
"loss": 0.8985, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 6.700598802395209, |
|
"grad_norm": 0.711075484752655, |
|
"learning_rate": 5.8380498885569246e-06, |
|
"loss": 0.8923, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 6.790419161676647, |
|
"grad_norm": 0.8444594740867615, |
|
"learning_rate": 5.554211572597477e-06, |
|
"loss": 0.8989, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 6.880239520958084, |
|
"grad_norm": 0.9748265743255615, |
|
"learning_rate": 5.274769500437882e-06, |
|
"loss": 0.8956, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 6.970059880239521, |
|
"grad_norm": 0.8513533473014832, |
|
"learning_rate": 5.000000000000003e-06, |
|
"loss": 0.8906, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 7.053892215568863, |
|
"grad_norm": 1.3123410940170288, |
|
"learning_rate": 4.730174778705909e-06, |
|
"loss": 0.8249, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 7.1437125748503, |
|
"grad_norm": 1.0771753787994385, |
|
"learning_rate": 4.4655606547984165e-06, |
|
"loss": 0.8777, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 7.233532934131737, |
|
"grad_norm": 0.8778141736984253, |
|
"learning_rate": 4.206419293496333e-06, |
|
"loss": 0.8832, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 7.323353293413174, |
|
"grad_norm": 1.0552406311035156, |
|
"learning_rate": 3.953006948245247e-06, |
|
"loss": 0.872, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 7.413173652694611, |
|
"grad_norm": 0.6986867785453796, |
|
"learning_rate": 3.705574207319844e-06, |
|
"loss": 0.8807, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 7.502994011976048, |
|
"grad_norm": 0.7952435612678528, |
|
"learning_rate": 3.4643657460282078e-06, |
|
"loss": 0.8793, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 7.592814371257485, |
|
"grad_norm": 0.9549069404602051, |
|
"learning_rate": 3.2296200847632376e-06, |
|
"loss": 0.8751, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 7.682634730538922, |
|
"grad_norm": 0.6770684719085693, |
|
"learning_rate": 3.001569353140347e-06, |
|
"loss": 0.8772, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 7.772455089820359, |
|
"grad_norm": 0.8119450211524963, |
|
"learning_rate": 2.780439060454756e-06, |
|
"loss": 0.8721, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 7.862275449101796, |
|
"grad_norm": 0.7255963087081909, |
|
"learning_rate": 2.566447872685298e-06, |
|
"loss": 0.8734, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 7.952095808383233, |
|
"grad_norm": 0.6024225950241089, |
|
"learning_rate": 2.359807396265307e-06, |
|
"loss": 0.8779, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 8.035928143712574, |
|
"grad_norm": 0.6504734754562378, |
|
"learning_rate": 2.160721968834344e-06, |
|
"loss": 0.8174, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 8.125748502994012, |
|
"grad_norm": 0.8142725825309753, |
|
"learning_rate": 1.969388457177743e-06, |
|
"loss": 0.863, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 8.215568862275449, |
|
"grad_norm": 0.8136929869651794, |
|
"learning_rate": 1.7859960625537476e-06, |
|
"loss": 0.8677, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 8.305389221556887, |
|
"grad_norm": 0.6553688049316406, |
|
"learning_rate": 1.6107261336007284e-06, |
|
"loss": 0.853, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 8.395209580838323, |
|
"grad_norm": 0.8645008206367493, |
|
"learning_rate": 1.443751987009533e-06, |
|
"loss": 0.8569, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 8.48502994011976, |
|
"grad_norm": 0.7802151441574097, |
|
"learning_rate": 1.2852387361382768e-06, |
|
"loss": 0.8608, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 8.574850299401197, |
|
"grad_norm": 0.8659719824790955, |
|
"learning_rate": 1.1353431277390125e-06, |
|
"loss": 0.8723, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 8.664670658682635, |
|
"grad_norm": 1.073027491569519, |
|
"learning_rate": 9.942133869578164e-07, |
|
"loss": 0.8538, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 8.754491017964071, |
|
"grad_norm": 0.8132328391075134, |
|
"learning_rate": 8.619890707614687e-07, |
|
"loss": 0.8552, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 8.844311377245509, |
|
"grad_norm": 0.959922730922699, |
|
"learning_rate": 7.388009299357412e-07, |
|
"loss": 0.8658, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 8.934131736526947, |
|
"grad_norm": 0.8655376434326172, |
|
"learning_rate": 6.247707797917257e-07, |
|
"loss": 0.8522, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 9.017964071856287, |
|
"grad_norm": 0.8419204950332642, |
|
"learning_rate": 5.200113797080464e-07, |
|
"loss": 0.7919, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 9.107784431137725, |
|
"grad_norm": 0.9591242074966431, |
|
"learning_rate": 4.2462632162809103e-07, |
|
"loss": 0.857, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 9.197604790419161, |
|
"grad_norm": 0.4990997016429901, |
|
"learning_rate": 3.387099276225214e-07, |
|
"loss": 0.8503, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 9.2874251497006, |
|
"grad_norm": 0.9034556150436401, |
|
"learning_rate": 2.623471566183322e-07, |
|
"loss": 0.8465, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 9.377245508982035, |
|
"grad_norm": 0.8334706425666809, |
|
"learning_rate": 1.9561352038673264e-07, |
|
"loss": 0.8576, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 9.467065868263473, |
|
"grad_norm": 0.6150113344192505, |
|
"learning_rate": 1.3857500887288544e-07, |
|
"loss": 0.8544, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 9.55688622754491, |
|
"grad_norm": 0.7956252098083496, |
|
"learning_rate": 9.12880249413628e-08, |
|
"loss": 0.8488, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 9.646706586826348, |
|
"grad_norm": 0.755624532699585, |
|
"learning_rate": 5.379932860185122e-08, |
|
"loss": 0.8512, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 9.736526946107784, |
|
"grad_norm": 0.6398297548294067, |
|
"learning_rate": 2.6145990770238827e-08, |
|
"loss": 0.8559, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 9.826347305389222, |
|
"grad_norm": 0.9212434887886047, |
|
"learning_rate": 8.355356610822984e-09, |
|
"loss": 0.8499, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 9.91616766467066, |
|
"grad_norm": 0.868816614151001, |
|
"learning_rate": 4.4501849589040357e-10, |
|
"loss": 0.8618, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 9.91616766467066, |
|
"step": 1110, |
|
"total_flos": 3.0002693815874355e+17, |
|
"train_loss": 1.1883513411960087, |
|
"train_runtime": 36141.346, |
|
"train_samples_per_second": 0.553, |
|
"train_steps_per_second": 0.031 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1110, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.0002693815874355e+17, |
|
"train_batch_size": 3, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|