|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.9987515605493134, |
|
"eval_steps": 500, |
|
"global_step": 400, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0024968789013732834, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.5487, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.004993757802746567, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.5726, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.00749063670411985, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.4761, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.009987515605493134, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.5582, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.012484394506866416, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.5577, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.0149812734082397, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.5281, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.017478152309612985, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.5385, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.019975031210986267, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.4239, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02247191011235955, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.558, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.024968789013732832, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.4404, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02746566791510612, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.5149, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.0299625468164794, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.5407, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03245942571785269, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.5754, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03495630461922597, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.5222, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03745318352059925, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.5261, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.039950062421972535, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.5199, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04244694132334582, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.5327, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.0449438202247191, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.4925, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.04744069912609238, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.5452, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.049937578027465665, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.4986, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.052434456928838954, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.5148, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.05493133583021224, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.6131, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.05742821473158552, |
|
"grad_norm": NaN, |
|
"learning_rate": 5e-05, |
|
"loss": 2.534, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.0599250936329588, |
|
"grad_norm": 3.498404026031494, |
|
"learning_rate": 4.9875000000000006e-05, |
|
"loss": 2.5259, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.062421972534332085, |
|
"grad_norm": 3.7281525135040283, |
|
"learning_rate": 4.975e-05, |
|
"loss": 2.5641, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.06491885143570537, |
|
"grad_norm": 3.4878628253936768, |
|
"learning_rate": 4.962500000000001e-05, |
|
"loss": 2.5001, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.06741573033707865, |
|
"grad_norm": 3.498518228530884, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 2.4923, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.06991260923845194, |
|
"grad_norm": 3.5697684288024902, |
|
"learning_rate": 4.937500000000001e-05, |
|
"loss": 2.4743, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.07240948813982521, |
|
"grad_norm": 3.58793044090271, |
|
"learning_rate": 4.9250000000000004e-05, |
|
"loss": 2.4403, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.0749063670411985, |
|
"grad_norm": 3.395378828048706, |
|
"learning_rate": 4.9125e-05, |
|
"loss": 2.4839, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07740324594257178, |
|
"grad_norm": 3.4821157455444336, |
|
"learning_rate": 4.9e-05, |
|
"loss": 2.4865, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.07990012484394507, |
|
"grad_norm": 3.5381717681884766, |
|
"learning_rate": 4.8875e-05, |
|
"loss": 2.3919, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.08239700374531835, |
|
"grad_norm": 3.7879738807678223, |
|
"learning_rate": 4.875e-05, |
|
"loss": 2.5253, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.08489388264669163, |
|
"grad_norm": 3.9536428451538086, |
|
"learning_rate": 4.8625e-05, |
|
"loss": 2.557, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.08739076154806492, |
|
"grad_norm": 3.5281002521514893, |
|
"learning_rate": 4.85e-05, |
|
"loss": 2.419, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.0898876404494382, |
|
"grad_norm": 3.8065526485443115, |
|
"learning_rate": 4.8375000000000004e-05, |
|
"loss": 2.4839, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.09238451935081149, |
|
"grad_norm": 3.48492431640625, |
|
"learning_rate": 4.825e-05, |
|
"loss": 2.3443, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.09488139825218476, |
|
"grad_norm": 3.5769503116607666, |
|
"learning_rate": 4.8125000000000004e-05, |
|
"loss": 2.5158, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.09737827715355805, |
|
"grad_norm": 3.5777034759521484, |
|
"learning_rate": 4.8e-05, |
|
"loss": 2.403, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.09987515605493133, |
|
"grad_norm": 3.471339225769043, |
|
"learning_rate": 4.7875000000000005e-05, |
|
"loss": 2.3998, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.10237203495630462, |
|
"grad_norm": 3.6594278812408447, |
|
"learning_rate": 4.775e-05, |
|
"loss": 2.4084, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.10486891385767791, |
|
"grad_norm": 3.5339417457580566, |
|
"learning_rate": 4.7625000000000006e-05, |
|
"loss": 2.4, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.10736579275905118, |
|
"grad_norm": 3.4090983867645264, |
|
"learning_rate": 4.75e-05, |
|
"loss": 2.367, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.10986267166042447, |
|
"grad_norm": 3.769968032836914, |
|
"learning_rate": 4.7375e-05, |
|
"loss": 2.3624, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.11235955056179775, |
|
"grad_norm": 3.788815975189209, |
|
"learning_rate": 4.7249999999999997e-05, |
|
"loss": 2.4006, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11485642946317104, |
|
"grad_norm": 3.7179384231567383, |
|
"learning_rate": 4.7125e-05, |
|
"loss": 2.376, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.11735330836454431, |
|
"grad_norm": 3.4289679527282715, |
|
"learning_rate": 4.7e-05, |
|
"loss": 2.3354, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.1198501872659176, |
|
"grad_norm": 3.5412650108337402, |
|
"learning_rate": 4.6875e-05, |
|
"loss": 2.3317, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.12234706616729088, |
|
"grad_norm": 3.5559499263763428, |
|
"learning_rate": 4.6750000000000005e-05, |
|
"loss": 2.3663, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.12484394506866417, |
|
"grad_norm": 3.6865787506103516, |
|
"learning_rate": 4.6625e-05, |
|
"loss": 2.4062, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.12734082397003746, |
|
"grad_norm": 3.417897939682007, |
|
"learning_rate": 4.6500000000000005e-05, |
|
"loss": 2.3314, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.12983770287141075, |
|
"grad_norm": 3.5215134620666504, |
|
"learning_rate": 4.6375e-05, |
|
"loss": 2.367, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.132334581772784, |
|
"grad_norm": 3.478131055831909, |
|
"learning_rate": 4.6250000000000006e-05, |
|
"loss": 2.285, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.1348314606741573, |
|
"grad_norm": 3.8263564109802246, |
|
"learning_rate": 4.6125e-05, |
|
"loss": 2.402, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.1373283395755306, |
|
"grad_norm": 3.4977402687072754, |
|
"learning_rate": 4.600000000000001e-05, |
|
"loss": 2.317, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.13982521847690388, |
|
"grad_norm": 3.5797126293182373, |
|
"learning_rate": 4.5875000000000004e-05, |
|
"loss": 2.3097, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.14232209737827714, |
|
"grad_norm": 3.6681716442108154, |
|
"learning_rate": 4.575e-05, |
|
"loss": 2.3766, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.14481897627965043, |
|
"grad_norm": 3.6064581871032715, |
|
"learning_rate": 4.5625e-05, |
|
"loss": 2.2548, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.14731585518102372, |
|
"grad_norm": 3.680893659591675, |
|
"learning_rate": 4.55e-05, |
|
"loss": 2.4249, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.149812734082397, |
|
"grad_norm": 3.5588841438293457, |
|
"learning_rate": 4.5375e-05, |
|
"loss": 2.3109, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.1523096129837703, |
|
"grad_norm": 3.4901130199432373, |
|
"learning_rate": 4.525e-05, |
|
"loss": 2.2421, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.15480649188514356, |
|
"grad_norm": 3.5400235652923584, |
|
"learning_rate": 4.5125e-05, |
|
"loss": 2.3226, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.15730337078651685, |
|
"grad_norm": 3.578420877456665, |
|
"learning_rate": 4.5e-05, |
|
"loss": 2.2904, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.15980024968789014, |
|
"grad_norm": 3.583136558532715, |
|
"learning_rate": 4.4875e-05, |
|
"loss": 2.3255, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.16229712858926343, |
|
"grad_norm": 3.35066819190979, |
|
"learning_rate": 4.4750000000000004e-05, |
|
"loss": 2.2183, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.1647940074906367, |
|
"grad_norm": 3.4735934734344482, |
|
"learning_rate": 4.4625e-05, |
|
"loss": 2.2754, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.16729088639200998, |
|
"grad_norm": 3.3307180404663086, |
|
"learning_rate": 4.4500000000000004e-05, |
|
"loss": 2.2964, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.16978776529338327, |
|
"grad_norm": 3.516148567199707, |
|
"learning_rate": 4.4375e-05, |
|
"loss": 2.3107, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.17228464419475656, |
|
"grad_norm": 3.4239909648895264, |
|
"learning_rate": 4.4250000000000005e-05, |
|
"loss": 2.2945, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.17478152309612985, |
|
"grad_norm": 3.507624626159668, |
|
"learning_rate": 4.4125e-05, |
|
"loss": 2.2859, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.1772784019975031, |
|
"grad_norm": 3.6821298599243164, |
|
"learning_rate": 4.4000000000000006e-05, |
|
"loss": 2.3248, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.1797752808988764, |
|
"grad_norm": 3.3753108978271484, |
|
"learning_rate": 4.3875e-05, |
|
"loss": 2.2001, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.1822721598002497, |
|
"grad_norm": 3.4027931690216064, |
|
"learning_rate": 4.375e-05, |
|
"loss": 2.2668, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.18476903870162298, |
|
"grad_norm": 3.524763584136963, |
|
"learning_rate": 4.3625e-05, |
|
"loss": 2.2686, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.18726591760299627, |
|
"grad_norm": 3.5769248008728027, |
|
"learning_rate": 4.35e-05, |
|
"loss": 2.2757, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.18976279650436953, |
|
"grad_norm": 3.5016684532165527, |
|
"learning_rate": 4.3375000000000004e-05, |
|
"loss": 2.2218, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.19225967540574282, |
|
"grad_norm": 3.45440411567688, |
|
"learning_rate": 4.325e-05, |
|
"loss": 2.2405, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.1947565543071161, |
|
"grad_norm": 3.5838842391967773, |
|
"learning_rate": 4.3125000000000005e-05, |
|
"loss": 2.2426, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.1972534332084894, |
|
"grad_norm": 3.716230869293213, |
|
"learning_rate": 4.3e-05, |
|
"loss": 2.2819, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.19975031210986266, |
|
"grad_norm": 3.258978843688965, |
|
"learning_rate": 4.2875000000000005e-05, |
|
"loss": 2.2049, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.20224719101123595, |
|
"grad_norm": 3.575076103210449, |
|
"learning_rate": 4.275e-05, |
|
"loss": 2.2412, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.20474406991260924, |
|
"grad_norm": 3.5371599197387695, |
|
"learning_rate": 4.2625000000000006e-05, |
|
"loss": 2.2206, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.20724094881398253, |
|
"grad_norm": 3.6981263160705566, |
|
"learning_rate": 4.25e-05, |
|
"loss": 2.266, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.20973782771535582, |
|
"grad_norm": 3.451237678527832, |
|
"learning_rate": 4.237500000000001e-05, |
|
"loss": 2.2564, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.21223470661672908, |
|
"grad_norm": 3.4362707138061523, |
|
"learning_rate": 4.2250000000000004e-05, |
|
"loss": 2.1987, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.21473158551810237, |
|
"grad_norm": 3.45649790763855, |
|
"learning_rate": 4.2125e-05, |
|
"loss": 2.2001, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.21722846441947566, |
|
"grad_norm": 3.476231575012207, |
|
"learning_rate": 4.2e-05, |
|
"loss": 2.1378, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.21972534332084895, |
|
"grad_norm": 3.5050230026245117, |
|
"learning_rate": 4.1875e-05, |
|
"loss": 2.1478, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 3.455080509185791, |
|
"learning_rate": 4.175e-05, |
|
"loss": 2.2395, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.2247191011235955, |
|
"grad_norm": 3.500718832015991, |
|
"learning_rate": 4.1625e-05, |
|
"loss": 2.1794, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.2272159800249688, |
|
"grad_norm": 3.53946852684021, |
|
"learning_rate": 4.15e-05, |
|
"loss": 2.2581, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.22971285892634208, |
|
"grad_norm": 3.506479024887085, |
|
"learning_rate": 4.1375e-05, |
|
"loss": 2.2028, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.23220973782771537, |
|
"grad_norm": 3.4707953929901123, |
|
"learning_rate": 4.125e-05, |
|
"loss": 2.1435, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.23470661672908863, |
|
"grad_norm": 3.4014856815338135, |
|
"learning_rate": 4.1125000000000004e-05, |
|
"loss": 2.2146, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.23720349563046192, |
|
"grad_norm": 3.42364764213562, |
|
"learning_rate": 4.1e-05, |
|
"loss": 2.1824, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2397003745318352, |
|
"grad_norm": 3.480703115463257, |
|
"learning_rate": 4.0875000000000004e-05, |
|
"loss": 2.2208, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2421972534332085, |
|
"grad_norm": 3.6021103858947754, |
|
"learning_rate": 4.075e-05, |
|
"loss": 2.1939, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.24469413233458176, |
|
"grad_norm": 3.673717498779297, |
|
"learning_rate": 4.0625000000000005e-05, |
|
"loss": 2.234, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.24719101123595505, |
|
"grad_norm": 3.511613368988037, |
|
"learning_rate": 4.05e-05, |
|
"loss": 2.2176, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.24968789013732834, |
|
"grad_norm": 3.5998849868774414, |
|
"learning_rate": 4.0375e-05, |
|
"loss": 2.2259, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.25218476903870163, |
|
"grad_norm": 3.4577770233154297, |
|
"learning_rate": 4.025e-05, |
|
"loss": 2.2238, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.2546816479400749, |
|
"grad_norm": 3.612593173980713, |
|
"learning_rate": 4.0125e-05, |
|
"loss": 2.161, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.2571785268414482, |
|
"grad_norm": 3.477313280105591, |
|
"learning_rate": 4e-05, |
|
"loss": 2.1839, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.2596754057428215, |
|
"grad_norm": 3.553819417953491, |
|
"learning_rate": 3.9875e-05, |
|
"loss": 2.2249, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.26217228464419473, |
|
"grad_norm": 3.412799596786499, |
|
"learning_rate": 3.9750000000000004e-05, |
|
"loss": 2.1834, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.264669163545568, |
|
"grad_norm": 3.4956414699554443, |
|
"learning_rate": 3.9625e-05, |
|
"loss": 2.1456, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.2671660424469413, |
|
"grad_norm": 3.6447088718414307, |
|
"learning_rate": 3.9500000000000005e-05, |
|
"loss": 2.166, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.2696629213483146, |
|
"grad_norm": 3.6700491905212402, |
|
"learning_rate": 3.9375e-05, |
|
"loss": 2.2626, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.2721598002496879, |
|
"grad_norm": 3.570895195007324, |
|
"learning_rate": 3.9250000000000005e-05, |
|
"loss": 2.1324, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.2746566791510612, |
|
"grad_norm": 3.456331968307495, |
|
"learning_rate": 3.9125e-05, |
|
"loss": 2.1244, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.27715355805243447, |
|
"grad_norm": 3.4046988487243652, |
|
"learning_rate": 3.9000000000000006e-05, |
|
"loss": 2.1525, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.27965043695380776, |
|
"grad_norm": 3.423374652862549, |
|
"learning_rate": 3.8875e-05, |
|
"loss": 2.1114, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.28214731585518105, |
|
"grad_norm": 3.3426084518432617, |
|
"learning_rate": 3.875e-05, |
|
"loss": 2.1863, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.2846441947565543, |
|
"grad_norm": 3.2898874282836914, |
|
"learning_rate": 3.8625e-05, |
|
"loss": 2.2121, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.28714107365792757, |
|
"grad_norm": 3.395939826965332, |
|
"learning_rate": 3.85e-05, |
|
"loss": 2.1401, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.28963795255930086, |
|
"grad_norm": 3.3747150897979736, |
|
"learning_rate": 3.8375e-05, |
|
"loss": 2.0959, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.29213483146067415, |
|
"grad_norm": 3.5609467029571533, |
|
"learning_rate": 3.825e-05, |
|
"loss": 2.1268, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.29463171036204744, |
|
"grad_norm": 3.6404829025268555, |
|
"learning_rate": 3.8125e-05, |
|
"loss": 2.184, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.29712858926342073, |
|
"grad_norm": 3.50461745262146, |
|
"learning_rate": 3.8e-05, |
|
"loss": 2.168, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.299625468164794, |
|
"grad_norm": 3.4648733139038086, |
|
"learning_rate": 3.7875e-05, |
|
"loss": 2.1526, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.3021223470661673, |
|
"grad_norm": 3.696610450744629, |
|
"learning_rate": 3.775e-05, |
|
"loss": 2.204, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.3046192259675406, |
|
"grad_norm": 3.4781978130340576, |
|
"learning_rate": 3.7625e-05, |
|
"loss": 2.1915, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.30711610486891383, |
|
"grad_norm": 3.490053176879883, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 2.1437, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.3096129837702871, |
|
"grad_norm": 3.452174186706543, |
|
"learning_rate": 3.737500000000001e-05, |
|
"loss": 2.1535, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.3121098626716604, |
|
"grad_norm": 3.5188889503479004, |
|
"learning_rate": 3.7250000000000004e-05, |
|
"loss": 2.1696, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.3146067415730337, |
|
"grad_norm": 3.452965021133423, |
|
"learning_rate": 3.7125e-05, |
|
"loss": 2.1077, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.317103620474407, |
|
"grad_norm": 3.4781739711761475, |
|
"learning_rate": 3.7e-05, |
|
"loss": 2.1442, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.3196004993757803, |
|
"grad_norm": 3.455982208251953, |
|
"learning_rate": 3.6875e-05, |
|
"loss": 2.2422, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.32209737827715357, |
|
"grad_norm": 3.735812187194824, |
|
"learning_rate": 3.675e-05, |
|
"loss": 2.211, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.32459425717852686, |
|
"grad_norm": 3.7122321128845215, |
|
"learning_rate": 3.6625e-05, |
|
"loss": 2.1534, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.32709113607990015, |
|
"grad_norm": 3.407907724380493, |
|
"learning_rate": 3.65e-05, |
|
"loss": 2.135, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.3295880149812734, |
|
"grad_norm": 3.610645055770874, |
|
"learning_rate": 3.6375e-05, |
|
"loss": 2.1437, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.33208489388264667, |
|
"grad_norm": 3.316847085952759, |
|
"learning_rate": 3.625e-05, |
|
"loss": 2.1594, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.33458177278401996, |
|
"grad_norm": 3.504835367202759, |
|
"learning_rate": 3.6125000000000004e-05, |
|
"loss": 2.1369, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.33707865168539325, |
|
"grad_norm": 3.4886667728424072, |
|
"learning_rate": 3.6e-05, |
|
"loss": 2.1854, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.33957553058676654, |
|
"grad_norm": 3.4116599559783936, |
|
"learning_rate": 3.5875000000000005e-05, |
|
"loss": 2.0741, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.34207240948813983, |
|
"grad_norm": 3.4653735160827637, |
|
"learning_rate": 3.575e-05, |
|
"loss": 2.1779, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.3445692883895131, |
|
"grad_norm": 3.496469497680664, |
|
"learning_rate": 3.5625000000000005e-05, |
|
"loss": 2.1337, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.3470661672908864, |
|
"grad_norm": 3.4037115573883057, |
|
"learning_rate": 3.55e-05, |
|
"loss": 2.1346, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.3495630461922597, |
|
"grad_norm": 3.6134746074676514, |
|
"learning_rate": 3.5375e-05, |
|
"loss": 2.212, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.352059925093633, |
|
"grad_norm": 3.444700002670288, |
|
"learning_rate": 3.525e-05, |
|
"loss": 2.1676, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.3545568039950062, |
|
"grad_norm": 3.404010534286499, |
|
"learning_rate": 3.5125e-05, |
|
"loss": 2.0873, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3570536828963795, |
|
"grad_norm": 3.3055148124694824, |
|
"learning_rate": 3.5e-05, |
|
"loss": 2.1762, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.3595505617977528, |
|
"grad_norm": 3.5183520317077637, |
|
"learning_rate": 3.4875e-05, |
|
"loss": 2.0908, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.3620474406991261, |
|
"grad_norm": 3.661907911300659, |
|
"learning_rate": 3.475e-05, |
|
"loss": 2.0899, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.3645443196004994, |
|
"grad_norm": 3.574948787689209, |
|
"learning_rate": 3.4625e-05, |
|
"loss": 2.1531, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.36704119850187267, |
|
"grad_norm": 3.4745309352874756, |
|
"learning_rate": 3.45e-05, |
|
"loss": 2.1238, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.36953807740324596, |
|
"grad_norm": 3.6377320289611816, |
|
"learning_rate": 3.4375e-05, |
|
"loss": 2.0556, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.37203495630461925, |
|
"grad_norm": 3.3080575466156006, |
|
"learning_rate": 3.4250000000000006e-05, |
|
"loss": 2.1218, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.37453183520599254, |
|
"grad_norm": 3.2884228229522705, |
|
"learning_rate": 3.4125e-05, |
|
"loss": 2.0843, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.37702871410736577, |
|
"grad_norm": 3.4235734939575195, |
|
"learning_rate": 3.4000000000000007e-05, |
|
"loss": 2.1042, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.37952559300873906, |
|
"grad_norm": 3.284189462661743, |
|
"learning_rate": 3.3875000000000003e-05, |
|
"loss": 2.0937, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.38202247191011235, |
|
"grad_norm": 3.311265468597412, |
|
"learning_rate": 3.375000000000001e-05, |
|
"loss": 2.0621, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.38451935081148564, |
|
"grad_norm": 3.5135247707366943, |
|
"learning_rate": 3.3625000000000004e-05, |
|
"loss": 2.1317, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.38701622971285893, |
|
"grad_norm": 3.428800582885742, |
|
"learning_rate": 3.35e-05, |
|
"loss": 2.1447, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.3895131086142322, |
|
"grad_norm": 3.4077048301696777, |
|
"learning_rate": 3.3375e-05, |
|
"loss": 2.1757, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.3920099875156055, |
|
"grad_norm": 3.572735071182251, |
|
"learning_rate": 3.325e-05, |
|
"loss": 2.0373, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.3945068664169788, |
|
"grad_norm": 3.465317487716675, |
|
"learning_rate": 3.3125e-05, |
|
"loss": 2.0783, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.3970037453183521, |
|
"grad_norm": 3.5976178646087646, |
|
"learning_rate": 3.3e-05, |
|
"loss": 2.2142, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.3995006242197253, |
|
"grad_norm": 3.576270341873169, |
|
"learning_rate": 3.2875e-05, |
|
"loss": 2.134, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.4019975031210986, |
|
"grad_norm": 3.536389112472534, |
|
"learning_rate": 3.275e-05, |
|
"loss": 2.1068, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.4044943820224719, |
|
"grad_norm": 3.45173716545105, |
|
"learning_rate": 3.2625e-05, |
|
"loss": 2.0449, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.4069912609238452, |
|
"grad_norm": 3.498258352279663, |
|
"learning_rate": 3.2500000000000004e-05, |
|
"loss": 2.1712, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.4094881398252185, |
|
"grad_norm": 3.2850704193115234, |
|
"learning_rate": 3.2375e-05, |
|
"loss": 2.0633, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.41198501872659177, |
|
"grad_norm": 3.3497467041015625, |
|
"learning_rate": 3.2250000000000005e-05, |
|
"loss": 2.1995, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.41448189762796506, |
|
"grad_norm": 3.4341518878936768, |
|
"learning_rate": 3.2125e-05, |
|
"loss": 2.1297, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.41697877652933835, |
|
"grad_norm": 3.3831026554107666, |
|
"learning_rate": 3.2000000000000005e-05, |
|
"loss": 2.198, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.41947565543071164, |
|
"grad_norm": 3.450352191925049, |
|
"learning_rate": 3.1875e-05, |
|
"loss": 2.092, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.42197253433208487, |
|
"grad_norm": 3.3830325603485107, |
|
"learning_rate": 3.175e-05, |
|
"loss": 2.1015, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.42446941323345816, |
|
"grad_norm": 3.548405885696411, |
|
"learning_rate": 3.1624999999999996e-05, |
|
"loss": 2.0894, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.42696629213483145, |
|
"grad_norm": 3.675614595413208, |
|
"learning_rate": 3.15e-05, |
|
"loss": 2.1759, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.42946317103620474, |
|
"grad_norm": 3.520315408706665, |
|
"learning_rate": 3.1375e-05, |
|
"loss": 2.1231, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.43196004993757803, |
|
"grad_norm": 3.604353666305542, |
|
"learning_rate": 3.125e-05, |
|
"loss": 2.048, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.4344569288389513, |
|
"grad_norm": 3.609004259109497, |
|
"learning_rate": 3.1125000000000004e-05, |
|
"loss": 2.1223, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.4369538077403246, |
|
"grad_norm": 3.4398860931396484, |
|
"learning_rate": 3.1e-05, |
|
"loss": 2.0657, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4394506866416979, |
|
"grad_norm": 3.4411044120788574, |
|
"learning_rate": 3.0875000000000005e-05, |
|
"loss": 2.0593, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.4419475655430712, |
|
"grad_norm": 3.5043869018554688, |
|
"learning_rate": 3.075e-05, |
|
"loss": 2.157, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 3.3451061248779297, |
|
"learning_rate": 3.0625000000000006e-05, |
|
"loss": 1.996, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.4469413233458177, |
|
"grad_norm": 3.6858675479888916, |
|
"learning_rate": 3.05e-05, |
|
"loss": 2.1263, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.449438202247191, |
|
"grad_norm": 3.5967984199523926, |
|
"learning_rate": 3.0375000000000003e-05, |
|
"loss": 2.0165, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.4519350811485643, |
|
"grad_norm": 3.5284011363983154, |
|
"learning_rate": 3.025e-05, |
|
"loss": 2.1286, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.4544319600499376, |
|
"grad_norm": 3.548715353012085, |
|
"learning_rate": 3.0125000000000004e-05, |
|
"loss": 2.0539, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.45692883895131087, |
|
"grad_norm": 3.52622127532959, |
|
"learning_rate": 3e-05, |
|
"loss": 2.1254, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.45942571785268416, |
|
"grad_norm": 3.3532912731170654, |
|
"learning_rate": 2.9875000000000004e-05, |
|
"loss": 2.1257, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.46192259675405745, |
|
"grad_norm": 3.4587035179138184, |
|
"learning_rate": 2.975e-05, |
|
"loss": 2.1171, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.46441947565543074, |
|
"grad_norm": 3.5046117305755615, |
|
"learning_rate": 2.9625000000000002e-05, |
|
"loss": 2.1057, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.46691635455680397, |
|
"grad_norm": 3.33771014213562, |
|
"learning_rate": 2.95e-05, |
|
"loss": 2.0668, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.46941323345817726, |
|
"grad_norm": 3.402855157852173, |
|
"learning_rate": 2.9375000000000003e-05, |
|
"loss": 2.087, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.47191011235955055, |
|
"grad_norm": 3.417536735534668, |
|
"learning_rate": 2.925e-05, |
|
"loss": 1.9994, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.47440699126092384, |
|
"grad_norm": 3.627957582473755, |
|
"learning_rate": 2.9125000000000003e-05, |
|
"loss": 2.1349, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4769038701622971, |
|
"grad_norm": 3.5531859397888184, |
|
"learning_rate": 2.9e-05, |
|
"loss": 2.1836, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.4794007490636704, |
|
"grad_norm": 3.5431745052337646, |
|
"learning_rate": 2.8875e-05, |
|
"loss": 2.0185, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.4818976279650437, |
|
"grad_norm": 3.592146158218384, |
|
"learning_rate": 2.8749999999999997e-05, |
|
"loss": 2.0973, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.484394506866417, |
|
"grad_norm": 3.4691829681396484, |
|
"learning_rate": 2.8625e-05, |
|
"loss": 2.1199, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.4868913857677903, |
|
"grad_norm": 3.599349021911621, |
|
"learning_rate": 2.8499999999999998e-05, |
|
"loss": 2.0849, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.4893882646691635, |
|
"grad_norm": 3.5513803958892822, |
|
"learning_rate": 2.8375000000000002e-05, |
|
"loss": 2.0859, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.4918851435705368, |
|
"grad_norm": 3.4182136058807373, |
|
"learning_rate": 2.825e-05, |
|
"loss": 2.0096, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.4943820224719101, |
|
"grad_norm": 3.347532272338867, |
|
"learning_rate": 2.8125000000000003e-05, |
|
"loss": 2.1039, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.4968789013732834, |
|
"grad_norm": 3.5202083587646484, |
|
"learning_rate": 2.8000000000000003e-05, |
|
"loss": 2.0436, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.4993757802746567, |
|
"grad_norm": 3.5688531398773193, |
|
"learning_rate": 2.7875e-05, |
|
"loss": 2.0736, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.50187265917603, |
|
"grad_norm": 3.531306743621826, |
|
"learning_rate": 2.7750000000000004e-05, |
|
"loss": 2.0244, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.5043695380774033, |
|
"grad_norm": 3.595841884613037, |
|
"learning_rate": 2.7625e-05, |
|
"loss": 2.0808, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.5068664169787765, |
|
"grad_norm": 3.610381603240967, |
|
"learning_rate": 2.7500000000000004e-05, |
|
"loss": 2.0863, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.5093632958801498, |
|
"grad_norm": 3.2456705570220947, |
|
"learning_rate": 2.7375e-05, |
|
"loss": 2.1102, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.5118601747815231, |
|
"grad_norm": 3.5384883880615234, |
|
"learning_rate": 2.725e-05, |
|
"loss": 2.0491, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5143570536828964, |
|
"grad_norm": 3.4325191974639893, |
|
"learning_rate": 2.7125000000000002e-05, |
|
"loss": 2.0562, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.5168539325842697, |
|
"grad_norm": 3.428877115249634, |
|
"learning_rate": 2.7000000000000002e-05, |
|
"loss": 2.0329, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.519350811485643, |
|
"grad_norm": 3.451172351837158, |
|
"learning_rate": 2.6875e-05, |
|
"loss": 2.1357, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.5218476903870163, |
|
"grad_norm": 3.5145444869995117, |
|
"learning_rate": 2.6750000000000003e-05, |
|
"loss": 2.133, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.5243445692883895, |
|
"grad_norm": 3.452249050140381, |
|
"learning_rate": 2.6625e-05, |
|
"loss": 2.033, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5268414481897628, |
|
"grad_norm": 3.493157148361206, |
|
"learning_rate": 2.6500000000000004e-05, |
|
"loss": 2.0387, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.529338327091136, |
|
"grad_norm": 3.4883360862731934, |
|
"learning_rate": 2.6375e-05, |
|
"loss": 2.0182, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.5318352059925093, |
|
"grad_norm": 3.377523183822632, |
|
"learning_rate": 2.625e-05, |
|
"loss": 2.0963, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.5343320848938826, |
|
"grad_norm": 3.4430289268493652, |
|
"learning_rate": 2.6124999999999998e-05, |
|
"loss": 2.0959, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.5368289637952559, |
|
"grad_norm": 3.380902051925659, |
|
"learning_rate": 2.6000000000000002e-05, |
|
"loss": 2.0883, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5393258426966292, |
|
"grad_norm": 3.437469005584717, |
|
"learning_rate": 2.5875e-05, |
|
"loss": 2.0548, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.5418227215980025, |
|
"grad_norm": 3.628032684326172, |
|
"learning_rate": 2.5750000000000002e-05, |
|
"loss": 2.0784, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.5443196004993758, |
|
"grad_norm": 3.2870864868164062, |
|
"learning_rate": 2.5625e-05, |
|
"loss": 2.0463, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.5468164794007491, |
|
"grad_norm": 3.5962319374084473, |
|
"learning_rate": 2.5500000000000003e-05, |
|
"loss": 2.0625, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.5493133583021224, |
|
"grad_norm": 3.3193917274475098, |
|
"learning_rate": 2.5375e-05, |
|
"loss": 2.0249, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5518102372034956, |
|
"grad_norm": 3.4375598430633545, |
|
"learning_rate": 2.525e-05, |
|
"loss": 2.0802, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.5543071161048689, |
|
"grad_norm": 3.500824451446533, |
|
"learning_rate": 2.5124999999999997e-05, |
|
"loss": 2.0309, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.5568039950062422, |
|
"grad_norm": 3.5241281986236572, |
|
"learning_rate": 2.5e-05, |
|
"loss": 2.1013, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.5593008739076155, |
|
"grad_norm": 3.4705541133880615, |
|
"learning_rate": 2.4875e-05, |
|
"loss": 2.0883, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.5617977528089888, |
|
"grad_norm": 3.4071896076202393, |
|
"learning_rate": 2.4750000000000002e-05, |
|
"loss": 2.0965, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5642946317103621, |
|
"grad_norm": 3.315619707107544, |
|
"learning_rate": 2.4625000000000002e-05, |
|
"loss": 2.0697, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.5667915106117354, |
|
"grad_norm": 3.280471086502075, |
|
"learning_rate": 2.45e-05, |
|
"loss": 2.1179, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.5692883895131086, |
|
"grad_norm": 3.7654521465301514, |
|
"learning_rate": 2.4375e-05, |
|
"loss": 2.0855, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.5717852684144819, |
|
"grad_norm": 3.7220211029052734, |
|
"learning_rate": 2.425e-05, |
|
"loss": 2.0319, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.5742821473158551, |
|
"grad_norm": 3.419952154159546, |
|
"learning_rate": 2.4125e-05, |
|
"loss": 2.0362, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5767790262172284, |
|
"grad_norm": 3.46474552154541, |
|
"learning_rate": 2.4e-05, |
|
"loss": 2.0325, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.5792759051186017, |
|
"grad_norm": 3.272773027420044, |
|
"learning_rate": 2.3875e-05, |
|
"loss": 2.0735, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.581772784019975, |
|
"grad_norm": 3.659705877304077, |
|
"learning_rate": 2.375e-05, |
|
"loss": 2.1283, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.5842696629213483, |
|
"grad_norm": 3.303420305252075, |
|
"learning_rate": 2.3624999999999998e-05, |
|
"loss": 2.1808, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.5867665418227216, |
|
"grad_norm": 3.622915744781494, |
|
"learning_rate": 2.35e-05, |
|
"loss": 2.0583, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5892634207240949, |
|
"grad_norm": 3.3635072708129883, |
|
"learning_rate": 2.3375000000000002e-05, |
|
"loss": 2.0519, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.5917602996254682, |
|
"grad_norm": 3.5549280643463135, |
|
"learning_rate": 2.3250000000000003e-05, |
|
"loss": 2.0682, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.5942571785268415, |
|
"grad_norm": 3.578655958175659, |
|
"learning_rate": 2.3125000000000003e-05, |
|
"loss": 2.077, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5967540574282147, |
|
"grad_norm": 3.611335277557373, |
|
"learning_rate": 2.3000000000000003e-05, |
|
"loss": 1.9911, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.599250936329588, |
|
"grad_norm": 3.3821637630462646, |
|
"learning_rate": 2.2875e-05, |
|
"loss": 2.0082, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.6017478152309613, |
|
"grad_norm": 3.4852335453033447, |
|
"learning_rate": 2.275e-05, |
|
"loss": 2.0278, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.6042446941323346, |
|
"grad_norm": 3.525944948196411, |
|
"learning_rate": 2.2625e-05, |
|
"loss": 2.0914, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.6067415730337079, |
|
"grad_norm": 3.3227696418762207, |
|
"learning_rate": 2.25e-05, |
|
"loss": 2.107, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.6092384519350812, |
|
"grad_norm": 3.669677257537842, |
|
"learning_rate": 2.2375000000000002e-05, |
|
"loss": 1.9829, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.6117353308364545, |
|
"grad_norm": 3.625420570373535, |
|
"learning_rate": 2.2250000000000002e-05, |
|
"loss": 2.0123, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.6142322097378277, |
|
"grad_norm": 3.4239187240600586, |
|
"learning_rate": 2.2125000000000002e-05, |
|
"loss": 2.0562, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.616729088639201, |
|
"grad_norm": 3.5444185733795166, |
|
"learning_rate": 2.2000000000000003e-05, |
|
"loss": 2.0755, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.6192259675405742, |
|
"grad_norm": 3.500282049179077, |
|
"learning_rate": 2.1875e-05, |
|
"loss": 2.0275, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.6217228464419475, |
|
"grad_norm": 3.4052162170410156, |
|
"learning_rate": 2.175e-05, |
|
"loss": 2.0654, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.6242197253433208, |
|
"grad_norm": 3.5831384658813477, |
|
"learning_rate": 2.1625e-05, |
|
"loss": 1.994, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6267166042446941, |
|
"grad_norm": 3.322357654571533, |
|
"learning_rate": 2.15e-05, |
|
"loss": 2.0476, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.6292134831460674, |
|
"grad_norm": 3.543062448501587, |
|
"learning_rate": 2.1375e-05, |
|
"loss": 2.0585, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.6317103620474407, |
|
"grad_norm": 3.521042823791504, |
|
"learning_rate": 2.125e-05, |
|
"loss": 2.0239, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.634207240948814, |
|
"grad_norm": 3.395559549331665, |
|
"learning_rate": 2.1125000000000002e-05, |
|
"loss": 2.0957, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.6367041198501873, |
|
"grad_norm": 3.567340135574341, |
|
"learning_rate": 2.1e-05, |
|
"loss": 2.0649, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6392009987515606, |
|
"grad_norm": 3.397399663925171, |
|
"learning_rate": 2.0875e-05, |
|
"loss": 2.1739, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.6416978776529338, |
|
"grad_norm": 3.4432034492492676, |
|
"learning_rate": 2.075e-05, |
|
"loss": 2.0686, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.6441947565543071, |
|
"grad_norm": 3.4990758895874023, |
|
"learning_rate": 2.0625e-05, |
|
"loss": 2.1194, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.6466916354556804, |
|
"grad_norm": 3.497478723526001, |
|
"learning_rate": 2.05e-05, |
|
"loss": 2.0429, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.6491885143570537, |
|
"grad_norm": 3.4457273483276367, |
|
"learning_rate": 2.0375e-05, |
|
"loss": 2.1038, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.651685393258427, |
|
"grad_norm": 3.3803672790527344, |
|
"learning_rate": 2.025e-05, |
|
"loss": 2.1119, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.6541822721598003, |
|
"grad_norm": 3.474200963973999, |
|
"learning_rate": 2.0125e-05, |
|
"loss": 2.0346, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.6566791510611736, |
|
"grad_norm": 3.4010913372039795, |
|
"learning_rate": 2e-05, |
|
"loss": 2.0361, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.6591760299625468, |
|
"grad_norm": 3.391678810119629, |
|
"learning_rate": 1.9875000000000002e-05, |
|
"loss": 2.0216, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.66167290886392, |
|
"grad_norm": 3.484393835067749, |
|
"learning_rate": 1.9750000000000002e-05, |
|
"loss": 2.1634, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6641697877652933, |
|
"grad_norm": 3.491408586502075, |
|
"learning_rate": 1.9625000000000003e-05, |
|
"loss": 2.0471, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 3.562328815460205, |
|
"learning_rate": 1.9500000000000003e-05, |
|
"loss": 2.0928, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.6691635455680399, |
|
"grad_norm": 3.6033616065979004, |
|
"learning_rate": 1.9375e-05, |
|
"loss": 2.0641, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.6716604244694132, |
|
"grad_norm": 3.441002130508423, |
|
"learning_rate": 1.925e-05, |
|
"loss": 2.0484, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.6741573033707865, |
|
"grad_norm": 3.4448623657226562, |
|
"learning_rate": 1.9125e-05, |
|
"loss": 2.0558, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.6766541822721598, |
|
"grad_norm": 3.481809377670288, |
|
"learning_rate": 1.9e-05, |
|
"loss": 2.0449, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.6791510611735331, |
|
"grad_norm": 3.2882351875305176, |
|
"learning_rate": 1.8875e-05, |
|
"loss": 1.9441, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.6816479400749064, |
|
"grad_norm": 3.5338022708892822, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 2.1015, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.6841448189762797, |
|
"grad_norm": 3.349963665008545, |
|
"learning_rate": 1.8625000000000002e-05, |
|
"loss": 2.0868, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.686641697877653, |
|
"grad_norm": 3.562603712081909, |
|
"learning_rate": 1.85e-05, |
|
"loss": 2.0308, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6891385767790262, |
|
"grad_norm": 3.484811782836914, |
|
"learning_rate": 1.8375e-05, |
|
"loss": 2.0972, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.6916354556803995, |
|
"grad_norm": 3.4616386890411377, |
|
"learning_rate": 1.825e-05, |
|
"loss": 2.0084, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.6941323345817728, |
|
"grad_norm": 3.5441646575927734, |
|
"learning_rate": 1.8125e-05, |
|
"loss": 2.0395, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.6966292134831461, |
|
"grad_norm": 3.5133986473083496, |
|
"learning_rate": 1.8e-05, |
|
"loss": 2.0973, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.6991260923845194, |
|
"grad_norm": 3.6960537433624268, |
|
"learning_rate": 1.7875e-05, |
|
"loss": 1.9632, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.7016229712858927, |
|
"grad_norm": 3.499337911605835, |
|
"learning_rate": 1.775e-05, |
|
"loss": 2.0925, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.704119850187266, |
|
"grad_norm": 3.451720952987671, |
|
"learning_rate": 1.7625e-05, |
|
"loss": 2.0816, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.7066167290886392, |
|
"grad_norm": 3.540565252304077, |
|
"learning_rate": 1.75e-05, |
|
"loss": 2.0853, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.7091136079900124, |
|
"grad_norm": 3.5862390995025635, |
|
"learning_rate": 1.7375e-05, |
|
"loss": 1.9933, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.7116104868913857, |
|
"grad_norm": 3.436612844467163, |
|
"learning_rate": 1.725e-05, |
|
"loss": 2.0519, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.714107365792759, |
|
"grad_norm": 3.5170860290527344, |
|
"learning_rate": 1.7125000000000003e-05, |
|
"loss": 2.0087, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.7166042446941323, |
|
"grad_norm": 3.613799571990967, |
|
"learning_rate": 1.7000000000000003e-05, |
|
"loss": 2.0785, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.7191011235955056, |
|
"grad_norm": 3.4286880493164062, |
|
"learning_rate": 1.6875000000000004e-05, |
|
"loss": 2.0805, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.7215980024968789, |
|
"grad_norm": 3.4460251331329346, |
|
"learning_rate": 1.675e-05, |
|
"loss": 2.0529, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.7240948813982522, |
|
"grad_norm": 3.5601372718811035, |
|
"learning_rate": 1.6625e-05, |
|
"loss": 2.0795, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7265917602996255, |
|
"grad_norm": 3.3969712257385254, |
|
"learning_rate": 1.65e-05, |
|
"loss": 2.0261, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.7290886392009988, |
|
"grad_norm": 3.285691738128662, |
|
"learning_rate": 1.6375e-05, |
|
"loss": 2.0902, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.731585518102372, |
|
"grad_norm": 3.482919454574585, |
|
"learning_rate": 1.6250000000000002e-05, |
|
"loss": 2.0239, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.7340823970037453, |
|
"grad_norm": 3.489551544189453, |
|
"learning_rate": 1.6125000000000002e-05, |
|
"loss": 2.0235, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.7365792759051186, |
|
"grad_norm": 3.461838960647583, |
|
"learning_rate": 1.6000000000000003e-05, |
|
"loss": 2.0562, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7390761548064919, |
|
"grad_norm": 3.3764636516571045, |
|
"learning_rate": 1.5875e-05, |
|
"loss": 2.0649, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.7415730337078652, |
|
"grad_norm": 3.4460861682891846, |
|
"learning_rate": 1.575e-05, |
|
"loss": 2.0674, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.7440699126092385, |
|
"grad_norm": 3.5665292739868164, |
|
"learning_rate": 1.5625e-05, |
|
"loss": 2.0219, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.7465667915106118, |
|
"grad_norm": 3.505133867263794, |
|
"learning_rate": 1.55e-05, |
|
"loss": 2.0278, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.7490636704119851, |
|
"grad_norm": 3.615962266921997, |
|
"learning_rate": 1.5375e-05, |
|
"loss": 2.0006, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7515605493133583, |
|
"grad_norm": 3.462989330291748, |
|
"learning_rate": 1.525e-05, |
|
"loss": 2.0423, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.7540574282147315, |
|
"grad_norm": 3.351733446121216, |
|
"learning_rate": 1.5125e-05, |
|
"loss": 2.0353, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.7565543071161048, |
|
"grad_norm": 3.3703553676605225, |
|
"learning_rate": 1.5e-05, |
|
"loss": 2.0352, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.7590511860174781, |
|
"grad_norm": 3.578253984451294, |
|
"learning_rate": 1.4875e-05, |
|
"loss": 2.027, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.7615480649188514, |
|
"grad_norm": 3.304739236831665, |
|
"learning_rate": 1.475e-05, |
|
"loss": 2.0538, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.7640449438202247, |
|
"grad_norm": 3.5209975242614746, |
|
"learning_rate": 1.4625e-05, |
|
"loss": 1.9996, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.766541822721598, |
|
"grad_norm": 3.5905096530914307, |
|
"learning_rate": 1.45e-05, |
|
"loss": 2.0141, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.7690387016229713, |
|
"grad_norm": 3.383382558822632, |
|
"learning_rate": 1.4374999999999999e-05, |
|
"loss": 1.9648, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.7715355805243446, |
|
"grad_norm": 3.4596011638641357, |
|
"learning_rate": 1.4249999999999999e-05, |
|
"loss": 2.0336, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.7740324594257179, |
|
"grad_norm": 3.4215636253356934, |
|
"learning_rate": 1.4125e-05, |
|
"loss": 2.0386, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7765293383270911, |
|
"grad_norm": 3.341522693634033, |
|
"learning_rate": 1.4000000000000001e-05, |
|
"loss": 2.1078, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.7790262172284644, |
|
"grad_norm": 3.366393566131592, |
|
"learning_rate": 1.3875000000000002e-05, |
|
"loss": 2.079, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.7815230961298377, |
|
"grad_norm": 3.497201919555664, |
|
"learning_rate": 1.3750000000000002e-05, |
|
"loss": 2.0535, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.784019975031211, |
|
"grad_norm": 3.563197135925293, |
|
"learning_rate": 1.3625e-05, |
|
"loss": 2.0283, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.7865168539325843, |
|
"grad_norm": 3.4403908252716064, |
|
"learning_rate": 1.3500000000000001e-05, |
|
"loss": 2.0832, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7890137328339576, |
|
"grad_norm": 3.4520885944366455, |
|
"learning_rate": 1.3375000000000002e-05, |
|
"loss": 2.0146, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.7915106117353309, |
|
"grad_norm": 3.5180275440216064, |
|
"learning_rate": 1.3250000000000002e-05, |
|
"loss": 2.004, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.7940074906367042, |
|
"grad_norm": 3.153761863708496, |
|
"learning_rate": 1.3125e-05, |
|
"loss": 2.0935, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.7965043695380774, |
|
"grad_norm": 3.468968629837036, |
|
"learning_rate": 1.3000000000000001e-05, |
|
"loss": 2.066, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.7990012484394506, |
|
"grad_norm": 3.5003297328948975, |
|
"learning_rate": 1.2875000000000001e-05, |
|
"loss": 1.9856, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.8014981273408239, |
|
"grad_norm": 3.5374064445495605, |
|
"learning_rate": 1.2750000000000002e-05, |
|
"loss": 2.0671, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.8039950062421972, |
|
"grad_norm": 3.659308433532715, |
|
"learning_rate": 1.2625e-05, |
|
"loss": 1.9864, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.8064918851435705, |
|
"grad_norm": 3.5156655311584473, |
|
"learning_rate": 1.25e-05, |
|
"loss": 1.9394, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.8089887640449438, |
|
"grad_norm": 3.326158285140991, |
|
"learning_rate": 1.2375000000000001e-05, |
|
"loss": 1.961, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.8114856429463171, |
|
"grad_norm": 3.4294004440307617, |
|
"learning_rate": 1.225e-05, |
|
"loss": 2.0097, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.8139825218476904, |
|
"grad_norm": 3.4218974113464355, |
|
"learning_rate": 1.2125e-05, |
|
"loss": 2.0647, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.8164794007490637, |
|
"grad_norm": 3.438875198364258, |
|
"learning_rate": 1.2e-05, |
|
"loss": 2.0179, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.818976279650437, |
|
"grad_norm": 3.523358106613159, |
|
"learning_rate": 1.1875e-05, |
|
"loss": 2.0494, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.8214731585518102, |
|
"grad_norm": 3.506168842315674, |
|
"learning_rate": 1.175e-05, |
|
"loss": 2.1027, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.8239700374531835, |
|
"grad_norm": 3.548112154006958, |
|
"learning_rate": 1.1625000000000001e-05, |
|
"loss": 2.0389, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.8264669163545568, |
|
"grad_norm": 3.5158162117004395, |
|
"learning_rate": 1.1500000000000002e-05, |
|
"loss": 1.978, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.8289637952559301, |
|
"grad_norm": 3.4753963947296143, |
|
"learning_rate": 1.1375e-05, |
|
"loss": 2.0421, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.8314606741573034, |
|
"grad_norm": 3.5032706260681152, |
|
"learning_rate": 1.125e-05, |
|
"loss": 1.9888, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.8339575530586767, |
|
"grad_norm": 3.2991061210632324, |
|
"learning_rate": 1.1125000000000001e-05, |
|
"loss": 2.0151, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.83645443196005, |
|
"grad_norm": 3.425863027572632, |
|
"learning_rate": 1.1000000000000001e-05, |
|
"loss": 2.0503, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8389513108614233, |
|
"grad_norm": 3.469403028488159, |
|
"learning_rate": 1.0875e-05, |
|
"loss": 1.9425, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.8414481897627965, |
|
"grad_norm": 3.3070104122161865, |
|
"learning_rate": 1.075e-05, |
|
"loss": 2.0603, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.8439450686641697, |
|
"grad_norm": 3.418203115463257, |
|
"learning_rate": 1.0625e-05, |
|
"loss": 2.0018, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.846441947565543, |
|
"grad_norm": 3.5066540241241455, |
|
"learning_rate": 1.05e-05, |
|
"loss": 2.0122, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.8489388264669163, |
|
"grad_norm": 3.4425551891326904, |
|
"learning_rate": 1.0375e-05, |
|
"loss": 2.0225, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8514357053682896, |
|
"grad_norm": 3.343202829360962, |
|
"learning_rate": 1.025e-05, |
|
"loss": 2.0755, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.8539325842696629, |
|
"grad_norm": 3.28155255317688, |
|
"learning_rate": 1.0125e-05, |
|
"loss": 1.9847, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.8564294631710362, |
|
"grad_norm": 3.3546323776245117, |
|
"learning_rate": 1e-05, |
|
"loss": 1.991, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.8589263420724095, |
|
"grad_norm": 3.420783758163452, |
|
"learning_rate": 9.875000000000001e-06, |
|
"loss": 2.0471, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.8614232209737828, |
|
"grad_norm": 3.347703456878662, |
|
"learning_rate": 9.750000000000002e-06, |
|
"loss": 2.0444, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.8639200998751561, |
|
"grad_norm": 3.4276270866394043, |
|
"learning_rate": 9.625e-06, |
|
"loss": 2.0548, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.8664169787765293, |
|
"grad_norm": 3.3684396743774414, |
|
"learning_rate": 9.5e-06, |
|
"loss": 2.0664, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.8689138576779026, |
|
"grad_norm": 3.5308725833892822, |
|
"learning_rate": 9.375000000000001e-06, |
|
"loss": 2.0263, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.8714107365792759, |
|
"grad_norm": 3.5037009716033936, |
|
"learning_rate": 9.25e-06, |
|
"loss": 2.0629, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.8739076154806492, |
|
"grad_norm": 3.3732502460479736, |
|
"learning_rate": 9.125e-06, |
|
"loss": 2.0872, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8764044943820225, |
|
"grad_norm": 3.379492998123169, |
|
"learning_rate": 9e-06, |
|
"loss": 2.0494, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.8789013732833958, |
|
"grad_norm": 3.356729030609131, |
|
"learning_rate": 8.875e-06, |
|
"loss": 2.0578, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.8813982521847691, |
|
"grad_norm": 3.505227565765381, |
|
"learning_rate": 8.75e-06, |
|
"loss": 2.0547, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.8838951310861424, |
|
"grad_norm": 3.325188159942627, |
|
"learning_rate": 8.625e-06, |
|
"loss": 2.0479, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.8863920099875156, |
|
"grad_norm": 3.220914363861084, |
|
"learning_rate": 8.500000000000002e-06, |
|
"loss": 2.1145, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 3.2536563873291016, |
|
"learning_rate": 8.375e-06, |
|
"loss": 2.1226, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.8913857677902621, |
|
"grad_norm": 3.4424164295196533, |
|
"learning_rate": 8.25e-06, |
|
"loss": 1.9732, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.8938826466916354, |
|
"grad_norm": 3.210689067840576, |
|
"learning_rate": 8.125000000000001e-06, |
|
"loss": 2.0682, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.8963795255930087, |
|
"grad_norm": 3.6276254653930664, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.9996, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.898876404494382, |
|
"grad_norm": 3.284266471862793, |
|
"learning_rate": 7.875e-06, |
|
"loss": 1.9988, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.9013732833957553, |
|
"grad_norm": 3.3102824687957764, |
|
"learning_rate": 7.75e-06, |
|
"loss": 2.0986, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.9038701622971286, |
|
"grad_norm": 3.4718074798583984, |
|
"learning_rate": 7.625e-06, |
|
"loss": 2.0236, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.9063670411985019, |
|
"grad_norm": 3.356231689453125, |
|
"learning_rate": 7.5e-06, |
|
"loss": 1.9603, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.9088639200998752, |
|
"grad_norm": 3.488931655883789, |
|
"learning_rate": 7.375e-06, |
|
"loss": 2.0287, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.9113607990012484, |
|
"grad_norm": 3.370278835296631, |
|
"learning_rate": 7.25e-06, |
|
"loss": 2.055, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.9138576779026217, |
|
"grad_norm": 3.357985019683838, |
|
"learning_rate": 7.1249999999999995e-06, |
|
"loss": 2.0225, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.916354556803995, |
|
"grad_norm": 3.3029778003692627, |
|
"learning_rate": 7.000000000000001e-06, |
|
"loss": 1.9724, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.9188514357053683, |
|
"grad_norm": 3.6066527366638184, |
|
"learning_rate": 6.875000000000001e-06, |
|
"loss": 2.0172, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.9213483146067416, |
|
"grad_norm": 3.4643936157226562, |
|
"learning_rate": 6.750000000000001e-06, |
|
"loss": 2.0598, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.9238451935081149, |
|
"grad_norm": 3.2209830284118652, |
|
"learning_rate": 6.625000000000001e-06, |
|
"loss": 2.0395, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.9263420724094882, |
|
"grad_norm": 3.308652639389038, |
|
"learning_rate": 6.5000000000000004e-06, |
|
"loss": 2.0576, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.9288389513108615, |
|
"grad_norm": 3.378284215927124, |
|
"learning_rate": 6.375000000000001e-06, |
|
"loss": 2.1037, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.9313358302122348, |
|
"grad_norm": 3.408998489379883, |
|
"learning_rate": 6.25e-06, |
|
"loss": 2.0508, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.9338327091136079, |
|
"grad_norm": 3.4758336544036865, |
|
"learning_rate": 6.125e-06, |
|
"loss": 2.0642, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.9363295880149812, |
|
"grad_norm": 3.3644943237304688, |
|
"learning_rate": 6e-06, |
|
"loss": 2.104, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9388264669163545, |
|
"grad_norm": 3.3621973991394043, |
|
"learning_rate": 5.875e-06, |
|
"loss": 2.05, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.9413233458177278, |
|
"grad_norm": 3.3936052322387695, |
|
"learning_rate": 5.750000000000001e-06, |
|
"loss": 1.9648, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.9438202247191011, |
|
"grad_norm": 3.28177809715271, |
|
"learning_rate": 5.625e-06, |
|
"loss": 2.0129, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.9463171036204744, |
|
"grad_norm": 3.1901121139526367, |
|
"learning_rate": 5.500000000000001e-06, |
|
"loss": 2.0564, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.9488139825218477, |
|
"grad_norm": 3.4150338172912598, |
|
"learning_rate": 5.375e-06, |
|
"loss": 2.0344, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.951310861423221, |
|
"grad_norm": 3.3532180786132812, |
|
"learning_rate": 5.25e-06, |
|
"loss": 2.012, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.9538077403245943, |
|
"grad_norm": 3.434931993484497, |
|
"learning_rate": 5.125e-06, |
|
"loss": 2.073, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.9563046192259675, |
|
"grad_norm": 3.34334135055542, |
|
"learning_rate": 5e-06, |
|
"loss": 2.0716, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.9588014981273408, |
|
"grad_norm": 3.3420920372009277, |
|
"learning_rate": 4.875000000000001e-06, |
|
"loss": 2.0691, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.9612983770287141, |
|
"grad_norm": 3.648798942565918, |
|
"learning_rate": 4.75e-06, |
|
"loss": 2.0198, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9637952559300874, |
|
"grad_norm": 3.2924463748931885, |
|
"learning_rate": 4.625e-06, |
|
"loss": 2.015, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.9662921348314607, |
|
"grad_norm": 3.2124977111816406, |
|
"learning_rate": 4.5e-06, |
|
"loss": 2.1013, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.968789013732834, |
|
"grad_norm": 3.4171626567840576, |
|
"learning_rate": 4.375e-06, |
|
"loss": 2.0419, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.9712858926342073, |
|
"grad_norm": 3.4217112064361572, |
|
"learning_rate": 4.250000000000001e-06, |
|
"loss": 2.0808, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.9737827715355806, |
|
"grad_norm": 3.281397581100464, |
|
"learning_rate": 4.125e-06, |
|
"loss": 2.0552, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9762796504369539, |
|
"grad_norm": 3.3025379180908203, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 2.0196, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.978776529338327, |
|
"grad_norm": 3.261284112930298, |
|
"learning_rate": 3.875e-06, |
|
"loss": 1.903, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.9812734082397003, |
|
"grad_norm": 3.3036253452301025, |
|
"learning_rate": 3.75e-06, |
|
"loss": 2.0455, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.9837702871410736, |
|
"grad_norm": 3.4235925674438477, |
|
"learning_rate": 3.625e-06, |
|
"loss": 2.0393, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.9862671660424469, |
|
"grad_norm": 3.347456455230713, |
|
"learning_rate": 3.5000000000000004e-06, |
|
"loss": 2.0227, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.9887640449438202, |
|
"grad_norm": 3.3116769790649414, |
|
"learning_rate": 3.3750000000000003e-06, |
|
"loss": 2.0891, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.9912609238451935, |
|
"grad_norm": 3.2966830730438232, |
|
"learning_rate": 3.2500000000000002e-06, |
|
"loss": 2.0784, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.9937578027465668, |
|
"grad_norm": 3.4097273349761963, |
|
"learning_rate": 3.125e-06, |
|
"loss": 2.0751, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.9962546816479401, |
|
"grad_norm": 3.3749406337738037, |
|
"learning_rate": 3e-06, |
|
"loss": 2.0916, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.9987515605493134, |
|
"grad_norm": 3.322603225708008, |
|
"learning_rate": 2.8750000000000004e-06, |
|
"loss": 2.0949, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9987515605493134, |
|
"step": 400, |
|
"total_flos": 2.847507751108608e+17, |
|
"train_loss": 2.1486885741353037, |
|
"train_runtime": 92894.5901, |
|
"train_samples_per_second": 0.241, |
|
"train_steps_per_second": 0.004 |
|
} |
|
], |
|
"logging_steps": 1.0, |
|
"max_steps": 400, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 400, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.847507751108608e+17, |
|
"train_batch_size": 14, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|