|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.7777777777777777, |
|
"eval_steps": 500, |
|
"global_step": 200, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.008888888888888889, |
|
"grad_norm": 0.5841383934020996, |
|
"learning_rate": 8.928571428571428e-07, |
|
"loss": 0.4865, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.017777777777777778, |
|
"grad_norm": 0.37317243218421936, |
|
"learning_rate": 1.7857142857142857e-06, |
|
"loss": 0.4116, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.02666666666666667, |
|
"grad_norm": 0.16240684688091278, |
|
"learning_rate": 2.6785714285714285e-06, |
|
"loss": 0.3085, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.035555555555555556, |
|
"grad_norm": 0.22618648409843445, |
|
"learning_rate": 3.5714285714285714e-06, |
|
"loss": 0.3213, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.044444444444444446, |
|
"grad_norm": 0.33735740184783936, |
|
"learning_rate": 4.464285714285715e-06, |
|
"loss": 0.2869, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.05333333333333334, |
|
"grad_norm": 0.20259538292884827, |
|
"learning_rate": 5.357142857142857e-06, |
|
"loss": 0.3825, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.06222222222222222, |
|
"grad_norm": 0.3431294560432434, |
|
"learning_rate": 6.25e-06, |
|
"loss": 0.3896, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.07111111111111111, |
|
"grad_norm": 0.5504677891731262, |
|
"learning_rate": 7.142857142857143e-06, |
|
"loss": 0.3175, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.1282545030117035, |
|
"learning_rate": 8.035714285714286e-06, |
|
"loss": 0.2901, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.08888888888888889, |
|
"grad_norm": 0.18529950082302094, |
|
"learning_rate": 8.92857142857143e-06, |
|
"loss": 0.3735, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.09777777777777778, |
|
"grad_norm": 0.2145024836063385, |
|
"learning_rate": 9.821428571428573e-06, |
|
"loss": 0.3525, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.10666666666666667, |
|
"grad_norm": 0.9437930583953857, |
|
"learning_rate": 1.0714285714285714e-05, |
|
"loss": 0.444, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.11555555555555555, |
|
"grad_norm": 0.24417199194431305, |
|
"learning_rate": 1.1607142857142857e-05, |
|
"loss": 0.2967, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.12444444444444444, |
|
"grad_norm": 0.2694835662841797, |
|
"learning_rate": 1.25e-05, |
|
"loss": 0.2967, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 0.20757225155830383, |
|
"learning_rate": 1.3392857142857144e-05, |
|
"loss": 0.3487, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.14222222222222222, |
|
"grad_norm": 0.2921523153781891, |
|
"learning_rate": 1.4285714285714285e-05, |
|
"loss": 0.4247, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.1511111111111111, |
|
"grad_norm": 0.20908255875110626, |
|
"learning_rate": 1.5178571428571429e-05, |
|
"loss": 0.3009, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.15087872743606567, |
|
"learning_rate": 1.6071428571428572e-05, |
|
"loss": 0.3145, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.1688888888888889, |
|
"grad_norm": 0.8311613202095032, |
|
"learning_rate": 1.6964285714285715e-05, |
|
"loss": 0.3868, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 0.27965763211250305, |
|
"learning_rate": 1.785714285714286e-05, |
|
"loss": 0.351, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.18666666666666668, |
|
"grad_norm": 0.23372279107570648, |
|
"learning_rate": 1.8750000000000002e-05, |
|
"loss": 0.3025, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.19555555555555557, |
|
"grad_norm": 0.3673437237739563, |
|
"learning_rate": 1.9642857142857145e-05, |
|
"loss": 0.388, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.20444444444444446, |
|
"grad_norm": 0.39919161796569824, |
|
"learning_rate": 2.0535714285714285e-05, |
|
"loss": 0.3611, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.21333333333333335, |
|
"grad_norm": 0.20096033811569214, |
|
"learning_rate": 2.1428571428571428e-05, |
|
"loss": 0.3244, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 0.27874326705932617, |
|
"learning_rate": 2.2321428571428575e-05, |
|
"loss": 0.359, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2311111111111111, |
|
"grad_norm": 0.31668320298194885, |
|
"learning_rate": 2.3214285714285715e-05, |
|
"loss": 0.3062, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.4131496548652649, |
|
"learning_rate": 2.4107142857142858e-05, |
|
"loss": 0.4351, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.24888888888888888, |
|
"grad_norm": 0.4470804035663605, |
|
"learning_rate": 2.5e-05, |
|
"loss": 0.4332, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.2577777777777778, |
|
"grad_norm": 0.3746657967567444, |
|
"learning_rate": 2.5892857142857148e-05, |
|
"loss": 0.3811, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.8335658311843872, |
|
"learning_rate": 2.6785714285714288e-05, |
|
"loss": 0.4618, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.27555555555555555, |
|
"grad_norm": 0.8326655626296997, |
|
"learning_rate": 2.767857142857143e-05, |
|
"loss": 0.3872, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.28444444444444444, |
|
"grad_norm": 0.7701701521873474, |
|
"learning_rate": 2.857142857142857e-05, |
|
"loss": 0.3255, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.29333333333333333, |
|
"grad_norm": 0.40850526094436646, |
|
"learning_rate": 2.9464285714285718e-05, |
|
"loss": 0.3167, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.3022222222222222, |
|
"grad_norm": 0.35397693514823914, |
|
"learning_rate": 3.0357142857142857e-05, |
|
"loss": 0.2987, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.3111111111111111, |
|
"grad_norm": 0.7541699409484863, |
|
"learning_rate": 3.125e-05, |
|
"loss": 0.3413, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.3483797609806061, |
|
"learning_rate": 3.2142857142857144e-05, |
|
"loss": 0.2858, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.3288888888888889, |
|
"grad_norm": 0.33964604139328003, |
|
"learning_rate": 3.303571428571429e-05, |
|
"loss": 0.3036, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.3377777777777778, |
|
"grad_norm": 0.45213809609413147, |
|
"learning_rate": 3.392857142857143e-05, |
|
"loss": 0.3437, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.3466666666666667, |
|
"grad_norm": 0.4597315192222595, |
|
"learning_rate": 3.4821428571428574e-05, |
|
"loss": 0.3318, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 0.44681742787361145, |
|
"learning_rate": 3.571428571428572e-05, |
|
"loss": 0.3107, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.36444444444444446, |
|
"grad_norm": 0.9226369857788086, |
|
"learning_rate": 3.6607142857142853e-05, |
|
"loss": 0.3231, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.37333333333333335, |
|
"grad_norm": 0.27465879917144775, |
|
"learning_rate": 3.7500000000000003e-05, |
|
"loss": 0.3301, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.38222222222222224, |
|
"grad_norm": 0.4791021943092346, |
|
"learning_rate": 3.839285714285715e-05, |
|
"loss": 0.3527, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.39111111111111113, |
|
"grad_norm": 0.586669385433197, |
|
"learning_rate": 3.928571428571429e-05, |
|
"loss": 0.3359, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.34871557354927063, |
|
"learning_rate": 4.017857142857143e-05, |
|
"loss": 0.3268, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.4088888888888889, |
|
"grad_norm": 0.4256209135055542, |
|
"learning_rate": 4.107142857142857e-05, |
|
"loss": 0.2978, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.4177777777777778, |
|
"grad_norm": 0.30336323380470276, |
|
"learning_rate": 4.196428571428572e-05, |
|
"loss": 0.2898, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.4266666666666667, |
|
"grad_norm": 0.5795422792434692, |
|
"learning_rate": 4.2857142857142856e-05, |
|
"loss": 0.3481, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.43555555555555553, |
|
"grad_norm": 0.38410332798957825, |
|
"learning_rate": 4.375e-05, |
|
"loss": 0.3414, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.3348947763442993, |
|
"learning_rate": 4.464285714285715e-05, |
|
"loss": 0.336, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4533333333333333, |
|
"grad_norm": 0.42829787731170654, |
|
"learning_rate": 4.5535714285714286e-05, |
|
"loss": 0.2913, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.4622222222222222, |
|
"grad_norm": 4.1896653175354, |
|
"learning_rate": 4.642857142857143e-05, |
|
"loss": 0.8937, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.4711111111111111, |
|
"grad_norm": 0.7059090733528137, |
|
"learning_rate": 4.732142857142857e-05, |
|
"loss": 0.4011, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.41015371680259705, |
|
"learning_rate": 4.8214285714285716e-05, |
|
"loss": 0.3351, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.4888888888888889, |
|
"grad_norm": 0.43993642926216125, |
|
"learning_rate": 4.910714285714286e-05, |
|
"loss": 0.3313, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.49777777777777776, |
|
"grad_norm": 1.0636693239212036, |
|
"learning_rate": 5e-05, |
|
"loss": 0.4636, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.5066666666666667, |
|
"grad_norm": 1.5080366134643555, |
|
"learning_rate": 5.089285714285714e-05, |
|
"loss": 0.5489, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.5155555555555555, |
|
"grad_norm": 0.5345658659934998, |
|
"learning_rate": 5.1785714285714296e-05, |
|
"loss": 0.3057, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.5244444444444445, |
|
"grad_norm": 0.7976881861686707, |
|
"learning_rate": 5.267857142857143e-05, |
|
"loss": 0.3581, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.726458728313446, |
|
"learning_rate": 5.3571428571428575e-05, |
|
"loss": 0.3279, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5422222222222223, |
|
"grad_norm": 0.5178459286689758, |
|
"learning_rate": 5.446428571428571e-05, |
|
"loss": 0.3217, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.5511111111111111, |
|
"grad_norm": 0.6377764940261841, |
|
"learning_rate": 5.535714285714286e-05, |
|
"loss": 0.321, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.3174062967300415, |
|
"learning_rate": 5.6250000000000005e-05, |
|
"loss": 0.3154, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.5688888888888889, |
|
"grad_norm": 0.31623443961143494, |
|
"learning_rate": 5.714285714285714e-05, |
|
"loss": 0.2735, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.5777777777777777, |
|
"grad_norm": 0.3521466851234436, |
|
"learning_rate": 5.803571428571429e-05, |
|
"loss": 0.2878, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.5866666666666667, |
|
"grad_norm": 0.8923875689506531, |
|
"learning_rate": 5.8928571428571435e-05, |
|
"loss": 0.3906, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.5955555555555555, |
|
"grad_norm": 0.6803852915763855, |
|
"learning_rate": 5.982142857142857e-05, |
|
"loss": 0.3573, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.6044444444444445, |
|
"grad_norm": 0.5237946510314941, |
|
"learning_rate": 6.0714285714285715e-05, |
|
"loss": 0.3068, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.6133333333333333, |
|
"grad_norm": 0.6161757111549377, |
|
"learning_rate": 6.160714285714286e-05, |
|
"loss": 0.2944, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.6222222222222222, |
|
"grad_norm": 0.3687132000923157, |
|
"learning_rate": 6.25e-05, |
|
"loss": 0.2901, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6311111111111111, |
|
"grad_norm": 0.39233818650245667, |
|
"learning_rate": 6.339285714285714e-05, |
|
"loss": 0.2821, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.7180721759796143, |
|
"learning_rate": 6.428571428571429e-05, |
|
"loss": 0.3509, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.6488888888888888, |
|
"grad_norm": 0.9132435917854309, |
|
"learning_rate": 6.517857142857143e-05, |
|
"loss": 0.4027, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.6577777777777778, |
|
"grad_norm": 0.2931051552295685, |
|
"learning_rate": 6.607142857142857e-05, |
|
"loss": 0.3303, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.4739736318588257, |
|
"learning_rate": 6.696428571428572e-05, |
|
"loss": 0.286, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.6755555555555556, |
|
"grad_norm": 0.645233154296875, |
|
"learning_rate": 6.785714285714286e-05, |
|
"loss": 0.3673, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.6844444444444444, |
|
"grad_norm": 0.6568748354911804, |
|
"learning_rate": 6.875e-05, |
|
"loss": 0.3451, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.6933333333333334, |
|
"grad_norm": 0.330121785402298, |
|
"learning_rate": 6.964285714285715e-05, |
|
"loss": 0.2916, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.7022222222222222, |
|
"grad_norm": 0.6969891786575317, |
|
"learning_rate": 7.053571428571429e-05, |
|
"loss": 0.3799, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 0.3836056590080261, |
|
"learning_rate": 7.142857142857143e-05, |
|
"loss": 0.2785, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.6308532357215881, |
|
"learning_rate": 7.232142857142858e-05, |
|
"loss": 0.3365, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.7288888888888889, |
|
"grad_norm": 1.3300080299377441, |
|
"learning_rate": 7.321428571428571e-05, |
|
"loss": 0.4452, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.7377777777777778, |
|
"grad_norm": 0.4857744872570038, |
|
"learning_rate": 7.410714285714286e-05, |
|
"loss": 0.3408, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.7466666666666667, |
|
"grad_norm": 0.2752129137516022, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 0.2481, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.7555555555555555, |
|
"grad_norm": 0.37411218881607056, |
|
"learning_rate": 7.589285714285714e-05, |
|
"loss": 0.2714, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.7644444444444445, |
|
"grad_norm": 0.34179869294166565, |
|
"learning_rate": 7.67857142857143e-05, |
|
"loss": 0.322, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.7733333333333333, |
|
"grad_norm": 0.5072541236877441, |
|
"learning_rate": 7.767857142857144e-05, |
|
"loss": 0.3442, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.7822222222222223, |
|
"grad_norm": 0.3834559917449951, |
|
"learning_rate": 7.857142857142858e-05, |
|
"loss": 0.2904, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.7911111111111111, |
|
"grad_norm": 0.37922006845474243, |
|
"learning_rate": 7.946428571428571e-05, |
|
"loss": 0.3116, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.41435790061950684, |
|
"learning_rate": 8.035714285714287e-05, |
|
"loss": 0.2949, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8088888888888889, |
|
"grad_norm": 0.5537578463554382, |
|
"learning_rate": 8.125000000000001e-05, |
|
"loss": 0.3337, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.8177777777777778, |
|
"grad_norm": 0.24957779049873352, |
|
"learning_rate": 8.214285714285714e-05, |
|
"loss": 0.2709, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.8266666666666667, |
|
"grad_norm": 0.43642184138298035, |
|
"learning_rate": 8.30357142857143e-05, |
|
"loss": 0.3033, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.8355555555555556, |
|
"grad_norm": 0.835472583770752, |
|
"learning_rate": 8.392857142857144e-05, |
|
"loss": 0.3828, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.8444444444444444, |
|
"grad_norm": 0.6168670654296875, |
|
"learning_rate": 8.482142857142857e-05, |
|
"loss": 0.3455, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.8533333333333334, |
|
"grad_norm": 0.46005958318710327, |
|
"learning_rate": 8.571428571428571e-05, |
|
"loss": 0.3489, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.8622222222222222, |
|
"grad_norm": 0.34383633732795715, |
|
"learning_rate": 8.660714285714287e-05, |
|
"loss": 0.2866, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.8711111111111111, |
|
"grad_norm": 0.4366074204444885, |
|
"learning_rate": 8.75e-05, |
|
"loss": 0.3239, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.33174213767051697, |
|
"learning_rate": 8.839285714285714e-05, |
|
"loss": 0.2783, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.26910072565078735, |
|
"learning_rate": 8.92857142857143e-05, |
|
"loss": 0.2807, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.8977777777777778, |
|
"grad_norm": 0.9884425401687622, |
|
"learning_rate": 9.017857142857143e-05, |
|
"loss": 0.4379, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.9066666666666666, |
|
"grad_norm": 0.24256423115730286, |
|
"learning_rate": 9.107142857142857e-05, |
|
"loss": 0.231, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.9155555555555556, |
|
"grad_norm": 0.36811211705207825, |
|
"learning_rate": 9.196428571428572e-05, |
|
"loss": 0.2668, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.9244444444444444, |
|
"grad_norm": 0.45584559440612793, |
|
"learning_rate": 9.285714285714286e-05, |
|
"loss": 0.3285, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 0.4034405052661896, |
|
"learning_rate": 9.375e-05, |
|
"loss": 0.2823, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.9422222222222222, |
|
"grad_norm": 0.48256492614746094, |
|
"learning_rate": 9.464285714285715e-05, |
|
"loss": 0.3389, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.9511111111111111, |
|
"grad_norm": 0.4109070897102356, |
|
"learning_rate": 9.553571428571429e-05, |
|
"loss": 0.3515, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.8162788152694702, |
|
"learning_rate": 9.642857142857143e-05, |
|
"loss": 0.5591, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.9688888888888889, |
|
"grad_norm": 0.33671724796295166, |
|
"learning_rate": 9.732142857142858e-05, |
|
"loss": 0.3512, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.9777777777777777, |
|
"grad_norm": 0.35684165358543396, |
|
"learning_rate": 9.821428571428572e-05, |
|
"loss": 0.3087, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.9866666666666667, |
|
"grad_norm": 0.34859076142311096, |
|
"learning_rate": 9.910714285714286e-05, |
|
"loss": 0.2894, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.9955555555555555, |
|
"grad_norm": 0.3526354432106018, |
|
"learning_rate": 0.0001, |
|
"loss": 0.3052, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 1.0044444444444445, |
|
"grad_norm": 0.2897067368030548, |
|
"learning_rate": 9.999975716105452e-05, |
|
"loss": 0.3058, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 1.0133333333333334, |
|
"grad_norm": 0.4105049967765808, |
|
"learning_rate": 9.999902864657691e-05, |
|
"loss": 0.2635, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 1.0222222222222221, |
|
"grad_norm": 0.36226218938827515, |
|
"learning_rate": 9.999781446364365e-05, |
|
"loss": 0.3257, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.031111111111111, |
|
"grad_norm": 0.31135883927345276, |
|
"learning_rate": 9.999611462404875e-05, |
|
"loss": 0.2646, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 1.04, |
|
"grad_norm": 2.0171165466308594, |
|
"learning_rate": 9.999392914430371e-05, |
|
"loss": 0.5321, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 1.048888888888889, |
|
"grad_norm": 0.3250499665737152, |
|
"learning_rate": 9.999125804563732e-05, |
|
"loss": 0.3208, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 1.0577777777777777, |
|
"grad_norm": 0.2886015474796295, |
|
"learning_rate": 9.998810135399546e-05, |
|
"loss": 0.2932, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 0.3275207281112671, |
|
"learning_rate": 9.998445910004082e-05, |
|
"loss": 0.2971, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.0755555555555556, |
|
"grad_norm": 0.5577352643013, |
|
"learning_rate": 9.998033131915266e-05, |
|
"loss": 0.3561, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 1.0844444444444445, |
|
"grad_norm": 1.0877000093460083, |
|
"learning_rate": 9.997571805142639e-05, |
|
"loss": 0.4113, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 1.0933333333333333, |
|
"grad_norm": 0.5185580253601074, |
|
"learning_rate": 9.997061934167328e-05, |
|
"loss": 0.3085, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 1.1022222222222222, |
|
"grad_norm": 0.4035893678665161, |
|
"learning_rate": 9.996503523941994e-05, |
|
"loss": 0.271, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.3941110074520111, |
|
"learning_rate": 9.995896579890784e-05, |
|
"loss": 0.2875, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.12, |
|
"grad_norm": 0.3415586054325104, |
|
"learning_rate": 9.99524110790929e-05, |
|
"loss": 0.2711, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 1.1288888888888888, |
|
"grad_norm": 0.3006160259246826, |
|
"learning_rate": 9.99453711436447e-05, |
|
"loss": 0.2718, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 1.1377777777777778, |
|
"grad_norm": 0.522278368473053, |
|
"learning_rate": 9.993784606094612e-05, |
|
"loss": 0.3306, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 1.1466666666666667, |
|
"grad_norm": 0.5858255624771118, |
|
"learning_rate": 9.992983590409246e-05, |
|
"loss": 0.3475, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 1.1555555555555554, |
|
"grad_norm": 0.5486164689064026, |
|
"learning_rate": 9.992134075089084e-05, |
|
"loss": 0.3259, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.1644444444444444, |
|
"grad_norm": 0.2975933253765106, |
|
"learning_rate": 9.991236068385941e-05, |
|
"loss": 0.2588, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 1.1733333333333333, |
|
"grad_norm": 0.6659825444221497, |
|
"learning_rate": 9.99028957902266e-05, |
|
"loss": 0.3266, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 1.1822222222222223, |
|
"grad_norm": 0.2541256248950958, |
|
"learning_rate": 9.989294616193017e-05, |
|
"loss": 0.2649, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 1.1911111111111112, |
|
"grad_norm": 0.8277371525764465, |
|
"learning_rate": 9.988251189561645e-05, |
|
"loss": 0.4076, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.3177882432937622, |
|
"learning_rate": 9.987159309263924e-05, |
|
"loss": 0.304, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.208888888888889, |
|
"grad_norm": 0.36816540360450745, |
|
"learning_rate": 9.986018985905901e-05, |
|
"loss": 0.3187, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 1.2177777777777778, |
|
"grad_norm": 0.4456408619880676, |
|
"learning_rate": 9.984830230564171e-05, |
|
"loss": 0.2769, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 1.2266666666666666, |
|
"grad_norm": 0.7157383561134338, |
|
"learning_rate": 9.983593054785776e-05, |
|
"loss": 0.382, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 1.2355555555555555, |
|
"grad_norm": 0.5327372550964355, |
|
"learning_rate": 9.982307470588098e-05, |
|
"loss": 0.2732, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 1.2444444444444445, |
|
"grad_norm": 0.581408679485321, |
|
"learning_rate": 9.980973490458728e-05, |
|
"loss": 0.3956, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.2533333333333334, |
|
"grad_norm": 0.32390910387039185, |
|
"learning_rate": 9.979591127355365e-05, |
|
"loss": 0.2905, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 1.2622222222222224, |
|
"grad_norm": 0.4703962802886963, |
|
"learning_rate": 9.978160394705668e-05, |
|
"loss": 0.2897, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 1.271111111111111, |
|
"grad_norm": 0.5023928284645081, |
|
"learning_rate": 9.976681306407148e-05, |
|
"loss": 0.328, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 1.28, |
|
"grad_norm": 0.3573106825351715, |
|
"learning_rate": 9.975153876827008e-05, |
|
"loss": 0.2687, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 1.2888888888888888, |
|
"grad_norm": 0.6152392625808716, |
|
"learning_rate": 9.973578120802025e-05, |
|
"loss": 0.3375, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.2977777777777777, |
|
"grad_norm": 0.40894556045532227, |
|
"learning_rate": 9.971954053638399e-05, |
|
"loss": 0.2888, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 1.3066666666666666, |
|
"grad_norm": 0.8339890241622925, |
|
"learning_rate": 9.970281691111598e-05, |
|
"loss": 0.4384, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 1.3155555555555556, |
|
"grad_norm": 0.41529226303100586, |
|
"learning_rate": 9.968561049466214e-05, |
|
"loss": 0.2831, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 1.3244444444444445, |
|
"grad_norm": 0.28021934628486633, |
|
"learning_rate": 9.966792145415795e-05, |
|
"loss": 0.2671, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.22352413833141327, |
|
"learning_rate": 9.964974996142698e-05, |
|
"loss": 0.2839, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.3422222222222222, |
|
"grad_norm": 0.2703256905078888, |
|
"learning_rate": 9.963109619297905e-05, |
|
"loss": 0.2675, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 1.3511111111111112, |
|
"grad_norm": 0.2645833194255829, |
|
"learning_rate": 9.961196033000861e-05, |
|
"loss": 0.2708, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 1.3599999999999999, |
|
"grad_norm": 0.40951216220855713, |
|
"learning_rate": 9.959234255839298e-05, |
|
"loss": 0.3015, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 1.3688888888888888, |
|
"grad_norm": 0.3288329839706421, |
|
"learning_rate": 9.957224306869053e-05, |
|
"loss": 0.325, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.3777777777777778, |
|
"grad_norm": 0.3932753801345825, |
|
"learning_rate": 9.955166205613879e-05, |
|
"loss": 0.2716, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.3866666666666667, |
|
"grad_norm": 0.46717700362205505, |
|
"learning_rate": 9.953059972065265e-05, |
|
"loss": 0.3444, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.3955555555555557, |
|
"grad_norm": 0.20313459634780884, |
|
"learning_rate": 9.950905626682228e-05, |
|
"loss": 0.2584, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.4044444444444444, |
|
"grad_norm": 0.47062796354293823, |
|
"learning_rate": 9.948703190391131e-05, |
|
"loss": 0.372, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.4133333333333333, |
|
"grad_norm": 0.3500126898288727, |
|
"learning_rate": 9.946452684585463e-05, |
|
"loss": 0.2737, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.4222222222222223, |
|
"grad_norm": 0.5783170461654663, |
|
"learning_rate": 9.944154131125642e-05, |
|
"loss": 0.2938, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.431111111111111, |
|
"grad_norm": 0.27551746368408203, |
|
"learning_rate": 9.941807552338804e-05, |
|
"loss": 0.3139, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.44, |
|
"grad_norm": 0.2641688287258148, |
|
"learning_rate": 9.939412971018574e-05, |
|
"loss": 0.3009, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.448888888888889, |
|
"grad_norm": 0.4430491030216217, |
|
"learning_rate": 9.936970410424857e-05, |
|
"loss": 0.2779, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.4577777777777778, |
|
"grad_norm": 0.44353199005126953, |
|
"learning_rate": 9.934479894283606e-05, |
|
"loss": 0.2694, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.4666666666666668, |
|
"grad_norm": 0.3297507166862488, |
|
"learning_rate": 9.931941446786594e-05, |
|
"loss": 0.2638, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.4755555555555555, |
|
"grad_norm": 0.5766128897666931, |
|
"learning_rate": 9.92935509259118e-05, |
|
"loss": 0.3052, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.4844444444444445, |
|
"grad_norm": 0.3493499755859375, |
|
"learning_rate": 9.92672085682006e-05, |
|
"loss": 0.2728, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.4933333333333334, |
|
"grad_norm": 0.34138888120651245, |
|
"learning_rate": 9.924038765061042e-05, |
|
"loss": 0.2679, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.5022222222222221, |
|
"grad_norm": 0.40943869948387146, |
|
"learning_rate": 9.921308843366772e-05, |
|
"loss": 0.2556, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.511111111111111, |
|
"grad_norm": 0.4275529384613037, |
|
"learning_rate": 9.918531118254507e-05, |
|
"loss": 0.3012, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.52, |
|
"grad_norm": 0.3822483718395233, |
|
"learning_rate": 9.915705616705839e-05, |
|
"loss": 0.2984, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.528888888888889, |
|
"grad_norm": 0.3507990837097168, |
|
"learning_rate": 9.912832366166442e-05, |
|
"loss": 0.2839, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.537777777777778, |
|
"grad_norm": 0.3176634907722473, |
|
"learning_rate": 9.909911394545799e-05, |
|
"loss": 0.2715, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.5466666666666666, |
|
"grad_norm": 0.7413046956062317, |
|
"learning_rate": 9.906942730216939e-05, |
|
"loss": 0.2995, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 0.5602743625640869, |
|
"learning_rate": 9.903926402016153e-05, |
|
"loss": 0.303, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.5644444444444443, |
|
"grad_norm": 0.3049962818622589, |
|
"learning_rate": 9.900862439242719e-05, |
|
"loss": 0.2866, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.5733333333333333, |
|
"grad_norm": 0.20894083380699158, |
|
"learning_rate": 9.89775087165862e-05, |
|
"loss": 0.2801, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.5822222222222222, |
|
"grad_norm": 0.4999159574508667, |
|
"learning_rate": 9.894591729488242e-05, |
|
"loss": 0.3153, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.5911111111111111, |
|
"grad_norm": 0.6849189400672913, |
|
"learning_rate": 9.8913850434181e-05, |
|
"loss": 0.2794, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.44084635376930237, |
|
"learning_rate": 9.888130844596524e-05, |
|
"loss": 0.2953, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.608888888888889, |
|
"grad_norm": 0.2744970917701721, |
|
"learning_rate": 9.884829164633359e-05, |
|
"loss": 0.2654, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.6177777777777778, |
|
"grad_norm": 0.7441728711128235, |
|
"learning_rate": 9.881480035599667e-05, |
|
"loss": 0.4128, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.6266666666666667, |
|
"grad_norm": 0.2983834147453308, |
|
"learning_rate": 9.878083490027406e-05, |
|
"loss": 0.3103, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.6355555555555554, |
|
"grad_norm": 0.2417658269405365, |
|
"learning_rate": 9.874639560909117e-05, |
|
"loss": 0.248, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.6444444444444444, |
|
"grad_norm": 0.4583745002746582, |
|
"learning_rate": 9.871148281697608e-05, |
|
"loss": 0.2747, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.6533333333333333, |
|
"grad_norm": 0.42793506383895874, |
|
"learning_rate": 9.867609686305617e-05, |
|
"loss": 0.282, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.6622222222222223, |
|
"grad_norm": 0.32263195514678955, |
|
"learning_rate": 9.864023809105497e-05, |
|
"loss": 0.2709, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.6711111111111112, |
|
"grad_norm": 0.35320043563842773, |
|
"learning_rate": 9.860390684928873e-05, |
|
"loss": 0.3429, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.6800000000000002, |
|
"grad_norm": 0.6125680804252625, |
|
"learning_rate": 9.856710349066307e-05, |
|
"loss": 0.2844, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.6888888888888889, |
|
"grad_norm": 0.36522263288497925, |
|
"learning_rate": 9.852982837266955e-05, |
|
"loss": 0.2413, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.6977777777777778, |
|
"grad_norm": 0.3167021870613098, |
|
"learning_rate": 9.849208185738217e-05, |
|
"loss": 0.2682, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.7066666666666666, |
|
"grad_norm": 0.46384674310684204, |
|
"learning_rate": 9.84538643114539e-05, |
|
"loss": 0.2671, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.7155555555555555, |
|
"grad_norm": 0.27667102217674255, |
|
"learning_rate": 9.841517610611309e-05, |
|
"loss": 0.2929, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.7244444444444444, |
|
"grad_norm": 0.34263694286346436, |
|
"learning_rate": 9.837601761715983e-05, |
|
"loss": 0.2837, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.7333333333333334, |
|
"grad_norm": 0.5394869446754456, |
|
"learning_rate": 9.833638922496238e-05, |
|
"loss": 0.2535, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.7422222222222223, |
|
"grad_norm": 0.30996885895729065, |
|
"learning_rate": 9.829629131445342e-05, |
|
"loss": 0.2845, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.751111111111111, |
|
"grad_norm": 0.3415825664997101, |
|
"learning_rate": 9.825572427512632e-05, |
|
"loss": 0.2525, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.76, |
|
"grad_norm": 0.23367656767368317, |
|
"learning_rate": 9.82146885010314e-05, |
|
"loss": 0.295, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.7688888888888887, |
|
"grad_norm": 0.32408076524734497, |
|
"learning_rate": 9.817318439077195e-05, |
|
"loss": 0.3182, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.4190412759780884, |
|
"learning_rate": 9.81312123475006e-05, |
|
"loss": 0.2723, |
|
"step": 200 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 1120, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.6062381766095667e+17, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|