|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 10.0, |
|
"eval_steps": 500, |
|
"global_step": 1110, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.009009009009009009, |
|
"grad_norm": 608.0, |
|
"learning_rate": 1.801801801801802e-06, |
|
"loss": 58.5641, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.04504504504504504, |
|
"grad_norm": 532.0, |
|
"learning_rate": 9.00900900900901e-06, |
|
"loss": 54.6181, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.09009009009009009, |
|
"grad_norm": 446.0, |
|
"learning_rate": 1.801801801801802e-05, |
|
"loss": 50.0236, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.13513513513513514, |
|
"grad_norm": 193.0, |
|
"learning_rate": 2.702702702702703e-05, |
|
"loss": 33.1549, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.18018018018018017, |
|
"grad_norm": 44.5, |
|
"learning_rate": 3.603603603603604e-05, |
|
"loss": 25.2428, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.22522522522522523, |
|
"grad_norm": 26.625, |
|
"learning_rate": 4.5045045045045046e-05, |
|
"loss": 22.4735, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.2702702702702703, |
|
"grad_norm": 17.25, |
|
"learning_rate": 5.405405405405406e-05, |
|
"loss": 20.4661, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.3153153153153153, |
|
"grad_norm": 7.6875, |
|
"learning_rate": 6.306306306306306e-05, |
|
"loss": 19.1401, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.36036036036036034, |
|
"grad_norm": 11.6875, |
|
"learning_rate": 7.207207207207208e-05, |
|
"loss": 18.3188, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.40540540540540543, |
|
"grad_norm": 23.5, |
|
"learning_rate": 8.108108108108109e-05, |
|
"loss": 16.7622, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.45045045045045046, |
|
"grad_norm": 56.25, |
|
"learning_rate": 9.009009009009009e-05, |
|
"loss": 12.6183, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.4954954954954955, |
|
"grad_norm": 13.0, |
|
"learning_rate": 9.90990990990991e-05, |
|
"loss": 4.3593, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.5405405405405406, |
|
"grad_norm": 3.875, |
|
"learning_rate": 0.00010810810810810812, |
|
"loss": 2.18, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.5855855855855856, |
|
"grad_norm": 2.421875, |
|
"learning_rate": 0.00011711711711711712, |
|
"loss": 1.8179, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.6306306306306306, |
|
"grad_norm": 3.265625, |
|
"learning_rate": 0.00012612612612612612, |
|
"loss": 1.5974, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.6756756756756757, |
|
"grad_norm": 2.375, |
|
"learning_rate": 0.00013513513513513514, |
|
"loss": 1.486, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.7207207207207207, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.00014414414414414415, |
|
"loss": 1.361, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.7657657657657657, |
|
"grad_norm": 3.890625, |
|
"learning_rate": 0.00015315315315315314, |
|
"loss": 1.3001, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.8108108108108109, |
|
"grad_norm": 4.46875, |
|
"learning_rate": 0.00016216216216216218, |
|
"loss": 1.261, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.8558558558558559, |
|
"grad_norm": 5.03125, |
|
"learning_rate": 0.0001711711711711712, |
|
"loss": 1.2015, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.9009009009009009, |
|
"grad_norm": 32.25, |
|
"learning_rate": 0.00018018018018018018, |
|
"loss": 1.1886, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.9459459459459459, |
|
"grad_norm": 1.703125, |
|
"learning_rate": 0.0001891891891891892, |
|
"loss": 1.1679, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.990990990990991, |
|
"grad_norm": 2.984375, |
|
"learning_rate": 0.0001981981981981982, |
|
"loss": 1.1572, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 2.307225465774536, |
|
"eval_runtime": 1.0056, |
|
"eval_samples_per_second": 4.972, |
|
"eval_steps_per_second": 1.989, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 1.0360360360360361, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.00019999208860571255, |
|
"loss": 1.0473, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 1.0810810810810811, |
|
"grad_norm": 1.546875, |
|
"learning_rate": 0.0001999599507118322, |
|
"loss": 1.0618, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.1261261261261262, |
|
"grad_norm": 10.0, |
|
"learning_rate": 0.00019990309979553045, |
|
"loss": 1.0458, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 1.1711711711711712, |
|
"grad_norm": 8.4375, |
|
"learning_rate": 0.00019982154991201608, |
|
"loss": 1.0364, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.2162162162162162, |
|
"grad_norm": 2.0, |
|
"learning_rate": 0.00019971532122280464, |
|
"loss": 1.0457, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 1.2612612612612613, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.00019958443999073397, |
|
"loss": 0.9906, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.3063063063063063, |
|
"grad_norm": 18.25, |
|
"learning_rate": 0.00019942893857347128, |
|
"loss": 0.9911, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 1.3513513513513513, |
|
"grad_norm": 2.578125, |
|
"learning_rate": 0.0001992488554155135, |
|
"loss": 0.9996, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.3963963963963963, |
|
"grad_norm": 1.7734375, |
|
"learning_rate": 0.00019904423503868247, |
|
"loss": 0.9656, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.4414414414414414, |
|
"grad_norm": 5.65625, |
|
"learning_rate": 0.00019881512803111796, |
|
"loss": 0.9753, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.4864864864864864, |
|
"grad_norm": 3.78125, |
|
"learning_rate": 0.00019856159103477086, |
|
"loss": 0.9239, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.5315315315315314, |
|
"grad_norm": 0.86328125, |
|
"learning_rate": 0.00019828368673139947, |
|
"loss": 0.9428, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.5765765765765765, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 0.00019798148382707296, |
|
"loss": 0.9455, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.6216216216216215, |
|
"grad_norm": 1.8125, |
|
"learning_rate": 0.00019765505703518496, |
|
"loss": 0.9373, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.00019730448705798239, |
|
"loss": 0.9659, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.7117117117117115, |
|
"grad_norm": 3.859375, |
|
"learning_rate": 0.00019692986056661356, |
|
"loss": 0.9271, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.7567567567567568, |
|
"grad_norm": 3.34375, |
|
"learning_rate": 0.00019653127017970034, |
|
"loss": 0.9303, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.8018018018018018, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.0001961088144404403, |
|
"loss": 0.9333, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.8468468468468469, |
|
"grad_norm": 1.4453125, |
|
"learning_rate": 0.00019566259779224378, |
|
"loss": 0.8923, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.8918918918918919, |
|
"grad_norm": 3.171875, |
|
"learning_rate": 0.00019519273055291266, |
|
"loss": 0.9, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.936936936936937, |
|
"grad_norm": 14.3125, |
|
"learning_rate": 0.00019469932888736632, |
|
"loss": 0.8988, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.981981981981982, |
|
"grad_norm": 3.46875, |
|
"learning_rate": 0.0001941825147789225, |
|
"loss": 0.9296, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 2.178852081298828, |
|
"eval_runtime": 1.0053, |
|
"eval_samples_per_second": 4.973, |
|
"eval_steps_per_second": 1.989, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 2.027027027027027, |
|
"grad_norm": 1.4921875, |
|
"learning_rate": 0.00019364241599913924, |
|
"loss": 0.8696, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 2.0720720720720722, |
|
"grad_norm": 2.65625, |
|
"learning_rate": 0.0001930791660762262, |
|
"loss": 0.8363, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 2.1171171171171173, |
|
"grad_norm": 1.265625, |
|
"learning_rate": 0.00019249290426203252, |
|
"loss": 0.821, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 2.1621621621621623, |
|
"grad_norm": 2.546875, |
|
"learning_rate": 0.00019188377549761963, |
|
"loss": 0.8511, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 2.2072072072072073, |
|
"grad_norm": 0.828125, |
|
"learning_rate": 0.0001912519303774276, |
|
"loss": 0.8231, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 2.2522522522522523, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.000190597525112044, |
|
"loss": 0.8496, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 2.2972972972972974, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.00018992072148958368, |
|
"loss": 0.852, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 2.3423423423423424, |
|
"grad_norm": 1.578125, |
|
"learning_rate": 0.0001892216868356904, |
|
"loss": 0.8131, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 2.3873873873873874, |
|
"grad_norm": 1.5078125, |
|
"learning_rate": 0.00018850059397216876, |
|
"loss": 0.8483, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 2.4324324324324325, |
|
"grad_norm": 1.125, |
|
"learning_rate": 0.00018775762117425777, |
|
"loss": 0.8432, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 2.4774774774774775, |
|
"grad_norm": 0.6015625, |
|
"learning_rate": 0.00018699295212655596, |
|
"loss": 0.8493, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 2.5225225225225225, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 0.00018620677587760916, |
|
"loss": 0.7998, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 2.5675675675675675, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.0001853992867931721, |
|
"loss": 0.8256, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 2.6126126126126126, |
|
"grad_norm": 0.6796875, |
|
"learning_rate": 0.00018457068450815562, |
|
"loss": 0.8162, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 2.6576576576576576, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 0.0001837211738772711, |
|
"loss": 0.8338, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 2.7027027027027026, |
|
"grad_norm": 0.9140625, |
|
"learning_rate": 0.00018285096492438424, |
|
"loss": 0.8279, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 2.7477477477477477, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00018196027279059117, |
|
"loss": 0.7962, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 2.7927927927927927, |
|
"grad_norm": 2.78125, |
|
"learning_rate": 0.0001810493176810292, |
|
"loss": 0.8192, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.8378378378378377, |
|
"grad_norm": 0.63671875, |
|
"learning_rate": 0.00018011832481043576, |
|
"loss": 0.8147, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.8828828828828827, |
|
"grad_norm": 0.56640625, |
|
"learning_rate": 0.00017916752434746856, |
|
"loss": 0.8255, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.9279279279279278, |
|
"grad_norm": 1.8046875, |
|
"learning_rate": 0.0001781971513578013, |
|
"loss": 0.8059, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.972972972972973, |
|
"grad_norm": 1.1640625, |
|
"learning_rate": 0.00017720744574600863, |
|
"loss": 0.8273, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 2.1709225177764893, |
|
"eval_runtime": 1.0054, |
|
"eval_samples_per_second": 4.973, |
|
"eval_steps_per_second": 1.989, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 3.018018018018018, |
|
"grad_norm": 0.83984375, |
|
"learning_rate": 0.00017619865219625452, |
|
"loss": 0.7934, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 3.063063063063063, |
|
"grad_norm": 1.5859375, |
|
"learning_rate": 0.00017517102011179933, |
|
"loss": 0.7096, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 3.108108108108108, |
|
"grad_norm": 0.91796875, |
|
"learning_rate": 0.00017412480355334005, |
|
"loss": 0.7203, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 3.153153153153153, |
|
"grad_norm": 1.9296875, |
|
"learning_rate": 0.00017306026117619889, |
|
"loss": 0.7237, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 3.1981981981981984, |
|
"grad_norm": 1.6328125, |
|
"learning_rate": 0.00017197765616637636, |
|
"loss": 0.738, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 3.2432432432432434, |
|
"grad_norm": 2.0625, |
|
"learning_rate": 0.00017087725617548385, |
|
"loss": 0.7214, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 3.2882882882882885, |
|
"grad_norm": 3.53125, |
|
"learning_rate": 0.0001697593332545723, |
|
"loss": 0.7549, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 4.25, |
|
"learning_rate": 0.0001686241637868734, |
|
"loss": 0.7575, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 3.3783783783783785, |
|
"grad_norm": 1.6484375, |
|
"learning_rate": 0.00016747202841946928, |
|
"loss": 0.7392, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 3.4234234234234235, |
|
"grad_norm": 1.6171875, |
|
"learning_rate": 0.00016630321199390867, |
|
"loss": 0.7251, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 3.4684684684684686, |
|
"grad_norm": 1.5546875, |
|
"learning_rate": 0.0001651180034757856, |
|
"loss": 0.7285, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 3.5135135135135136, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 0.0001639166958832985, |
|
"loss": 0.7114, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 3.5585585585585586, |
|
"grad_norm": 1.2421875, |
|
"learning_rate": 0.00016269958621480788, |
|
"loss": 0.7223, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 3.6036036036036037, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00016146697537540924, |
|
"loss": 0.7273, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 3.6486486486486487, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00016021916810254097, |
|
"loss": 0.7328, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 3.6936936936936937, |
|
"grad_norm": 1.1015625, |
|
"learning_rate": 0.00015895647289064396, |
|
"loss": 0.7409, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 3.7387387387387387, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 0.000157679201914893, |
|
"loss": 0.7247, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 3.7837837837837838, |
|
"grad_norm": 1.890625, |
|
"learning_rate": 0.0001563876709540178, |
|
"loss": 0.7446, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 3.828828828828829, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 0.0001550821993122334, |
|
"loss": 0.7421, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 3.873873873873874, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 0.00015376310974029873, |
|
"loss": 0.7362, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 3.918918918918919, |
|
"grad_norm": 0.66015625, |
|
"learning_rate": 0.00015243072835572318, |
|
"loss": 0.7398, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 3.963963963963964, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0001510853845621409, |
|
"loss": 0.7586, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_loss": 2.2163968086242676, |
|
"eval_runtime": 1.0061, |
|
"eval_samples_per_second": 4.97, |
|
"eval_steps_per_second": 1.988, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 4.009009009009009, |
|
"grad_norm": 0.58203125, |
|
"learning_rate": 0.00014972741096787242, |
|
"loss": 0.7128, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 4.054054054054054, |
|
"grad_norm": 0.75, |
|
"learning_rate": 0.00014835714330369446, |
|
"loss": 0.6463, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 4.099099099099099, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 0.00014697492033983707, |
|
"loss": 0.6453, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 4.1441441441441444, |
|
"grad_norm": 0.55859375, |
|
"learning_rate": 0.00014558108380223012, |
|
"loss": 0.647, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 4.1891891891891895, |
|
"grad_norm": 1.3125, |
|
"learning_rate": 0.00014417597828801832, |
|
"loss": 0.626, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 4.2342342342342345, |
|
"grad_norm": 0.85546875, |
|
"learning_rate": 0.00014275995118036693, |
|
"loss": 0.6334, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 4.2792792792792795, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.0001413333525625784, |
|
"loss": 0.6435, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 4.324324324324325, |
|
"grad_norm": 0.8046875, |
|
"learning_rate": 0.00013989653513154165, |
|
"loss": 0.6439, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 4.36936936936937, |
|
"grad_norm": 1.0859375, |
|
"learning_rate": 0.00013844985411053492, |
|
"loss": 0.6559, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 4.414414414414415, |
|
"grad_norm": 1.3359375, |
|
"learning_rate": 0.00013699366716140435, |
|
"loss": 0.6654, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 4.45945945945946, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00013552833429613938, |
|
"loss": 0.6783, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 4.504504504504505, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 0.00013405421778786737, |
|
"loss": 0.6543, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 4.54954954954955, |
|
"grad_norm": 0.62890625, |
|
"learning_rate": 0.00013257168208128908, |
|
"loss": 0.6608, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 4.594594594594595, |
|
"grad_norm": 0.60546875, |
|
"learning_rate": 0.00013108109370257712, |
|
"loss": 0.6621, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 4.63963963963964, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 0.00012958282116876026, |
|
"loss": 0.656, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 4.684684684684685, |
|
"grad_norm": 0.65234375, |
|
"learning_rate": 0.00012807723489661495, |
|
"loss": 0.6505, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 4.72972972972973, |
|
"grad_norm": 0.921875, |
|
"learning_rate": 0.00012656470711108764, |
|
"loss": 0.6789, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 4.774774774774775, |
|
"grad_norm": 0.61328125, |
|
"learning_rate": 0.00012504561175326985, |
|
"loss": 0.6588, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 4.81981981981982, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 0.00012352032438794902, |
|
"loss": 0.6534, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 4.864864864864865, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 0.00012198922211075778, |
|
"loss": 0.6482, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 4.90990990990991, |
|
"grad_norm": 0.94140625, |
|
"learning_rate": 0.00012045268345494511, |
|
"loss": 0.6595, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 4.954954954954955, |
|
"grad_norm": 0.59765625, |
|
"learning_rate": 0.00011891108829779165, |
|
"loss": 0.6624, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.578125, |
|
"learning_rate": 0.00011736481776669306, |
|
"loss": 0.6613, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_loss": 2.3182225227355957, |
|
"eval_runtime": 1.0028, |
|
"eval_samples_per_second": 4.986, |
|
"eval_steps_per_second": 1.994, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 5.045045045045045, |
|
"grad_norm": 0.98046875, |
|
"learning_rate": 0.0001158142541449341, |
|
"loss": 0.5564, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 5.09009009009009, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00011425978077717709, |
|
"loss": 0.5273, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 5.135135135135135, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00011270178197468789, |
|
"loss": 0.5589, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 5.18018018018018, |
|
"grad_norm": 0.6171875, |
|
"learning_rate": 0.00011114064292032282, |
|
"loss": 0.5593, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 5.225225225225225, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 0.00010957674957330042, |
|
"loss": 0.5672, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 5.27027027027027, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 0.00010801048857378071, |
|
"loss": 0.5444, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 5.315315315315315, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 0.00010644224714727681, |
|
"loss": 0.5747, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 5.36036036036036, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 0.0001048724130089212, |
|
"loss": 0.5609, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 5.405405405405405, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 0.00010330137426761135, |
|
"loss": 0.5625, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 5.45045045045045, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 0.00010172951933005775, |
|
"loss": 0.5671, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 5.495495495495495, |
|
"grad_norm": 0.80859375, |
|
"learning_rate": 0.00010015723680475846, |
|
"loss": 0.564, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 5.54054054054054, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 9.858491540592382e-05, |
|
"loss": 0.5784, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 5.585585585585585, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 9.70129438573747e-05, |
|
"loss": 0.5672, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 5.63063063063063, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 9.54417107964389e-05, |
|
"loss": 0.5592, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 5.675675675675675, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 9.38716046778684e-05, |
|
"loss": 0.5634, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 5.7207207207207205, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 9.230301367780208e-05, |
|
"loss": 0.5691, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 5.7657657657657655, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 9.07363255977973e-05, |
|
"loss": 0.5722, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 5.8108108108108105, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 8.917192776895382e-05, |
|
"loss": 0.5827, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 5.8558558558558556, |
|
"grad_norm": 0.83203125, |
|
"learning_rate": 8.76102069561545e-05, |
|
"loss": 0.5745, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 5.900900900900901, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 8.605154926244543e-05, |
|
"loss": 0.5614, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 5.945945945945946, |
|
"grad_norm": 0.65625, |
|
"learning_rate": 8.449634003358022e-05, |
|
"loss": 0.5731, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 5.990990990990991, |
|
"grad_norm": 0.8828125, |
|
"learning_rate": 8.294496376275104e-05, |
|
"loss": 0.577, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_loss": 2.4773526191711426, |
|
"eval_runtime": 1.0034, |
|
"eval_samples_per_second": 4.983, |
|
"eval_steps_per_second": 1.993, |
|
"step": 666 |
|
}, |
|
{ |
|
"epoch": 6.036036036036036, |
|
"grad_norm": 0.8984375, |
|
"learning_rate": 8.13978039955308e-05, |
|
"loss": 0.5142, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 6.081081081081081, |
|
"grad_norm": 0.8359375, |
|
"learning_rate": 7.985524323504948e-05, |
|
"loss": 0.4725, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 6.126126126126126, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 7.831766284742807e-05, |
|
"loss": 0.4671, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 6.171171171171171, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 7.678544296749384e-05, |
|
"loss": 0.4804, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 6.216216216216216, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 7.525896240479976e-05, |
|
"loss": 0.4704, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 6.261261261261261, |
|
"grad_norm": 0.75, |
|
"learning_rate": 7.37385985499718e-05, |
|
"loss": 0.4659, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 6.306306306306306, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 7.222472728140695e-05, |
|
"loss": 0.4697, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 6.351351351351352, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 7.071772287234497e-05, |
|
"loss": 0.4912, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 6.396396396396397, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 6.921795789833723e-05, |
|
"loss": 0.4689, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 6.441441441441442, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 6.772580314513508e-05, |
|
"loss": 0.4753, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 6.486486486486487, |
|
"grad_norm": 0.75, |
|
"learning_rate": 6.624162751702076e-05, |
|
"loss": 0.4759, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 6.531531531531532, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 6.476579794560356e-05, |
|
"loss": 0.489, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 6.576576576576577, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 6.329867929910347e-05, |
|
"loss": 0.473, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 6.621621621621622, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 6.184063429214515e-05, |
|
"loss": 0.4793, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 6.039202339608432e-05, |
|
"loss": 0.5071, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 6.711711711711712, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 5.895320474988864e-05, |
|
"loss": 0.4741, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 6.756756756756757, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 5.752453407159522e-05, |
|
"loss": 0.4799, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 6.801801801801802, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 5.610636457036693e-05, |
|
"loss": 0.4901, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 6.846846846846847, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 5.469904685916861e-05, |
|
"loss": 0.4858, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 6.891891891891892, |
|
"grad_norm": 0.76953125, |
|
"learning_rate": 5.33029288680852e-05, |
|
"loss": 0.4895, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 6.936936936936937, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 5.191835575830352e-05, |
|
"loss": 0.4935, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 6.981981981981982, |
|
"grad_norm": 0.69921875, |
|
"learning_rate": 5.0545669836778144e-05, |
|
"loss": 0.4958, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_loss": 2.7035882472991943, |
|
"eval_runtime": 1.0058, |
|
"eval_samples_per_second": 4.971, |
|
"eval_steps_per_second": 1.988, |
|
"step": 777 |
|
}, |
|
{ |
|
"epoch": 7.027027027027027, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 4.918521047160308e-05, |
|
"loss": 0.4443, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 7.072072072072072, |
|
"grad_norm": 0.7734375, |
|
"learning_rate": 4.783731400811022e-05, |
|
"loss": 0.4139, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 7.117117117117117, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.650231368571486e-05, |
|
"loss": 0.41, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 7.162162162162162, |
|
"grad_norm": 0.90625, |
|
"learning_rate": 4.518053955552903e-05, |
|
"loss": 0.4291, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 7.207207207207207, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 4.387231839876349e-05, |
|
"loss": 0.4141, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 7.252252252252252, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 4.2577973645937674e-05, |
|
"loss": 0.4139, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 7.297297297297297, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 4.129782529691815e-05, |
|
"loss": 0.4278, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 7.342342342342342, |
|
"grad_norm": 0.73046875, |
|
"learning_rate": 4.003218984180552e-05, |
|
"loss": 0.4148, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 7.387387387387387, |
|
"grad_norm": 0.79296875, |
|
"learning_rate": 3.878138018268866e-05, |
|
"loss": 0.4168, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 7.4324324324324325, |
|
"grad_norm": 0.82421875, |
|
"learning_rate": 3.7545705556286126e-05, |
|
"loss": 0.4182, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 7.4774774774774775, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 3.632547145749395e-05, |
|
"loss": 0.4239, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 7.5225225225225225, |
|
"grad_norm": 0.78515625, |
|
"learning_rate": 3.5120979563858266e-05, |
|
"loss": 0.4137, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 7.5675675675675675, |
|
"grad_norm": 0.73828125, |
|
"learning_rate": 3.393252766099187e-05, |
|
"loss": 0.4111, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 7.612612612612613, |
|
"grad_norm": 0.7421875, |
|
"learning_rate": 3.2760409568952766e-05, |
|
"loss": 0.4179, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 7.657657657657658, |
|
"grad_norm": 0.76171875, |
|
"learning_rate": 3.1604915069603436e-05, |
|
"loss": 0.429, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 7.702702702702703, |
|
"grad_norm": 0.75, |
|
"learning_rate": 3.0466329834968233e-05, |
|
"loss": 0.4118, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 7.747747747747748, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 2.9344935356606773e-05, |
|
"loss": 0.4049, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 7.792792792792793, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 2.8241008876021215e-05, |
|
"loss": 0.413, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 7.837837837837838, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 2.7154823316113932e-05, |
|
"loss": 0.4071, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 7.882882882882883, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 2.60866472137129e-05, |
|
"loss": 0.4073, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 7.927927927927928, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 2.5036744653181753e-05, |
|
"loss": 0.4124, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 7.972972972972973, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 2.4005375201130274e-05, |
|
"loss": 0.4205, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_loss": 2.9689488410949707, |
|
"eval_runtime": 1.0053, |
|
"eval_samples_per_second": 4.973, |
|
"eval_steps_per_second": 1.989, |
|
"step": 888 |
|
}, |
|
{ |
|
"epoch": 8.018018018018019, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 2.29927938422419e-05, |
|
"loss": 0.4012, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 8.063063063063064, |
|
"grad_norm": 1.0078125, |
|
"learning_rate": 2.199925091623418e-05, |
|
"loss": 0.3781, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 8.108108108108109, |
|
"grad_norm": 0.8671875, |
|
"learning_rate": 2.102499205596743e-05, |
|
"loss": 0.3809, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 8.153153153153154, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 2.0070258126717e-05, |
|
"loss": 0.3699, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 8.198198198198199, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.913528516662452e-05, |
|
"loss": 0.3742, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 8.243243243243244, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.8220304328342252e-05, |
|
"loss": 0.378, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 8.288288288288289, |
|
"grad_norm": 0.70703125, |
|
"learning_rate": 1.7325541821885384e-05, |
|
"loss": 0.3842, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 8.333333333333334, |
|
"grad_norm": 0.75390625, |
|
"learning_rate": 1.6451218858706374e-05, |
|
"loss": 0.3894, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 8.378378378378379, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 1.5597551597004966e-05, |
|
"loss": 0.3758, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 8.423423423423424, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 1.476475108828762e-05, |
|
"loss": 0.3717, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 8.468468468468469, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 1.3953023225189243e-05, |
|
"loss": 0.3771, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 8.513513513513514, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 1.3162568690570743e-05, |
|
"loss": 0.3759, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 8.558558558558559, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 1.23935829079042e-05, |
|
"loss": 0.3786, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 8.603603603603604, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.1646255992958466e-05, |
|
"loss": 0.3734, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 8.64864864864865, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 1.0920772706797167e-05, |
|
"loss": 0.3809, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 8.693693693693694, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 1.0217312410100089e-05, |
|
"loss": 0.3767, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 8.73873873873874, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 9.536049018820192e-06, |
|
"loss": 0.3786, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 8.783783783783784, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 8.87715096118642e-06, |
|
"loss": 0.3786, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 8.82882882882883, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 8.240781136063346e-06, |
|
"loss": 0.3868, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 8.873873873873874, |
|
"grad_norm": 0.72265625, |
|
"learning_rate": 7.6270968726777414e-06, |
|
"loss": 0.3767, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 8.91891891891892, |
|
"grad_norm": 0.7578125, |
|
"learning_rate": 7.03624989172228e-06, |
|
"loss": 0.3791, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 8.963963963963964, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 6.468386267845717e-06, |
|
"loss": 0.382, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_loss": 3.2251663208007812, |
|
"eval_runtime": 1.0069, |
|
"eval_samples_per_second": 4.966, |
|
"eval_steps_per_second": 1.986, |
|
"step": 999 |
|
}, |
|
{ |
|
"epoch": 9.00900900900901, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 5.9236463935389065e-06, |
|
"loss": 0.3794, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 9.054054054054054, |
|
"grad_norm": 0.671875, |
|
"learning_rate": 5.402164944425758e-06, |
|
"loss": 0.3777, |
|
"step": 1005 |
|
}, |
|
{ |
|
"epoch": 9.0990990990991, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 4.904070845967468e-06, |
|
"loss": 0.3779, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 9.144144144144144, |
|
"grad_norm": 0.703125, |
|
"learning_rate": 4.429487241588304e-06, |
|
"loss": 0.3744, |
|
"step": 1015 |
|
}, |
|
{ |
|
"epoch": 9.18918918918919, |
|
"grad_norm": 0.6953125, |
|
"learning_rate": 3.9785314622310495e-06, |
|
"loss": 0.3694, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 9.234234234234235, |
|
"grad_norm": 0.71875, |
|
"learning_rate": 3.5513149973492976e-06, |
|
"loss": 0.3751, |
|
"step": 1025 |
|
}, |
|
{ |
|
"epoch": 9.27927927927928, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 3.1479434673440167e-06, |
|
"loss": 0.3685, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 9.324324324324325, |
|
"grad_norm": 0.68359375, |
|
"learning_rate": 2.7685165974510986e-06, |
|
"loss": 0.3653, |
|
"step": 1035 |
|
}, |
|
{ |
|
"epoch": 9.36936936936937, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 2.4131281930864002e-06, |
|
"loss": 0.3728, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 9.414414414414415, |
|
"grad_norm": 0.734375, |
|
"learning_rate": 2.0818661166542074e-06, |
|
"loss": 0.3693, |
|
"step": 1045 |
|
}, |
|
{ |
|
"epoch": 9.45945945945946, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.7748122658251876e-06, |
|
"loss": 0.3764, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 9.504504504504505, |
|
"grad_norm": 0.7265625, |
|
"learning_rate": 1.4920425532888526e-06, |
|
"loss": 0.3654, |
|
"step": 1055 |
|
}, |
|
{ |
|
"epoch": 9.54954954954955, |
|
"grad_norm": 0.66796875, |
|
"learning_rate": 1.2336268879856727e-06, |
|
"loss": 0.3747, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 9.594594594594595, |
|
"grad_norm": 0.69140625, |
|
"learning_rate": 9.996291578236228e-07, |
|
"loss": 0.3711, |
|
"step": 1065 |
|
}, |
|
{ |
|
"epoch": 9.63963963963964, |
|
"grad_norm": 0.71484375, |
|
"learning_rate": 7.901072138831511e-07, |
|
"loss": 0.3722, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 9.684684684684685, |
|
"grad_norm": 0.7109375, |
|
"learning_rate": 6.051128561147756e-07, |
|
"loss": 0.3612, |
|
"step": 1075 |
|
}, |
|
{ |
|
"epoch": 9.72972972972973, |
|
"grad_norm": 0.74609375, |
|
"learning_rate": 4.44691820532539e-07, |
|
"loss": 0.3647, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 9.774774774774775, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 3.0888376790679795e-07, |
|
"loss": 0.3672, |
|
"step": 1085 |
|
}, |
|
{ |
|
"epoch": 9.81981981981982, |
|
"grad_norm": 0.6484375, |
|
"learning_rate": 1.977222739588891e-07, |
|
"loss": 0.3659, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 9.864864864864865, |
|
"grad_norm": 0.67578125, |
|
"learning_rate": 1.1123482106021322e-07, |
|
"loss": 0.3692, |
|
"step": 1095 |
|
}, |
|
{ |
|
"epoch": 9.90990990990991, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 4.9442791437848136e-08, |
|
"loss": 0.3663, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 9.954954954954955, |
|
"grad_norm": 0.6875, |
|
"learning_rate": 1.2361461888166226e-08, |
|
"loss": 0.3673, |
|
"step": 1105 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.6640625, |
|
"learning_rate": 0.0, |
|
"loss": 0.372, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_loss": 3.242992401123047, |
|
"eval_runtime": 1.0031, |
|
"eval_samples_per_second": 4.984, |
|
"eval_steps_per_second": 1.994, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 1110, |
|
"total_flos": 1.697049221804327e+18, |
|
"train_loss": 1.8630313719715084, |
|
"train_runtime": 9058.6901, |
|
"train_samples_per_second": 1.957, |
|
"train_steps_per_second": 0.123 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1110, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.697049221804327e+18, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|