|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.0, |
|
"eval_steps": 119, |
|
"global_step": 475, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.7395525574684143, |
|
"learning_rate": 2e-05, |
|
"loss": 1.1916, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"eval_loss": 1.3024712800979614, |
|
"eval_runtime": 11.3342, |
|
"eval_samples_per_second": 8.823, |
|
"eval_steps_per_second": 8.823, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 0.7760837078094482, |
|
"learning_rate": 4e-05, |
|
"loss": 1.5154, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.630995512008667, |
|
"learning_rate": 6e-05, |
|
"loss": 2.1425, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.6175426244735718, |
|
"learning_rate": 8e-05, |
|
"loss": 0.7877, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.5972404479980469, |
|
"learning_rate": 0.0001, |
|
"loss": 1.3798, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.5723439455032349, |
|
"learning_rate": 0.00012, |
|
"loss": 1.0747, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 1.4761886596679688, |
|
"learning_rate": 0.00014, |
|
"loss": 1.1005, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.2958413362503052, |
|
"learning_rate": 0.00016, |
|
"loss": 1.1242, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.9850685000419617, |
|
"learning_rate": 0.00018, |
|
"loss": 1.1449, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 1.666906714439392, |
|
"learning_rate": 0.0002, |
|
"loss": 1.4931, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 0.5160439014434814, |
|
"learning_rate": 0.00019999771775537991, |
|
"loss": 1.1436, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.0970433950424194, |
|
"learning_rate": 0.00019999087112569246, |
|
"loss": 1.6171, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.7956830263137817, |
|
"learning_rate": 0.00019997946042345127, |
|
"loss": 1.3005, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 1.2406549453735352, |
|
"learning_rate": 0.00019996348616949672, |
|
"loss": 1.6621, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.932831346988678, |
|
"learning_rate": 0.0001999429490929718, |
|
"loss": 1.784, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 0.9084440469741821, |
|
"learning_rate": 0.00019991785013128923, |
|
"loss": 1.5638, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.491622805595398, |
|
"learning_rate": 0.0001998881904300884, |
|
"loss": 1.5091, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.6921408176422119, |
|
"learning_rate": 0.00019985397134318319, |
|
"loss": 0.9959, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.7786202430725098, |
|
"learning_rate": 0.0001998151944325001, |
|
"loss": 1.0953, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 0.5404906272888184, |
|
"learning_rate": 0.00019977186146800707, |
|
"loss": 2.0195, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 1.5230717658996582, |
|
"learning_rate": 0.00019972397442763262, |
|
"loss": 1.0865, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 6.9937639236450195, |
|
"learning_rate": 0.00019967153549717553, |
|
"loss": 1.6098, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.6447493433952332, |
|
"learning_rate": 0.00019961454707020514, |
|
"loss": 1.63, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 0.5249179601669312, |
|
"learning_rate": 0.00019955301174795208, |
|
"loss": 0.9971, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.7006235122680664, |
|
"learning_rate": 0.00019948693233918952, |
|
"loss": 1.151, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.094476580619812, |
|
"learning_rate": 0.00019941631186010494, |
|
"loss": 1.0216, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.7347458004951477, |
|
"learning_rate": 0.0001993411535341625, |
|
"loss": 0.8214, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 0.36545494198799133, |
|
"learning_rate": 0.00019926146079195594, |
|
"loss": 1.3825, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.0544871091842651, |
|
"learning_rate": 0.0001991772372710519, |
|
"loss": 1.244, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 2.498039484024048, |
|
"learning_rate": 0.00019908848681582391, |
|
"loss": 1.8747, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.9571327567100525, |
|
"learning_rate": 0.0001989952134772769, |
|
"loss": 1.3877, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 2.0039713382720947, |
|
"learning_rate": 0.00019889742151286247, |
|
"loss": 2.0081, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.1989943981170654, |
|
"learning_rate": 0.00019879511538628428, |
|
"loss": 1.4427, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 0.8076533675193787, |
|
"learning_rate": 0.00019868829976729443, |
|
"loss": 1.3122, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 3.5690324306488037, |
|
"learning_rate": 0.00019857697953148037, |
|
"loss": 1.5759, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.7935991883277893, |
|
"learning_rate": 0.00019846115976004234, |
|
"loss": 1.2685, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.0418881177902222, |
|
"learning_rate": 0.00019834084573956128, |
|
"loss": 1.8058, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.13619065284729, |
|
"learning_rate": 0.00019821604296175774, |
|
"loss": 1.55, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 1.460271954536438, |
|
"learning_rate": 0.00019808675712324107, |
|
"loss": 1.3906, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 0.8569309711456299, |
|
"learning_rate": 0.00019795299412524945, |
|
"loss": 1.3688, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.2582310438156128, |
|
"learning_rate": 0.00019781476007338058, |
|
"loss": 1.093, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.4148155450820923, |
|
"learning_rate": 0.00019767206127731281, |
|
"loss": 1.3204, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.2747466564178467, |
|
"learning_rate": 0.00019752490425051743, |
|
"loss": 1.0998, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.7309204339981079, |
|
"learning_rate": 0.000197373295709961, |
|
"loss": 1.6907, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 0.5022898316383362, |
|
"learning_rate": 0.00019721724257579907, |
|
"loss": 1.4717, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.9706376194953918, |
|
"learning_rate": 0.00019705675197106016, |
|
"loss": 1.8908, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.6191427111625671, |
|
"learning_rate": 0.00019689183122132068, |
|
"loss": 1.3458, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.8788885474205017, |
|
"learning_rate": 0.0001967224878543705, |
|
"loss": 1.772, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.3387707471847534, |
|
"learning_rate": 0.00019654872959986937, |
|
"loss": 1.4979, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.0316563844680786, |
|
"learning_rate": 0.0001963705643889941, |
|
"loss": 1.2347, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.2246055603027344, |
|
"learning_rate": 0.00019618800035407658, |
|
"loss": 1.7885, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.443323016166687, |
|
"learning_rate": 0.0001960010458282326, |
|
"loss": 1.2274, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 0.8868201375007629, |
|
"learning_rate": 0.0001958097093449813, |
|
"loss": 0.9314, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 2.7264301776885986, |
|
"learning_rate": 0.00019561399963785586, |
|
"loss": 1.1364, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.450035810470581, |
|
"learning_rate": 0.00019541392564000488, |
|
"loss": 1.6993, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.0921446084976196, |
|
"learning_rate": 0.00019520949648378443, |
|
"loss": 1.4098, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.580201268196106, |
|
"learning_rate": 0.00019500072150034137, |
|
"loss": 1.0974, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.6637622117996216, |
|
"learning_rate": 0.00019478761021918728, |
|
"loss": 1.4646, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 0.7193565964698792, |
|
"learning_rate": 0.00019457017236776373, |
|
"loss": 1.2315, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.8072863817214966, |
|
"learning_rate": 0.00019434841787099803, |
|
"loss": 1.1918, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.3189361095428467, |
|
"learning_rate": 0.00019412235685085035, |
|
"loss": 1.5442, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.0731955766677856, |
|
"learning_rate": 0.00019389199962585157, |
|
"loss": 0.9577, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 0.7714097499847412, |
|
"learning_rate": 0.0001936573567106325, |
|
"loss": 1.6435, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.1686166524887085, |
|
"learning_rate": 0.00019341843881544372, |
|
"loss": 1.7296, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.8492275476455688, |
|
"learning_rate": 0.00019317525684566685, |
|
"loss": 1.4221, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.8079515099525452, |
|
"learning_rate": 0.00019292782190131677, |
|
"loss": 1.234, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.6675179600715637, |
|
"learning_rate": 0.00019267614527653488, |
|
"loss": 1.2457, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 0.5377606153488159, |
|
"learning_rate": 0.0001924202384590736, |
|
"loss": 1.4115, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.6757211089134216, |
|
"learning_rate": 0.0001921601131297721, |
|
"loss": 1.0735, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.585841178894043, |
|
"learning_rate": 0.00019189578116202307, |
|
"loss": 1.3859, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.6633312106132507, |
|
"learning_rate": 0.00019162725462123072, |
|
"loss": 1.1772, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.7894064784049988, |
|
"learning_rate": 0.0001913545457642601, |
|
"loss": 0.9443, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 0.7125688195228577, |
|
"learning_rate": 0.00019107766703887764, |
|
"loss": 1.2188, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.4715336561203003, |
|
"learning_rate": 0.00019079663108318302, |
|
"loss": 1.3095, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.017150402069092, |
|
"learning_rate": 0.00019051145072503215, |
|
"loss": 1.704, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.5130548477172852, |
|
"learning_rate": 0.00019022213898145176, |
|
"loss": 1.2175, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 0.931109607219696, |
|
"learning_rate": 0.00018992870905804534, |
|
"loss": 0.9057, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 3.687540292739868, |
|
"learning_rate": 0.0001896311743483901, |
|
"loss": 1.0161, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.5885124802589417, |
|
"learning_rate": 0.00018932954843342591, |
|
"loss": 1.2787, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.636814534664154, |
|
"learning_rate": 0.00018902384508083517, |
|
"loss": 1.0253, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.0739303827285767, |
|
"learning_rate": 0.0001887140782444145, |
|
"loss": 1.1437, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 0.6316006183624268, |
|
"learning_rate": 0.00018840026206343784, |
|
"loss": 0.4953, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 1.1502597332000732, |
|
"learning_rate": 0.00018808241086201103, |
|
"loss": 1.043, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 1.3769752979278564, |
|
"learning_rate": 0.0001877605391484179, |
|
"loss": 1.2975, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.9198704957962036, |
|
"learning_rate": 0.00018743466161445823, |
|
"loss": 1.3415, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.8441985845565796, |
|
"learning_rate": 0.00018710479313477696, |
|
"loss": 0.9262, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 0.6904205679893494, |
|
"learning_rate": 0.00018677094876618538, |
|
"loss": 1.0266, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.9781098365783691, |
|
"learning_rate": 0.00018643314374697378, |
|
"loss": 1.8946, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.5499415397644043, |
|
"learning_rate": 0.00018609139349621588, |
|
"loss": 0.8428, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.4500677585601807, |
|
"learning_rate": 0.0001857457136130651, |
|
"loss": 1.3847, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.2159172296524048, |
|
"learning_rate": 0.00018539611987604258, |
|
"loss": 1.0733, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 0.3588328957557678, |
|
"learning_rate": 0.00018504262824231674, |
|
"loss": 1.3488, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.6259031891822815, |
|
"learning_rate": 0.00018468525484697525, |
|
"loss": 1.9598, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.32454252243042, |
|
"learning_rate": 0.00018432401600228823, |
|
"loss": 0.7533, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.4529890716075897, |
|
"learning_rate": 0.00018395892819696389, |
|
"loss": 1.816, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5825135707855225, |
|
"learning_rate": 0.00018359000809539585, |
|
"loss": 1.104, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.4329938888549805, |
|
"learning_rate": 0.0001832172725369024, |
|
"loss": 1.37, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.0493029356002808, |
|
"learning_rate": 0.00018284073853495807, |
|
"loss": 1.3342, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.8605116605758667, |
|
"learning_rate": 0.00018246042327641678, |
|
"loss": 1.26, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 0.775687575340271, |
|
"learning_rate": 0.00018207634412072764, |
|
"loss": 0.8628, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 4.881119251251221, |
|
"learning_rate": 0.0001816885185991424, |
|
"loss": 1.6938, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.4081553220748901, |
|
"learning_rate": 0.00018129696441391522, |
|
"loss": 1.7014, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.636123538017273, |
|
"learning_rate": 0.00018090169943749476, |
|
"loss": 1.4061, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 0.2846980392932892, |
|
"learning_rate": 0.00018050274171170836, |
|
"loss": 0.5465, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.0350561141967773, |
|
"learning_rate": 0.00018010010944693848, |
|
"loss": 1.7158, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.09168541431427, |
|
"learning_rate": 0.0001796938210212915, |
|
"loss": 1.0047, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.8954039812088013, |
|
"learning_rate": 0.00017928389497975895, |
|
"loss": 1.7889, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.9749327301979065, |
|
"learning_rate": 0.00017887035003337083, |
|
"loss": 1.9958, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.0174291133880615, |
|
"learning_rate": 0.00017845320505834175, |
|
"loss": 1.5635, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 1.013778805732727, |
|
"learning_rate": 0.0001780324790952092, |
|
"loss": 1.1228, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 0.6990927457809448, |
|
"learning_rate": 0.0001776081913479645, |
|
"loss": 1.2748, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.4960625171661377, |
|
"learning_rate": 0.0001771803611831762, |
|
"loss": 1.1509, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7278648018836975, |
|
"learning_rate": 0.0001767490081291062, |
|
"loss": 1.1221, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.7677181363105774, |
|
"learning_rate": 0.0001763141518748182, |
|
"loss": 1.7333, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 1.0699597597122192, |
|
"learning_rate": 0.0001758758122692791, |
|
"loss": 1.3511, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 0.5539786219596863, |
|
"learning_rate": 0.00017543400932045307, |
|
"loss": 1.171, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.4356689751148224, |
|
"learning_rate": 0.0001749887631943882, |
|
"loss": 1.2168, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.6579217910766602, |
|
"learning_rate": 0.00017454009421429597, |
|
"loss": 1.331, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 2.777512311935425, |
|
"learning_rate": 0.00017408802285962368, |
|
"loss": 1.4826, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"eval_loss": 1.2498269081115723, |
|
"eval_runtime": 11.2195, |
|
"eval_samples_per_second": 8.913, |
|
"eval_steps_per_second": 8.913, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.47785863280296326, |
|
"learning_rate": 0.00017363256976511972, |
|
"loss": 1.4644, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 0.5930947661399841, |
|
"learning_rate": 0.00017317375571989158, |
|
"loss": 1.591, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7645952701568604, |
|
"learning_rate": 0.00017271160166645695, |
|
"loss": 1.2038, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.7254743576049805, |
|
"learning_rate": 0.0001722461286997879, |
|
"loss": 1.1559, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.39301592111587524, |
|
"learning_rate": 0.00017177735806634789, |
|
"loss": 1.1492, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 0.8771683573722839, |
|
"learning_rate": 0.00017130531116312203, |
|
"loss": 1.0438, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6636186242103577, |
|
"learning_rate": 0.0001708300095366405, |
|
"loss": 1.0158, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.7356956601142883, |
|
"learning_rate": 0.00017035147488199482, |
|
"loss": 1.2417, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 3.8965201377868652, |
|
"learning_rate": 0.00016986972904184784, |
|
"loss": 1.3653, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 0.6314372420310974, |
|
"learning_rate": 0.00016938479400543658, |
|
"loss": 0.9501, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.27, |
|
"grad_norm": 1.1143038272857666, |
|
"learning_rate": 0.00016889669190756868, |
|
"loss": 1.2585, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.9126316905021667, |
|
"learning_rate": 0.00016840544502761176, |
|
"loss": 0.9933, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.7595810294151306, |
|
"learning_rate": 0.0001679110757884769, |
|
"loss": 1.3224, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.7331580519676208, |
|
"learning_rate": 0.00016741360675559473, |
|
"loss": 1.451, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.37914204597473145, |
|
"learning_rate": 0.00016691306063588583, |
|
"loss": 1.0334, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.28, |
|
"grad_norm": 0.7935598492622375, |
|
"learning_rate": 0.00016640946027672392, |
|
"loss": 1.401, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.3880078196525574, |
|
"learning_rate": 0.00016590282866489319, |
|
"loss": 1.2831, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.7817143201828003, |
|
"learning_rate": 0.0001653931889255391, |
|
"loss": 1.2609, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.7870498299598694, |
|
"learning_rate": 0.0001648805643211127, |
|
"loss": 0.8674, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5795213580131531, |
|
"learning_rate": 0.00016436497825030884, |
|
"loss": 0.9604, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.29, |
|
"grad_norm": 0.5516975522041321, |
|
"learning_rate": 0.00016384645424699835, |
|
"loss": 0.6344, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 1.6294902563095093, |
|
"learning_rate": 0.00016332501597915352, |
|
"loss": 1.0385, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.3721281886100769, |
|
"learning_rate": 0.00016280068724776797, |
|
"loss": 1.0667, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.632927656173706, |
|
"learning_rate": 0.0001622734919857702, |
|
"loss": 0.5996, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.7089216709136963, |
|
"learning_rate": 0.0001617434542569313, |
|
"loss": 1.0407, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.2346296310424805, |
|
"learning_rate": 0.0001612105982547663, |
|
"loss": 1.4111, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.5850281119346619, |
|
"learning_rate": 0.00016067494830143014, |
|
"loss": 1.1949, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.6999644041061401, |
|
"learning_rate": 0.00016013652884660723, |
|
"loss": 1.2583, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 0.6640311479568481, |
|
"learning_rate": 0.0001595953644663957, |
|
"loss": 0.8627, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.31, |
|
"grad_norm": 1.467826008796692, |
|
"learning_rate": 0.00015905147986218547, |
|
"loss": 1.4436, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.5436001420021057, |
|
"learning_rate": 0.00015850489985953076, |
|
"loss": 1.1029, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.3373098373413086, |
|
"learning_rate": 0.000157955649407017, |
|
"loss": 1.0907, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 0.8601608276367188, |
|
"learning_rate": 0.00015740375357512195, |
|
"loss": 1.285, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.090238332748413, |
|
"learning_rate": 0.0001568492375550715, |
|
"loss": 1.1262, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.32, |
|
"grad_norm": 1.0742828845977783, |
|
"learning_rate": 0.00015629212665768978, |
|
"loss": 0.9301, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.6687757968902588, |
|
"learning_rate": 0.00015573244631224365, |
|
"loss": 1.3763, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.322456032037735, |
|
"learning_rate": 0.00015517022206528233, |
|
"loss": 1.157, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.552617073059082, |
|
"learning_rate": 0.00015460547957947104, |
|
"loss": 1.5864, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 1.0862557888031006, |
|
"learning_rate": 0.0001540382446324198, |
|
"loss": 1.2378, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.33, |
|
"grad_norm": 0.6755203008651733, |
|
"learning_rate": 0.00015346854311550673, |
|
"loss": 1.1782, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5506089329719543, |
|
"learning_rate": 0.00015289640103269625, |
|
"loss": 1.5313, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.5224264860153198, |
|
"learning_rate": 0.0001523218444993522, |
|
"loss": 1.1505, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.8278419971466064, |
|
"learning_rate": 0.00015174489974104574, |
|
"loss": 1.4319, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.34, |
|
"grad_norm": 0.9323415160179138, |
|
"learning_rate": 0.00015116559309235825, |
|
"loss": 1.3218, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.1334632635116577, |
|
"learning_rate": 0.00015058395099567935, |
|
"loss": 1.0519, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.3949350118637085, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 1.0169, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.9846246242523193, |
|
"learning_rate": 0.0001494137667597006, |
|
"loss": 1.8383, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 0.7132427096366882, |
|
"learning_rate": 0.0001488252780333342, |
|
"loss": 1.1292, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.35, |
|
"grad_norm": 1.259857177734375, |
|
"learning_rate": 0.00014823456068240558, |
|
"loss": 0.929, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.3703701496124268, |
|
"learning_rate": 0.00014764164167014451, |
|
"loss": 1.5655, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.5980277061462402, |
|
"learning_rate": 0.0001470465480602756, |
|
"loss": 1.1459, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.2204481363296509, |
|
"learning_rate": 0.00014644930701578253, |
|
"loss": 0.8177, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 0.325509637594223, |
|
"learning_rate": 0.00014584994579766865, |
|
"loss": 1.2372, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.36, |
|
"grad_norm": 1.206936001777649, |
|
"learning_rate": 0.0001452484917637122, |
|
"loss": 1.3148, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.0160785913467407, |
|
"learning_rate": 0.00014464497236721778, |
|
"loss": 1.1832, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 1.3579516410827637, |
|
"learning_rate": 0.00014403941515576344, |
|
"loss": 1.3798, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.47968143224716187, |
|
"learning_rate": 0.00014343184776994289, |
|
"loss": 1.0797, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.9022389650344849, |
|
"learning_rate": 0.00014282229794210404, |
|
"loss": 1.3824, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.37, |
|
"grad_norm": 0.9592376947402954, |
|
"learning_rate": 0.0001422107934950832, |
|
"loss": 0.7374, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.8611739873886108, |
|
"learning_rate": 0.0001415973623409351, |
|
"loss": 1.4377, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.4279676079750061, |
|
"learning_rate": 0.00014098203247965875, |
|
"loss": 1.7355, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.669457733631134, |
|
"learning_rate": 0.00014036483199791948, |
|
"loss": 1.3662, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.38, |
|
"grad_norm": 0.8061684370040894, |
|
"learning_rate": 0.00013974578906776684, |
|
"loss": 1.2989, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.6208318471908569, |
|
"learning_rate": 0.00013912493194534874, |
|
"loss": 1.4503, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.9276289343833923, |
|
"learning_rate": 0.0001385022889696218, |
|
"loss": 1.7339, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.5193164944648743, |
|
"learning_rate": 0.0001378778885610576, |
|
"loss": 1.0053, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.7712852954864502, |
|
"learning_rate": 0.00013725175922034565, |
|
"loss": 0.7669, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.39, |
|
"grad_norm": 0.6915028691291809, |
|
"learning_rate": 0.00013662392952709228, |
|
"loss": 1.2908, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.8135898113250732, |
|
"learning_rate": 0.00013599442813851632, |
|
"loss": 1.2639, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.9134801626205444, |
|
"learning_rate": 0.00013536328378814093, |
|
"loss": 1.3689, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.3660939037799835, |
|
"learning_rate": 0.00013473052528448201, |
|
"loss": 1.196, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 3.6361420154571533, |
|
"learning_rate": 0.00013409618150973348, |
|
"loss": 2.6822, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.4731757938861847, |
|
"learning_rate": 0.0001334602814184486, |
|
"loss": 1.1966, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.3311164081096649, |
|
"learning_rate": 0.00013282285403621864, |
|
"loss": 1.4858, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.9196639060974121, |
|
"learning_rate": 0.00013218392845834787, |
|
"loss": 1.2163, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.2580674886703491, |
|
"learning_rate": 0.00013154353384852558, |
|
"loss": 1.467, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 1.3060836791992188, |
|
"learning_rate": 0.00013090169943749476, |
|
"loss": 2.2125, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.41, |
|
"grad_norm": 0.5120143294334412, |
|
"learning_rate": 0.00013025845452171807, |
|
"loss": 1.3174, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.9528945684432983, |
|
"learning_rate": 0.00012961382846204055, |
|
"loss": 1.7378, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5604123473167419, |
|
"learning_rate": 0.00012896785068234926, |
|
"loss": 1.287, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 0.5821733474731445, |
|
"learning_rate": 0.00012832055066823038, |
|
"loss": 1.4721, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.42, |
|
"grad_norm": 1.4492989778518677, |
|
"learning_rate": 0.0001276719579656236, |
|
"loss": 1.2461, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 1.423685908317566, |
|
"learning_rate": 0.00012702210217947288, |
|
"loss": 0.9973, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.5945116877555847, |
|
"learning_rate": 0.0001263710129723757, |
|
"loss": 0.6836, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.8192650079727173, |
|
"learning_rate": 0.00012571872006322888, |
|
"loss": 1.236, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.35752618312835693, |
|
"learning_rate": 0.00012506525322587207, |
|
"loss": 1.2113, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.43, |
|
"grad_norm": 0.5102562308311462, |
|
"learning_rate": 0.00012441064228772874, |
|
"loss": 0.835, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.6736109852790833, |
|
"learning_rate": 0.0001237549171284447, |
|
"loss": 1.389, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.7494972348213196, |
|
"learning_rate": 0.00012309810767852433, |
|
"loss": 1.0185, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.34725117683410645, |
|
"learning_rate": 0.0001224402439179643, |
|
"loss": 0.9023, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.5315357446670532, |
|
"learning_rate": 0.00012178135587488515, |
|
"loss": 0.9621, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.44, |
|
"grad_norm": 0.4834609031677246, |
|
"learning_rate": 0.00012112147362416076, |
|
"loss": 0.9703, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.5364122986793518, |
|
"learning_rate": 0.0001204606272860454, |
|
"loss": 1.4784, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.008988380432129, |
|
"learning_rate": 0.00011979884702479909, |
|
"loss": 1.6889, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.8513673543930054, |
|
"learning_rate": 0.00011913616304731063, |
|
"loss": 1.4391, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 1.0368411540985107, |
|
"learning_rate": 0.00011847260560171896, |
|
"loss": 1.4572, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.45, |
|
"grad_norm": 0.43699911236763, |
|
"learning_rate": 0.00011780820497603215, |
|
"loss": 0.9995, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 1.1479334831237793, |
|
"learning_rate": 0.00011714299149674537, |
|
"loss": 0.9971, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.6493399739265442, |
|
"learning_rate": 0.00011647699552745628, |
|
"loss": 1.1328, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.8177739977836609, |
|
"learning_rate": 0.00011581024746747924, |
|
"loss": 1.2741, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.46, |
|
"grad_norm": 0.31355175375938416, |
|
"learning_rate": 0.00011514277775045768, |
|
"loss": 1.2813, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.5200531482696533, |
|
"learning_rate": 0.00011447461684297504, |
|
"loss": 1.4285, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 1.6473718881607056, |
|
"learning_rate": 0.00011380579524316406, |
|
"loss": 1.5263, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 2.004498243331909, |
|
"learning_rate": 0.00011313634347931466, |
|
"loss": 0.9576, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.9827050566673279, |
|
"learning_rate": 0.00011246629210848061, |
|
"loss": 1.3642, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.47, |
|
"grad_norm": 0.6069352626800537, |
|
"learning_rate": 0.00011179567171508463, |
|
"loss": 1.3768, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.4674800634384155, |
|
"learning_rate": 0.00011112451290952237, |
|
"loss": 0.9445, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 1.1005616188049316, |
|
"learning_rate": 0.00011045284632676536, |
|
"loss": 1.5748, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.578959584236145, |
|
"learning_rate": 0.00010978070262496247, |
|
"loss": 1.3462, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.9835721254348755, |
|
"learning_rate": 0.00010910811248404065, |
|
"loss": 2.2544, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.48, |
|
"grad_norm": 0.37735217809677124, |
|
"learning_rate": 0.00010843510660430447, |
|
"loss": 1.585, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.3374781608581543, |
|
"learning_rate": 0.00010776171570503499, |
|
"loss": 0.7627, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.3360700607299805, |
|
"learning_rate": 0.0001070879705230873, |
|
"loss": 1.8169, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 0.6257541179656982, |
|
"learning_rate": 0.00010641390181148772, |
|
"loss": 1.0015, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 1.234805703163147, |
|
"learning_rate": 0.00010573954033803007, |
|
"loss": 1.3024, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.49, |
|
"grad_norm": 2.442201852798462, |
|
"learning_rate": 0.00010506491688387127, |
|
"loss": 1.3141, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.39670243859291077, |
|
"learning_rate": 0.00010439006224212628, |
|
"loss": 0.9339, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.37090030312538147, |
|
"learning_rate": 0.00010371500721646261, |
|
"loss": 1.3281, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.6054628491401672, |
|
"learning_rate": 0.0001030397826196943, |
|
"loss": 1.2919, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"eval_loss": 1.2451566457748413, |
|
"eval_runtime": 11.2081, |
|
"eval_samples_per_second": 8.922, |
|
"eval_steps_per_second": 8.922, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.8905054926872253, |
|
"learning_rate": 0.00010236441927237535, |
|
"loss": 1.5113, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.8593491911888123, |
|
"learning_rate": 0.0001016889480013931, |
|
"loss": 0.9025, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 0.9700817465782166, |
|
"learning_rate": 0.00010101339963856111, |
|
"loss": 1.2504, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 1.2512820959091187, |
|
"learning_rate": 0.00010033780501921164, |
|
"loss": 1.769, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 3.91457462310791, |
|
"learning_rate": 9.966219498078839e-05, |
|
"loss": 2.3025, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.51, |
|
"grad_norm": 2.128582239151001, |
|
"learning_rate": 9.898660036143893e-05, |
|
"loss": 2.0598, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.3125651478767395, |
|
"learning_rate": 9.83110519986069e-05, |
|
"loss": 1.0118, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.2437289953231812, |
|
"learning_rate": 9.763558072762468e-05, |
|
"loss": 1.2138, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.653993546962738, |
|
"learning_rate": 9.696021738030575e-05, |
|
"loss": 0.983, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 1.0555764436721802, |
|
"learning_rate": 9.62849927835374e-05, |
|
"loss": 1.4967, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.52, |
|
"grad_norm": 0.6005884408950806, |
|
"learning_rate": 9.560993775787373e-05, |
|
"loss": 0.9319, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.776595413684845, |
|
"learning_rate": 9.493508311612874e-05, |
|
"loss": 1.2568, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.8884586691856384, |
|
"learning_rate": 9.426045966196993e-05, |
|
"loss": 1.4376, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 0.7831926345825195, |
|
"learning_rate": 9.358609818851229e-05, |
|
"loss": 1.2657, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.2989636659622192, |
|
"learning_rate": 9.291202947691271e-05, |
|
"loss": 1.9303, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.53, |
|
"grad_norm": 1.6510024070739746, |
|
"learning_rate": 9.223828429496499e-05, |
|
"loss": 1.361, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.6142978668212891, |
|
"learning_rate": 9.156489339569554e-05, |
|
"loss": 1.2702, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 1.1854060888290405, |
|
"learning_rate": 9.089188751595936e-05, |
|
"loss": 0.6902, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.6005362868309021, |
|
"learning_rate": 9.021929737503757e-05, |
|
"loss": 1.0575, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.54, |
|
"grad_norm": 0.7906481027603149, |
|
"learning_rate": 8.954715367323468e-05, |
|
"loss": 0.9277, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 2.450592041015625, |
|
"learning_rate": 8.887548709047764e-05, |
|
"loss": 1.6923, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.780250072479248, |
|
"learning_rate": 8.820432828491542e-05, |
|
"loss": 1.0708, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.6275424957275391, |
|
"learning_rate": 8.753370789151941e-05, |
|
"loss": 1.2547, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 1.6233257055282593, |
|
"learning_rate": 8.686365652068535e-05, |
|
"loss": 1.2188, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.55, |
|
"grad_norm": 0.9274204969406128, |
|
"learning_rate": 8.619420475683597e-05, |
|
"loss": 1.2182, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.7996556162834167, |
|
"learning_rate": 8.552538315702498e-05, |
|
"loss": 0.953, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.8006198406219482, |
|
"learning_rate": 8.485722224954237e-05, |
|
"loss": 0.8871, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.3429837226867676, |
|
"learning_rate": 8.418975253252078e-05, |
|
"loss": 1.3951, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.6119269728660583, |
|
"learning_rate": 8.352300447254372e-05, |
|
"loss": 1.4484, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.56, |
|
"grad_norm": 0.8381280303001404, |
|
"learning_rate": 8.285700850325467e-05, |
|
"loss": 1.4779, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.9022213816642761, |
|
"learning_rate": 8.219179502396787e-05, |
|
"loss": 1.6646, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.6201476454734802, |
|
"learning_rate": 8.15273943982811e-05, |
|
"loss": 1.6252, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.8065648674964905, |
|
"learning_rate": 8.086383695268938e-05, |
|
"loss": 1.0879, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 0.40957608819007874, |
|
"learning_rate": 8.020115297520093e-05, |
|
"loss": 1.4822, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 0.57, |
|
"grad_norm": 2.3640942573547363, |
|
"learning_rate": 7.953937271395464e-05, |
|
"loss": 1.4912, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 1.3901112079620361, |
|
"learning_rate": 7.887852637583926e-05, |
|
"loss": 1.2811, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.41163328289985657, |
|
"learning_rate": 7.821864412511485e-05, |
|
"loss": 1.4811, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.8646697402000427, |
|
"learning_rate": 7.755975608203572e-05, |
|
"loss": 1.1372, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 0.58, |
|
"grad_norm": 0.7892725467681885, |
|
"learning_rate": 7.690189232147566e-05, |
|
"loss": 1.239, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 1.0216518640518188, |
|
"learning_rate": 7.624508287155533e-05, |
|
"loss": 1.9391, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.6296453475952148, |
|
"learning_rate": 7.558935771227129e-05, |
|
"loss": 1.283, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.7754496335983276, |
|
"learning_rate": 7.493474677412794e-05, |
|
"loss": 1.5106, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.9793290495872498, |
|
"learning_rate": 7.428127993677115e-05, |
|
"loss": 1.6032, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 0.59, |
|
"grad_norm": 0.9508350491523743, |
|
"learning_rate": 7.362898702762433e-05, |
|
"loss": 1.4869, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.551931619644165, |
|
"learning_rate": 7.297789782052717e-05, |
|
"loss": 1.268, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.45385247468948364, |
|
"learning_rate": 7.232804203437644e-05, |
|
"loss": 1.0128, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.7140306830406189, |
|
"learning_rate": 7.16794493317696e-05, |
|
"loss": 1.24, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 5.549449920654297, |
|
"learning_rate": 7.10321493176508e-05, |
|
"loss": 1.6162, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 1.3684600591659546, |
|
"learning_rate": 7.038617153795948e-05, |
|
"loss": 1.8522, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.850640058517456, |
|
"learning_rate": 6.974154547828191e-05, |
|
"loss": 1.9203, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.7197821140289307, |
|
"learning_rate": 6.909830056250527e-05, |
|
"loss": 1.2131, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.7707614898681641, |
|
"learning_rate": 6.845646615147445e-05, |
|
"loss": 1.7302, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 0.7183483242988586, |
|
"learning_rate": 6.781607154165218e-05, |
|
"loss": 0.676, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 0.61, |
|
"grad_norm": 2.40576171875, |
|
"learning_rate": 6.717714596378137e-05, |
|
"loss": 1.399, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.5833460688591003, |
|
"learning_rate": 6.653971858155141e-05, |
|
"loss": 1.3112, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 1.4274028539657593, |
|
"learning_rate": 6.590381849026655e-05, |
|
"loss": 1.7495, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.43348875641822815, |
|
"learning_rate": 6.526947471551798e-05, |
|
"loss": 1.5504, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.62, |
|
"grad_norm": 0.9081869721412659, |
|
"learning_rate": 6.463671621185908e-05, |
|
"loss": 1.0873, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.0856722593307495, |
|
"learning_rate": 6.40055718614837e-05, |
|
"loss": 1.2858, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.3667042851448059, |
|
"learning_rate": 6.337607047290774e-05, |
|
"loss": 1.1236, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 1.053734540939331, |
|
"learning_rate": 6.274824077965438e-05, |
|
"loss": 1.0311, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.639960765838623, |
|
"learning_rate": 6.21221114389424e-05, |
|
"loss": 1.2877, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.63, |
|
"grad_norm": 0.3543897569179535, |
|
"learning_rate": 6.149771103037821e-05, |
|
"loss": 1.2895, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.5615779161453247, |
|
"learning_rate": 6.0875068054651266e-05, |
|
"loss": 1.3834, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6555135846138, |
|
"learning_rate": 6.0254210932233176e-05, |
|
"loss": 1.1616, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.647588312625885, |
|
"learning_rate": 5.9635168002080564e-05, |
|
"loss": 1.3614, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.6642404794692993, |
|
"learning_rate": 5.901796752034128e-05, |
|
"loss": 1.321, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.64, |
|
"grad_norm": 0.41330766677856445, |
|
"learning_rate": 5.8402637659064895e-05, |
|
"loss": 1.1208, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.7018295526504517, |
|
"learning_rate": 5.7789206504916816e-05, |
|
"loss": 1.408, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.5185045599937439, |
|
"learning_rate": 5.717770205789601e-05, |
|
"loss": 1.2841, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.5689573287963867, |
|
"learning_rate": 5.656815223005714e-05, |
|
"loss": 0.8656, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 2.1489999294281006, |
|
"learning_rate": 5.596058484423656e-05, |
|
"loss": 1.0203, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.65, |
|
"grad_norm": 0.6517840027809143, |
|
"learning_rate": 5.535502763278222e-05, |
|
"loss": 1.2159, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.47839996218681335, |
|
"learning_rate": 5.4751508236287865e-05, |
|
"loss": 0.9904, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.5880618691444397, |
|
"learning_rate": 5.415005420233141e-05, |
|
"loss": 0.6293, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.8745140433311462, |
|
"learning_rate": 5.355069298421747e-05, |
|
"loss": 0.9696, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 0.66, |
|
"grad_norm": 0.5589584112167358, |
|
"learning_rate": 5.2953451939724454e-05, |
|
"loss": 0.7707, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.5342937111854553, |
|
"learning_rate": 5.2358358329855516e-05, |
|
"loss": 1.0788, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.537293016910553, |
|
"learning_rate": 5.1765439317594466e-05, |
|
"loss": 1.4954, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.48457396030426025, |
|
"learning_rate": 5.1174721966665774e-05, |
|
"loss": 1.0569, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.46171680092811584, |
|
"learning_rate": 5.058623324029944e-05, |
|
"loss": 1.6199, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 0.67, |
|
"grad_norm": 0.9995810985565186, |
|
"learning_rate": 5.000000000000002e-05, |
|
"loss": 1.5058, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.8011060357093811, |
|
"learning_rate": 4.941604900432065e-05, |
|
"loss": 1.0958, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.5803133845329285, |
|
"learning_rate": 4.8834406907641784e-05, |
|
"loss": 0.8619, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.2584335803985596, |
|
"learning_rate": 4.825510025895429e-05, |
|
"loss": 1.4993, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 0.43015971779823303, |
|
"learning_rate": 4.767815550064778e-05, |
|
"loss": 0.8414, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 0.68, |
|
"grad_norm": 1.1480287313461304, |
|
"learning_rate": 4.710359896730379e-05, |
|
"loss": 1.4084, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.6011125445365906, |
|
"learning_rate": 4.65314568844933e-05, |
|
"loss": 0.9027, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.9287271499633789, |
|
"learning_rate": 4.596175536758024e-05, |
|
"loss": 1.3626, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.5835272669792175, |
|
"learning_rate": 4.539452042052901e-05, |
|
"loss": 0.9265, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.9826067686080933, |
|
"learning_rate": 4.482977793471769e-05, |
|
"loss": 0.9542, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 0.69, |
|
"grad_norm": 0.4515445828437805, |
|
"learning_rate": 4.426755368775637e-05, |
|
"loss": 1.1758, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.791499674320221, |
|
"learning_rate": 4.3707873342310254e-05, |
|
"loss": 0.6814, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.8545525074005127, |
|
"learning_rate": 4.3150762444928473e-05, |
|
"loss": 1.4359, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.8281897306442261, |
|
"learning_rate": 4.259624642487805e-05, |
|
"loss": 1.5126, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 1.3756364583969116, |
|
"learning_rate": 4.204435059298303e-05, |
|
"loss": 0.8866, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.1280224323272705, |
|
"learning_rate": 4.149510014046922e-05, |
|
"loss": 0.9567, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.3786430060863495, |
|
"learning_rate": 4.094852013781456e-05, |
|
"loss": 1.2275, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.3916789293289185, |
|
"learning_rate": 4.040463553360431e-05, |
|
"loss": 3.2689, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 0.828298032283783, |
|
"learning_rate": 3.9863471153392804e-05, |
|
"loss": 1.0928, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 0.71, |
|
"grad_norm": 1.3147966861724854, |
|
"learning_rate": 3.9325051698569925e-05, |
|
"loss": 1.7501, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.8213374018669128, |
|
"learning_rate": 3.878940174523371e-05, |
|
"loss": 1.5596, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 2.037734270095825, |
|
"learning_rate": 3.8256545743068725e-05, |
|
"loss": 1.3332, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.9790574908256531, |
|
"learning_rate": 3.772650801422982e-05, |
|
"loss": 1.0873, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.4515441954135895, |
|
"learning_rate": 3.719931275223205e-05, |
|
"loss": 1.3623, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 0.72, |
|
"grad_norm": 0.9941519498825073, |
|
"learning_rate": 3.6674984020846504e-05, |
|
"loss": 1.0522, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.93306565284729, |
|
"learning_rate": 3.615354575300166e-05, |
|
"loss": 1.5876, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 2.297698497772217, |
|
"learning_rate": 3.5635021749691166e-05, |
|
"loss": 0.8665, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.9025039076805115, |
|
"learning_rate": 3.511943567888732e-05, |
|
"loss": 0.4777, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 0.38800546526908875, |
|
"learning_rate": 3.460681107446091e-05, |
|
"loss": 0.8782, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 0.73, |
|
"grad_norm": 1.42990243434906, |
|
"learning_rate": 3.4097171335106824e-05, |
|
"loss": 1.1503, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.4309168756008148, |
|
"learning_rate": 3.3590539723276083e-05, |
|
"loss": 1.4906, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 1.5898098945617676, |
|
"learning_rate": 3.308693936411421e-05, |
|
"loss": 1.2899, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.8075172305107117, |
|
"learning_rate": 3.258639324440527e-05, |
|
"loss": 0.7296, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 0.74, |
|
"grad_norm": 0.7422662973403931, |
|
"learning_rate": 3.2088924211523144e-05, |
|
"loss": 1.5174, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.38478884100914, |
|
"learning_rate": 3.1594554972388265e-05, |
|
"loss": 1.3737, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.49486249685287476, |
|
"learning_rate": 3.110330809243134e-05, |
|
"loss": 1.3679, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.4216090738773346, |
|
"learning_rate": 3.061520599456341e-05, |
|
"loss": 1.3637, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 0.46424904465675354, |
|
"learning_rate": 3.0130270958152197e-05, |
|
"loss": 1.3429, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"eval_loss": 1.2427868843078613, |
|
"eval_runtime": 10.4613, |
|
"eval_samples_per_second": 9.559, |
|
"eval_steps_per_second": 9.559, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 0.75, |
|
"grad_norm": 1.1968390941619873, |
|
"learning_rate": 2.964852511800519e-05, |
|
"loss": 1.3649, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.45987701416015625, |
|
"learning_rate": 2.9169990463359555e-05, |
|
"loss": 1.4336, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.5114177465438843, |
|
"learning_rate": 2.869468883687798e-05, |
|
"loss": 1.4723, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.3620365560054779, |
|
"learning_rate": 2.8222641933652117e-05, |
|
"loss": 1.6468, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.9834029078483582, |
|
"learning_rate": 2.7753871300212142e-05, |
|
"loss": 0.963, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 0.76, |
|
"grad_norm": 0.6030866503715515, |
|
"learning_rate": 2.7288398333543064e-05, |
|
"loss": 1.1532, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.7282407879829407, |
|
"learning_rate": 2.6826244280108437e-05, |
|
"loss": 1.2677, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.549340009689331, |
|
"learning_rate": 2.6367430234880284e-05, |
|
"loss": 1.1274, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.6679304838180542, |
|
"learning_rate": 2.591197714037631e-05, |
|
"loss": 1.5468, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.3590414822101593, |
|
"learning_rate": 2.5459905785704042e-05, |
|
"loss": 1.7081, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 0.77, |
|
"grad_norm": 0.8412752747535706, |
|
"learning_rate": 2.5011236805611814e-05, |
|
"loss": 1.3058, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 1.4232923984527588, |
|
"learning_rate": 2.4565990679546914e-05, |
|
"loss": 1.3649, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.7082968950271606, |
|
"learning_rate": 2.4124187730720917e-05, |
|
"loss": 1.3421, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.8737981915473938, |
|
"learning_rate": 2.368584812518184e-05, |
|
"loss": 0.8252, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 0.78, |
|
"grad_norm": 0.6763678193092346, |
|
"learning_rate": 2.3250991870893835e-05, |
|
"loss": 1.8269, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.49625009298324585, |
|
"learning_rate": 2.2819638816823797e-05, |
|
"loss": 1.7375, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 1.235646367073059, |
|
"learning_rate": 2.2391808652035517e-05, |
|
"loss": 1.0455, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.6838667988777161, |
|
"learning_rate": 2.1967520904790827e-05, |
|
"loss": 1.2117, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.5402644872665405, |
|
"learning_rate": 2.154679494165829e-05, |
|
"loss": 1.4113, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 0.79, |
|
"grad_norm": 0.9066751599311829, |
|
"learning_rate": 2.1129649966629184e-05, |
|
"loss": 1.1857, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 1.1335337162017822, |
|
"learning_rate": 2.0716105020241072e-05, |
|
"loss": 1.4199, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.7712799310684204, |
|
"learning_rate": 2.0306178978708514e-05, |
|
"loss": 1.7568, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.3476005494594574, |
|
"learning_rate": 1.9899890553061562e-05, |
|
"loss": 1.4962, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.7450062036514282, |
|
"learning_rate": 1.9497258288291654e-05, |
|
"loss": 1.5029, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.9961157441139221, |
|
"learning_rate": 1.9098300562505266e-05, |
|
"loss": 1.2156, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.5865272283554077, |
|
"learning_rate": 1.8703035586084816e-05, |
|
"loss": 0.8954, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.7002312541007996, |
|
"learning_rate": 1.831148140085762e-05, |
|
"loss": 1.3208, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.5696095824241638, |
|
"learning_rate": 1.7923655879272393e-05, |
|
"loss": 1.6606, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.6654199361801147, |
|
"learning_rate": 1.753957672358324e-05, |
|
"loss": 1.0694, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 0.81, |
|
"grad_norm": 0.4102238714694977, |
|
"learning_rate": 1.7159261465041952e-05, |
|
"loss": 1.2681, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.5439934134483337, |
|
"learning_rate": 1.6782727463097624e-05, |
|
"loss": 1.018, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 1.4591739177703857, |
|
"learning_rate": 1.6409991904604173e-05, |
|
"loss": 0.8686, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.4289948046207428, |
|
"learning_rate": 1.60410718030361e-05, |
|
"loss": 1.1516, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.82, |
|
"grad_norm": 0.46624648571014404, |
|
"learning_rate": 1.5675983997711795e-05, |
|
"loss": 1.3106, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.7769433259963989, |
|
"learning_rate": 1.5314745153024766e-05, |
|
"loss": 1.205, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.6348716020584106, |
|
"learning_rate": 1.495737175768326e-05, |
|
"loss": 1.0937, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.4135714769363403, |
|
"learning_rate": 1.4603880123957447e-05, |
|
"loss": 1.0782, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 0.7596187591552734, |
|
"learning_rate": 1.425428638693489e-05, |
|
"loss": 1.6273, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.83, |
|
"grad_norm": 1.391519546508789, |
|
"learning_rate": 1.3908606503784139e-05, |
|
"loss": 1.4292, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 1.8692115545272827, |
|
"learning_rate": 1.356685625302625e-05, |
|
"loss": 0.5871, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.47545358538627625, |
|
"learning_rate": 1.3229051233814637e-05, |
|
"loss": 1.0054, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.2672554552555084, |
|
"learning_rate": 1.2895206865223064e-05, |
|
"loss": 0.6172, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.6463977694511414, |
|
"learning_rate": 1.2565338385541792e-05, |
|
"loss": 2.0793, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.84, |
|
"grad_norm": 0.5812274813652039, |
|
"learning_rate": 1.2239460851582118e-05, |
|
"loss": 0.8392, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.5981879234313965, |
|
"learning_rate": 1.1917589137989005e-05, |
|
"loss": 1.459, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.9397989511489868, |
|
"learning_rate": 1.1599737936562149e-05, |
|
"loss": 1.3638, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.3462386429309845, |
|
"learning_rate": 1.1285921755585504e-05, |
|
"loss": 1.1605, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 1.203017234802246, |
|
"learning_rate": 1.097615491916485e-05, |
|
"loss": 1.5022, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.85, |
|
"grad_norm": 0.7160519957542419, |
|
"learning_rate": 1.0670451566574102e-05, |
|
"loss": 1.0726, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 0.9885064959526062, |
|
"learning_rate": 1.0368825651609893e-05, |
|
"loss": 1.0344, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.1007866859436035, |
|
"learning_rate": 1.007129094195468e-05, |
|
"loss": 1.4191, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.0664376020431519, |
|
"learning_rate": 9.777861018548251e-06, |
|
"loss": 1.8957, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 0.86, |
|
"grad_norm": 1.2938302755355835, |
|
"learning_rate": 9.488549274967872e-06, |
|
"loss": 1.181, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.7518212199211121, |
|
"learning_rate": 9.203368916817012e-06, |
|
"loss": 1.4975, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 1.3200393915176392, |
|
"learning_rate": 8.92233296112236e-06, |
|
"loss": 1.0157, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.677116334438324, |
|
"learning_rate": 8.645454235739903e-06, |
|
"loss": 1.1984, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.9666435122489929, |
|
"learning_rate": 8.372745378769309e-06, |
|
"loss": 1.1112, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 0.87, |
|
"grad_norm": 0.8352184295654297, |
|
"learning_rate": 8.10421883797694e-06, |
|
"loss": 1.0512, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 2.5026190280914307, |
|
"learning_rate": 7.839886870227909e-06, |
|
"loss": 1.1279, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.36853912472724915, |
|
"learning_rate": 7.5797615409264335e-06, |
|
"loss": 1.1051, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.5076872110366821, |
|
"learning_rate": 7.32385472346514e-06, |
|
"loss": 1.597, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.9718438982963562, |
|
"learning_rate": 7.072178098683246e-06, |
|
"loss": 1.1807, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 0.88, |
|
"grad_norm": 0.5310528874397278, |
|
"learning_rate": 6.824743154333157e-06, |
|
"loss": 1.2764, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.9790662527084351, |
|
"learning_rate": 6.581561184556295e-06, |
|
"loss": 1.5014, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 2.499530553817749, |
|
"learning_rate": 6.342643289367522e-06, |
|
"loss": 1.3249, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.6649277210235596, |
|
"learning_rate": 6.108000374148448e-06, |
|
"loss": 1.432, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.8835659623146057, |
|
"learning_rate": 5.87764314914967e-06, |
|
"loss": 1.0439, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 0.89, |
|
"grad_norm": 0.7555325627326965, |
|
"learning_rate": 5.651582129001986e-06, |
|
"loss": 0.9622, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.9321297407150269, |
|
"learning_rate": 5.429827632236284e-06, |
|
"loss": 1.1174, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.7446428537368774, |
|
"learning_rate": 5.212389780812732e-06, |
|
"loss": 1.2175, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.4157393276691437, |
|
"learning_rate": 4.999278499658666e-06, |
|
"loss": 1.3221, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.4439353048801422, |
|
"learning_rate": 4.790503516215572e-06, |
|
"loss": 1.3804, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.6780000329017639, |
|
"learning_rate": 4.586074359995119e-06, |
|
"loss": 1.5498, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.6816204190254211, |
|
"learning_rate": 4.386000362144138e-06, |
|
"loss": 0.8413, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.4683542847633362, |
|
"learning_rate": 4.190290655018736e-06, |
|
"loss": 1.5352, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.4905780553817749, |
|
"learning_rate": 3.998954171767422e-06, |
|
"loss": 1.5878, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 0.91, |
|
"grad_norm": 0.6626597046852112, |
|
"learning_rate": 3.811999645923414e-06, |
|
"loss": 1.5102, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.5728758573532104, |
|
"learning_rate": 3.6294356110059157e-06, |
|
"loss": 1.3155, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.9096332788467407, |
|
"learning_rate": 3.451270400130646e-06, |
|
"loss": 1.5012, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.9138293266296387, |
|
"learning_rate": 3.277512145629502e-06, |
|
"loss": 1.0071, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 0.7803674340248108, |
|
"learning_rate": 3.10816877867931e-06, |
|
"loss": 1.1, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 0.92, |
|
"grad_norm": 1.2517226934432983, |
|
"learning_rate": 2.943248028939838e-06, |
|
"loss": 1.2342, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.6347147822380066, |
|
"learning_rate": 2.7827574242009437e-06, |
|
"loss": 1.1261, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.4368477165699005, |
|
"learning_rate": 2.626704290039017e-06, |
|
"loss": 1.1669, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.7963775396347046, |
|
"learning_rate": 2.4750957494826033e-06, |
|
"loss": 1.3863, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 1.0838688611984253, |
|
"learning_rate": 2.327938722687184e-06, |
|
"loss": 1.1262, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 0.93, |
|
"grad_norm": 0.48085644841194153, |
|
"learning_rate": 2.1852399266194314e-06, |
|
"loss": 1.7383, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.5789294242858887, |
|
"learning_rate": 2.0470058747505516e-06, |
|
"loss": 1.2789, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 1.7602370977401733, |
|
"learning_rate": 1.9132428767589473e-06, |
|
"loss": 2.2284, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.6166039109230042, |
|
"learning_rate": 1.7839570382422787e-06, |
|
"loss": 1.1418, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 0.94, |
|
"grad_norm": 0.5087316036224365, |
|
"learning_rate": 1.6591542604387445e-06, |
|
"loss": 0.7367, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 4.2447404861450195, |
|
"learning_rate": 1.538840239957684e-06, |
|
"loss": 2.1703, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.3775041103363037, |
|
"learning_rate": 1.4230204685196203e-06, |
|
"loss": 1.4201, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 1.3511093854904175, |
|
"learning_rate": 1.3117002327055927e-06, |
|
"loss": 1.5358, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.6467747092247009, |
|
"learning_rate": 1.20488461371574e-06, |
|
"loss": 1.0678, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 0.95, |
|
"grad_norm": 0.7409128546714783, |
|
"learning_rate": 1.102578487137529e-06, |
|
"loss": 1.5819, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.4174960851669312, |
|
"learning_rate": 1.004786522723089e-06, |
|
"loss": 1.3788, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.54808509349823, |
|
"learning_rate": 9.11513184176116e-07, |
|
"loss": 1.1524, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 0.3604092299938202, |
|
"learning_rate": 8.227627289481121e-07, |
|
"loss": 1.246, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.0738741159439087, |
|
"learning_rate": 7.385392080440534e-07, |
|
"loss": 0.9884, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 0.96, |
|
"grad_norm": 1.2032325267791748, |
|
"learning_rate": 6.588464658374815e-07, |
|
"loss": 2.1638, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.5337828993797302, |
|
"learning_rate": 5.836881398950667e-07, |
|
"loss": 1.1283, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.2100324630737305, |
|
"learning_rate": 5.130676608104845e-07, |
|
"loss": 1.5026, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 1.029868483543396, |
|
"learning_rate": 4.469882520479196e-07, |
|
"loss": 1.5626, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.9376425743103027, |
|
"learning_rate": 3.8545292979486057e-07, |
|
"loss": 1.4563, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 0.97, |
|
"grad_norm": 0.3371017575263977, |
|
"learning_rate": 3.2846450282447703e-07, |
|
"loss": 1.0665, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.8099949359893799, |
|
"learning_rate": 2.760255723673888e-07, |
|
"loss": 1.0242, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.5782180428504944, |
|
"learning_rate": 2.2813853199292746e-07, |
|
"loss": 1.1465, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 0.8454908132553101, |
|
"learning_rate": 1.8480556749991274e-07, |
|
"loss": 1.0718, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 0.98, |
|
"grad_norm": 1.310767650604248, |
|
"learning_rate": 1.460286568168212e-07, |
|
"loss": 1.0671, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.9640721678733826, |
|
"learning_rate": 1.1180956991160286e-07, |
|
"loss": 0.9314, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 1.6716411113739014, |
|
"learning_rate": 8.214986871076802e-08, |
|
"loss": 1.202, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.5227134823799133, |
|
"learning_rate": 5.705090702819993e-08, |
|
"loss": 0.6217, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.6409493684768677, |
|
"learning_rate": 3.6513830503293045e-08, |
|
"loss": 1.3795, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 0.99, |
|
"grad_norm": 0.9884561896324158, |
|
"learning_rate": 2.0539576548717076e-08, |
|
"loss": 1.0896, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.44622230529785156, |
|
"learning_rate": 9.128874307551272e-09, |
|
"loss": 1.4132, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.5369505286216736, |
|
"learning_rate": 2.282244620088747e-09, |
|
"loss": 0.7652, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.6095150709152222, |
|
"learning_rate": 0.0, |
|
"loss": 1.8662, |
|
"step": 475 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 475, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 500, |
|
"total_flos": 777748162805760.0, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|